diff --git a/AGENTS.md b/AGENTS.md index f6e26cc3..1f55440a 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,6 +19,9 @@ - `uv run pre-commit run --all-files` — enforce formatting and lint rules before pushing. - `uv run pytest` — execute the suite with the active interpreter. +- `scripts/check_test_parity.sh` — run changed tests and report sync/async + parity gaps (accepts optional base/head refs, defaults to + `upstream/main..HEAD`). - `uv build` — produce wheels and sdists identical to the release workflow. - `uvx nox -s tests` — create matrix virtualenvs via nox and execute the pytest session. diff --git a/README.md b/README.md index 99ff6936..3a454d4f 100644 --- a/README.md +++ b/README.md @@ -36,3 +36,9 @@ Run the test suite with: ```bash uv run pytest ``` + +Check sync/async parity for changed tests (defaults to `upstream/main..HEAD`): + +```bash +scripts/check_test_parity.sh +``` diff --git a/TESTING_GUIDELINES.md b/TESTING_GUIDELINES.md index c0852a26..f2603b20 100644 --- a/TESTING_GUIDELINES.md +++ b/TESTING_GUIDELINES.md @@ -13,6 +13,9 @@ iteration required. request customization, validation failures, file helpers, and live calls. Do not hide the transport behind a parameter; the test name itself should reveal which client is under test. +- **Check parity regularly.** Run `scripts/check_test_parity.sh` (defaults to + `upstream/main..HEAD`) to spot missing sync/async counterparts, keeping + parameterized test IDs aligned between transports. - **Exercise both sides of the contract.** Hermetic tests (via `httpx.MockTransport`) validate serialization and local validation. Live suites prove the server behaves the same way, including invalid literal diff --git a/pyproject.toml b/pyproject.toml index 69d2a422..a28be645 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requires-python = ">=3.10" dependencies = [ "exceptiongroup>=1.3.0", "httpx>=0.28.1", + "langcodes>=3.4.0", "pydantic>=2.12.0", ] diff --git a/scripts/check_test_parity.sh b/scripts/check_test_parity.sh new file mode 100755 index 00000000..0b8f13d3 --- /dev/null +++ b/scripts/check_test_parity.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +set -euo pipefail +IFS=$'\n\t' + +base_ref="${1:-upstream/main}" +head_ref="${2:-HEAD}" + +if ! git rev-parse --verify "$base_ref" > /dev/null 2>&1; then + echo "Base ref '$base_ref' not found." >&2 + exit 1 +fi + +if ! git rev-parse --verify "$head_ref" > /dev/null 2>&1; then + echo "Head ref '$head_ref' not found." >&2 + exit 1 +fi + +test_files=() +while IFS= read -r file; do + if [[ -n "$file" ]]; then + test_files+=("$file") + fi +done < <( + git diff --name-only --diff-filter=d "$base_ref..$head_ref" -- tests | grep -E '\.py$' || true +) + +if [[ ${#test_files[@]} -eq 0 ]]; then + echo "No changed test files under tests/ for $base_ref..$head_ref." + exit 0 +fi + +tmp_output="$(mktemp)" +tmp_tests="$(mktemp)" +tmp_counts="$(mktemp)" +tmp_missing_sync="$(mktemp)" +tmp_missing_async="$(mktemp)" +tmp_payload="$(mktemp)" +trap 'rm -f "$tmp_output" "$tmp_tests" "$tmp_counts" "$tmp_missing_sync" "$tmp_missing_async" "$tmp_payload"' EXIT + +echo "Running pytest on changed tests:" +printf ' - %s\n' "${test_files[@]}" + +uv run pytest -vv -rA -n auto "${test_files[@]}" | tee "$tmp_output" + +awk ' +{ + line = $0; + sub(/^\[[^]]+\][[:space:]]+/, "", line); + sub(/[[:space:]]+\[[^]]+\]$/, "", line); + if (line ~ /^(PASSED|FAILED|SKIPPED|XFAIL|XPASS|ERROR)[[:space:]]+tests\/.*::/) { + sub(/^(PASSED|FAILED|SKIPPED|XFAIL|XPASS|ERROR)[[:space:]]+/, "", line); + print line; + } else if (line ~ /^tests\/.*::.*[[:space:]]+(PASSED|FAILED|SKIPPED|XFAIL|XPASS|ERROR)$/) { + sub(/[[:space:]]+(PASSED|FAILED|SKIPPED|XFAIL|XPASS|ERROR)$/, "", line); + print line; + } +} +' "$tmp_output" > "$tmp_tests" + +if [[ ! -s "$tmp_tests" ]]; then + echo "No test node IDs detected in pytest output; try rerunning with -vv." >&2 + exit 1 +fi + +awk -v sync_file="$tmp_missing_sync" \ + -v async_file="$tmp_missing_async" \ + -v payload_file="$tmp_payload" \ + -v counts_file="$tmp_counts" ' +function is_async(nodeid) { + return (nodeid ~ /::test_.*async_/); +} +function normalize(nodeid) { + sub(/::test_live_async_/, "::test_live_", nodeid); + sub(/::test_async_/, "::test_", nodeid); + return nodeid; +} +{ + total++; + if ($0 ~ /::test_.*(payload|validation)/) { + payload_like[$0] = 1; + } + if (is_async($0)) { + async_count++; + norm = normalize($0); + async_norm[norm] = 1; + async_orig[norm] = $0; + } else { + sync_count++; + norm = normalize($0); + sync_norm[norm] = 1; + sync_orig[norm] = $0; + } +} +END { + missing_sync = 0; + missing_async = 0; + + for (n in async_norm) { + if (!(n in sync_norm)) { + missing_sync++; + print async_orig[n] >> sync_file; + } + } + for (n in sync_norm) { + if (!(n in async_norm)) { + missing_async++; + print sync_orig[n] >> async_file; + } + } + payload_count = 0; + for (t in payload_like) { + payload_count++; + print t >> payload_file; + } + + print "total=" total > counts_file; + print "sync_count=" sync_count >> counts_file; + print "async_count=" async_count >> counts_file; + print "missing_sync=" missing_sync >> counts_file; + print "missing_async=" missing_async >> counts_file; + print "payload_count=" payload_count >> counts_file; +} +' "$tmp_tests" + +total=0 +sync_count=0 +async_count=0 +missing_sync=0 +missing_async=0 +payload_count=0 +while IFS='=' read -r key value; do + case "$key" in + total) total="$value" ;; + sync_count) sync_count="$value" ;; + async_count) async_count="$value" ;; + missing_sync) missing_sync="$value" ;; + missing_async) missing_async="$value" ;; + payload_count) payload_count="$value" ;; + esac +done < "$tmp_counts" + +echo "" +echo "Test parity report" +echo "Total tests: $total" +echo "Sync tests: $sync_count" +echo "Async tests: $async_count" +echo "Missing sync counterparts: $missing_sync" +if [[ "$missing_sync" -gt 0 ]]; then + sort "$tmp_missing_sync" | while read -r line; do + echo " - $line" + done +fi +echo "Missing async counterparts: $missing_async" +if [[ "$missing_async" -gt 0 ]]; then + sort "$tmp_missing_async" | while read -r line; do + echo " - $line" + done +fi +echo "Payload/validation-style tests (name contains payload/validation): $payload_count" +if [[ "$payload_count" -gt 0 ]]; then + sort "$tmp_payload" | while read -r line; do + echo " - $line" + done +fi diff --git a/src/pdfrest/client.py b/src/pdfrest/client.py index 8818899f..0035227c 100644 --- a/src/pdfrest/client.py +++ b/src/pdfrest/client.py @@ -66,41 +66,72 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) - -__all__ = ("AsyncPdfRestClient", "PdfRestClient") - from .models._internal import ( BasePdfRestGraphicPayload, BmpPdfRestPayload, + ConvertToMarkdownPayload, DeletePayload, + ExtractImagesPayload, + ExtractTextPayload, GifPdfRestPayload, JpegPdfRestPayload, + OcrPdfPayload, PdfCompressPayload, + PdfFlattenAnnotationsPayload, PdfFlattenFormsPayload, + PdfFlattenTransparenciesPayload, PdfInfoPayload, + PdfLinearizePayload, PdfMergePayload, + PdfRasterizePayload, PdfRedactionApplyPayload, PdfRedactionPreviewPayload, PdfRestRawFileResponse, PdfSplitPayload, + PdfToExcelPayload, + PdfToPdfaPayload, PdfToPdfxPayload, + PdfToPowerpointPayload, PdfToWordPayload, + PdfXfaToAcroformsPayload, PngPdfRestPayload, + SummarizePdfTextPayload, TiffPdfRestPayload, + TranslatePdfTextPayload, UploadURLs, ) from .types import ( ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, PdfInfoQuery, PdfMergeInput, PdfPageSelection, PdfRedactionInstruction, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + TiffColorModel, + TranslateOutputFormat, ) +__all__ = ("AsyncPdfRestClient", "PdfRestClient") +FileResponseModel = TypeVar("FileResponseModel", bound=PdfRestFileBasedResponse) + DEFAULT_BASE_URL = "https://api.pdfrest.com" API_KEY_ENV_VAR = "PDFREST_API_KEY" API_KEY_HEADER_NAME = "Api-Key" @@ -965,11 +996,12 @@ def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) json_body = job_options.model_dump( mode="json", by_alias=True, exclude_none=True, exclude_unset=True @@ -997,15 +1029,17 @@ def _post_file_operation( for file_id in output_ids ] - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in raw_response.input_id], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) def send_request(self, request: _RequestModel) -> Any: return self._send_request(request) @@ -1229,11 +1263,12 @@ async def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) request = self.prepare_request( "POST", @@ -1269,15 +1304,17 @@ async def throttled_fetch_file_info(file_id: str) -> PdfRestFile: ) ) - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in raw_response.input_id], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) async def send_request(self, request: _RequestModel) -> Any: return await self._send_request(request) @@ -2105,506 +2142,1296 @@ def query_pdf_info( raw_payload = self._send_request(request) return PdfRestInfoResponse.model_validate(raw_payload) - def preview_redactions( + def summarize_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, - redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Generate a PDF redaction preview with annotated redaction rectangles.""" + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ payload: dict[str, Any] = { "files": file, - "redactions": redactions, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output - return self._post_file_operation( - endpoint="/pdf-with-redacted-text-preview", - payload=payload, - payload_model=PdfRedactionPreviewPayload, + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) + raw_payload = self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) - def apply_redactions( + def summarize_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - rgb_color: PdfRGBColor | Sequence[int] | None = None, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Apply previously previewed redactions and return the final redacted PDF.""" + """Summarize a document and return the result as a downloadable file.""" payload: dict[str, Any] = { "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", } - if rgb_color is not None: - payload["rgb_color"] = rgb_color + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdf-with-redacted-text-applied", + endpoint="/summarized-pdf-text", payload=payload, - payload_model=PdfRedactionApplyPayload, + payload_model=SummarizePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def split_pdf( + def convert_to_markdown( self, file: PdfRestFile | Sequence[PdfRestFile], *, - page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, - output_prefix: str | None = None, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Split a PDF into one or more PDF files based on the provided page groups.""" + """Convert a PDF to Markdown and return a file-based response.""" - payload: dict[str, Any] = {"files": file} - if page_groups is not None: - payload["page_groups"] = page_groups - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/split-pdf", + endpoint="/markdown", payload=payload, - payload_model=PdfSplitPayload, + payload_model=ConvertToMarkdownPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def merge_pdfs( + def ocr_pdf( self, - sources: Sequence[PdfMergeInput], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Merge multiple PDFs (or page subsets) into a single PDF file.""" + """Perform OCR on a PDF to make text searchable and extractable.""" - payload: dict[str, Any] = {"sources": sources} - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/merged-pdf", + endpoint="/pdf-with-ocr-text", payload=payload, - payload_model=PdfMergePayload, + payload_model=OcrPdfPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_word( + def translate_pdf_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Convert a PDF to a Word document.""" + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" - payload: dict[str, Any] = {"files": file} + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) + + def translate_pdf_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/word", + endpoint="/translated-pdf-text", payload=payload, - payload_model=PdfToWordPayload, + payload_model=TranslatePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, + response_model=TranslatePdfTextFileResponse, ) - def flatten_pdf_forms( + def extract_images( self, file: PdfRestFile | Sequence[PdfRestFile], *, + pages: PdfPageSelection | None = None, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Flatten form fields in a PDF so they are no longer editable.""" + """Extract embedded images from a PDF.""" payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/flattened-forms-pdf", + endpoint="/extracted-images", payload=payload, - payload_model=PdfFlattenFormsPayload, + payload_model=ExtractImagesPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def compress_pdf( + def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], - profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Compress a PDF using preset or custom compression profiles.""" + """Extract text content from a PDF and return a file-based response.""" payload: dict[str, Any] = { "files": file, - "compression_level": compression_level, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", } - if profile is not None: - payload["profile"] = profile + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/compressed-pdf", + endpoint="/extracted-text", payload=payload, - payload_model=PdfCompressPayload, + payload_model=ExtractTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_pdfx( + def preview_redactions( self, file: PdfRestFile | Sequence[PdfRestFile], *, - output_type: PdfXType, + redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert a PDF to a specified PDF/X version.""" + """Generate a PDF redaction preview with annotated redaction rectangles.""" - payload: dict[str, Any] = {"files": file, "output_type": output_type} + payload: dict[str, Any] = { + "files": file, + "redactions": redactions, + } if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdfx", + endpoint="/pdf-with-redacted-text-preview", payload=payload, - payload_model=PdfToPdfxPayload, + payload_model=PdfRedactionPreviewPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_png( + def apply_redactions( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + rgb_color: PdfRGBColor | Sequence[int] | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to PNG images.""" + """Apply previously previewed redactions and return the final redacted PDF.""" payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, + "files": file, } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + if rgb_color is not None: + payload["rgb_color"] = rgb_color + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/png", + return self._post_file_operation( + endpoint="/pdf-with-redacted-text-applied", payload=payload, - payload_model=PngPdfRestPayload, + payload_model=PdfRedactionApplyPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_bmp( + def split_pdf( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, + page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to BMP images.""" + """Split a PDF into one or more PDF files based on the provided page groups.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"files": file} + if page_groups is not None: + payload["page_groups"] = page_groups if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - return self._convert_to_graphic( - endpoint="/bmp", + return self._post_file_operation( + endpoint="/split-pdf", payload=payload, - payload_model=BmpPdfRestPayload, + payload_model=PdfSplitPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_gif( + def merge_pdfs( self, - files: PdfRestFile | Sequence[PdfRestFile], + sources: Sequence[PdfMergeInput], *, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to GIF images.""" + """Merge multiple PDFs (or page subsets) into a single PDF file.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"sources": sources} if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - return self._convert_to_graphic( - endpoint="/gif", + return self._post_file_operation( + endpoint="/merged-pdf", payload=payload, - payload_model=GifPdfRestPayload, + payload_model=PdfMergePayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_jpeg( + def convert_to_excel( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to JPEG images.""" + """Convert a PDF to an Excel spreadsheet.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/jpg", + return self._post_file_operation( + endpoint="/excel", payload=payload, - payload_model=JpegPdfRestPayload, + payload_model=PdfToExcelPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_tiff( + def convert_to_powerpoint( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to TIFF images.""" + """Convert a PDF to a PowerPoint presentation.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/tif", + return self._post_file_operation( + endpoint="/powerpoint", payload=payload, - payload_model=TiffPdfRestPayload, + payload_model=PdfToPowerpointPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - -class AsyncPdfRestClient(_AsyncApiClient): - """Asynchronous client for interacting with the pdfrest API.""" - + def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_word( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a Word document.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/word", + payload=payload, + payload_model=PdfToWordPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_pdf_forms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten form fields in a PDF so they are no longer editable.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-forms-pdf", + payload=payload, + payload_model=PdfFlattenFormsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def compress_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + compression_level: CompressionLevel, + profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Compress a PDF using preset or custom compression profiles.""" + + payload: dict[str, Any] = { + "files": file, + "compression_level": compression_level, + } + if profile is not None: + payload["profile"] = profile + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/compressed-pdf", + payload=payload, + payload_model=PdfCompressPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + return self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfx( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfXType, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/X version.""" + + payload: dict[str, Any] = {"files": file, "output_type": output_type} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdfx", + payload=payload, + payload_model=PdfToPdfxPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_png( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to PNG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "smoothing": smoothing, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + + return self._convert_to_graphic( + endpoint="/png", + payload=payload, + payload_model=PngPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_bmp( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to BMP images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "smoothing": smoothing, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + + return self._convert_to_graphic( + endpoint="/bmp", + payload=payload, + payload_model=BmpPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_gif( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to GIF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "smoothing": smoothing, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + + return self._convert_to_graphic( + endpoint="/gif", + payload=payload, + payload_model=GifPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_jpeg( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + jpeg_quality: int = 75, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to JPEG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "smoothing": smoothing, + "jpeg_quality": jpeg_quality, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + + return self._convert_to_graphic( + endpoint="/jpg", + payload=payload, + payload_model=JpegPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_tiff( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to TIFF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "smoothing": smoothing, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + + return self._convert_to_graphic( + endpoint="/tif", + payload=payload, + payload_model=TiffPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + +class AsyncPdfRestClient(_AsyncApiClient): + """Asynchronous client for interacting with the pdfrest API.""" + def __init__( self, *, api_key: str | None = None, base_url: str | URL | None = None, timeout: TimeoutTypes | None = None, - headers: AnyMapping | None = None, - http_client: httpx.AsyncClient | None = None, - transport: httpx.AsyncBaseTransport | None = None, - concurrency_limit: int = DEFAULT_FILE_INFO_CONCURRENCY, - max_retries: int = DEFAULT_MAX_RETRIES, - ) -> None: - """Create an asynchronous pdfRest client.""" + headers: AnyMapping | None = None, + http_client: httpx.AsyncClient | None = None, + transport: httpx.AsyncBaseTransport | None = None, + concurrency_limit: int = DEFAULT_FILE_INFO_CONCURRENCY, + max_retries: int = DEFAULT_MAX_RETRIES, + ) -> None: + """Create an asynchronous pdfRest client.""" + + super().__init__( + api_key=api_key, + base_url=base_url, + timeout=timeout, + headers=headers, + http_client=http_client, + transport=transport, + concurrency_limit=concurrency_limit, + max_retries=max_retries, + ) + self._files_client = _AsyncFilesClient(self) + + @override + async def __aenter__(self) -> AsyncPdfRestClient: + _ = await super().__aenter__() + return self + + @override + async def __aexit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: + await super().__aexit__(exc_type, exc, traceback) + + @property + def files(self) -> _AsyncFilesClient: + return self._files_client + + async def query_pdf_info( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + queries: Sequence[PdfInfoQuery] | PdfInfoQuery = ALL_PDF_INFO_QUERIES, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestInfoResponse: + """Query pdfRest for metadata describing a PDF document asynchronously.""" + + payload = PdfInfoPayload.model_validate({"file": file, "queries": queries}) + request = self.prepare_request( + "POST", + "/pdf-info", + json_body=payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_defaults=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return PdfRestInfoResponse.model_validate(raw_payload) + + async def summarize_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) + + async def summarize_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Summarize a document and return the result as a downloadable file.""" + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/summarized-pdf-text", + payload=payload, + payload_model=SummarizePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_markdown( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to Markdown and return a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/markdown", + payload=payload, + payload_model=ConvertToMarkdownPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def ocr_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Perform OCR on a PDF to make text searchable and extractable.""" + + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-ocr-text", + payload=payload, + payload_model=OcrPdfPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def translate_pdf_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) + + async def translate_pdf_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" - super().__init__( - api_key=api_key, - base_url=base_url, + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/translated-pdf-text", + payload=payload, + payload_model=TranslatePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, timeout=timeout, - headers=headers, - http_client=http_client, - transport=transport, - concurrency_limit=concurrency_limit, - max_retries=max_retries, + response_model=TranslatePdfTextFileResponse, ) - self._files_client = _AsyncFilesClient(self) - @override - async def __aenter__(self) -> AsyncPdfRestClient: - _ = await super().__aenter__() - return self + async def extract_images( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Extract embedded images from a PDF.""" - @override - async def __aexit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: - await super().__aexit__(exc_type, exc, traceback) + payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output - @property - def files(self) -> _AsyncFilesClient: - return self._files_client + return await self._post_file_operation( + endpoint="/extracted-images", + payload=payload, + payload_model=ExtractImagesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) - async def query_pdf_info( + async def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - queries: Sequence[PdfInfoQuery] | PdfInfoQuery = ALL_PDF_INFO_QUERIES, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestInfoResponse: - """Query pdfRest for metadata describing a PDF document asynchronously.""" + ) -> PdfRestFileBasedResponse: + """Extract text content from a PDF and return a file-based response.""" - payload = PdfInfoPayload.model_validate({"file": file, "queries": queries}) - request = self.prepare_request( - "POST", - "/pdf-info", - json_body=payload.model_dump( - mode="json", by_alias=True, exclude_none=True, exclude_defaults=True - ), + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/extracted-text", + payload=payload, + payload_model=ExtractTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - raw_payload = await self._send_request(request) - return PdfRestInfoResponse.model_validate(raw_payload) async def preview_redactions( self, @@ -2764,6 +3591,84 @@ async def merge_pdfs( timeout=timeout, ) + async def convert_to_excel( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to an Excel spreadsheet.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/excel", + payload=payload, + payload_model=PdfToExcelPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_powerpoint( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a PowerPoint presentation.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/powerpoint", + payload=payload, + payload_model=PdfToPowerpointPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_word( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2820,7 +3725,7 @@ async def compress_pdf( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], + compression_level: CompressionLevel, profile: PdfRestFile | Sequence[PdfRestFile] | None = None, output: str | None = None, extra_query: Query | None = None, @@ -2849,6 +3754,143 @@ async def compress_pdf( timeout=timeout, ) + async def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_pdfx( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2883,10 +3925,8 @@ async def convert_to_png( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2898,13 +3938,12 @@ async def convert_to_png( "files": files, "resolution": resolution, "color_model": color_model, + "smoothing": smoothing, } if output_prefix is not None: payload["output_prefix"] = output_prefix if page_range is not None: payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing return await self._convert_to_graphic( endpoint="/png", @@ -2923,10 +3962,8 @@ async def convert_to_bmp( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2938,13 +3975,12 @@ async def convert_to_bmp( "files": files, "resolution": resolution, "color_model": color_model, + "smoothing": smoothing, } if output_prefix is not None: payload["output_prefix"] = output_prefix if page_range is not None: payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing return await self._convert_to_graphic( endpoint="/bmp", @@ -2963,10 +3999,8 @@ async def convert_to_gif( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2978,13 +4012,12 @@ async def convert_to_gif( "files": files, "resolution": resolution, "color_model": color_model, + "smoothing": smoothing, } if output_prefix is not None: payload["output_prefix"] = output_prefix if page_range is not None: payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing return await self._convert_to_graphic( endpoint="/gif", @@ -3003,11 +4036,9 @@ async def convert_to_jpeg( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", + jpeg_quality: int = 75, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3019,15 +4050,13 @@ async def convert_to_jpeg( "files": files, "resolution": resolution, "color_model": color_model, + "smoothing": smoothing, + "jpeg_quality": jpeg_quality, } if output_prefix is not None: payload["output_prefix"] = output_prefix if page_range is not None: payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality return await self._convert_to_graphic( endpoint="/jpg", @@ -3046,10 +4075,8 @@ async def convert_to_tiff( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] = "none", extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3061,13 +4088,12 @@ async def convert_to_tiff( "files": files, "resolution": resolution, "color_model": color_model, + "smoothing": smoothing, } if output_prefix is not None: payload["output_prefix"] = output_prefix if page_range is not None: payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing return await self._convert_to_graphic( endpoint="/tif", diff --git a/src/pdfrest/models/__init__.py b/src/pdfrest/models/__init__.py index 54c9aeb4..ef10e565 100644 --- a/src/pdfrest/models/__init__.py +++ b/src/pdfrest/models/__init__.py @@ -5,6 +5,9 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) @@ -15,5 +18,8 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ] diff --git a/src/pdfrest/models/_internal.py b/src/pdfrest/models/_internal.py index 33cb8747..1c654f0d 100644 --- a/src/pdfrest/models/_internal.py +++ b/src/pdfrest/models/_internal.py @@ -6,6 +6,7 @@ from pathlib import PurePath from typing import Annotated, Any, Generic, Literal, TypeVar +from langcodes import tag_is_valid from pydantic import ( AfterValidator, AliasChoices, @@ -21,7 +22,16 @@ from pdfrest.types.public import PdfRedactionPreset -from ..types import PdfInfoQuery, PdfXType +from ..types import ( + OcrLanguage, + PdfAType, + PdfInfoQuery, + PdfXType, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TranslateOutputFormat, +) from . import PdfRestFile from .public import PdfRestFileID @@ -112,6 +122,12 @@ def _serialize_file_ids(value: list[PdfRestFile]) -> str: return ",".join(str(file.id) for file in value) +def _bool_to_on_off(value: Any) -> Any: + if isinstance(value, bool): + return "on" if value else "off" + return value + + def _serialize_page_ranges(value: list[str | int | tuple[str | int, ...]]) -> str: def join_tuple(value: str | int | tuple[str | int, ...]) -> str: if isinstance(value, tuple): @@ -160,6 +176,45 @@ def _int_to_string(value: Any) -> Any: return value +_OUTPUT_LANGUAGE_ERROR = ( + "The provided 'output_language' language tag is invalid. Format 'output_language' as " + "a valid 2-3 character ISO 639 language code (e.g., 'en', 'es', 'fra'), optionally " + "with a script, alphabetic region, or numeric region (e.g., 'zh-Hant', 'eng-US', " + "'es-419'). See documentation for recommended formats." +) + + +def _validate_output_language(value: str) -> str: + if not value: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + trimmed = value.strip() + if not trimmed: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + segments = trimmed.split("-") + if len(segments) > 2: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + language = segments[0] + if not re.fullmatch(r"[A-Za-z]{2,3}", language): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + if len(segments) == 2: + subtag = segments[1] + if not ( + re.fullmatch(r"[A-Za-z]{4}", subtag) + or re.fullmatch(r"[A-Za-z]{2}", subtag) + or re.fullmatch(r"[0-9]{3}", subtag) + ): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + if not tag_is_valid(trimmed): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + return trimmed + + class UploadURLs(BaseModel): url: Annotated[ list[HttpUrl] | HttpUrl, @@ -248,6 +303,265 @@ class PdfInfoPayload(BaseModel): ] +class SummarizePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready summarize request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + target_word_count: Annotated[ + int | None, Field(serialization_alias="target_word_count", ge=1, default=400) + ] = 400 + summary_format: Annotated[ + SummaryFormat, Field(serialization_alias="summary_format", default="overview") + ] = "overview" + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + SummaryOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class OcrPdfPayload(BaseModel): + """Adapt caller options into a pdfRest-ready OCR request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + languages: Annotated[ + list[OcrLanguage], + Field( + serialization_alias="languages", + validation_alias=AliasChoices("languages", "language"), + min_length=1, + default_factory=lambda: ["English"], + ), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + PlainSerializer(_serialize_as_comma_separated_string), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract text request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + full_text: Literal["off", "by_page", "document"] = "document" + preserve_line_breaks: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + word_style: Annotated[Literal["off", "on"], BeforeValidator(_bool_to_on_off)] = ( + "off" + ) + word_coordinates: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + output_type: Literal["json", "file"] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ConvertToMarkdownPayload(BaseModel): + """Adapt caller options into a pdfRest-ready markdown conversion payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + page_break_comments: Annotated[ + Literal["on", "off"] | None, + Field(serialization_alias="page_break_comments", default=None), + BeforeValidator(_bool_to_on_off), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class TranslatePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready translate request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_language: Annotated[ + str, + Field(serialization_alias="output_language"), + AfterValidator(_validate_output_language), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + TranslateOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + Literal["json", "file"], + Field(serialization_alias="output_type", default="json"), + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractImagesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract images request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + RgbChannel = Annotated[int, Field(ge=0, le=255)] @@ -519,6 +833,87 @@ class PdfToWordPayload(BaseModel): ] = None +class PdfToExcelPayload(BaseModel): + """Adapt caller options into a pdfRest-ready Excel request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPowerpointPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PowerPoint request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPdfaPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PDF/A request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_type: Annotated[PdfAType, Field(serialization_alias="output_type")] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + rasterize_if_errors_encountered: Annotated[ + Literal["on", "off"] | None, + Field( + serialization_alias="rasterize_if_errors_encountered", + default=None, + ), + BeforeValidator(_bool_to_on_off), + ] = None + + class PdfToPdfxPayload(BaseModel): """Adapt caller options into a pdfRest-ready PDF/X request payload.""" @@ -626,6 +1021,127 @@ def _validate_profile_dependency(self) -> PdfCompressPayload: return self +class PdfXfaToAcroformsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready XFA-to-AcroForms request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfLinearizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready linearize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfRasterizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready rasterize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfFlattenTransparenciesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-transparencies request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + quality: Literal["low", "medium", "high"] = "medium" + + +class PdfFlattenAnnotationsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-annotations request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + class BmpPdfRestPayload(BasePdfRestGraphicPayload[Literal["rgb", "gray"]]): """Adapt caller options into a pdfRest-ready BMP request payload.""" diff --git a/src/pdfrest/models/public.py b/src/pdfrest/models/public.py index 3de11476..e4dc8a3a 100644 --- a/src/pdfrest/models/public.py +++ b/src/pdfrest/models/public.py @@ -26,6 +26,9 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ) @@ -312,6 +315,93 @@ class PdfRestDeletionResponse(BaseModel): ] +class SummarizePdfTextResponse(BaseModel): + """Response returned by the summarize-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + summary: Annotated[ + str | None, + Field( + description="Summary content", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextResponse(BaseModel): + """Response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + translated_text: Annotated[ + str | None, + Field( + alias="translated_text", + validation_alias=AliasChoices("translated_text", "translatedText"), + description="Inline translation content when output_type is json.", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextFileResponse(PdfRestFileBasedResponse): + """File-based response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + + class PdfRestInfoResponse(BaseModel): """A response containing the output from the /info route.""" diff --git a/src/pdfrest/types/__init__.py b/src/pdfrest/types/__init__.py index 9bc36a87..48f78b03 100644 --- a/src/pdfrest/types/__init__.py +++ b/src/pdfrest/types/__init__.py @@ -1,7 +1,17 @@ """Public import surface for shared pdfrest types.""" from .public import ( + ALL_OCR_LANGUAGES, ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, PdfInfoQuery, PdfMergeInput, PdfMergeSource, @@ -11,10 +21,26 @@ PdfRedactionType, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TiffColorModel, + TranslateOutputFormat, ) __all__ = [ + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", @@ -24,4 +50,10 @@ "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ] diff --git a/src/pdfrest/types/public.py b/src/pdfrest/types/public.py index 1df53284..6472f2e7 100644 --- a/src/pdfrest/types/public.py +++ b/src/pdfrest/types/public.py @@ -13,7 +13,17 @@ PdfRestFile = Any __all__ = ( + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", @@ -23,6 +33,12 @@ "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ) PdfInfoQuery = Literal[ @@ -98,4 +114,49 @@ class PdfMergeSource(TypedDict, total=False): PdfMergeInput = PdfRestFile | PdfMergeSource | tuple[PdfRestFile, PdfPageSelection] +PdfAType = Literal["PDF/A-1b", "PDF/A-2b", "PDF/A-2u", "PDF/A-3b", "PDF/A-3u"] PdfXType = Literal["PDF/X-1a", "PDF/X-3", "PDF/X-4", "PDF/X-6"] +ExtractTextGranularity = Literal["off", "by_page", "document"] +CompressionLevel = Literal["low", "medium", "high", "custom"] +FlattenQuality = Literal["low", "medium", "high"] +PngColorModel = Literal["rgb", "rgba", "gray"] +BmpColorModel = Literal["rgb", "gray"] +GifColorModel = Literal["rgb", "gray"] +JpegColorModel = Literal["rgb", "cmyk", "gray"] +TiffColorModel = Literal["rgb", "rgba", "cmyk", "lab", "gray"] +GraphicSmoothing = Literal["none", "all", "text", "line", "image"] + +SummaryFormat = Literal[ + "overview", + "highlight", + "abstract", + "bullet_points", + "numbered_list", + "table_of_contents", + "outline", + "question_answer", + "action_items", +] + +SummaryOutputFormat = Literal["plaintext", "markdown"] +SummaryOutputType = Literal["json", "file"] + +TranslateOutputFormat = Literal["plaintext", "markdown"] + +OcrLanguage = Literal[ + "ChineseSimplified", + "ChineseTraditional", + "Dutch", + "English", + "French", + "German", + "Italian", + "Japanese", + "Korean", + "Portuguese", + "Spanish", +] + +ALL_OCR_LANGUAGES: tuple[OcrLanguage, ...] = cast( + tuple[OcrLanguage, ...], get_args(OcrLanguage) +) diff --git a/tests/graphics_test_helpers.py b/tests/graphics_test_helpers.py index 8fff8bfa..b94d5fc3 100644 --- a/tests/graphics_test_helpers.py +++ b/tests/graphics_test_helpers.py @@ -49,7 +49,7 @@ def assert_conversion_payload( for key, value in expected.items(): assert payload[key] == value extra_keys = set(payload) - set(expected) - permitted = {"color_model", "resolution"} + permitted = {"color_model", "resolution", "smoothing"} if allowed_extras is not None: permitted.update(allowed_extras) assert extra_keys <= permitted @@ -57,3 +57,5 @@ def assert_conversion_payload( assert payload["resolution"] == 300 if "color_model" not in expected and "color_model" in payload: assert payload["color_model"] == "rgb" + if "smoothing" not in expected and "smoothing" in payload: + assert payload["smoothing"] == "none" diff --git a/tests/live/test_live_compress_pdf.py b/tests/live/test_live_compress_pdf.py index 6ee8b365..0b3cdf66 100644 --- a/tests/live/test_live_compress_pdf.py +++ b/tests/live/test_live_compress_pdf.py @@ -158,7 +158,7 @@ def test_live_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)compression"), ): client.compress_pdf( uploaded_pdf_for_compression, @@ -177,7 +177,7 @@ async def test_live_async_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)compression"): await client.compress_pdf( uploaded_pdf_for_compression, compression_level="low", diff --git a/tests/live/test_live_convert_to_excel.py b/tests/live/test_live_convert_to_excel.py new file mode 100644 index 00000000..3816af34 --- /dev/null +++ b/tests/live/test_live_convert_to_excel.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_excel( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-excel", id="custom-output"), + ], +) +def test_live_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_excel(uploaded_pdf_for_excel, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".xlsx") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("async-excel", id="custom-output"), + ], +) +async def test_live_async_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_excel(uploaded_pdf_for_excel, **kwargs) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".xlsx") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + + +def test_live_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_markdown.py b/tests/live/test_live_convert_to_markdown.py new file mode 100644 index 00000000..760e1798 --- /dev/null +++ b/tests/live/test_live_convert_to_markdown.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.convert_to_markdown(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_markdown(uploaded, output="async-md") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_convert_to_pdfa.py b/tests/live/test_live_convert_to_pdfa.py new file mode 100644 index 00000000..8b40221d --- /dev/null +++ b/tests/live/test_live_convert_to_pdfa.py @@ -0,0 +1,180 @@ +from __future__ import annotations + +from typing import cast, get_args + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile +from pdfrest.types import PdfAType + +from ..resources import get_test_resource_path + +PDFA_TYPES: tuple[PdfAType, ...] = cast(tuple[PdfAType, ...], get_args(PdfAType)) +PDFA_TYPE_PARAMS = [ + pytest.param(output_type, id=output_type) for output_type in PDFA_TYPES +] + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_pdfa( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +def test_live_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="pdfa-live", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + assert output_file.name.startswith("pdfa-live") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +async def test_live_async_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="async-pdfa", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfa") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +def test_live_convert_to_pdfa_with_rasterize_option( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-2b", + rasterize_if_errors_encountered="on", + output="pdfa-rasterize", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("pdfa-rasterize") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_pdfa_with_rasterize_option( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-2b", + rasterize_if_errors_encountered="on", + output="async-pdfa-rasterize", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfa-rasterize") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +@pytest.mark.parametrize( + "invalid_output_type", + [ + pytest.param("PDF/A-0", id="pdfa-0"), + pytest.param("PDF/A-99", id="pdfa-99"), + pytest.param("pdf/a-2b", id="lowercase"), + ], +) +def test_live_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + invalid_output_type: str, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"), + ): + client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": invalid_output_type}, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "invalid_output_type", + [ + pytest.param("PDF/A-0", id="pdfa-0"), + pytest.param("PDF/A-99", id="pdfa-99"), + pytest.param("pdf/a-2b", id="lowercase"), + ], +) +async def test_live_async_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + invalid_output_type: str, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"): + await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": invalid_output_type}, + ) diff --git a/tests/live/test_live_convert_to_pdfx.py b/tests/live/test_live_convert_to_pdfx.py index a08088b0..2e02ee42 100644 --- a/tests/live/test_live_convert_to_pdfx.py +++ b/tests/live/test_live_convert_to_pdfx.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfXType @@ -50,6 +50,31 @@ def test_live_convert_to_pdfx_success( assert output_file.name.startswith("pdfx-live") +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFX_TYPES, ids=list(PDFX_TYPES)) +async def test_live_async_convert_to_pdfx_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, + output_type: PdfXType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type=output_type, + output="async-pdfx", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfx") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfx.id) + + @pytest.mark.parametrize( "invalid_output_type", [ @@ -69,10 +94,37 @@ def test_live_convert_to_pdfx_invalid_output_type( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"), ): client.convert_to_pdfx( uploaded_pdf_for_pdfx, output_type="PDF/X-1a", extra_body={"output_type": invalid_output_type}, ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "invalid_output_type", + [ + pytest.param("PDF/X-0", id="pdfx-0"), + pytest.param("PDF/X-99", id="pdfx-99"), + pytest.param("pdf/x-4", id="lowercase"), + ], +) +async def test_live_async_convert_to_pdfx_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, + invalid_output_type: str, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"): + await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type="PDF/X-1a", + extra_body={"output_type": invalid_output_type}, + ) diff --git a/tests/live/test_live_convert_to_powerpoint.py b/tests/live/test_live_convert_to_powerpoint.py new file mode 100644 index 00000000..248c04e7 --- /dev/null +++ b/tests/live/test_live_convert_to_powerpoint.py @@ -0,0 +1,135 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_powerpoint( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-powerpoint", id="custom-output"), + ], +) +def test_live_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_powerpoint(uploaded_pdf_for_powerpoint, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pptx") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("async-powerpoint", id="custom-output"), + ], +) +async def test_live_async_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, **kwargs + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pptx") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + + +def test_live_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_word.py b/tests/live/test_live_convert_to_word.py index c3c5822e..dcebe926 100644 --- a/tests/live/test_live_convert_to_word.py +++ b/tests/live/test_live_convert_to_word.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -57,6 +57,46 @@ def test_live_convert_to_word_success( assert output_file.name.endswith(".docx") +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("async-word", id="custom-output"), + ], +) +async def test_live_async_convert_to_word_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_word( + uploaded_pdf_for_word, + **kwargs, + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".docx") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + assert str(response.input_id) == str(uploaded_pdf_for_word.id) + + def test_live_convert_to_word_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -67,9 +107,26 @@ def test_live_convert_to_word_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.convert_to_word( uploaded_pdf_for_word, extra_body={"id": "00000000-0000-0000-0000-000000000000"}, ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_word_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_word( + uploaded_pdf_for_word, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_xfa_to_acroforms.py b/tests/live/test_live_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..adc2e8ab --- /dev/null +++ b/tests/live/test_live_convert_xfa_to_acroforms.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_acroforms( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("xfa.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-acroforms", id="custom-output"), + ], +) +def test_live_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_xfa_to_acroforms(uploaded_pdf_for_acroforms, **kwargs) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + assert response.warning is None + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +def test_live_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("async-acroforms", id="custom-output"), + ], +) +async def test_live_async_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, **kwargs + ) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + assert response.warning is None + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + + +@pytest.mark.asyncio +async def test_live_async_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_delete.py b/tests/live/test_live_delete.py index 75727fef..52bdf6fd 100644 --- a/tests/live/test_live_delete.py +++ b/tests/live/test_live_delete.py @@ -57,7 +57,7 @@ def test_live_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = client.files.create_from_paths([resource])[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) @@ -72,7 +72,7 @@ async def test_live_async_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = (await client.files.create_from_paths([resource]))[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): await client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) diff --git a/tests/live/test_live_extract_images.py b/tests/live/test_live_extract_images.py new file mode 100644 index 00000000..3410622a --- /dev/null +++ b/tests/live/test_live_extract_images.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_images(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_images(uploaded, output="async-images") + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert output_files[0].name.startswith("async-images") + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_extract_pdf_text_to_file.py b/tests/live/test_live_extract_pdf_text_to_file.py new file mode 100644 index 00000000..d6e58652 --- /dev/null +++ b/tests/live/test_live_extract_pdf_text_to_file.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + output="async-text", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-text") + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_flatten_annotations.py b/tests/live/test_live_flatten_annotations.py new file mode 100644 index 00000000..27cd0934 --- /dev/null +++ b/tests/live/test_live_flatten_annotations.py @@ -0,0 +1,129 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_annotations( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flatten-annotations", id="custom-output"), + ], +) +def test_live_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_annotations(uploaded_pdf_for_annotations, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flatten-annotations", id="custom-output"), + ], +) +async def test_live_async_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_annotations( + uploaded_pdf_for_annotations, **kwargs + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + + +def test_live_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_flatten_pdf_forms.py b/tests/live/test_live_flatten_pdf_forms.py index c6ad7fdb..2c6b939f 100644 --- a/tests/live/test_live_flatten_pdf_forms.py +++ b/tests/live/test_live_flatten_pdf_forms.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -54,6 +54,43 @@ def test_live_flatten_pdf_forms( assert output_file.name.endswith(".pdf") +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flattened-live", id="custom-output"), + ], +) +async def test_live_async_flatten_pdf_forms( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + **kwargs, + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_with_forms.id) + + def test_live_flatten_pdf_forms_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -64,9 +101,26 @@ def test_live_flatten_pdf_forms_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.flatten_pdf_forms( uploaded_pdf_with_forms, extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_pdf_forms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_flatten_transparencies.py b/tests/live/test_live_flatten_transparencies.py new file mode 100644 index 00000000..2438e68b --- /dev/null +++ b/tests/live/test_live_flatten_transparencies.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_transparencies( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + ("output_name", "quality"), + [ + pytest.param(None, "medium", id="default-output"), + pytest.param("flatten-transparency", "high", id="custom-output-high"), + ], +) +def test_live_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, + output_name: str | None, + quality: str, +) -> None: + kwargs: dict[str, str] = {"quality": quality} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_transparencies( + uploaded_pdf_for_transparencies, **kwargs + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("output_name", "quality"), + [ + pytest.param(None, "medium", id="default-output"), + pytest.param("flatten-transparency", "high", id="custom-output-high"), + ], +) +async def test_live_async_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, + output_name: str | None, + quality: str, +) -> None: + kwargs: dict[str, str] = {"quality": quality} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_transparencies( + uploaded_pdf_for_transparencies, **kwargs + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + + +def test_live_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_graphic_conversions.py b/tests/live/test_live_graphic_conversions.py index 2b68edb3..9c13ae31 100644 --- a/tests/live/test_live_graphic_conversions.py +++ b/tests/live/test_live_graphic_conversions.py @@ -5,7 +5,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.models._internal import ( BasePdfRestGraphicPayload, @@ -36,6 +36,14 @@ class _GraphicEndpointSpec(NamedTuple): "tiff": _GraphicEndpointSpec("convert_to_tiff", TiffPdfRestPayload), } +_EXPECTED_FILE_FORMATS: dict[str, tuple[str, str]] = { + "png": ("image/png", ".png"), + "bmp": ("image/bmp", ".bmp"), + "gif": ("image/gif", ".gif"), + "jpeg": ("image/jpeg", ".jpg"), + "tiff": ("image/tiff", ".tif"), +} + def _enumerate_color_models( payload_model: type[BasePdfRestGraphicPayload[Any]], @@ -108,6 +116,22 @@ def _invalid_smoothing_cases() -> list[Any]: return cases +def _expected_file_format(label: str) -> tuple[str, str]: + return _EXPECTED_FILE_FORMATS[label] + + +def _assert_output_files( + output_files: Sequence[PdfRestFile], + *, + expected_mime: str, + expected_suffix: str, +) -> None: + assert output_files + assert all(file_info.name.endswith(expected_suffix) for file_info in output_files) + assert all(file_info.type == expected_mime for file_info in output_files) + assert all(file_info.size > 0 for file_info in output_files) + + @pytest.fixture(scope="module") def uploaded_20_page_pdf( pdfrest_api_key: str, @@ -121,6 +145,55 @@ def uploaded_20_page_pdf( return client.files.create_from_paths([resource])[0] +def test_live_convert_to_png_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.convert_to_png( + uploaded, + output_prefix="live-png", + resolution=150, + ) + + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", + ) + assert str(response.input_id) == str(uploaded.id) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_png_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_png( + uploaded, + output_prefix="async-png", + resolution=150, + ) + + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", + ) + assert str(response.input_id) == str(uploaded.id) + + @pytest.mark.parametrize( ("_endpoint_label", "spec", "color_model"), _valid_color_cases(), @@ -140,12 +213,51 @@ def test_live_graphic_valid_color_models( ) as client: uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) + expected_mime, expected_suffix = _expected_file_format(_endpoint_label) response = client_method( uploaded, color_model=color_model, resolution=resolution, ) - assert response.output_files + _assert_output_files( + response.output_files, + expected_mime=expected_mime, + expected_suffix=expected_suffix, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("_endpoint_label", "spec", "color_model"), + _valid_color_cases(), +) +async def test_live_async_graphic_valid_color_models( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + _endpoint_label: str, + spec: _GraphicEndpointSpec, + color_model: str, +) -> None: + resource = get_test_resource_path("report.pdf") + payload_model = spec.payload_model + resolution = _resolution_bounds(payload_model)[0] + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + client_method = getattr(client, spec.method_name) + expected_mime, expected_suffix = _expected_file_format(_endpoint_label) + response = await client_method( + uploaded, + color_model=color_model, + resolution=resolution, + ) + _assert_output_files( + response.output_files, + expected_mime=expected_mime, + expected_suffix=expected_suffix, + ) @pytest.mark.parametrize( @@ -168,7 +280,7 @@ def test_live_graphic_invalid_color_model( uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) resolution = _resolution_bounds(payload_model)[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)color"): client_method( uploaded, resolution=resolution, @@ -176,6 +288,35 @@ def test_live_graphic_invalid_color_model( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("_endpoint_label", "spec", "invalid_color"), + _invalid_color_cases(), +) +async def test_live_async_graphic_invalid_color_model( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + _endpoint_label: str, + spec: _GraphicEndpointSpec, + invalid_color: str, +) -> None: + payload_model = spec.payload_model + + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + client_method = getattr(client, spec.method_name) + resolution = _resolution_bounds(payload_model)[0] + with pytest.raises(PdfRestApiError, match=r"(?i)color"): + await client_method( + uploaded, + resolution=resolution, + extra_body={"color_model": invalid_color}, + ) + + @pytest.mark.parametrize( ("_endpoint_label", "spec"), PNG_PAYLOAD_ONLY.items(), @@ -213,11 +354,64 @@ def test_live_graphic_resolution_bounds( if should_raise: call_kwargs["extra_body"] = {"resolution": base_resolution + offset} - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)resolution"): client_method(uploaded, **call_kwargs) else: response = client_method(uploaded, **call_kwargs) - assert response.output_files + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("_endpoint_label", "spec"), + PNG_PAYLOAD_ONLY.items(), + ids=list(PNG_PAYLOAD_ONLY), +) +@pytest.mark.parametrize( + ("bound", "offset", "should_raise"), + [ + pytest.param("min", 0, False, id="min"), + pytest.param("max", 0, False, id="max"), + pytest.param("min", -1, True, id="below-min"), + pytest.param("max", 1, True, id="above-max"), + ], +) +async def test_live_async_graphic_resolution_bounds( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + _endpoint_label: str, + spec: _GraphicEndpointSpec, + bound: str, + offset: int, + should_raise: bool, +) -> None: + payload_model = spec.payload_model + min_res, max_res = _resolution_bounds(payload_model) + resource = get_test_resource_path("report.pdf") + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + client_method = getattr(client, spec.method_name) + base_resolution = min_res if bound == "min" else max_res + call_kwargs: dict[str, Any] = {"resolution": base_resolution} + + if should_raise: + call_kwargs["extra_body"] = {"resolution": base_resolution + offset} + with pytest.raises(PdfRestApiError, match=r"(?i)resolution"): + await client_method(uploaded, **call_kwargs) + else: + response = await client_method(uploaded, **call_kwargs) + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", + ) @pytest.mark.parametrize( @@ -237,11 +431,46 @@ def test_live_graphic_valid_smoothing( ) as client: uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) + expected_mime, expected_suffix = _expected_file_format(_endpoint_label) response = client_method( uploaded, smoothing=smoothing_value, ) - assert response.output_files + _assert_output_files( + response.output_files, + expected_mime=expected_mime, + expected_suffix=expected_suffix, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("_endpoint_label", "spec", "smoothing_value"), + _valid_smoothing_cases(), +) +async def test_live_async_graphic_valid_smoothing( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + _endpoint_label: str, + spec: _GraphicEndpointSpec, + smoothing_value: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + client_method = getattr(client, spec.method_name) + expected_mime, expected_suffix = _expected_file_format(_endpoint_label) + response = await client_method( + uploaded, + smoothing=smoothing_value, + ) + _assert_output_files( + response.output_files, + expected_mime=expected_mime, + expected_suffix=expected_suffix, + ) @pytest.mark.parametrize( @@ -261,7 +490,7 @@ def test_live_graphic_invalid_smoothing( ) as client: uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): client_method( uploaded, smoothing="none", @@ -269,6 +498,32 @@ def test_live_graphic_invalid_smoothing( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("_endpoint_label", "spec", "invalid_smoothing"), + _invalid_smoothing_cases(), +) +async def test_live_async_graphic_invalid_smoothing( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + _endpoint_label: str, + spec: _GraphicEndpointSpec, + invalid_smoothing: Any, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + client_method = getattr(client, spec.method_name) + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): + await client_method( + uploaded, + smoothing="none", + extra_body={"smoothing": invalid_smoothing}, + ) + + @pytest.mark.parametrize( ("page_range", "expect_success"), [ @@ -307,16 +562,14 @@ def test_live_png_page_range_variants( expected_pages = _expand_page_selection(page_range, total_pages=20) assert len(response.output_files) == len(expected_pages) - assert any( - file_info.name.endswith(".png") for file_info in response.output_files - ) - assert all( - file_info.type == "image/png" and file_info.size > 0 - for file_info in response.output_files + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", ) assert str(response.input_id) == str(uploaded_20_page_pdf.id) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.convert_to_png( uploaded_20_page_pdf, output_prefix=f"live-range-{case_id}", @@ -324,6 +577,60 @@ def test_live_png_page_range_variants( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("page_range", "expect_success"), + [ + pytest.param("5", True, id="single"), + pytest.param("3-7", True, id="ascending-range"), + pytest.param("last", True, id="last"), + pytest.param("1-last", True, id="entire-document"), + pytest.param(["1", "3", "5-7"], True, id="list-mixed"), + ], +) +async def test_live_async_png_page_range_variants( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_20_page_pdf: PdfRestFile, + page_range: Any, + expect_success: bool, + request: pytest.FixtureRequest, +) -> None: + case_id = request.node.callspec.id + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + info = await client.query_pdf_info(uploaded_20_page_pdf) + + assert info.page_count == 20 + assert str(info.input_id) == str(uploaded_20_page_pdf.id) + assert info.filename is None or info.filename.endswith(".pdf") + + if expect_success: + response = await client.convert_to_png( + uploaded_20_page_pdf, + output_prefix=f"live-async-range-{case_id}", + page_range=page_range, + ) + + expected_pages = _expand_page_selection(page_range, total_pages=20) + assert len(response.output_files) == len(expected_pages) + _assert_output_files( + response.output_files, + expected_mime="image/png", + expected_suffix=".png", + ) + assert str(response.input_id) == str(uploaded_20_page_pdf.id) + else: + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.convert_to_png( + uploaded_20_page_pdf, + output_prefix=f"live-async-range-{case_id}", + extra_body={"page_range": page_range}, + ) + + @pytest.mark.parametrize( "page_override", [ @@ -348,7 +655,10 @@ def test_live_png_page_range_invalid_overrides( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises( + PdfRestApiError, + match=r"There was an issue processing your file\. Validate all fields and try again\.", + ), ): client.convert_to_png( uploaded_20_page_pdf, @@ -358,6 +668,42 @@ def test_live_png_page_range_invalid_overrides( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + "page_override", + [ + pytest.param("0", id="zero"), + pytest.param("last-0", id="range-with-zero"), + pytest.param("7-3", id="descending-range"), + pytest.param("even", id="even"), + pytest.param("odd", id="odd"), + pytest.param("odd,even", id="odd-even"), + ], +) +async def test_live_async_png_page_range_invalid_overrides( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_20_page_pdf: PdfRestFile, + page_override: str, + request: pytest.FixtureRequest, +) -> None: + case_id = request.node.callspec.id + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises( + PdfRestApiError, + match=r"There was an issue processing your file\. Validate all fields and try again\.", + ): + await client.convert_to_png( + uploaded_20_page_pdf, + output_prefix=f"live-async-range-invalid-{case_id}", + page_range="1", + extra_body={"pages": page_override}, + ) + + def _expand_page_selection( selection: Any, *, diff --git a/tests/live/test_live_linearize_pdf.py b/tests/live/test_live_linearize_pdf.py new file mode 100644 index 00000000..1d43f9b2 --- /dev/null +++ b/tests/live/test_live_linearize_pdf.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_linearize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("linearized-live", id="custom-output"), + ], +) +def test_live_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.linearize_pdf(uploaded_pdf_for_linearize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("linearized-live", id="custom-output"), + ], +) +async def test_live_async_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.linearize_pdf( + uploaded_pdf_for_linearize, + **kwargs, + ) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + + +def test_live_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_ocr_pdf.py b/tests/live/test_live_ocr_pdf.py new file mode 100644 index 00000000..89625a2e --- /dev/null +++ b/tests/live/test_live_ocr_pdf.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report-image.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.ocr_pdf(uploaded, languages=["English", "German"]) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report-image.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.ocr_pdf(uploaded, output="async-ocr") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + assert response.output_file.name.startswith("async-ocr") + assert response.output_file.type == "application/pdf" + assert response.output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report-image.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report-image.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_pdf_info.py b/tests/live/test_live_pdf_info.py index 977fe87d..7d361512 100644 --- a/tests/live/test_live_pdf_info.py +++ b/tests/live/test_live_pdf_info.py @@ -93,6 +93,27 @@ def test_live_pdf_info_queries( _assert_expected_value(query_name, value) +@pytest.mark.asyncio +@pytest.mark.parametrize("query_name", ALLOWED_QUERIES, ids=list(ALLOWED_QUERIES)) +async def test_live_async_pdf_info_queries( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf: PdfRestFile, + query_name: PdfInfoQuery, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + response = await client.query_pdf_info(uploaded_pdf, queries=query_name) + + assert isinstance(response, PdfRestInfoResponse) + assert str(response.input_id) == str(uploaded_pdf.id) + assert response.all_queries_processed is True + + value = getattr(response, query_name) + _assert_expected_value(query_name, value) + + @pytest.mark.parametrize( "invalid_query", [ @@ -111,7 +132,7 @@ def test_live_pdf_info_invalid_query( PdfRestClient( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)quer"), ): client.query_pdf_info( uploaded_pdf, @@ -120,6 +141,32 @@ def test_live_pdf_info_invalid_query( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + "invalid_query", + [ + pytest.param("invalid_query", id="invalid-query"), + pytest.param("tagged,!!invalid!!", id="mixed-invalid"), + pytest.param("🚫", id="emoji"), + ], +) +async def test_live_async_pdf_info_invalid_query( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf: PdfRestFile, + invalid_query: str, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)quer"): + await client.query_pdf_info( + uploaded_pdf, + queries="tagged", + extra_body={"queries": invalid_query}, + ) + + @pytest.mark.parametrize( "query_group", [ @@ -146,7 +193,50 @@ def test_live_pdf_info_multiple_queries( @pytest.mark.asyncio -async def test_live_pdf_info_async_all_queries( +@pytest.mark.parametrize( + "query_group", + [ + pytest.param(("tagged", "filename"), id="two-values"), + pytest.param(("page_count", "file_size", "pdf_version"), id="three-values"), + ], +) +async def test_live_async_pdf_info_multiple_queries( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf: PdfRestFile, + query_group: tuple[PdfInfoQuery, ...], +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + response = await client.query_pdf_info(uploaded_pdf, queries=query_group) + + assert isinstance(response, PdfRestInfoResponse) + assert str(response.input_id) == str(uploaded_pdf.id) + assert response.all_queries_processed is True + for item in query_group: + _assert_expected_value(item, getattr(response, item)) + + +def test_live_pdf_info_all_queries( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf: PdfRestFile, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, base_url=pdfrest_live_base_url + ) as client: + response = client.query_pdf_info(uploaded_pdf, queries=ALLOWED_QUERIES) + + assert isinstance(response, PdfRestInfoResponse) + assert str(response.input_id) == str(uploaded_pdf.id) + assert response.all_queries_processed is True + for query in ALLOWED_QUERIES: + _assert_expected_value(query, getattr(response, query)) + + +@pytest.mark.asyncio +async def test_live_async_pdf_info_all_queries( pdfrest_api_key: str, pdfrest_live_base_url: str, uploaded_pdf: PdfRestFile, diff --git a/tests/live/test_live_pdf_redactions.py b/tests/live/test_live_pdf_redactions.py index 796785a1..0b7aee90 100644 --- a/tests/live/test_live_pdf_redactions.py +++ b/tests/live/test_live_pdf_redactions.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfRedactionInstruction, PdfRedactionPreset @@ -135,6 +135,113 @@ def test_live_redaction_preview_and_apply_multiple( assert final_file.type == "application/pdf" +@pytest.mark.asyncio +@pytest.mark.parametrize( + "instructions", + [ + pytest.param( + [ + { + "type": "literal", + "value": "The quick brown fox jumped over the lazy dog.", + }, + {"type": "regex", "value": r"\b\d{3}-\d{2}-\d{4}\b"}, + ], + id="literal-and-regex", + ), + pytest.param( + [ + {"type": "preset", "value": "email"}, + {"type": "preset", "value": "phone_number"}, + ], + id="preset-email-and-phone", + ), + pytest.param( + [ + {"type": "preset", "value": "credit_card"}, + {"type": "preset", "value": "bank_routing_number"}, + {"type": "preset", "value": "swift_bic_number"}, + ], + id="multiple-presets", + ), + ], +) +async def test_live_async_redaction_preview_and_apply_multiple( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, + instructions: list[PdfRedactionInstruction], +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + preview = await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=instructions, + output="redaction-preview-multi", + ) + + assert preview.output_files + preview_file = preview.output_files[0] + applied = await client.apply_redactions( + preview_file, + output="redaction-final-multi", + ) + + final_file = applied.output_files[0] + assert final_file.name.endswith("redaction-final-multi.pdf") + assert final_file.type == "application/pdf" + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "instruction", + [ + pytest.param( + { + "type": "literal", + "value": "The quick brown fox jumped over the lazy dog.", + }, + id="literal", + ), + pytest.param({"type": "regex", "value": r"\b\d{3}-\d{2}-\d{4}\b"}, id="regex"), + *[ + pytest.param({"type": "preset", "value": preset}, id=f"preset-{preset}") + for preset in get_args(PdfRedactionPreset) + ], + ], +) +async def test_live_async_redaction_preview_and_apply_single( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, + instruction: PdfRedactionInstruction, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + preview = await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[instruction], + output="redaction-preview", + ) + + preview_file = preview.output_files[0] + applied = await client.apply_redactions( + preview_file, + output="redaction-final", + ) + + assert preview.output_files + assert preview_file.name.endswith("redaction-preview.pdf") + assert applied.output_files + final_file = applied.output_files[0] + assert final_file.name.endswith("redaction-final.pdf") + assert final_file.type == "application/pdf" + + @pytest.mark.parametrize( "extra_body", [ @@ -153,7 +260,13 @@ def test_live_redactions_invalid_payloads( base_url=pdfrest_live_base_url, ) as client: if "redactions" in extra_body: - with pytest.raises(PdfRestApiError): + with pytest.raises( + PdfRestApiError, + match=( + r"The JSON data provided is not properly formatted\. Please check " + r"your syntax and try again\." + ), + ): client.preview_redactions( uploaded_pdf_for_redaction, redactions=[{"type": "literal", "value": "placeholder"}], @@ -165,5 +278,46 @@ def test_live_redactions_invalid_payloads( redactions=[{"type": "literal", "value": "placeholder"}], ) preview_file = preview.output_files[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): client.apply_redactions(preview_file, extra_body=extra_body) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "extra_body", + [ + pytest.param({"redactions": "invalid"}, id="invalid-redactions"), + pytest.param({"rgb_color": "-1,-1,-1"}, id="invalid-rgb"), + ], +) +async def test_live_async_redactions_invalid_payloads( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, + extra_body: dict[str, object], +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + if "redactions" in extra_body: + with pytest.raises( + PdfRestApiError, + match=( + r"The JSON data provided is not properly formatted\. Please check " + r"your syntax and try again\." + ), + ): + await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "placeholder"}], + extra_body=extra_body, + ) + else: + preview = await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "placeholder"}], + ) + preview_file = preview.output_files[0] + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): + await client.apply_redactions(preview_file, extra_body=extra_body) diff --git a/tests/live/test_live_pdf_split_merge.py b/tests/live/test_live_pdf_split_merge.py index 979f7b1e..9ae630cb 100644 --- a/tests/live/test_live_pdf_split_merge.py +++ b/tests/live/test_live_pdf_split_merge.py @@ -151,6 +151,65 @@ def test_live_split_pdf_page_groups( assert str(response.input_id) == str(split_source.id) +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("page_groups", "expected_count"), + [ + pytest.param(["1-5", "6-last"], 2, id="two-ranges"), + pytest.param([["1", "3", "5"], "2-4"], 2, id="alternating-selection"), + pytest.param(["even"], 1, id="even-only"), + pytest.param(["9-2"], 1, id="descending-single"), + pytest.param(["odd", "even"], 2, id="odd-and-even"), + ], +) +async def test_live_async_split_pdf_page_groups( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], + page_groups: list[PdfPageSelection], + expected_count: int, +) -> None: + split_source, _ = uploaded_live_pdfs + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + total_pages = await _fetch_page_count_async(client, split_source) + + response = await client.split_pdf( + split_source, + page_groups=page_groups, + output_prefix="live-async-split", + ) + + assert len(response.output_files) == expected_count + + output_infos = [ + await client.query_pdf_info(output_file) + for output_file in response.output_files + ] + + assert all( + output_file.name.startswith("live-async-split") + and output_file.name.endswith(".pdf") + and output_file.type == "application/pdf" + and output_file.size > 0 + for output_file in response.output_files + ) + page_counts_optional = [info.page_count for info in output_infos] + assert all(count is not None for count in page_counts_optional) + expected_page_counts = [ + len(_expand_page_selection(group, total_pages=total_pages)) + for group in page_groups + ][: len(page_counts_optional)] + page_counts = [ + int(count) for count in page_counts_optional if count is not None + ] + assert page_counts == expected_page_counts + assert str(response.input_id) == str(split_source.id) + + def test_live_split_pdf_default_outputs( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -186,6 +245,43 @@ def test_live_split_pdf_default_outputs( assert str(response.input_id) == str(split_source.id) +@pytest.mark.asyncio +async def test_live_async_split_pdf_default_outputs( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], +) -> None: + split_source, _ = uploaded_live_pdfs + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + total_pages = await _fetch_page_count_async(client, split_source) + + response = await client.split_pdf( + split_source, + output_prefix="live-async-split-default", + ) + + assert len(response.output_files) == total_pages + + output_infos = [ + await client.query_pdf_info(output_file) + for output_file in response.output_files + ] + assert all( + output_file.name.startswith("live-async-split-default") + and output_file.name.endswith(".pdf") + and output_file.type == "application/pdf" + and output_file.size > 0 + for output_file in response.output_files + ) + assert all(info.page_count == 1 for info in output_infos) + + assert str(response.input_id) == str(split_source.id) + + def test_live_split_pdf_invalid_pages( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -198,7 +294,7 @@ def test_live_split_pdf_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.split_pdf( split_source, @@ -207,6 +303,26 @@ def test_live_split_pdf_invalid_pages( ) +@pytest.mark.asyncio +async def test_live_async_split_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], +) -> None: + split_source, _ = uploaded_live_pdfs + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.split_pdf( + split_source, + page_groups=["1-2"], + extra_body={"pages": ["0"]}, + ) + + def test_live_merge_pdfs_success( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -254,6 +370,30 @@ def test_live_merge_pdfs_success( assert output_info.page_count == expected_total_pages +@pytest.mark.asyncio +async def test_live_async_merge_pdfs_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], +) -> None: + split_source, merge_partner = uploaded_live_pdfs + sources: list[PdfMergeInput] = [ + {"file": split_source, "pages": "even"}, + {"file": merge_partner, "pages": "1"}, + ] + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.merge_pdfs( + sources, + output_prefix="live-async-merge-invalid", + extra_body={"pages": ["even", "0"]}, + ) + + def test_live_merge_pdfs_invalid_pages( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -270,7 +410,7 @@ def test_live_merge_pdfs_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.merge_pdfs( sources, @@ -280,7 +420,7 @@ def test_live_merge_pdfs_invalid_pages( @pytest.mark.asyncio -async def test_live_async_merge_pdfs( +async def test_live_async_merge_pdfs_success( pdfrest_api_key: str, pdfrest_live_base_url: str, uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], @@ -373,7 +513,7 @@ def test_live_split_pdf_page_range_variants( output_pages = client.query_pdf_info(response.output_files[0]).page_count assert output_pages == len(expected_pages) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.split_pdf( split_source, page_groups=[selection if not requires_override else "1"], @@ -382,6 +522,52 @@ def test_live_split_pdf_page_range_variants( ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("selection", "expect_success", "requires_override"), SPLIT_RANGE_CASES +) +async def test_live_async_split_pdf_page_range_variants( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], + selection: PdfPageSelection, + expect_success: bool, + requires_override: bool, + request: pytest.FixtureRequest, +) -> None: + split_source, _ = uploaded_live_pdfs + case_id = request.node.callspec.id + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + total_pages = await _fetch_page_count_async(client, split_source) + override_body = None + if requires_override: + override_body = {"pages": [str(selection)]} + + if expect_success: + response = await client.split_pdf( + split_source, + page_groups=[selection if not requires_override else "1"], + output_prefix=f"live-async-split-range-{case_id}", + extra_body=override_body, + ) + expected_pages = _expand_page_selection(selection, total_pages=total_pages) + output_pages = ( + await client.query_pdf_info(response.output_files[0]) + ).page_count + assert output_pages == len(expected_pages) + else: + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.split_pdf( + split_source, + page_groups=[selection if not requires_override else "1"], + output_prefix=f"live-async-split-range-{case_id}", + extra_body=override_body, + ) + + MERGE_RANGE_CASES = [ pytest.param("3", True, False, id="single-str"), pytest.param(3, True, False, id="single-int"), @@ -446,9 +632,69 @@ def test_live_merge_pdf_page_range_variants( output_info = client.query_pdf_info(response.output_file) assert output_info.page_count == expected_total_pages else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.merge_pdfs( sources, output_prefix=f"live-merge-range-{case_id}", extra_body=override_body, ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("selection", "expect_success", "requires_override"), MERGE_RANGE_CASES +) +async def test_live_async_merge_pdf_page_range_variants( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_live_pdfs: tuple[PdfRestFile, PdfRestFile], + selection: PdfPageSelection, + expect_success: bool, + requires_override: bool, + request: pytest.FixtureRequest, +) -> None: + split_source, merge_partner = uploaded_live_pdfs + case_id = request.node.callspec.id + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + source_page_counts = { + str(split_source.id): await _fetch_page_count_async(client, split_source), + str(merge_partner.id): await _fetch_page_count_async(client, merge_partner), + } + sources: list[PdfMergeInput] = [ + { + "file": split_source, + "pages": selection if not requires_override else "1", + }, + {"file": merge_partner, "pages": "1"}, + ] + override_body = {"pages": [str(selection), "1"]} if requires_override else None + + if expect_success: + response = await client.merge_pdfs( + sources, + output_prefix=f"live-async-merge-range-{case_id}", + extra_body=override_body, + ) + expected_total_pages = sum( + len( + _expand_page_selection( + chosen_selection, + total_pages=source_page_counts[str(file.id)], + ) + ) + for file, chosen_selection in ( + _extract_merge_entry(entry) for entry in sources + ) + ) + output_info = await client.query_pdf_info(response.output_file) + assert output_info.page_count == expected_total_pages + else: + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.merge_pdfs( + sources, + output_prefix=f"live-async-merge-range-{case_id}", + extra_body=override_body, + ) diff --git a/tests/live/test_live_rasterize_pdf.py b/tests/live/test_live_rasterize_pdf.py new file mode 100644 index 00000000..6ad9fd72 --- /dev/null +++ b/tests/live/test_live_rasterize_pdf.py @@ -0,0 +1,127 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_rasterize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("rasterized-live", id="custom-output"), + ], +) +def test_live_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.rasterize_pdf(uploaded_pdf_for_rasterize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("rasterized-live", id="custom-output"), + ], +) +async def test_live_async_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.rasterize_pdf(uploaded_pdf_for_rasterize, **kwargs) + + assert response.output_files + output_file = response.output_file + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + + +def test_live_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_summarize_pdf_text.py b/tests/live/test_live_summarize_pdf_text.py new file mode 100644 index 00000000..4317c8d5 --- /dev/null +++ b/tests/live/test_live_summarize_pdf_text.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse, SummarizePdfTextResponse + +from ..resources import get_test_resource_path + + +def test_live_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text_to_file( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.summarize_text_to_file( + uploaded, + target_word_count=30, + summary_format="overview", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.summarize_text( + uploaded, + target_word_count=30, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + await client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) diff --git a/tests/live/test_live_translate_pdf_text.py b/tests/live/test_live_translate_pdf_text.py new file mode 100644 index 00000000..a2366ec8 --- /dev/null +++ b/tests/live/test_live_translate_pdf_text.py @@ -0,0 +1,157 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import ( + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) + +from ..resources import get_test_resource_path + + +def test_live_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.translate_pdf_text( + uploaded, + output_language="es", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "es" + assert response.input_id == uploaded.id + + +def test_live_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + client.translate_pdf_text( + uploaded, + output_language="es", + extra_body={"output_format": "invalid-format"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + await client.translate_pdf_text( + uploaded, + output_language="de", + extra_body={"output_format": "invalid-format"}, + ) + + +def test_live_translate_pdf_text_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text_to_file( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".txt") + assert output_file.type == "text/plain" + assert output_file.size > 0 + assert response.warning is None + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.translate_pdf_text_to_file( + uploaded, + output_language="de", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".txt") + assert output_file.type == "text/plain" + assert output_file.size > 0 + assert response.warning is None + assert response.output_language == "de" + assert response.source_languages + assert response.input_id == uploaded.id diff --git a/tests/resources/duckhat.pdf b/tests/resources/duckhat.pdf new file mode 100644 index 00000000..8dbaff23 Binary files /dev/null and b/tests/resources/duckhat.pdf differ diff --git a/tests/resources/report-image.pdf b/tests/resources/report-image.pdf new file mode 100644 index 00000000..47c42a29 Binary files /dev/null and b/tests/resources/report-image.pdf differ diff --git a/tests/resources/xfa.pdf b/tests/resources/xfa.pdf new file mode 100755 index 00000000..8a3ffe2c Binary files /dev/null and b/tests/resources/xfa.pdf differ diff --git a/tests/test_convert_to_excel.py b/tests/test_convert_to_excel.py new file mode 100644 index 00000000..8debea4c --- /dev/null +++ b/tests/test_convert_to_excel.py @@ -0,0 +1,303 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToExcelPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_excel_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate( + {"files": [input_file], "output": "report"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "report.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel(input_file, output="report") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "report.xlsx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate({"files": [input_file]}).model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.xlsx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_excel_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_excel(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_excel([pdf_file, make_pdf_file(PdfRestFileID.generate())]) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.convert_to_excel(png_file) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.convert_to_excel( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_convert_to_jpeg.py b/tests/test_convert_to_jpeg.py index 46e5f648..4ebf5a4e 100644 --- a/tests/test_convert_to_jpeg.py +++ b/tests/test_convert_to_jpeg.py @@ -83,7 +83,7 @@ def handler(request: httpx.Request) -> httpx.Response: assert str(output_file.url).endswith(output_id) -def test_convert_to_jpeg_defaults_excluded(monkeypatch: pytest.MonkeyPatch) -> None: +def test_convert_to_jpeg_defaults_included(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("PDFREST_API_KEY", raising=False) input_file = make_pdf_file(PdfRestFileID.generate(1)) output_id = "8e9f0011-2222-4bcd-9f00-abcdefabcdef" @@ -98,7 +98,9 @@ def handler(request: httpx.Request) -> httpx.Response: assert_conversion_payload( payload, request_payload, allowed_extras={"jpeg_quality"} ) - assert "jpeg_quality" not in payload + assert payload["jpeg_quality"] == 75 + assert payload["resolution"] == 300 + assert payload["color_model"] == "rgb" return httpx.Response( 200, json={"inputId": [input_file.id], "outputId": [output_id]}, @@ -122,6 +124,50 @@ def handler(request: httpx.Request) -> httpx.Response: assert output_file.type == "image/jpeg" +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_defaults_included( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = "8e9f0011-2222-4bcd-9f00-abcdefabcdef" + + request_payload = JpegPdfRestPayload.model_validate( + {"files": input_file} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_defaults=True) + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/jpg": + payload = json.loads(request.content.decode("utf-8")) + assert_conversion_payload( + payload, request_payload, allowed_extras={"jpeg_quality"} + ) + assert payload["jpeg_quality"] == 75 + assert payload["resolution"] == 300 + assert payload["color_model"] == "rgb" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "example-001.jpg", "image/jpeg" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_jpeg(input_file) + + output_file = response.output_files[0] + assert output_file.name == "example-001.jpg" + assert output_file.type == "image/jpeg" + + @pytest.mark.parametrize("resolution", [12, 2400]) def test_convert_to_jpeg_resolution_limits( monkeypatch: pytest.MonkeyPatch, resolution: int @@ -172,6 +218,57 @@ def handler(request: httpx.Request) -> httpx.Response: assert response.output_files[0].name == f"example-resolution-{resolution}.jpg" +@pytest.mark.asyncio +@pytest.mark.parametrize("resolution", [12, 2400]) +async def test_async_convert_to_jpeg_resolution_limits( + monkeypatch: pytest.MonkeyPatch, resolution: int +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + request_payload = JpegPdfRestPayload.model_validate( + { + "files": [input_file], + "resolution": resolution, + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_defaults=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/jpg": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert_conversion_payload( + payload, request_payload, allowed_extras={"jpeg_quality"} + ) + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, f"example-resolution-{resolution}.jpg", "image/jpeg" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_jpeg( + input_file, + resolution=resolution, + ) + + assert seen == {"post": 1, "get": 1} + assert response.output_files[0].name == f"example-resolution-{resolution}.jpg" + + @pytest.mark.parametrize("invalid_resolution", [11, 2401]) def test_convert_to_jpeg_resolution_out_of_bounds( monkeypatch: pytest.MonkeyPatch, invalid_resolution: int @@ -195,6 +292,28 @@ def handler(_: httpx.Request) -> httpx.Response: ) +@pytest.mark.asyncio +@pytest.mark.parametrize("invalid_resolution", [11, 2401]) +async def test_async_convert_to_jpeg_resolution_out_of_bounds( + monkeypatch: pytest.MonkeyPatch, invalid_resolution: int +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=r"less than or equal to 2400|greater than or equal to 12", + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + resolution=invalid_resolution, + ) + + @pytest.mark.parametrize( "invalid_color", [pytest.param("rgba", id="rgba"), pytest.param("lab", id="lab")], @@ -221,6 +340,31 @@ def handler(_: httpx.Request) -> httpx.Response: ) +@pytest.mark.asyncio +@pytest.mark.parametrize( + "invalid_color", + [pytest.param("rgba", id="rgba"), pytest.param("lab", id="lab")], +) +async def test_async_convert_to_jpeg_invalid_color_model( + monkeypatch: pytest.MonkeyPatch, invalid_color: str +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=re.escape("Input should be 'rgb', 'cmyk' or 'gray'"), + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + color_model=invalid_color, # type: ignore[arg-type] + ) + + def test_convert_to_jpeg_invalid_quality(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("PDFREST_API_KEY", raising=False) @@ -242,10 +386,39 @@ def handler(_: httpx.Request) -> httpx.Response: @pytest.mark.asyncio -async def test_async_convert_to_jpeg_success( +async def test_async_convert_to_jpeg_invalid_quality( monkeypatch: pytest.MonkeyPatch, ) -> None: monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=re.escape("Input should be greater than or equal to 1"), + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + jpeg_quality=0, + ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "color_model", + [ + pytest.param("rgb", id="rgb"), + pytest.param("cmyk", id="cmyk"), + pytest.param("gray", id="gray"), + ], +) +async def test_async_convert_to_jpeg_success( + monkeypatch: pytest.MonkeyPatch, color_model: str +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) input_file = make_pdf_file(PdfRestFileID.generate(1)) output_id = "9f001122-3333-4cde-af01-cdefabcdef12" @@ -255,7 +428,7 @@ async def test_async_convert_to_jpeg_success( "output_prefix": "async-output", "page_range": "1-2", "resolution": 500, - "color_model": "gray", + "color_model": color_model, "jpeg_quality": 85, "smoothing": ["all"], } @@ -293,7 +466,7 @@ def handler(request: httpx.Request) -> httpx.Response: output_prefix="async-output", page_range="1-2", resolution=500, - color_model="gray", + color_model=color_model, smoothing=["all"], jpeg_quality=85, ) @@ -381,6 +554,27 @@ def handler(_: httpx.Request) -> httpx.Response: ) +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_validation_error( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match="less than or equal to 2400", + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + resolution=5000, + ) + + def test_convert_to_jpeg_invalid_smoothing_value( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -403,6 +597,27 @@ def handler(_: httpx.Request) -> httpx.Response: ) +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_invalid_smoothing_value( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=re.escape("Input should be 'none', 'all', 'text', 'line' or 'image'"), + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + smoothing="invalid", # type: ignore[arg-type] + ) + + def test_convert_to_jpeg_multiple_files_rejected( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -424,6 +639,26 @@ def handler(_: httpx.Request) -> httpx.Response: client.convert_to_jpeg([first, second]) +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_multiple_files_rejected( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + first = make_pdf_file(PdfRestFileID.generate(1)) + second = make_pdf_file(PdfRestFileID.generate(1)) + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=re.escape("List should have at most 1 item after validation"), + ): + await client.convert_to_jpeg([first, second]) + + def test_convert_to_jpeg_empty_page_range_rejected( monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -446,6 +681,27 @@ def handler(_: httpx.Request) -> httpx.Response: ) +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_empty_page_range_rejected( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + + def handler(_: httpx.Request) -> httpx.Response: + pytest.fail("Request should not be sent when validation fails.") + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=re.escape("List should have at least 1 item after validation"), + ): + await client.convert_to_jpeg( + make_pdf_file(PdfRestFileID.generate(1)), + page_range=[], + ) + + def test_convert_to_jpeg_sequence_arguments(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("PDFREST_API_KEY", raising=False) input_file = make_pdf_file(PdfRestFileID.generate(1)) @@ -457,7 +713,7 @@ def test_convert_to_jpeg_sequence_arguments(monkeypatch: pytest.MonkeyPatch) -> "page_range": "1, 3", "smoothing": "text", } - ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_defaults=True) + ).model_dump(mode="json", by_alias=True, exclude_none=True) seen: dict[str, int] = {"post": 0, "get": 0} @@ -493,6 +749,56 @@ def handler(request: httpx.Request) -> httpx.Response: assert response.output_files[0].name == "example-001.jpg" +@pytest.mark.asyncio +async def test_async_convert_to_jpeg_sequence_arguments( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = "cdef0123-7777-4ab0-9123-aaaaaaaabbbb" + + request_payload = JpegPdfRestPayload.model_validate( + { + "files": [input_file], + "page_range": "1, 3", + "smoothing": "text", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/jpg": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert_conversion_payload(payload, request_payload, allowed_extras=set()) + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "example-async-001.jpg", "image/jpeg" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_jpeg( + [input_file], + page_range="1, 3", + smoothing="text", + ) + + assert seen == {"post": 1, "get": 1} + assert response.output_files[0].name == "example-async-001.jpg" + + @pytest.mark.asyncio async def test_async_convert_to_jpeg_request_customization( monkeypatch: pytest.MonkeyPatch, diff --git a/tests/test_convert_to_markdown.py b/tests/test_convert_to_markdown.py new file mode 100644 index 00000000..140a1c22 --- /dev/null +++ b/tests/test_convert_to_markdown.py @@ -0,0 +1,353 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, +) +from pdfrest.models._internal import ConvertToMarkdownPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_markdown_file(file_id: str, name: str = "markdown.md") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_convert_to_markdown_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ConvertToMarkdownPayload.model_validate({"files": [text_file]}) + + +def test_convert_to_markdown_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "pages": ["5-2"]} + ) + + +def test_convert_to_markdown_payload_invalid_page_break_comments() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="Input should be 'on' or 'off'"): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "page_break_comments": "maybe"} + ) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ConvertToMarkdownPayload.model_validate({"files": [text_file]}) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "pages": ["5-2"]} + ) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_payload_invalid_page_break_comments() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="Input should be 'on' or 'off'"): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "page_break_comments": "maybe"} + ) + + +def test_convert_to_markdown_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "md", + "output_type": "file", + "page_break_comments": "on", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + pages=["1-3"], + output="md", + page_break_comments="on", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + + +def test_convert_to_markdown_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "page_break_comments": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "debug.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + page_break_comments="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.4) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "page_break_comments": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "debug-async.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_markdown( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.4, + page_break_comments="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + {"files": [input_file], "output_type": "file", "page_break_comments": "off"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "async.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_markdown( + input_file, page_break_comments="off" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 diff --git a/tests/test_convert_to_pdfa.py b/tests/test_convert_to_pdfa.py new file mode 100644 index 00000000..96af385b --- /dev/null +++ b/tests/test_convert_to_pdfa.py @@ -0,0 +1,367 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPdfaPayload +from pdfrest.types import PdfAType + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="pdfa-1b"), + pytest.param("PDF/A-2b", id="pdfa-2b"), + pytest.param("PDF/A-2u", id="pdfa-2u"), + pytest.param("PDF/A-3b", id="pdfa-3b"), + pytest.param("PDF/A-3u", id="pdfa-3u"), + ], +) +def test_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + { + "files": [input_file], + "output_type": output_type, + "output": "archive", + "rasterize_if_errors_encountered": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "archive.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type=output_type, + output="archive", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "archive.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="pdfa-1b"), + pytest.param("PDF/A-2b", id="pdfa-2b"), + pytest.param("PDF/A-2u", id="pdfa-2u"), + pytest.param("PDF/A-3b", id="pdfa-3b"), + pytest.param("PDF/A-3u", id="pdfa-3u"), + ], +) +async def test_async_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + { + "files": [input_file], + "output_type": output_type, + "rasterize_if_errors_encountered": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "async.pdf", "application/pdf"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type=output_type, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-3b" + assert payload["rasterize_if_errors_encountered"] == "on" + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type="PDF/A-3b", + output="custom", + rasterize_if_errors_encountered="on", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.33, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.33) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.33) + + +@pytest.mark.asyncio +async def test_async_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-2u" + assert payload["id"] == str(input_file.id) + assert payload["extra"] == {"note": "async"} + assert payload["rasterize_if_errors_encountered"] == "off" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type="PDF/A-2u", + rasterize_if_errors_encountered="off", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"extra": {"note": "async"}}, + timeout=0.72, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.72) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.72) + + +def test_convert_to_pdfa_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, + match=( + "Input should be 'PDF/A-1b', 'PDF/A-2b', 'PDF/A-2u', " + "'PDF/A-3b' or 'PDF/A-3u'" + ), + ), + ): + client.convert_to_pdfa(pdf_file, output_type=None) # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_pdfa(png_file, output_type="PDF/A-2b") + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="PDF/A-1b"), + ): + client.convert_to_pdfa(pdf_file, output_type="PDF/A-4") # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_pdfa( + [pdf_file, make_pdf_file(PdfRestFileID.generate())], + output_type="PDF/A-2b", + ) + + +@pytest.mark.asyncio +async def test_async_convert_to_pdfa_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, + match=( + "Input should be 'PDF/A-1b', 'PDF/A-2b', 'PDF/A-2u', " + "'PDF/A-3b' or 'PDF/A-3u'" + ), + ): + await client.convert_to_pdfa( + pdf_file, + output_type=None, # type: ignore[arg-type] + ) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.convert_to_pdfa(png_file, output_type="PDF/A-2b") + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="PDF/A-1b"): + await client.convert_to_pdfa( + pdf_file, + output_type="PDF/A-4", # type: ignore[arg-type] + ) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.convert_to_pdfa( + [pdf_file, make_pdf_file(PdfRestFileID.generate())], + output_type="PDF/A-2b", + ) diff --git a/tests/test_convert_to_powerpoint.py b/tests/test_convert_to_powerpoint.py new file mode 100644 index 00000000..26653678 --- /dev/null +++ b/tests/test_convert_to_powerpoint.py @@ -0,0 +1,304 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPowerpointPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_powerpoint_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file], "output": "slides"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint(input_file, output="slides") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "slides.pptx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pptx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_powerpoint_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_powerpoint(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_powerpoint( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.convert_to_powerpoint(png_file) + + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.convert_to_powerpoint( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_convert_xfa_to_acroforms.py b/tests/test_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..0d634cc7 --- /dev/null +++ b/tests/test_convert_xfa_to_acroforms.py @@ -0,0 +1,299 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfXfaToAcroformsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_xfa_to_acroforms_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file], "output": "acro"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "acro.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms(input_file, output="acro") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "acro.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_convert_xfa_to_acroforms_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_xfa_to_acroforms(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_xfa_to_acroforms( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.convert_xfa_to_acroforms(png_file) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.convert_xfa_to_acroforms( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py new file mode 100644 index 00000000..89ef81ce --- /dev/null +++ b/tests/test_extract_images.py @@ -0,0 +1,300 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractImagesPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_png_file(file_id: str, name: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_images_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractImagesPayload.model_validate({"files": [text_file]}) + + +def test_extract_images_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractImagesPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +@pytest.mark.asyncio +async def test_async_extract_images_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractImagesPayload.model_validate({"files": [text_file]}) + + +@pytest.mark.asyncio +async def test_async_extract_images_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractImagesPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_images_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id_1 = str(PdfRestFileID.generate()) + output_id_2 = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-3"], "output": "images"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id_1, output_id_2], + }, + ) + if request.method == "GET" and request.url.path in { + f"/resource/{output_id_1}", + f"/resource/{output_id_2}", + }: + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file( + output_id_1 + if request.url.path.endswith(output_id_1) + else output_id_2, + "image.png", + ).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images(input_file, pages=["1-3"], output="images") + + assert seen == {"post": 1, "get": 2} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 2 + assert response.input_id == input_file.id + + +def test_extract_images_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-last"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_png_file(output_id, "debug.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images( + input_file, + pages=["1-last"], + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_extract_images_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-last"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=_make_png_file(output_id, "debug-async.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_images( + input_file, + pages=["1-last"], + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_extract_images_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file(output_id, "async.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_images(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_extract_pdf_text_to_file.py b/tests/test_extract_pdf_text_to_file.py new file mode 100644 index 00000000..fb875fc9 --- /dev/null +++ b/tests/test_extract_pdf_text_to_file.py @@ -0,0 +1,346 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_text_file(file_id: str, name: str = "extracted.txt") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_pdf_text_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractTextPayload.model_validate({"files": [text_file]}) + + +def test_extract_pdf_text_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractTextPayload.model_validate({"files": [text_file]}) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_pdf_text_to_file_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "text", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + pages=["1-3"], + output="text", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + assert response.output_file.id == output_id + + +def test_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "output": "file-output", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_text_file(output_id, "debug.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + output="file-output", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.35, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.35) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.35) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "output": "file-output", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_text_file(output_id, "debug-async.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_pdf_text_to_file( + input_file, + output="file-output", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.35, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.35) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.35) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id, "async.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_pdf_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_flatten_annotations.py b/tests/test_flatten_annotations.py new file mode 100644 index 00000000..b34cf576 --- /dev/null +++ b/tests/test_flatten_annotations.py @@ -0,0 +1,308 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenAnnotationsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_annotations_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file], "output": "flattened"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations(input_file, output="flattened") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_annotations_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_annotations(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_annotations([pdf_file, make_pdf_file(PdfRestFileID.generate())]) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.flatten_annotations(png_file) + + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.flatten_annotations( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_flatten_transparencies.py b/tests/test_flatten_transparencies.py new file mode 100644 index 00000000..2f50ead1 --- /dev/null +++ b/tests/test_flatten_transparencies.py @@ -0,0 +1,332 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenTransparenciesPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_transparencies_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "output": "flattened", "quality": "high"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, output="flattened", quality="high" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + assert payload["quality"] == "low" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, + output="custom", + quality="low", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "quality": "medium"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["quality"] == "high" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies( + input_file, + quality="high", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_transparencies_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_transparencies(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_transparencies( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="Input should be 'low', 'medium' or 'high'" + ), + ): + client.flatten_transparencies(pdf_file, quality="ultra") # type: ignore[arg-type] + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_validation( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.flatten_transparencies(png_file) + + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.flatten_transparencies( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with pytest.raises( + ValidationError, match="Input should be 'low', 'medium' or 'high'" + ): + await client.flatten_transparencies( + pdf_file, + quality="ultra", # type: ignore[arg-type] + ) diff --git a/tests/test_linearize_pdf.py b/tests/test_linearize_pdf.py new file mode 100644 index 00000000..68e25fbc --- /dev/null +++ b/tests/test_linearize_pdf.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfLinearizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +def test_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "linearized" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized-out.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf( + input_file, + output="linearized", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.61, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized-out.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.61) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.61) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["flags"] == ["a", "b"] + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"flags": ["a", "b"]}, + timeout=0.83, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.83) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.83) + + +@pytest.mark.parametrize( + ("files", "match"), + [ + pytest.param( + "png", + "Must be a PDF file", + id="non-pdf-file", + ), + pytest.param( + "multiple", + "List should have at most 1 item after validation", + id="multiple-files", + ), + ], +) +def test_linearize_pdf_validation( + monkeypatch: pytest.MonkeyPatch, + files: str, + match: str, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + files_argument = ( + png_file + if files == "png" + else [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match=match), + ): + client.linearize_pdf(files_argument) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + ("files", "match"), + [ + pytest.param( + "png", + "Must be a PDF file", + id="non-pdf-file", + ), + pytest.param( + "multiple", + "List should have at most 1 item after validation", + id="multiple-files", + ), + ], +) +async def test_async_linearize_pdf_validation( + monkeypatch: pytest.MonkeyPatch, + files: str, + match: str, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + files_argument = ( + png_file + if files == "png" + else [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match=match): + await client.linearize_pdf(files_argument) diff --git a/tests/test_ocr_pdf.py b/tests/test_ocr_pdf.py new file mode 100644 index 00000000..fe55df9e --- /dev/null +++ b/tests/test_ocr_pdf.py @@ -0,0 +1,327 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import OcrPdfPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def test_ocr_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + OcrPdfPayload.model_validate({"files": [text_file]}) + + +def test_ocr_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + OcrPdfPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +@pytest.mark.asyncio +async def test_async_ocr_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + OcrPdfPayload.model_validate({"files": [text_file]}) + + +@pytest.mark.asyncio +async def test_async_ocr_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + OcrPdfPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_ocr_payload_languages() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = OcrPdfPayload.model_validate( + {"files": [file_repr], "languages": ["English", "German"]} + ) + assert payload.languages == ["English", "German"] + assert ( + payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + )["languages"] + == "English,German" + ) + + +@pytest.mark.asyncio +async def test_async_ocr_payload_languages() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = OcrPdfPayload.model_validate( + {"files": [file_repr], "languages": ["English", "German"]} + ) + assert payload.languages == ["English", "German"] + assert ( + payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + )["languages"] + == "English,German" + ) + + +def test_ocr_payload_invalid_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="ChineseSimplified"): + OcrPdfPayload.model_validate({"files": [file_repr], "languages": ["Klingon"]}) + + +@pytest.mark.asyncio +async def test_async_ocr_payload_invalid_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="ChineseSimplified"): + OcrPdfPayload.model_validate({"files": [file_repr], "languages": ["Klingon"]}) + + +def test_ocr_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "ocr", + "languages": ["English"], + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": output_id, + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + pages=["1-3"], + output="ocr", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "ocr.pdf" + assert response.input_id == input_file.id + + +def test_ocr_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=make_pdf_file(output_id, "custom-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_ocr_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=make_pdf_file(output_id, "custom-async-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.ocr_pdf( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_ocr_pdf_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "async-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.ocr_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_rasterize_pdf.py b/tests/test_rasterize_pdf.py new file mode 100644 index 00000000..d5a189e1 --- /dev/null +++ b/tests/test_rasterize_pdf.py @@ -0,0 +1,290 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfRasterizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file], "output": "rasterized"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "rasterized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf(input_file, output="rasterized") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "rasterized.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_rasterize_pdf_request_customization(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_rasterize_pdf_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.rasterize_pdf(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.rasterize_pdf([pdf_file, make_pdf_file(PdfRestFileID.generate())]) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + with pytest.raises(ValidationError, match="Must be a PDF file"): + await client.rasterize_pdf(png_file) + + with pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ): + await client.rasterize_pdf( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_summarize_pdf_text.py b/tests/test_summarize_pdf_text.py new file mode 100644 index 00000000..2f8c3ef9 --- /dev/null +++ b/tests/test_summarize_pdf_text.py @@ -0,0 +1,521 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, + SummarizePdfTextResponse, +) +from pdfrest.models._internal import SummarizePdfTextPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def _make_text_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_summarize_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + SummarizePdfTextPayload.model_validate({"files": [image_file]}) + + +def test_summarize_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + SummarizePdfTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +@pytest.mark.asyncio +async def test_async_summarize_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + SummarizePdfTextPayload.model_validate({"files": [image_file]}) + + +@pytest.mark.asyncio +async def test_async_summarize_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + SummarizePdfTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_summarize_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 120, + "summary_format": "bullet_points", + "pages": ["1-3"], + "output_format": "plaintext", + "output_type": "json", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "summary": "Key points...", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text( + input_file, + target_word_count=120, + summary_format="bullet_points", + pages=["1-3"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Key points..." + assert response.input_id == input_file.id + + +@pytest.mark.asyncio +async def test_async_summarize_text_json_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 120, + "summary_format": "bullet_points", + "pages": ["1-3"], + "output_format": "plaintext", + "output_type": "json", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "summary": "Async key points...", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text( + input_file, + target_word_count=120, + summary_format="bullet_points", + pages=["1-3"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Async key points..." + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 200, + "summary_format": "bullet_points", + "pages": ["2-last"], + "output_format": "plaintext", + "output_type": "file", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + target_word_count=200, + summary_format="bullet_points", + pages=["2-last"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "output_format": "markdown", + "summary_format": "overview", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.25) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.25) + + +@pytest.mark.asyncio +async def test_async_summarize_text_to_file_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "output_format": "markdown", + "summary_format": "overview", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-summary.txt", "text/plain" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text_to_file( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "async-summary.txt" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.25) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.25) + + +def test_summarize_text_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "summary": "Sync summary", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text(input_file) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Sync summary" + + +@pytest.mark.asyncio +async def test_async_summarize_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "summary": "Async summary", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text(input_file) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Async summary" + assert response.input_id == input_file.id + + +@pytest.mark.asyncio +async def test_async_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "file"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-summary.txt", "text/plain" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_translate_pdf_text.py b/tests/test_translate_pdf_text.py new file mode 100644 index 00000000..1d244031 --- /dev/null +++ b/tests/test_translate_pdf_text.py @@ -0,0 +1,527 @@ +from __future__ import annotations + +import json +import re + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileID, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) +from pdfrest.models._internal import TranslatePdfTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + +OUTPUT_LANGUAGE_ERROR = ( + "The provided 'output_language' language tag is invalid. Format 'output_language' " + "as a valid 2-3 character ISO 639 language code (e.g., 'en', 'es', 'fra'), " + "optionally with a script, alphabetic region, or numeric region (e.g., 'zh-Hant', " + "'eng-US', 'es-419'). See documentation for recommended formats." +) + + +def _make_markdown_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.md", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_translate_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + TranslatePdfTextPayload.model_validate( + {"files": [image_file], "output_language": "fr"} + ) + + +@pytest.mark.asyncio +async def test_async_translate_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + TranslatePdfTextPayload.model_validate( + {"files": [image_file], "output_language": "fr"} + ) + + +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("en", id="language-2-letter"), + pytest.param("fra", id="language-3-letter"), + pytest.param("zh-Hant", id="script"), + pytest.param("eng-US", id="alpha-region"), + pytest.param("es-419", id="numeric-region"), + ], +) +def test_translate_payload_accepts_valid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + assert payload.output_language == output_language + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("en", id="language-2-letter"), + pytest.param("fra", id="language-3-letter"), + pytest.param("zh-Hant", id="script"), + pytest.param("eng-US", id="alpha-region"), + pytest.param("es-419", id="numeric-region"), + ], +) +async def test_async_translate_payload_accepts_valid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + assert payload.output_language == output_language + + +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("", id="empty"), + pytest.param("e", id="too-short"), + pytest.param("english", id="not-a-code"), + pytest.param("eng-USA", id="long-subtag"), + pytest.param("en-1234", id="long-numeric-region"), + pytest.param("en-US-extra", id="too-many-subtags"), + ], +) +def test_translate_payload_rejects_invalid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, + match=re.escape(OUTPUT_LANGUAGE_ERROR), + ): + TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + +def test_translate_payload_requires_target_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError): + TranslatePdfTextPayload.model_validate({"files": [file_repr]}) + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("", id="empty"), + pytest.param("e", id="too-short"), + pytest.param("english", id="not-a-code"), + pytest.param("eng-USA", id="long-subtag"), + pytest.param("en-1234", id="long-numeric-region"), + pytest.param("en-US-extra", id="too-many-subtags"), + ], +) +async def test_async_translate_payload_rejects_invalid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, + match=re.escape(OUTPUT_LANGUAGE_ERROR), + ): + TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + +@pytest.mark.asyncio +async def test_async_translate_payload_requires_target_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError): + TranslatePdfTextPayload.model_validate({"files": [file_repr]}) + + +def test_translate_pdf_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_markdown_file(str(PdfRestFileID.generate(1))) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "fr", + "pages": ["1-2"], + "output_format": "plaintext", + "output_type": "json", + "output": "translation", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "translated_text": "Bonjour", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "fr", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text( + input_file, + output_language="fr", + pages=["1-2"], + output_format="plaintext", + output="translation", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Bonjour" + assert response.source_languages == ["en"] + assert response.output_language == "fr" + assert response.input_id == input_file.id + + +@pytest.mark.asyncio +async def test_async_translate_pdf_text_json_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_markdown_file(str(PdfRestFileID.generate(1))) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "fr", + "pages": ["1-2"], + "output_format": "plaintext", + "output_type": "json", + "output": "translation", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "translated_text": "Bonjour", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "fr", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.translate_pdf_text( + input_file, + output_language="fr", + pages=["1-2"], + output_format="plaintext", + output="translation", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Bonjour" + assert response.source_languages == ["en"] + assert response.output_language == "fr" + assert response.input_id == input_file.id + + +def test_translate_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "es", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputUrl": f"https://api.pdfrest.com/resource/{output_id}?format=file", + "outputId": output_id, + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "es", + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text_to_file( + input_file, + output_language="es", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_file.id == output_id + assert response.output_language == "es" + assert response.source_languages == ["en"] + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_translate_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "es", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputUrl": f"https://api.pdfrest.com/resource/{output_id}?format=file", + "outputId": output_id, + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "es", + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.translate_pdf_text_to_file( + input_file, + output_language="es", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_file.id == output_id + assert response.output_language == "es" + assert response.source_languages == ["en"] + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +def test_translate_pdf_text_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = TranslatePdfTextPayload.model_validate( + {"files": [input_file], "output_language": "de", "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "translated_text": "Hallo", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "de", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text( + input_file, + output_language="de", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Hallo" + assert response.source_languages == ["en"] + + +@pytest.mark.asyncio +async def test_async_translate_pdf_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = TranslatePdfTextPayload.model_validate( + {"files": [input_file], "output_language": "de", "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "translated_text": "Hallo", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "de", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.translate_pdf_text( + input_file, + output_language="de", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Hallo" + assert response.source_languages == ["en"] + assert response.output_language == "de" + assert response.input_id == input_file.id diff --git a/uv.lock b/uv.lock index ba0e7705..aa0a1f3c 100644 --- a/uv.lock +++ b/uv.lock @@ -439,6 +439,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "langcodes" +version = "3.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/75/f9edc5d72945019312f359e69ded9f82392a81d49c5051ed3209b100c0d2/langcodes-3.5.1.tar.gz", hash = "sha256:40bff315e01b01d11c2ae3928dd4f5cbd74dd38f9bd912c12b9a3606c143f731", size = 191084, upload-time = "2025-12-02T16:22:01.627Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/c1/d10b371bcba7abce05e2b33910e39c33cfa496a53f13640b7b8e10bb4d2b/langcodes-3.5.1-py3-none-any.whl", hash = "sha256:b6a9c25c603804e2d169165091d0cdb23934610524a21d226e4f463e8e958a72", size = 183050, upload-time = "2025-12-02T16:21:59.954Z" }, +] + [[package]] name = "license-expression" version = "30.4.4" @@ -601,6 +610,7 @@ source = { editable = "." } dependencies = [ { name = "exceptiongroup" }, { name = "httpx" }, + { name = "langcodes" }, { name = "pydantic" }, ] @@ -626,6 +636,7 @@ dev = [ requires-dist = [ { name = "exceptiongroup", specifier = ">=1.3.0" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "langcodes", specifier = ">=3.4.0" }, { name = "pydantic", specifier = ">=2.12.0" }, ]