diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index ea3ef34..3cd574d 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -25,8 +25,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install -r requirements-dev.txt - pip install -e . + pip install -e ".[test]" - name: Run tests run: | @@ -46,12 +45,12 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: "3.10" + python-version: "3.14" - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff mypy + pip install -e ".[lint]" - name: Lint with ruff run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c347fad..7528dd3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,8 @@ cd pyUSPTO ### Set Up Development Environment +> **Note:** Python 3.11+ is required for the development environment. Some dev dependencies (e.g. `myst-parser`) do not support Python 3.10. The package itself supports Python 3.10+. + ```bash # Create and activate a virtual environment python -m venv venv diff --git a/examples/ifw_example.py b/examples/ifw_example.py index 9e185e8..8e51643 100644 --- a/examples/ifw_example.py +++ b/examples/ifw_example.py @@ -1,7 +1,26 @@ """Example usage of pyUSPTO for IFW data. -This example demonstrates how to use the PatentDataClient to interact with the USPTO Patent Data API. -It shows how to retrieve IFW based on various identifying values. +This example demonstrates how to use the PatentDataClient to retrieve Image File +Wrapper (IFW) data from the USPTO Patent Data API. It covers: + +- get_IFW_metadata(): retrieve a PatentFileWrapper (with populated document_bag) + using any of the five supported identifiers: + - application_number + - patent_number + - publication_number + - PCT_app_number + - PCT_pub_number + +- get_IFW(): retrieve metadata AND bulk-download all prosecution history documents + (PDF preferred, DOCX fallback; XML and formatless docs are skipped). Returns an + IFWResult with: + - wrapper: the PatentFileWrapper + - output_path: path to the ZIP archive (as_zip=True, default) or output directory + - downloaded_documents: dict mapping document_identifier -> filename, allowing + each Document in document_bag to be linked to its downloaded file + +- download_archive() / download_publication(): download the pgpub or grant XML + archive from PrintedMetaData. """ import json @@ -61,6 +80,25 @@ print(f" - IFW Found based on PCT Pub No: {PCT_pub_number}") +print("\nGet IFW + download all prosecution docs as a ZIP archive -->") +ifw_result = client.get_IFW(application_number=application_number, destination="./download-example", overwrite=True) +if ifw_result: + print(f"Title: {ifw_result.wrapper.application_meta_data.invention_title if ifw_result.wrapper.application_meta_data else 'N/A'}") + print(f"Output: {ifw_result.output_path}") + doc_bag = ifw_result.wrapper.document_bag or [] + print(f"Documents downloaded: {len(ifw_result.downloaded_documents)} of {len(doc_bag)}") + for doc in doc_bag: + if doc.document_identifier: + filename = ifw_result.downloaded_documents.get(doc.document_identifier) + status = f"-> {filename}" if filename else "(skipped)" + print(f" {doc.document_code} [{doc.document_identifier}] {status}") + +print("\nGet IFW + download all prosecution docs as a directory (no ZIP) -->") +ifw_dir_result = client.get_IFW(application_number=application_number, destination="./download-example", overwrite=True, as_zip=False) +if ifw_dir_result: + print(f"Output directory: {ifw_dir_result.output_path}") + + print("\nNow let's download the Patent Publication Text -->") if app_no_ifw and app_no_ifw.pgpub_document_meta_data: pgpub_archive = app_no_ifw.pgpub_document_meta_data diff --git a/examples/patent_data_example.py b/examples/patent_data_example.py index 48e0998..5809925 100644 --- a/examples/patent_data_example.py +++ b/examples/patent_data_example.py @@ -302,6 +302,28 @@ print(f"Error with POST search: {e}") +# Search by CPC classification code +# CPC codes containing spaces or slashes are automatically quoted for the Lucene query. +# cpc_classification_bag on ApplicationMetaData is a list[str] of all CPC codes assigned +# to the application, so each result may have multiple codes. +try: + print("\nSearching by CPC classification code 'H10D 64/667'...") + cpc_response = client.search_applications( + classification_q="H10D 64/667", limit=3 + ) + print(f"Found {cpc_response.count} applications with CPC code H10D 64/667.") + for patent_wrapper in cpc_response.patent_file_wrapper_data_bag: + app_meta = patent_wrapper.application_meta_data + if app_meta: + print( + f" - App No: {patent_wrapper.application_number_text}, Title: {app_meta.invention_title}" + ) + if app_meta.cpc_classification_bag: + print(f" CPC codes: {', '.join(app_meta.cpc_classification_bag)}") +except Exception as e: + print(f"Error searching by CPC classification: {e}") + + # Example of getting status codes try: print("\nGetting first 5 status codes...") diff --git a/pyproject.toml b/pyproject.toml index 6b86858..f591731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,24 +51,17 @@ dependencies = [ dynamic = ["version"] [project.optional-dependencies] -dev = [ - # Testing - "pytest>=9.0.2", - "pytest-cov>=7.0.0", - "pytest-mock>=3.15.1", - # Documentation - "sphinx==8.1.3", - "sphinx-rtd-theme>=3.0.0", - "sphinx_immaterial>=0.13.8", - "sphinx-autodoc-typehints==3.0.1", +test = ["pytest>=9.0.2", "pytest-cov>=7.0.0", "pytest-mock>=3.15.1", "typing_extensions>=4.15.0"] +docs = [ + "sphinx>=9.1.0", + "sphinx-rtd-theme>=3.1.0", + "sphinx_immaterial>=0.13.9", + "sphinx-autodoc-typehints>=3.9.5", "sphinx-copybutton>=0.5.2", - "myst-parser>=4.0.1", - # Type checking - "mypy>=1.19.0", - "types-requests>=2.32.4", - # Code quality and formatting - "ruff>=0.15.0", + "myst-parser>=5.0.0", ] +lint = ["mypy>=1.19.0", "types-requests>=2.32.4", "ruff>=0.15.0"] +dev = ["pyUSPTO[test]", "pyUSPTO[docs]", "pyUSPTO[lint]"] [project.urls] GitHub = "https://github.com/DunlapCoddingPC/pyUSPTO" diff --git a/requirements-dev.txt b/requirements-dev.txt index b36ed0d..195a534 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,14 +22,14 @@ colorama==0.4.6 # sphinx coverage==7.13.4 # via pytest-cov -docutils==0.21.2 +docutils==0.22.4 # via # myst-parser # sphinx # sphinx-rtd-theme idna==3.11 # via requests -imagesize==1.4.1 +imagesize==2.0.0 # via sphinx iniconfig==2.3.0 # via pytest @@ -39,7 +39,7 @@ jinja2==3.1.6 # sphinx librt==0.8.1 # via mypy -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via # mdit-py-plugins # myst-parser @@ -52,11 +52,11 @@ mdit-py-plugins==0.5.0 mdurl==0.1.2 # via markdown-it-py mypy==1.19.1 - # via pyUSPTO (pyproject.toml) + # via pyuspto mypy-extensions==1.1.0 # via mypy -myst-parser==4.0.1 - # via pyUSPTO (pyproject.toml) +myst-parser==5.0.0 + # via pyuspto packaging==26.0 # via # pytest @@ -81,41 +81,48 @@ pygments==2.19.2 # sphinx pytest==9.0.2 # via - # pyUSPTO (pyproject.toml) # pytest-cov # pytest-mock + # pyuspto pytest-cov==7.0.0 - # via pyUSPTO (pyproject.toml) + # via pyuspto pytest-mock==3.15.1 - # via pyUSPTO (pyproject.toml) + # via pyuspto +pyuspto @ file:///C:/Users/andrewp/Documents/GitHub/pyUSPTO + # via + # pyUSPTO (pyproject.toml) + # pyuspto pyyaml==6.0.3 # via myst-parser requests==2.32.5 # via # pyUSPTO (pyproject.toml) + # pyuspto # sphinx # sphinx-immaterial +roman-numerals==4.1.0 + # via sphinx ruff==0.15.4 - # via pyUSPTO (pyproject.toml) + # via pyuspto snowballstemmer==3.0.1 # via sphinx -sphinx==8.1.3 +sphinx==9.1.0 # via # myst-parser - # pyUSPTO (pyproject.toml) + # pyuspto # sphinx-autodoc-typehints # sphinx-copybutton # sphinx-immaterial # sphinx-rtd-theme # sphinxcontrib-jquery -sphinx-autodoc-typehints==3.0.1 - # via pyUSPTO (pyproject.toml) +sphinx-autodoc-typehints==3.9.6 + # via pyuspto sphinx-copybutton==0.5.2 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinx-immaterial==0.13.9 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinx-rtd-theme==3.1.0 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 @@ -131,19 +138,22 @@ sphinxcontrib-qthelp==2.0.0 sphinxcontrib-serializinghtml==2.0.0 # via sphinx types-requests==2.32.4.20260107 - # via pyUSPTO (pyproject.toml) + # via pyuspto typing-extensions==4.15.0 # via # mypy # pydantic # pydantic-core # pydantic-extra-types + # pyuspto # sphinx-immaterial # typing-inspection typing-inspection==0.4.2 # via pydantic tzdata==2025.3 - # via pyUSPTO (pyproject.toml) + # via + # pyUSPTO (pyproject.toml) + # pyuspto urllib3==2.6.3 # via # requests diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index ab2ce96..c815ca3 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -4,7 +4,11 @@ It allows you to search for and retrieve patent application data. """ +import dataclasses +import os +import tempfile import warnings +import zipfile from collections.abc import Iterator from typing import Any @@ -20,6 +24,7 @@ DocumentMimeType, EventData, ForeignPriority, + IFWResult, PatentDataResponse, PatentFileWrapper, PatentTermAdjustmentData, @@ -121,6 +126,9 @@ def sanitize_application_number(self, input_number: str) -> str: # Example: "PCT/US2024/012345" -> "PCTUS2412345" if raw.startswith("PCT"): parts = raw.split("/") + if len(parts) == 1: + # Already sanitized (e.g. "PCTUS0812705"), return as-is + return raw if len(parts) != 3: raise ValueError( f"Invalid PCT application format: {input_number}. " @@ -298,9 +306,12 @@ def search_applications( ) q_parts.append(f"assignmentBag.assigneeBag.assigneeNameText:{v}") if classification_q: - q_parts.append( - f"applicationMetaData.cpcClassificationBag:{classification_q}" + v = ( + f'"{classification_q}"' + if any(c in classification_q for c in [" ", "/"]) + else classification_q ) + q_parts.append(f"applicationMetaData.cpcClassificationBag:{v}") if earliestPublicationNumber_q: q_parts.append( f"applicationMetaData.earliestPublicationNumber:{earliestPublicationNumber_q}" @@ -439,9 +450,12 @@ def get_search_results( ) q_parts.append(f"assignmentBag.assigneeBag.assigneeNameText:{v}") if classification_q: - q_parts.append( - f"applicationMetaData.cpcClassificationBag:{classification_q}" + v = ( + f'"{classification_q}"' + if any(c in classification_q for c in [" ", "/"]) + else classification_q ) + q_parts.append(f"applicationMetaData.cpcClassificationBag:{v}") if filing_date_from_q and filing_date_to_q: q_parts.append( @@ -1058,27 +1072,169 @@ def get_IFW_metadata( comprehensive data if found using one of the identifiers, otherwise None. """ + wrapper = None if application_number: - return self.get_application_by_number(application_number=application_number) - if patent_number: + wrapper = self.get_application_by_number( + application_number=application_number + ) + elif patent_number: pdr = self.search_applications(patent_number_q=patent_number, limit=1) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - if publication_number: + wrapper = pdr.patent_file_wrapper_data_bag[0] + elif publication_number: pdr = self.search_applications( earliestPublicationNumber_q=publication_number, limit=1 ) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - if PCT_app_number: - return self.get_application_by_number(application_number=PCT_app_number) - if PCT_pub_number: + wrapper = pdr.patent_file_wrapper_data_bag[0] + elif PCT_app_number: + wrapper = self.get_application_by_number(application_number=PCT_app_number) + elif PCT_pub_number: pdr = self.search_applications( pctPublicationNumber_q=PCT_pub_number, limit=1 ) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - return None + wrapper = pdr.patent_file_wrapper_data_bag[0] + if wrapper is None: + return None + doc_bag = self.get_application_documents(wrapper.application_number_text) + return dataclasses.replace(wrapper, document_bag=doc_bag) + + def get_IFW( + self, + *, + application_number: str | None = None, + publication_number: str | None = None, + patent_number: str | None = None, + PCT_app_number: str | None = None, + PCT_pub_number: str | None = None, + destination: str | None = None, + overwrite: bool = False, + as_zip: bool = True, + ) -> IFWResult | None: + """Retrieve IFW metadata and download all prosecution documents. + + Combines `get_IFW_metadata` with a bulk download of all available prosecution + history documents (PDF preferred, DOCX fallback). Documents with no downloadable + format (e.g., NPL references) are silently skipped. A warning is issued only + if a document has a download URL but the download itself fails. + + Args: + application_number: USPTO application number (e.g., "16123456"). + publication_number: USPTO pre-grant publication number. + patent_number: USPTO patent number. + PCT_app_number: PCT application number. + PCT_pub_number: PCT publication number. + destination: Directory for output. Defaults to current directory. + overwrite: Whether to overwrite an existing output. Default False. + as_zip: If True (default), package all downloads into a ZIP archive + at ``{destination}/{app_no}_ifw.zip``. If False, download files + directly into ``{destination}/{app_no}_ifw/``. + + Returns: + IFWResult with the PatentFileWrapper, the output path, and a mapping + of document_identifier to filename for each downloaded document. + Returns None if no application was found. + + Raises: + FileExistsError: If the output path already exists and overwrite=False. + """ + wrapper = self.get_IFW_metadata( + application_number=application_number, + publication_number=publication_number, + patent_number=patent_number, + PCT_app_number=PCT_app_number, + PCT_pub_number=PCT_pub_number, + ) + if wrapper is None: + return None + + dest_dir = destination or "." + app_no = wrapper.application_number_text or "unknown" + downloaded_documents: dict[str, str] = {} + + if as_zip: + output_path = os.path.join(dest_dir, f"{app_no}_ifw.zip") + if os.path.exists(output_path) and not overwrite: + raise FileExistsError( + f"ZIP archive already exists: {output_path}. Use overwrite=True to replace." + ) + os.makedirs(dest_dir, exist_ok=True) + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile( + output_path, "w", compression=zipfile.ZIP_DEFLATED + ) as zf: + for doc in wrapper.document_bag or []: + if not doc.document_identifier: + continue + fmt_obj = next( + ( + f + for f in doc.document_formats + if f.mime_type_identifier in ("PDF", "MS_WORD") + and f.download_url + ), + None, + ) + if fmt_obj is None or not fmt_obj.download_url: + continue + try: + downloaded = self._download_and_extract( + url=fmt_obj.download_url, + destination=tmp_dir, + overwrite=True, + ) + arcname = os.path.basename(downloaded) + zf.write(downloaded, arcname=arcname) + downloaded_documents[doc.document_identifier] = arcname + except Exception as exc: + warnings.warn( + f"Failed to download document {doc.document_identifier} " + f"({doc.document_code}): {exc}", + stacklevel=2, + ) + else: + output_path = os.path.join(dest_dir, f"{app_no}_ifw") + if os.path.exists(output_path) and not overwrite: + raise FileExistsError( + f"Output directory already exists: {output_path}. Use overwrite=True to replace." + ) + os.makedirs(output_path, exist_ok=True) + for doc in wrapper.document_bag or []: + if not doc.document_identifier: + continue + fmt_obj = next( + ( + f + for f in doc.document_formats + if f.mime_type_identifier in ("PDF", "MS_WORD") + and f.download_url + ), + None, + ) + if fmt_obj is None or not fmt_obj.download_url: + continue + try: + downloaded = self._download_and_extract( + url=fmt_obj.download_url, + destination=output_path, + overwrite=overwrite, + ) + downloaded_documents[doc.document_identifier] = os.path.basename( + downloaded + ) + except Exception as exc: + warnings.warn( + f"Failed to download document {doc.document_identifier} " + f"({doc.document_code}): {exc}", + stacklevel=2, + ) + + return IFWResult( + wrapper=wrapper, + output_path=os.path.abspath(output_path), + downloaded_documents=downloaded_documents, + ) def download_archive( self, diff --git a/src/pyUSPTO/models/patent_data.py b/src/pyUSPTO/models/patent_data.py index 800ecdd..36fde89 100644 --- a/src/pyUSPTO/models/patent_data.py +++ b/src/pyUSPTO/models/patent_data.py @@ -2048,6 +2048,7 @@ class PatentFileWrapper: pgpub_document_meta_data: `PrintedMetaData` for Pre-Grant Publication. grant_document_meta_data: `PrintedMetaData` for the granted patent. last_ingestion_date_time: Timestamp of when this data was last ingested by the API (UTC). + document_bag: `DocumentBag` containing associated documents and their metadata. """ application_number_text: str @@ -2063,6 +2064,7 @@ class PatentFileWrapper: pgpub_document_meta_data: PrintedMetaData | None = None grant_document_meta_data: PrintedMetaData | None = None last_ingestion_date_time: datetime | None = None + document_bag: DocumentBag | None = None @classmethod def from_dict( @@ -2209,6 +2211,25 @@ def to_dict(self) -> dict[str, Any]: } +@dataclass(frozen=True) +class IFWResult: + """Result of a get_IFW call: metadata wrapper, output path, and document map. + + Attributes: + wrapper: The PatentFileWrapper containing all IFW metadata and document_bag. + output_path: Absolute path to the ZIP archive (when as_zip=True) or the + output directory (when as_zip=False). + downloaded_documents: Maps document_identifier to the filename of the + downloaded file — the arcname inside the ZIP (as_zip=True) or the + basename inside the output directory (as_zip=False). Documents that + were skipped (no PDF/DOCX URL) or failed to download are absent. + """ + + wrapper: PatentFileWrapper + output_path: str + downloaded_documents: dict[str, str] + + @dataclass(frozen=True) class PatentDataResponse: """Represents the overall response from a patent data API request. diff --git a/tests/clients/test_patent_data_clients.py b/tests/clients/test_patent_data_clients.py index a14a5a4..e16f652 100644 --- a/tests/clients/test_patent_data_clients.py +++ b/tests/clients/test_patent_data_clients.py @@ -7,11 +7,12 @@ import csv import io +import os from collections.abc import Iterator from datetime import date, datetime, timezone from typing import Any from unittest import mock -from unittest.mock import MagicMock, mock_open, patch +from unittest.mock import MagicMock, call, mock_open, patch import pytest import requests @@ -33,6 +34,7 @@ DocumentMimeType, EventData, ForeignPriority, + IFWResult, Inventor, ParentContinuity, PatentDataResponse, @@ -384,6 +386,10 @@ def test_search_applications_post( {"classification_q": "H04L"}, "applicationMetaData.cpcClassificationBag:H04L", ), + ( + {"classification_q": "H10D 64/667"}, + 'applicationMetaData.cpcClassificationBag:"H10D 64/667"', + ), ( {"earliestPublicationNumber_q": "*12345678*"}, "applicationMetaData.earliestPublicationNumber:*12345678*", @@ -1227,20 +1233,22 @@ def test_get_ifw_by_application_number( ) -> None: """Test get_IFW with application_number calls get_application_by_number.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] app_num = "12345678" result = client.get_IFW_metadata(application_number=app_num) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint=f"api/v1/patent/applications/{app_num}", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_patent_number( self, @@ -1249,15 +1257,16 @@ def test_get_ifw_by_patent_number( ) -> None: """Test get_IFW with patent_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] patent_num = "10000000" result = client.get_IFW_metadata(patent_number=patent_num) - # Should call search_applications with patent_number_q - mock_make_request.assert_called_once_with( + # Should call search_applications with patent_number_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1267,7 +1276,8 @@ def test_get_ifw_by_patent_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_publication_number( self, @@ -1276,15 +1286,16 @@ def test_get_ifw_by_publication_number( ) -> None: """Test get_IFW with publication_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pub_num = "US20240123456A1" result = client.get_IFW_metadata(publication_number=pub_num) - # Should call search_applications with earliestPublicationNumber_q - mock_make_request.assert_called_once_with( + # Should call search_applications with earliestPublicationNumber_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1294,7 +1305,8 @@ def test_get_ifw_by_publication_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_pct_app_number( self, @@ -1308,9 +1320,10 @@ def test_get_ifw_by_pct_app_number( This is expected test behavior for validating the warning system. """ client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_app = "PCT/US2024/012345" @@ -1318,13 +1331,14 @@ def test_get_ifw_by_pct_app_number( with pytest.warns(USPTODataMismatchWarning): result = client.get_IFW_metadata(PCT_app_number=pct_app) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/PCTUS24012345", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_short_pct_app_number( self, @@ -1341,9 +1355,10 @@ def test_get_ifw_by_short_pct_app_number( This is expected test behavior for validating the warning system. """ client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_app = "PCT/US24/012345" @@ -1351,13 +1366,14 @@ def test_get_ifw_by_short_pct_app_number( with pytest.warns(USPTODataMismatchWarning): result = client.get_IFW_metadata(PCT_app_number=pct_app) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/PCTUS24012345", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_pct_app_number_malformed( self, @@ -1463,15 +1479,16 @@ def test_get_ifw_by_pct_pub_number( ) -> None: """Test get_IFW with PCT_pub_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_pub = "WO2024012345A1" result = client.get_IFW_metadata(PCT_pub_number=pct_pub) - # Should call search_applications with pctPublicationNumber_q - mock_make_request.assert_called_once_with( + # Should call search_applications with pctPublicationNumber_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1481,7 +1498,8 @@ def test_get_ifw_by_pct_pub_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_no_parameters_returns_none( self, patent_data_client: PatentDataClient @@ -1509,9 +1527,10 @@ def test_get_ifw_prioritizes_first_parameter( ) -> None: """Test get_IFW uses application_number when multiple parameters provided.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] app_num = "12345678" # Provide multiple parameters - should use application_number @@ -1521,13 +1540,14 @@ def test_get_ifw_prioritizes_first_parameter( publication_number="US20240123456A1", ) - # Should only call get_application_by_number, not search - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first, not search + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint=f"api/v1/patent/applications/{app_num}", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) class TestDownloadArchive: @@ -1971,6 +1991,10 @@ def test_get_search_results_get_with_combined_q_convenience_params( {"classification_q": "H04L"}, "applicationMetaData.cpcClassificationBag:H04L", ), + ( + {"classification_q": "H10D 64/667"}, + 'applicationMetaData.cpcClassificationBag:"H10D 64/667"', + ), ( {"filing_date_from_q": "2021-01-01"}, "applicationMetaData.filingDate:>=2021-01-01", @@ -2528,6 +2552,9 @@ def test_sanitize_invalid_series_code_format_raises( with pytest.raises(ValueError, match="Expected format: NNNNNNNN or NN/NNNNNN"): patent_data_client.sanitize_application_number("08/123/456") + # Already-sanitized PCT number passes through unchanged + assert patent_data_client.sanitize_application_number("PCTUS0812705") == "PCTUS0812705" + class TestRawDataFeature: """Tests for the include_raw_data feature.""" @@ -3069,3 +3096,349 @@ def test_to_csv_with_multiple_wrappers( assert data_rows[1][1] == "APP002" assert data_rows[1][2] == serialize_date(wrapper2_meta.filing_date) assert data_rows[1][7] == wrapper2_meta.first_inventor_name + + +class TestGetIFWDownload: + """Tests for the get_IFW method (metadata + bulk document download).""" + + @pytest.fixture + def pdf_doc(self) -> Document: + """A document that has a PDF download URL.""" + return Document( + document_identifier="DOC001", + document_code="CTNF", + document_formats=[ + DocumentFormat( + mime_type_identifier="PDF", + download_url="https://example.com/doc001.pdf", + ), + ], + ) + + @pytest.fixture + def xml_only_doc(self) -> Document: + """A document with only XML (no PDF/DOCX) — should be skipped silently.""" + return Document( + document_identifier="DOC002", + document_code="SPEC", + document_formats=[ + DocumentFormat( + mime_type_identifier="XML", + download_url="https://example.com/doc002.xml", + ), + ], + ) + + @pytest.fixture + def no_url_doc(self) -> Document: + """A document with a PDF format entry but no download URL (e.g. NPL ref).""" + return Document( + document_identifier="DOC003", + document_code="NPL", + document_formats=[ + DocumentFormat(mime_type_identifier="PDF", download_url=None), + ], + ) + + @pytest.fixture + def docx_doc(self) -> Document: + """A document that has only a DOCX (MS_WORD) download URL.""" + return Document( + document_identifier="DOC004", + document_code="AMND", + document_formats=[ + DocumentFormat( + mime_type_identifier="MS_WORD", + download_url="https://example.com/doc004.docx", + ), + ], + ) + + def _make_wrapper(self, *docs: Document) -> PatentFileWrapper: + return PatentFileWrapper( + application_number_text="12345678", + document_bag=DocumentBag(documents=list(docs)), + ) + + def test_returns_none_when_not_found( + self, patent_data_client: PatentDataClient + ) -> None: + """get_IFW returns None when no application is found.""" + with patch.object(patent_data_client, "get_IFW_metadata", return_value=None): + result = patent_data_client.get_IFW(application_number="00000000") + assert result is None + + def test_returns_ifw_result_with_zip( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """get_IFW returns IFWResult with a valid ZIP and populated downloaded_documents.""" + wrapper = self._make_wrapper(pdf_doc) + fake_pdf = tmp_path / "staging" / "doc001.pdf" + fake_pdf.parent.mkdir() + fake_pdf.write_bytes(b"%PDF fake content") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + ) + + assert isinstance(result, IFWResult) + assert result.wrapper is wrapper + assert result.output_path.endswith("12345678_ifw.zip") + assert result.downloaded_documents == {"DOC001": "doc001.pdf"} + import zipfile as zf + with zf.ZipFile(result.output_path) as z: + assert "doc001.pdf" in z.namelist() + + def test_returns_ifw_result_as_directory( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """as_zip=False downloads into a subdirectory and populates downloaded_documents.""" + wrapper = self._make_wrapper(pdf_doc) + out_dir = tmp_path / "out" / "12345678_ifw" + out_dir.mkdir(parents=True) + fake_pdf = out_dir / "doc001.pdf" + fake_pdf.write_bytes(b"%PDF fake content") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + as_zip=False, + overwrite=True, + ) + + assert isinstance(result, IFWResult) + assert result.output_path.endswith("12345678_ifw") + assert os.path.isdir(result.output_path) + assert result.downloaded_documents == {"DOC001": "doc001.pdf"} + + def test_skips_xml_only_docs_silently( + self, patent_data_client: PatentDataClient, xml_only_doc: Document, tmp_path + ) -> None: + """Documents with only XML format are silently skipped — _download_and_extract not called.""" + wrapper = self._make_wrapper(xml_only_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + mock_dl.assert_not_called() + assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} + + def test_skips_no_url_docs_silently( + self, patent_data_client: PatentDataClient, no_url_doc: Document, tmp_path + ) -> None: + """Documents with no download URL are silently skipped.""" + wrapper = self._make_wrapper(no_url_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + mock_dl.assert_not_called() + assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} + + def test_warns_on_download_failure( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """A warning is issued when a doc has a URL but the download raises.""" + wrapper = self._make_wrapper(pdf_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + side_effect=OSError("network error"), + ), + pytest.warns(match="DOC001"), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} + + def test_raises_file_exists_error_zip( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """FileExistsError raised if ZIP already exists and overwrite=False.""" + wrapper = self._make_wrapper(pdf_doc) + (tmp_path / "12345678_ifw.zip").write_bytes(b"") + + with patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper): + with pytest.raises(FileExistsError): + patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=False, + ) + + def test_raises_file_exists_error_directory( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """FileExistsError raised if output directory already exists and overwrite=False.""" + wrapper = self._make_wrapper(pdf_doc) + (tmp_path / "12345678_ifw").mkdir() + + with patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper): + with pytest.raises(FileExistsError): + patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=False, + as_zip=False, + ) + + def test_overwrite_replaces_existing_zip( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """overwrite=True replaces an existing ZIP without error.""" + wrapper = self._make_wrapper(pdf_doc) + (tmp_path / "12345678_ifw.zip").write_bytes(b"old content") + + fake_pdf = tmp_path / "staging" / "doc001.pdf" + fake_pdf.parent.mkdir() + fake_pdf.write_bytes(b"%PDF new") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=True, + ) + assert isinstance(result, IFWResult) + + def test_docx_downloaded_when_no_pdf( + self, patent_data_client: PatentDataClient, docx_doc: Document, tmp_path + ) -> None: + """DOCX format is used as fallback when PDF is not available.""" + wrapper = self._make_wrapper(docx_doc) + fake_docx = tmp_path / "staging" / "doc004.docx" + fake_docx.parent.mkdir() + fake_docx.write_bytes(b"fake docx") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_docx), + ) as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + ) + + mock_dl.assert_called_once_with( + url="https://example.com/doc004.docx", + destination=mock.ANY, + overwrite=True, + ) + assert isinstance(result, IFWResult) + assert result.downloaded_documents == {"DOC004": "doc004.docx"} + + def test_directory_skips_xml_only_docs( + self, patent_data_client: PatentDataClient, xml_only_doc: Document, tmp_path + ) -> None: + """as_zip=False: docs with only XML are silently skipped.""" + wrapper = self._make_wrapper(xml_only_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=False, + ) + mock_dl.assert_not_called() + assert result.downloaded_documents == {} + + def test_directory_warns_on_download_failure( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """as_zip=False: warning issued when download raises despite having a URL.""" + wrapper = self._make_wrapper(pdf_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + side_effect=OSError("network error"), + ), + pytest.warns(match="DOC001"), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=False, + ) + assert result.downloaded_documents == {} + + def test_skips_docs_with_no_identifier( + self, patent_data_client: PatentDataClient, tmp_path + ) -> None: + """Documents with no document_identifier are silently skipped in both modes.""" + no_id_doc = Document( + document_identifier=None, + document_code="CTNF", + document_formats=[ + DocumentFormat( + mime_type_identifier="PDF", + download_url="https://example.com/doc.pdf", + ), + ], + ) + wrapper = self._make_wrapper(no_id_doc) + + for as_zip in (True, False): + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=as_zip, + overwrite=True, + ) + mock_dl.assert_not_called() + assert result.downloaded_documents == {} diff --git a/tox.ini b/tox.ini index cc55d4a..ac508a9 100644 --- a/tox.ini +++ b/tox.ini @@ -13,9 +13,7 @@ basepython = py314: {env:LOCALAPPDATA}\Python\pythoncore-3.14-64\python.exe # py315: {env:LOCALAPPDATA}\Python\pythoncore-3.15-64\python.exe deps = - pytest>=9.0.2 - pytest-cov>=7.0.0 - pytest-mock>=3.15.1 - typing_extensions>=4.15.0 + -r requirements.txt +extras = test commands = pytest tests/ --cov=src/pyUSPTO