From f59622f9e2133ed050009b055a86c584fa870c75 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:00:23 -0600 Subject: [PATCH 1/9] fix: auto-quote classification_q values containing spaces or slashes; closes #101 --- src/pyUSPTO/clients/patent_data.py | 14 ++++++++++---- tests/clients/test_patent_data_clients.py | 8 ++++++++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index ab2ce96..3ca9efd 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -298,9 +298,12 @@ def search_applications( ) q_parts.append(f"assignmentBag.assigneeBag.assigneeNameText:{v}") if classification_q: - q_parts.append( - f"applicationMetaData.cpcClassificationBag:{classification_q}" + v = ( + f'"{classification_q}"' + if any(c in classification_q for c in [" ", "/"]) + else classification_q ) + q_parts.append(f"applicationMetaData.cpcClassificationBag:{v}") if earliestPublicationNumber_q: q_parts.append( f"applicationMetaData.earliestPublicationNumber:{earliestPublicationNumber_q}" @@ -439,9 +442,12 @@ def get_search_results( ) q_parts.append(f"assignmentBag.assigneeBag.assigneeNameText:{v}") if classification_q: - q_parts.append( - f"applicationMetaData.cpcClassificationBag:{classification_q}" + v = ( + f'"{classification_q}"' + if any(c in classification_q for c in [" ", "/"]) + else classification_q ) + q_parts.append(f"applicationMetaData.cpcClassificationBag:{v}") if filing_date_from_q and filing_date_to_q: q_parts.append( diff --git a/tests/clients/test_patent_data_clients.py b/tests/clients/test_patent_data_clients.py index a14a5a4..9cd378e 100644 --- a/tests/clients/test_patent_data_clients.py +++ b/tests/clients/test_patent_data_clients.py @@ -384,6 +384,10 @@ def test_search_applications_post( {"classification_q": "H04L"}, "applicationMetaData.cpcClassificationBag:H04L", ), + ( + {"classification_q": "H10D 64/667"}, + 'applicationMetaData.cpcClassificationBag:"H10D 64/667"', + ), ( {"earliestPublicationNumber_q": "*12345678*"}, "applicationMetaData.earliestPublicationNumber:*12345678*", @@ -1971,6 +1975,10 @@ def test_get_search_results_get_with_combined_q_convenience_params( {"classification_q": "H04L"}, "applicationMetaData.cpcClassificationBag:H04L", ), + ( + {"classification_q": "H10D 64/667"}, + 'applicationMetaData.cpcClassificationBag:"H10D 64/667"', + ), ( {"filing_date_from_q": "2021-01-01"}, "applicationMetaData.filingDate:>=2021-01-01", From a0d1a0cf1d18e757f900961979157ccf62cc0dfe Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:31:17 -0600 Subject: [PATCH 2/9] feat: populate document_bag in get_IFW_metadata (closes #99) --- examples/patent_data_example.py | 17 ++++ src/pyUSPTO/clients/patent_data.py | 30 ++++--- src/pyUSPTO/models/patent_data.py | 1 + tests/clients/test_patent_data_clients.py | 103 +++++++++++++--------- 4 files changed, 98 insertions(+), 53 deletions(-) diff --git a/examples/patent_data_example.py b/examples/patent_data_example.py index 48e0998..43ffa5f 100644 --- a/examples/patent_data_example.py +++ b/examples/patent_data_example.py @@ -302,6 +302,23 @@ print(f"Error with POST search: {e}") +# Search by CPC classification code +# CPC codes containing spaces or slashes are automatically quoted for the Lucene query. +try: + print("\nSearching by CPC classification code 'H10D 64/667'...") + cpc_response = client.search_applications( + classification_q="H10D 64/667", limit=3 + ) + print(f"Found {cpc_response.count} applications with CPC code H10D 64/667.") + for patent_wrapper in cpc_response.patent_file_wrapper_data_bag: + if patent_wrapper.application_meta_data: + print( + f" - App No: {patent_wrapper.application_number_text}, Title: {patent_wrapper.application_meta_data.invention_title}" + ) +except Exception as e: + print(f"Error searching by CPC classification: {e}") + + # Example of getting status codes try: print("\nGetting first 5 status codes...") diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index 3ca9efd..7cc27ae 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -4,6 +4,7 @@ It allows you to search for and retrieve patent application data. """ +import dataclasses import warnings from collections.abc import Iterator from typing import Any @@ -121,6 +122,9 @@ def sanitize_application_number(self, input_number: str) -> str: # Example: "PCT/US2024/012345" -> "PCTUS2412345" if raw.startswith("PCT"): parts = raw.split("/") + if len(parts) == 1: + # Already sanitized (e.g. "PCTUS0812705"), return as-is + return raw if len(parts) != 3: raise ValueError( f"Invalid PCT application format: {input_number}. " @@ -1064,27 +1068,33 @@ def get_IFW_metadata( comprehensive data if found using one of the identifiers, otherwise None. """ + wrapper = None if application_number: - return self.get_application_by_number(application_number=application_number) - if patent_number: + wrapper = self.get_application_by_number( + application_number=application_number + ) + elif patent_number: pdr = self.search_applications(patent_number_q=patent_number, limit=1) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - if publication_number: + wrapper = pdr.patent_file_wrapper_data_bag[0] + elif publication_number: pdr = self.search_applications( earliestPublicationNumber_q=publication_number, limit=1 ) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - if PCT_app_number: - return self.get_application_by_number(application_number=PCT_app_number) - if PCT_pub_number: + wrapper = pdr.patent_file_wrapper_data_bag[0] + elif PCT_app_number: + wrapper = self.get_application_by_number(application_number=PCT_app_number) + elif PCT_pub_number: pdr = self.search_applications( pctPublicationNumber_q=PCT_pub_number, limit=1 ) if pdr.patent_file_wrapper_data_bag: - return pdr.patent_file_wrapper_data_bag[0] - return None + wrapper = pdr.patent_file_wrapper_data_bag[0] + if wrapper is None: + return None + doc_bag = self.get_application_documents(wrapper.application_number_text) + return dataclasses.replace(wrapper, document_bag=doc_bag) def download_archive( self, diff --git a/src/pyUSPTO/models/patent_data.py b/src/pyUSPTO/models/patent_data.py index 800ecdd..7f174ce 100644 --- a/src/pyUSPTO/models/patent_data.py +++ b/src/pyUSPTO/models/patent_data.py @@ -2063,6 +2063,7 @@ class PatentFileWrapper: pgpub_document_meta_data: PrintedMetaData | None = None grant_document_meta_data: PrintedMetaData | None = None last_ingestion_date_time: datetime | None = None + document_bag: DocumentBag | None = None @classmethod def from_dict( diff --git a/tests/clients/test_patent_data_clients.py b/tests/clients/test_patent_data_clients.py index 9cd378e..4424be1 100644 --- a/tests/clients/test_patent_data_clients.py +++ b/tests/clients/test_patent_data_clients.py @@ -11,7 +11,7 @@ from datetime import date, datetime, timezone from typing import Any from unittest import mock -from unittest.mock import MagicMock, mock_open, patch +from unittest.mock import MagicMock, call, mock_open, patch import pytest import requests @@ -1231,20 +1231,22 @@ def test_get_ifw_by_application_number( ) -> None: """Test get_IFW with application_number calls get_application_by_number.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] app_num = "12345678" result = client.get_IFW_metadata(application_number=app_num) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint=f"api/v1/patent/applications/{app_num}", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_patent_number( self, @@ -1253,15 +1255,16 @@ def test_get_ifw_by_patent_number( ) -> None: """Test get_IFW with patent_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] patent_num = "10000000" result = client.get_IFW_metadata(patent_number=patent_num) - # Should call search_applications with patent_number_q - mock_make_request.assert_called_once_with( + # Should call search_applications with patent_number_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1271,7 +1274,8 @@ def test_get_ifw_by_patent_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_publication_number( self, @@ -1280,15 +1284,16 @@ def test_get_ifw_by_publication_number( ) -> None: """Test get_IFW with publication_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pub_num = "US20240123456A1" result = client.get_IFW_metadata(publication_number=pub_num) - # Should call search_applications with earliestPublicationNumber_q - mock_make_request.assert_called_once_with( + # Should call search_applications with earliestPublicationNumber_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1298,7 +1303,8 @@ def test_get_ifw_by_publication_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_pct_app_number( self, @@ -1312,9 +1318,10 @@ def test_get_ifw_by_pct_app_number( This is expected test behavior for validating the warning system. """ client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_app = "PCT/US2024/012345" @@ -1322,13 +1329,14 @@ def test_get_ifw_by_pct_app_number( with pytest.warns(USPTODataMismatchWarning): result = client.get_IFW_metadata(PCT_app_number=pct_app) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/PCTUS24012345", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_short_pct_app_number( self, @@ -1345,9 +1353,10 @@ def test_get_ifw_by_short_pct_app_number( This is expected test behavior for validating the warning system. """ client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_app = "PCT/US24/012345" @@ -1355,13 +1364,14 @@ def test_get_ifw_by_short_pct_app_number( with pytest.warns(USPTODataMismatchWarning): result = client.get_IFW_metadata(PCT_app_number=pct_app) - # Should call get_application_by_number - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/PCTUS24012345", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_by_pct_app_number_malformed( self, @@ -1467,15 +1477,16 @@ def test_get_ifw_by_pct_pub_number( ) -> None: """Test get_IFW with PCT_pub_number calls search_applications.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] pct_pub = "WO2024012345A1" result = client.get_IFW_metadata(PCT_pub_number=pct_pub) - # Should call search_applications with pctPublicationNumber_q - mock_make_request.assert_called_once_with( + # Should call search_applications with pctPublicationNumber_q first + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint="api/v1/patent/applications/search", params={ @@ -1485,7 +1496,8 @@ def test_get_ifw_by_pct_pub_number( }, response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) def test_get_ifw_no_parameters_returns_none( self, patent_data_client: PatentDataClient @@ -1513,9 +1525,10 @@ def test_get_ifw_prioritizes_first_parameter( ) -> None: """Test get_IFW uses application_number when multiple parameters provided.""" client, mock_make_request = client_with_mocked_request - mock_make_request.return_value = PatentDataResponse( - count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper] - ) + mock_make_request.side_effect = [ + PatentDataResponse(count=1, patent_file_wrapper_data_bag=[mock_patent_file_wrapper]), + {"documentBag": []}, + ] app_num = "12345678" # Provide multiple parameters - should use application_number @@ -1525,13 +1538,14 @@ def test_get_ifw_prioritizes_first_parameter( publication_number="US20240123456A1", ) - # Should only call get_application_by_number, not search - mock_make_request.assert_called_once_with( + # Should call get_application_by_number first, not search + assert mock_make_request.call_args_list[0] == call( method="GET", endpoint=f"api/v1/patent/applications/{app_num}", response_class=PatentDataResponse, ) - assert result is mock_patent_file_wrapper + assert result.application_number_text == mock_patent_file_wrapper.application_number_text + assert isinstance(result.document_bag, DocumentBag) class TestDownloadArchive: @@ -2536,6 +2550,9 @@ def test_sanitize_invalid_series_code_format_raises( with pytest.raises(ValueError, match="Expected format: NNNNNNNN or NN/NNNNNN"): patent_data_client.sanitize_application_number("08/123/456") + # Already-sanitized PCT number passes through unchanged + assert patent_data_client.sanitize_application_number("PCTUS0812705") == "PCTUS0812705" + class TestRawDataFeature: """Tests for the include_raw_data feature.""" From 3c2e002c07dd1956ab39aafaffaf8e5ea6dcd8f5 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:43:02 -0600 Subject: [PATCH 3/9] chore(deps): update dependencies --- requirements-dev.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index b36ed0d..6bb3543 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -22,14 +22,14 @@ colorama==0.4.6 # sphinx coverage==7.13.4 # via pytest-cov -docutils==0.21.2 +docutils==0.22.4 # via # myst-parser # sphinx # sphinx-rtd-theme idna==3.11 # via requests -imagesize==1.4.1 +imagesize==2.0.0 # via sphinx iniconfig==2.3.0 # via pytest @@ -39,7 +39,7 @@ jinja2==3.1.6 # sphinx librt==0.8.1 # via mypy -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via # mdit-py-plugins # myst-parser @@ -55,7 +55,7 @@ mypy==1.19.1 # via pyUSPTO (pyproject.toml) mypy-extensions==1.1.0 # via mypy -myst-parser==4.0.1 +myst-parser==5.0.0 # via pyUSPTO (pyproject.toml) packaging==26.0 # via @@ -71,7 +71,7 @@ pydantic==2.12.5 # via # pydantic-extra-types # sphinx-immaterial -pydantic-core==2.41.5 +pydantic-core==2.42.0 # via pydantic pydantic-extra-types==2.11.0 # via sphinx-immaterial @@ -99,7 +99,7 @@ ruff==0.15.4 # via pyUSPTO (pyproject.toml) snowballstemmer==3.0.1 # via sphinx -sphinx==8.1.3 +sphinx==9.1.0 # via # myst-parser # pyUSPTO (pyproject.toml) @@ -108,7 +108,7 @@ sphinx==8.1.3 # sphinx-immaterial # sphinx-rtd-theme # sphinxcontrib-jquery -sphinx-autodoc-typehints==3.0.1 +sphinx-autodoc-typehints==3.9.5 # via pyUSPTO (pyproject.toml) sphinx-copybutton==0.5.2 # via pyUSPTO (pyproject.toml) From 22693f167818970c46b24464480880ffda0b6e65 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Tue, 3 Mar 2026 11:04:03 -0600 Subject: [PATCH 4/9] [DOCS] Example how to search CPC codes (closes #102) --- examples/patent_data_example.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/examples/patent_data_example.py b/examples/patent_data_example.py index 43ffa5f..5809925 100644 --- a/examples/patent_data_example.py +++ b/examples/patent_data_example.py @@ -304,6 +304,8 @@ # Search by CPC classification code # CPC codes containing spaces or slashes are automatically quoted for the Lucene query. +# cpc_classification_bag on ApplicationMetaData is a list[str] of all CPC codes assigned +# to the application, so each result may have multiple codes. try: print("\nSearching by CPC classification code 'H10D 64/667'...") cpc_response = client.search_applications( @@ -311,10 +313,13 @@ ) print(f"Found {cpc_response.count} applications with CPC code H10D 64/667.") for patent_wrapper in cpc_response.patent_file_wrapper_data_bag: - if patent_wrapper.application_meta_data: + app_meta = patent_wrapper.application_meta_data + if app_meta: print( - f" - App No: {patent_wrapper.application_number_text}, Title: {patent_wrapper.application_meta_data.invention_title}" + f" - App No: {patent_wrapper.application_number_text}, Title: {app_meta.invention_title}" ) + if app_meta.cpc_classification_bag: + print(f" CPC codes: {', '.join(app_meta.cpc_classification_bag)}") except Exception as e: print(f"Error searching by CPC classification: {e}") From 2000f48e5801fb144f43dff9a2ba19e7fe64cba1 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Wed, 4 Mar 2026 10:07:14 -0600 Subject: [PATCH 5/9] feat: add get_IFW method with IFWResult and bulk download --- examples/ifw_example.py | 8 + src/pyUSPTO/clients/patent_data.py | 92 +++++++++ src/pyUSPTO/models/patent_data.py | 13 ++ tests/clients/test_patent_data_clients.py | 228 ++++++++++++++++++++++ 4 files changed, 341 insertions(+) diff --git a/examples/ifw_example.py b/examples/ifw_example.py index 9e185e8..00ac391 100644 --- a/examples/ifw_example.py +++ b/examples/ifw_example.py @@ -61,6 +61,14 @@ print(f" - IFW Found based on PCT Pub No: {PCT_pub_number}") +print("\nGet IFW + download all prosecution docs as a ZIP archive -->") +ifw_result = client.get_IFW(application_number=application_number, destination="./download-example", overwrite=True) +if ifw_result: + print(f"Title: {ifw_result.wrapper.application_meta_data.invention_title if ifw_result.wrapper.application_meta_data else 'N/A'}") + print(f"Archive: {ifw_result.archive_path}") + print(f"Documents in bag: {len(ifw_result.wrapper.document_bag)}") + + print("\nNow let's download the Patent Publication Text -->") if app_no_ifw and app_no_ifw.pgpub_document_meta_data: pgpub_archive = app_no_ifw.pgpub_document_meta_data diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index 7cc27ae..7b98dd7 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -5,7 +5,10 @@ """ import dataclasses +import os +import tempfile import warnings +import zipfile from collections.abc import Iterator from typing import Any @@ -21,6 +24,7 @@ DocumentMimeType, EventData, ForeignPriority, + IFWResult, PatentDataResponse, PatentFileWrapper, PatentTermAdjustmentData, @@ -1096,6 +1100,94 @@ def get_IFW_metadata( doc_bag = self.get_application_documents(wrapper.application_number_text) return dataclasses.replace(wrapper, document_bag=doc_bag) + def get_IFW( + self, + *, + application_number: str | None = None, + publication_number: str | None = None, + patent_number: str | None = None, + PCT_app_number: str | None = None, + PCT_pub_number: str | None = None, + destination: str | None = None, + overwrite: bool = False, + ) -> IFWResult | None: + """Retrieve IFW metadata and download all prosecution documents as a ZIP archive. + + Combines `get_IFW_metadata` with a bulk download of all available prosecution + history documents (PDF preferred, DOCX fallback). Documents with no downloadable + format (e.g., NPL references) are silently skipped. A warning is issued only + if a document has a download URL but the download itself fails. + + Args: + application_number: USPTO application number (e.g., "16123456"). + publication_number: USPTO pre-grant publication number. + patent_number: USPTO patent number. + PCT_app_number: PCT application number. + PCT_pub_number: PCT publication number. + destination: Directory to save the ZIP archive. Defaults to current directory. + overwrite: Whether to overwrite an existing ZIP. Default False. + + Returns: + IFWResult with the PatentFileWrapper and the path to the ZIP archive, + or None if no application was found. + + Raises: + FileExistsError: If the ZIP archive already exists and overwrite=False. + """ + wrapper = self.get_IFW_metadata( + application_number=application_number, + publication_number=publication_number, + patent_number=patent_number, + PCT_app_number=PCT_app_number, + PCT_pub_number=PCT_pub_number, + ) + if wrapper is None: + return None + + dest_dir = destination or "." + app_no = wrapper.application_number_text or "unknown" + zip_name = f"{app_no}_ifw.zip" + zip_path = os.path.join(dest_dir, zip_name) + + if os.path.exists(zip_path) and not overwrite: + raise FileExistsError( + f"ZIP archive already exists: {zip_path}. Use overwrite=True to replace." + ) + + os.makedirs(dest_dir, exist_ok=True) + + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: + for doc in wrapper.document_bag: + # Prefer PDF, fall back to MS_WORD; skip XML and formatless docs. + fmt_obj = next( + ( + f + for f in doc.document_formats + if f.mime_type_identifier in ("PDF", "MS_WORD") + and f.download_url + ), + None, + ) + if fmt_obj is None: + continue + + try: + downloaded = self._download_and_extract( + url=fmt_obj.download_url, + destination=tmp_dir, + overwrite=True, + ) + zf.write(downloaded, arcname=os.path.basename(downloaded)) + except Exception as exc: + warnings.warn( + f"Failed to download document {doc.document_identifier} " + f"({doc.document_code}): {exc}", + stacklevel=2, + ) + + return IFWResult(wrapper=wrapper, archive_path=os.path.abspath(zip_path)) + def download_archive( self, printed_metadata: PrintedMetaData, diff --git a/src/pyUSPTO/models/patent_data.py b/src/pyUSPTO/models/patent_data.py index 7f174ce..5bbad3d 100644 --- a/src/pyUSPTO/models/patent_data.py +++ b/src/pyUSPTO/models/patent_data.py @@ -2210,6 +2210,19 @@ def to_dict(self) -> dict[str, Any]: } +@dataclass(frozen=True) +class IFWResult: + """Result of a get_IFW call: metadata wrapper and path to the downloaded archive. + + Attributes: + wrapper: The PatentFileWrapper containing all IFW metadata and document_bag. + archive_path: Absolute path to the ZIP archive of downloaded prosecution documents. + """ + + wrapper: PatentFileWrapper + archive_path: str + + @dataclass(frozen=True) class PatentDataResponse: """Represents the overall response from a patent data API request. diff --git a/tests/clients/test_patent_data_clients.py b/tests/clients/test_patent_data_clients.py index 4424be1..c8f410a 100644 --- a/tests/clients/test_patent_data_clients.py +++ b/tests/clients/test_patent_data_clients.py @@ -33,6 +33,7 @@ DocumentMimeType, EventData, ForeignPriority, + IFWResult, Inventor, ParentContinuity, PatentDataResponse, @@ -3094,3 +3095,230 @@ def test_to_csv_with_multiple_wrappers( assert data_rows[1][1] == "APP002" assert data_rows[1][2] == serialize_date(wrapper2_meta.filing_date) assert data_rows[1][7] == wrapper2_meta.first_inventor_name + + +class TestGetIFWDownload: + """Tests for the get_IFW method (metadata + bulk document download).""" + + @pytest.fixture + def pdf_doc(self) -> Document: + """A document that has a PDF download URL.""" + return Document( + document_identifier="DOC001", + document_code="CTNF", + document_formats=[ + DocumentFormat( + mime_type_identifier="PDF", + download_url="https://example.com/doc001.pdf", + ), + ], + ) + + @pytest.fixture + def xml_only_doc(self) -> Document: + """A document with only XML (no PDF/DOCX) — should be skipped silently.""" + return Document( + document_identifier="DOC002", + document_code="SPEC", + document_formats=[ + DocumentFormat( + mime_type_identifier="XML", + download_url="https://example.com/doc002.xml", + ), + ], + ) + + @pytest.fixture + def no_url_doc(self) -> Document: + """A document with a PDF format entry but no download URL (e.g. NPL ref).""" + return Document( + document_identifier="DOC003", + document_code="NPL", + document_formats=[ + DocumentFormat(mime_type_identifier="PDF", download_url=None), + ], + ) + + @pytest.fixture + def docx_doc(self) -> Document: + """A document that has only a DOCX (MS_WORD) download URL.""" + return Document( + document_identifier="DOC004", + document_code="AMND", + document_formats=[ + DocumentFormat( + mime_type_identifier="MS_WORD", + download_url="https://example.com/doc004.docx", + ), + ], + ) + + def _make_wrapper(self, *docs: Document) -> PatentFileWrapper: + return PatentFileWrapper( + application_number_text="12345678", + document_bag=DocumentBag(documents=list(docs)), + ) + + def test_returns_none_when_not_found( + self, patent_data_client: PatentDataClient + ) -> None: + """get_IFW returns None when no application is found.""" + with patch.object(patent_data_client, "get_IFW_metadata", return_value=None): + result = patent_data_client.get_IFW(application_number="00000000") + assert result is None + + def test_returns_ifw_result_with_zip( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """get_IFW returns IFWResult with a valid ZIP containing the downloaded doc.""" + wrapper = self._make_wrapper(pdf_doc) + fake_pdf = tmp_path / "staging" / "doc001.pdf" + fake_pdf.parent.mkdir() + fake_pdf.write_bytes(b"%PDF fake content") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + ) + + assert isinstance(result, IFWResult) + assert result.wrapper is wrapper + assert result.archive_path.endswith("12345678_ifw.zip") + import zipfile as zf + with zf.ZipFile(result.archive_path) as z: + assert "doc001.pdf" in z.namelist() + + def test_skips_xml_only_docs_silently( + self, patent_data_client: PatentDataClient, xml_only_doc: Document, tmp_path + ) -> None: + """Documents with only XML format are silently skipped — _download_and_extract not called.""" + wrapper = self._make_wrapper(xml_only_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + mock_dl.assert_not_called() + assert isinstance(result, IFWResult) + + def test_skips_no_url_docs_silently( + self, patent_data_client: PatentDataClient, no_url_doc: Document, tmp_path + ) -> None: + """Documents with no download URL are silently skipped.""" + wrapper = self._make_wrapper(no_url_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + mock_dl.assert_not_called() + assert isinstance(result, IFWResult) + + def test_warns_on_download_failure( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """A warning is issued when a doc has a URL but the download raises.""" + wrapper = self._make_wrapper(pdf_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + side_effect=OSError("network error"), + ), + pytest.warns(match="DOC001"), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + ) + assert isinstance(result, IFWResult) + + def test_raises_file_exists_error( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """FileExistsError raised if ZIP already exists and overwrite=False.""" + wrapper = self._make_wrapper(pdf_doc) + existing_zip = tmp_path / "12345678_ifw.zip" + existing_zip.write_bytes(b"") + + with patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper): + with pytest.raises(FileExistsError): + patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=False, + ) + + def test_overwrite_replaces_existing_zip( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """overwrite=True replaces an existing ZIP without error.""" + wrapper = self._make_wrapper(pdf_doc) + existing_zip = tmp_path / "12345678_ifw.zip" + existing_zip.write_bytes(b"old content") + + fake_pdf = tmp_path / "staging" / "doc001.pdf" + fake_pdf.parent.mkdir() + fake_pdf.write_bytes(b"%PDF new") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=True, + ) + assert isinstance(result, IFWResult) + + def test_docx_downloaded_when_no_pdf( + self, patent_data_client: PatentDataClient, docx_doc: Document, tmp_path + ) -> None: + """DOCX format is used as fallback when PDF is not available.""" + wrapper = self._make_wrapper(docx_doc) + fake_docx = tmp_path / "staging" / "doc004.docx" + fake_docx.parent.mkdir() + fake_docx.write_bytes(b"fake docx") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_docx), + ) as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + ) + + mock_dl.assert_called_once_with( + url="https://example.com/doc004.docx", + destination=mock.ANY, + overwrite=True, + ) + assert isinstance(result, IFWResult) From be604d53c210c5c291570f1ad19bfc10b316037b Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Wed, 4 Mar 2026 10:33:13 -0600 Subject: [PATCH 6/9] feat: add IFWResult with document map and optional ZIP output --- examples/ifw_example.py | 38 +++++- src/pyUSPTO/clients/patent_data.py | 136 +++++++++++++++------- src/pyUSPTO/models/patent_data.py | 13 ++- tests/clients/test_patent_data_clients.py | 136 ++++++++++++++++++++-- 4 files changed, 264 insertions(+), 59 deletions(-) diff --git a/examples/ifw_example.py b/examples/ifw_example.py index 00ac391..8e51643 100644 --- a/examples/ifw_example.py +++ b/examples/ifw_example.py @@ -1,7 +1,26 @@ """Example usage of pyUSPTO for IFW data. -This example demonstrates how to use the PatentDataClient to interact with the USPTO Patent Data API. -It shows how to retrieve IFW based on various identifying values. +This example demonstrates how to use the PatentDataClient to retrieve Image File +Wrapper (IFW) data from the USPTO Patent Data API. It covers: + +- get_IFW_metadata(): retrieve a PatentFileWrapper (with populated document_bag) + using any of the five supported identifiers: + - application_number + - patent_number + - publication_number + - PCT_app_number + - PCT_pub_number + +- get_IFW(): retrieve metadata AND bulk-download all prosecution history documents + (PDF preferred, DOCX fallback; XML and formatless docs are skipped). Returns an + IFWResult with: + - wrapper: the PatentFileWrapper + - output_path: path to the ZIP archive (as_zip=True, default) or output directory + - downloaded_documents: dict mapping document_identifier -> filename, allowing + each Document in document_bag to be linked to its downloaded file + +- download_archive() / download_publication(): download the pgpub or grant XML + archive from PrintedMetaData. """ import json @@ -65,8 +84,19 @@ ifw_result = client.get_IFW(application_number=application_number, destination="./download-example", overwrite=True) if ifw_result: print(f"Title: {ifw_result.wrapper.application_meta_data.invention_title if ifw_result.wrapper.application_meta_data else 'N/A'}") - print(f"Archive: {ifw_result.archive_path}") - print(f"Documents in bag: {len(ifw_result.wrapper.document_bag)}") + print(f"Output: {ifw_result.output_path}") + doc_bag = ifw_result.wrapper.document_bag or [] + print(f"Documents downloaded: {len(ifw_result.downloaded_documents)} of {len(doc_bag)}") + for doc in doc_bag: + if doc.document_identifier: + filename = ifw_result.downloaded_documents.get(doc.document_identifier) + status = f"-> {filename}" if filename else "(skipped)" + print(f" {doc.document_code} [{doc.document_identifier}] {status}") + +print("\nGet IFW + download all prosecution docs as a directory (no ZIP) -->") +ifw_dir_result = client.get_IFW(application_number=application_number, destination="./download-example", overwrite=True, as_zip=False) +if ifw_dir_result: + print(f"Output directory: {ifw_dir_result.output_path}") print("\nNow let's download the Patent Publication Text -->") diff --git a/src/pyUSPTO/clients/patent_data.py b/src/pyUSPTO/clients/patent_data.py index 7b98dd7..c815ca3 100644 --- a/src/pyUSPTO/clients/patent_data.py +++ b/src/pyUSPTO/clients/patent_data.py @@ -1110,8 +1110,9 @@ def get_IFW( PCT_pub_number: str | None = None, destination: str | None = None, overwrite: bool = False, + as_zip: bool = True, ) -> IFWResult | None: - """Retrieve IFW metadata and download all prosecution documents as a ZIP archive. + """Retrieve IFW metadata and download all prosecution documents. Combines `get_IFW_metadata` with a bulk download of all available prosecution history documents (PDF preferred, DOCX fallback). Documents with no downloadable @@ -1124,15 +1125,19 @@ def get_IFW( patent_number: USPTO patent number. PCT_app_number: PCT application number. PCT_pub_number: PCT publication number. - destination: Directory to save the ZIP archive. Defaults to current directory. - overwrite: Whether to overwrite an existing ZIP. Default False. + destination: Directory for output. Defaults to current directory. + overwrite: Whether to overwrite an existing output. Default False. + as_zip: If True (default), package all downloads into a ZIP archive + at ``{destination}/{app_no}_ifw.zip``. If False, download files + directly into ``{destination}/{app_no}_ifw/``. Returns: - IFWResult with the PatentFileWrapper and the path to the ZIP archive, - or None if no application was found. + IFWResult with the PatentFileWrapper, the output path, and a mapping + of document_identifier to filename for each downloaded document. + Returns None if no application was found. Raises: - FileExistsError: If the ZIP archive already exists and overwrite=False. + FileExistsError: If the output path already exists and overwrite=False. """ wrapper = self.get_IFW_metadata( application_number=application_number, @@ -1146,47 +1151,90 @@ def get_IFW( dest_dir = destination or "." app_no = wrapper.application_number_text or "unknown" - zip_name = f"{app_no}_ifw.zip" - zip_path = os.path.join(dest_dir, zip_name) + downloaded_documents: dict[str, str] = {} - if os.path.exists(zip_path) and not overwrite: - raise FileExistsError( - f"ZIP archive already exists: {zip_path}. Use overwrite=True to replace." - ) - - os.makedirs(dest_dir, exist_ok=True) - - with tempfile.TemporaryDirectory() as tmp_dir: - with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf: - for doc in wrapper.document_bag: - # Prefer PDF, fall back to MS_WORD; skip XML and formatless docs. - fmt_obj = next( - ( - f - for f in doc.document_formats - if f.mime_type_identifier in ("PDF", "MS_WORD") - and f.download_url - ), - None, - ) - if fmt_obj is None: - continue - - try: - downloaded = self._download_and_extract( - url=fmt_obj.download_url, - destination=tmp_dir, - overwrite=True, - ) - zf.write(downloaded, arcname=os.path.basename(downloaded)) - except Exception as exc: - warnings.warn( - f"Failed to download document {doc.document_identifier} " - f"({doc.document_code}): {exc}", - stacklevel=2, + if as_zip: + output_path = os.path.join(dest_dir, f"{app_no}_ifw.zip") + if os.path.exists(output_path) and not overwrite: + raise FileExistsError( + f"ZIP archive already exists: {output_path}. Use overwrite=True to replace." + ) + os.makedirs(dest_dir, exist_ok=True) + with tempfile.TemporaryDirectory() as tmp_dir: + with zipfile.ZipFile( + output_path, "w", compression=zipfile.ZIP_DEFLATED + ) as zf: + for doc in wrapper.document_bag or []: + if not doc.document_identifier: + continue + fmt_obj = next( + ( + f + for f in doc.document_formats + if f.mime_type_identifier in ("PDF", "MS_WORD") + and f.download_url + ), + None, ) + if fmt_obj is None or not fmt_obj.download_url: + continue + try: + downloaded = self._download_and_extract( + url=fmt_obj.download_url, + destination=tmp_dir, + overwrite=True, + ) + arcname = os.path.basename(downloaded) + zf.write(downloaded, arcname=arcname) + downloaded_documents[doc.document_identifier] = arcname + except Exception as exc: + warnings.warn( + f"Failed to download document {doc.document_identifier} " + f"({doc.document_code}): {exc}", + stacklevel=2, + ) + else: + output_path = os.path.join(dest_dir, f"{app_no}_ifw") + if os.path.exists(output_path) and not overwrite: + raise FileExistsError( + f"Output directory already exists: {output_path}. Use overwrite=True to replace." + ) + os.makedirs(output_path, exist_ok=True) + for doc in wrapper.document_bag or []: + if not doc.document_identifier: + continue + fmt_obj = next( + ( + f + for f in doc.document_formats + if f.mime_type_identifier in ("PDF", "MS_WORD") + and f.download_url + ), + None, + ) + if fmt_obj is None or not fmt_obj.download_url: + continue + try: + downloaded = self._download_and_extract( + url=fmt_obj.download_url, + destination=output_path, + overwrite=overwrite, + ) + downloaded_documents[doc.document_identifier] = os.path.basename( + downloaded + ) + except Exception as exc: + warnings.warn( + f"Failed to download document {doc.document_identifier} " + f"({doc.document_code}): {exc}", + stacklevel=2, + ) - return IFWResult(wrapper=wrapper, archive_path=os.path.abspath(zip_path)) + return IFWResult( + wrapper=wrapper, + output_path=os.path.abspath(output_path), + downloaded_documents=downloaded_documents, + ) def download_archive( self, diff --git a/src/pyUSPTO/models/patent_data.py b/src/pyUSPTO/models/patent_data.py index 5bbad3d..36fde89 100644 --- a/src/pyUSPTO/models/patent_data.py +++ b/src/pyUSPTO/models/patent_data.py @@ -2048,6 +2048,7 @@ class PatentFileWrapper: pgpub_document_meta_data: `PrintedMetaData` for Pre-Grant Publication. grant_document_meta_data: `PrintedMetaData` for the granted patent. last_ingestion_date_time: Timestamp of when this data was last ingested by the API (UTC). + document_bag: `DocumentBag` containing associated documents and their metadata. """ application_number_text: str @@ -2212,15 +2213,21 @@ def to_dict(self) -> dict[str, Any]: @dataclass(frozen=True) class IFWResult: - """Result of a get_IFW call: metadata wrapper and path to the downloaded archive. + """Result of a get_IFW call: metadata wrapper, output path, and document map. Attributes: wrapper: The PatentFileWrapper containing all IFW metadata and document_bag. - archive_path: Absolute path to the ZIP archive of downloaded prosecution documents. + output_path: Absolute path to the ZIP archive (when as_zip=True) or the + output directory (when as_zip=False). + downloaded_documents: Maps document_identifier to the filename of the + downloaded file — the arcname inside the ZIP (as_zip=True) or the + basename inside the output directory (as_zip=False). Documents that + were skipped (no PDF/DOCX URL) or failed to download are absent. """ wrapper: PatentFileWrapper - archive_path: str + output_path: str + downloaded_documents: dict[str, str] @dataclass(frozen=True) diff --git a/tests/clients/test_patent_data_clients.py b/tests/clients/test_patent_data_clients.py index c8f410a..e16f652 100644 --- a/tests/clients/test_patent_data_clients.py +++ b/tests/clients/test_patent_data_clients.py @@ -7,6 +7,7 @@ import csv import io +import os from collections.abc import Iterator from datetime import date, datetime, timezone from typing import Any @@ -3170,7 +3171,7 @@ def test_returns_none_when_not_found( def test_returns_ifw_result_with_zip( self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path ) -> None: - """get_IFW returns IFWResult with a valid ZIP containing the downloaded doc.""" + """get_IFW returns IFWResult with a valid ZIP and populated downloaded_documents.""" wrapper = self._make_wrapper(pdf_doc) fake_pdf = tmp_path / "staging" / "doc001.pdf" fake_pdf.parent.mkdir() @@ -3191,11 +3192,42 @@ def test_returns_ifw_result_with_zip( assert isinstance(result, IFWResult) assert result.wrapper is wrapper - assert result.archive_path.endswith("12345678_ifw.zip") + assert result.output_path.endswith("12345678_ifw.zip") + assert result.downloaded_documents == {"DOC001": "doc001.pdf"} import zipfile as zf - with zf.ZipFile(result.archive_path) as z: + with zf.ZipFile(result.output_path) as z: assert "doc001.pdf" in z.namelist() + def test_returns_ifw_result_as_directory( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """as_zip=False downloads into a subdirectory and populates downloaded_documents.""" + wrapper = self._make_wrapper(pdf_doc) + out_dir = tmp_path / "out" / "12345678_ifw" + out_dir.mkdir(parents=True) + fake_pdf = out_dir / "doc001.pdf" + fake_pdf.write_bytes(b"%PDF fake content") + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + return_value=str(fake_pdf), + ), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path / "out"), + as_zip=False, + overwrite=True, + ) + + assert isinstance(result, IFWResult) + assert result.output_path.endswith("12345678_ifw") + assert os.path.isdir(result.output_path) + assert result.downloaded_documents == {"DOC001": "doc001.pdf"} + def test_skips_xml_only_docs_silently( self, patent_data_client: PatentDataClient, xml_only_doc: Document, tmp_path ) -> None: @@ -3212,6 +3244,7 @@ def test_skips_xml_only_docs_silently( ) mock_dl.assert_not_called() assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} def test_skips_no_url_docs_silently( self, patent_data_client: PatentDataClient, no_url_doc: Document, tmp_path @@ -3229,6 +3262,7 @@ def test_skips_no_url_docs_silently( ) mock_dl.assert_not_called() assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} def test_warns_on_download_failure( self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path @@ -3250,14 +3284,14 @@ def test_warns_on_download_failure( destination=str(tmp_path), ) assert isinstance(result, IFWResult) + assert result.downloaded_documents == {} - def test_raises_file_exists_error( + def test_raises_file_exists_error_zip( self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path ) -> None: """FileExistsError raised if ZIP already exists and overwrite=False.""" wrapper = self._make_wrapper(pdf_doc) - existing_zip = tmp_path / "12345678_ifw.zip" - existing_zip.write_bytes(b"") + (tmp_path / "12345678_ifw.zip").write_bytes(b"") with patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper): with pytest.raises(FileExistsError): @@ -3267,13 +3301,28 @@ def test_raises_file_exists_error( overwrite=False, ) + def test_raises_file_exists_error_directory( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """FileExistsError raised if output directory already exists and overwrite=False.""" + wrapper = self._make_wrapper(pdf_doc) + (tmp_path / "12345678_ifw").mkdir() + + with patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper): + with pytest.raises(FileExistsError): + patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + overwrite=False, + as_zip=False, + ) + def test_overwrite_replaces_existing_zip( self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path ) -> None: """overwrite=True replaces an existing ZIP without error.""" wrapper = self._make_wrapper(pdf_doc) - existing_zip = tmp_path / "12345678_ifw.zip" - existing_zip.write_bytes(b"old content") + (tmp_path / "12345678_ifw.zip").write_bytes(b"old content") fake_pdf = tmp_path / "staging" / "doc001.pdf" fake_pdf.parent.mkdir() @@ -3322,3 +3371,74 @@ def test_docx_downloaded_when_no_pdf( overwrite=True, ) assert isinstance(result, IFWResult) + assert result.downloaded_documents == {"DOC004": "doc004.docx"} + + def test_directory_skips_xml_only_docs( + self, patent_data_client: PatentDataClient, xml_only_doc: Document, tmp_path + ) -> None: + """as_zip=False: docs with only XML are silently skipped.""" + wrapper = self._make_wrapper(xml_only_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=False, + ) + mock_dl.assert_not_called() + assert result.downloaded_documents == {} + + def test_directory_warns_on_download_failure( + self, patent_data_client: PatentDataClient, pdf_doc: Document, tmp_path + ) -> None: + """as_zip=False: warning issued when download raises despite having a URL.""" + wrapper = self._make_wrapper(pdf_doc) + + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object( + patent_data_client, + "_download_and_extract", + side_effect=OSError("network error"), + ), + pytest.warns(match="DOC001"), + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=False, + ) + assert result.downloaded_documents == {} + + def test_skips_docs_with_no_identifier( + self, patent_data_client: PatentDataClient, tmp_path + ) -> None: + """Documents with no document_identifier are silently skipped in both modes.""" + no_id_doc = Document( + document_identifier=None, + document_code="CTNF", + document_formats=[ + DocumentFormat( + mime_type_identifier="PDF", + download_url="https://example.com/doc.pdf", + ), + ], + ) + wrapper = self._make_wrapper(no_id_doc) + + for as_zip in (True, False): + with ( + patch.object(patent_data_client, "get_IFW_metadata", return_value=wrapper), + patch.object(patent_data_client, "_download_and_extract") as mock_dl, + ): + result = patent_data_client.get_IFW( + application_number="12345678", + destination=str(tmp_path), + as_zip=as_zip, + overwrite=True, + ) + mock_dl.assert_not_called() + assert result.downloaded_documents == {} From e5f04f3961e18034fb89b562fe4323869d5c6839 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:23:11 -0600 Subject: [PATCH 7/9] fix: Update CI to not include dev requirements. --- .github/workflows/python-tests.yml | 2 +- CONTRIBUTING.md | 2 ++ tox.ini | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index ea3ef34..011764b 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -25,7 +25,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install -r requirements-dev.txt + pip install pytest pytest-cov pytest-mock pip install -e . - name: Run tests diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c347fad..7528dd3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,6 +22,8 @@ cd pyUSPTO ### Set Up Development Environment +> **Note:** Python 3.11+ is required for the development environment. Some dev dependencies (e.g. `myst-parser`) do not support Python 3.10. The package itself supports Python 3.10+. + ```bash # Create and activate a virtual environment python -m venv venv diff --git a/tox.ini b/tox.ini index cc55d4a..fe6db8e 100644 --- a/tox.ini +++ b/tox.ini @@ -13,6 +13,7 @@ basepython = py314: {env:LOCALAPPDATA}\Python\pythoncore-3.14-64\python.exe # py315: {env:LOCALAPPDATA}\Python\pythoncore-3.15-64\python.exe deps = + -r requirements.txt pytest>=9.0.2 pytest-cov>=7.0.0 pytest-mock>=3.15.1 From dc5eb461970cceb03435d1053ab276a1a692424d Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:33:05 -0600 Subject: [PATCH 8/9] fix: refactor CI --- .github/workflows/python-tests.yml | 7 +++--- pyproject.toml | 25 +++++++------------ requirements-dev.txt | 39 ++++++++++++++++++------------ tox.ini | 5 +--- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index 011764b..3cd574d 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -25,8 +25,7 @@ jobs: run: | python -m pip install --upgrade pip pip install -r requirements.txt - pip install pytest pytest-cov pytest-mock - pip install -e . + pip install -e ".[test]" - name: Run tests run: | @@ -46,12 +45,12 @@ jobs: - name: Set up Python uses: actions/setup-python@v6 with: - python-version: "3.10" + python-version: "3.14" - name: Install dependencies run: | python -m pip install --upgrade pip - pip install ruff mypy + pip install -e ".[lint]" - name: Lint with ruff run: | diff --git a/pyproject.toml b/pyproject.toml index 6b86858..b46dd25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,24 +51,17 @@ dependencies = [ dynamic = ["version"] [project.optional-dependencies] -dev = [ - # Testing - "pytest>=9.0.2", - "pytest-cov>=7.0.0", - "pytest-mock>=3.15.1", - # Documentation - "sphinx==8.1.3", - "sphinx-rtd-theme>=3.0.0", - "sphinx_immaterial>=0.13.8", - "sphinx-autodoc-typehints==3.0.1", +test = ["pytest>=9.0.2", "pytest-cov>=7.0.0", "pytest-mock>=3.15.1"] +docs = [ + "sphinx>=9.1.0", + "sphinx-rtd-theme>=3.1.0", + "sphinx_immaterial>=0.13.9", + "sphinx-autodoc-typehints>=3.9.5", "sphinx-copybutton>=0.5.2", - "myst-parser>=4.0.1", - # Type checking - "mypy>=1.19.0", - "types-requests>=2.32.4", - # Code quality and formatting - "ruff>=0.15.0", + "myst-parser>=5.0.0", ] +lint = ["mypy>=1.19.0", "types-requests>=2.32.4", "ruff>=0.15.0"] +dev = ["pyUSPTO[test]", "pyUSPTO[docs]", "pyUSPTO[lint]"] [project.urls] GitHub = "https://github.com/DunlapCoddingPC/pyUSPTO" diff --git a/requirements-dev.txt b/requirements-dev.txt index 6bb3543..82cbf57 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -52,11 +52,11 @@ mdit-py-plugins==0.5.0 mdurl==0.1.2 # via markdown-it-py mypy==1.19.1 - # via pyUSPTO (pyproject.toml) + # via pyuspto mypy-extensions==1.1.0 # via mypy myst-parser==5.0.0 - # via pyUSPTO (pyproject.toml) + # via pyuspto packaging==26.0 # via # pytest @@ -71,7 +71,7 @@ pydantic==2.12.5 # via # pydantic-extra-types # sphinx-immaterial -pydantic-core==2.42.0 +pydantic-core==2.41.5 # via pydantic pydantic-extra-types==2.11.0 # via sphinx-immaterial @@ -81,41 +81,48 @@ pygments==2.19.2 # sphinx pytest==9.0.2 # via - # pyUSPTO (pyproject.toml) # pytest-cov # pytest-mock + # pyuspto pytest-cov==7.0.0 - # via pyUSPTO (pyproject.toml) + # via pyuspto pytest-mock==3.15.1 - # via pyUSPTO (pyproject.toml) + # via pyuspto +pyuspto @ file:///C:/Users/andrewp/Documents/GitHub/pyUSPTO + # via + # pyUSPTO (pyproject.toml) + # pyuspto pyyaml==6.0.3 # via myst-parser requests==2.32.5 # via # pyUSPTO (pyproject.toml) + # pyuspto # sphinx # sphinx-immaterial +roman-numerals==4.1.0 + # via sphinx ruff==0.15.4 - # via pyUSPTO (pyproject.toml) + # via pyuspto snowballstemmer==3.0.1 # via sphinx sphinx==9.1.0 # via # myst-parser - # pyUSPTO (pyproject.toml) + # pyuspto # sphinx-autodoc-typehints # sphinx-copybutton # sphinx-immaterial # sphinx-rtd-theme # sphinxcontrib-jquery -sphinx-autodoc-typehints==3.9.5 - # via pyUSPTO (pyproject.toml) +sphinx-autodoc-typehints==3.9.6 + # via pyuspto sphinx-copybutton==0.5.2 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinx-immaterial==0.13.9 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinx-rtd-theme==3.1.0 - # via pyUSPTO (pyproject.toml) + # via pyuspto sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 @@ -131,7 +138,7 @@ sphinxcontrib-qthelp==2.0.0 sphinxcontrib-serializinghtml==2.0.0 # via sphinx types-requests==2.32.4.20260107 - # via pyUSPTO (pyproject.toml) + # via pyuspto typing-extensions==4.15.0 # via # mypy @@ -143,7 +150,9 @@ typing-extensions==4.15.0 typing-inspection==0.4.2 # via pydantic tzdata==2025.3 - # via pyUSPTO (pyproject.toml) + # via + # pyUSPTO (pyproject.toml) + # pyuspto urllib3==2.6.3 # via # requests diff --git a/tox.ini b/tox.ini index fe6db8e..ac508a9 100644 --- a/tox.ini +++ b/tox.ini @@ -14,9 +14,6 @@ basepython = # py315: {env:LOCALAPPDATA}\Python\pythoncore-3.15-64\python.exe deps = -r requirements.txt - pytest>=9.0.2 - pytest-cov>=7.0.0 - pytest-mock>=3.15.1 - typing_extensions>=4.15.0 +extras = test commands = pytest tests/ --cov=src/pyUSPTO From 10a21df98f54e9343f34159c03468eb93411cf47 Mon Sep 17 00:00:00 2001 From: Andrew <3300522+dpieski@users.noreply.github.com> Date: Thu, 5 Mar 2026 09:05:28 -0600 Subject: [PATCH 9/9] fix: missing typing_extensions --- pyproject.toml | 2 +- requirements-dev.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b46dd25..f591731 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ dynamic = ["version"] [project.optional-dependencies] -test = ["pytest>=9.0.2", "pytest-cov>=7.0.0", "pytest-mock>=3.15.1"] +test = ["pytest>=9.0.2", "pytest-cov>=7.0.0", "pytest-mock>=3.15.1", "typing_extensions>=4.15.0"] docs = [ "sphinx>=9.1.0", "sphinx-rtd-theme>=3.1.0", diff --git a/requirements-dev.txt b/requirements-dev.txt index 82cbf57..195a534 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -145,6 +145,7 @@ typing-extensions==4.15.0 # pydantic # pydantic-core # pydantic-extra-types + # pyuspto # sphinx-immaterial # typing-inspection typing-inspection==0.4.2