-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathbulk_data_example.py
More file actions
133 lines (105 loc) · 4.25 KB
/
bulk_data_example.py
File metadata and controls
133 lines (105 loc) · 4.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""Example usage of pyUSPTO for bulk data products.
Demonstrates the BulkDataClient for searching products, listing files,
and downloading bulk data archives.
"""
import os
from pyUSPTO import BulkDataClient, FileData, USPTOConfig
DEST_PATH = "./notes/download-example"
def format_size(size_bytes: int | float) -> str:
"""Format a size in bytes to a human-readable string (KB, MB, GB, etc.)."""
if size_bytes == 0:
return "0 B"
size_names = ["B", "KB", "MB", "GB", "TB", "PB"]
i = 0
while size_bytes >= 1024 and i < len(size_names) - 1:
size_bytes /= 1024
i += 1
return f"{size_bytes:.2f} {size_names[i]}"
# --- Client Initialization ---
api_key = os.environ.get("USPTO_API_KEY", "YOUR_API_KEY_HERE")
if api_key == "YOUR_API_KEY_HERE":
raise ValueError(
"API key is not set. Set the USPTO_API_KEY environment variable."
)
config = USPTOConfig(api_key=api_key)
client = BulkDataClient(config=config)
print("-" * 40)
print("Example 1: Search for products")
print("-" * 40)
response = client.search_products(query="patent", limit=5)
print(f"Found {response.count} products matching 'patent'")
for product in response.bulk_data_product_bag:
print(f"\n Product: {product.product_title_text}")
print(f" ID: {product.product_identifier}")
print(f" Description: {product.product_description_text[:100]}...")
print(f" Total files: {product.product_file_total_quantity}")
print(f" Total size: {format_size(product.product_total_file_size)}")
print("-" * 40)
print("Example 2: Paginate through products")
print("-" * 40)
max_items = 20
count = 0
for product in client.paginate_products(query="trademark", limit=10):
count += 1
print(f" {count}. {product.product_title_text} ({product.product_identifier})")
if count >= max_items:
print(f" ... (stopping at {max_items} products)")
break
print("-" * 40)
print("Example 3: Get product by ID")
print("-" * 40)
product_id = "PTGRXML" # Patent Grant Full-Text Data (No Images) - XML
product = client.get_product_by_id(product_id, include_files=True, latest=True)
print(f"Product: {product.product_title_text}")
print(f"Description: {product.product_description_text}")
print(f"Frequency: {product.product_frequency_text}")
print(f"Labels: {product.product_label_array_text}")
print(f"Categories: {product.product_dataset_category_array_text}")
print(f"Date range: {product.product_from_date} to {product.product_to_date}")
print("-" * 40)
print("Example 4: List files for a product")
print("-" * 40)
if product.product_file_bag and product.product_file_bag.file_data_bag:
print(f"Found {len(product.product_file_bag.file_data_bag)} file(s):")
for file_data in product.product_file_bag.file_data_bag:
print(f"\n File: {file_data.file_name}")
print(f" Size: {format_size(file_data.file_size)}")
print(f" Type: {file_data.file_type_text}")
print(
f" Data range: {file_data.file_data_from_date} to {file_data.file_data_to_date}"
)
print(f" Released: {file_data.file_release_date}")
print(f" Download URI: {file_data.file_download_uri}")
else:
print("No files found for this product")
print("-" * 40)
print("Example 5: Download a file (with extraction)")
print("-" * 40)
min_file: FileData | None = None
last_bytes: float = float("inf")
if product.product_file_bag and product.product_file_bag.file_data_bag:
for file_data in product.product_file_bag.file_data_bag:
if file_data.file_size < last_bytes:
last_bytes = file_data.file_size
min_file = file_data
if min_file:
print(f"Downloading smallest file: {min_file.file_name}")
print(f"Size: {format_size(min_file.file_size)}")
downloaded_path = client.download_file(
file_data=min_file,
destination=DEST_PATH,
overwrite=True,
extract=True,
)
print(f"Downloaded to {downloaded_path}")
print("-" * 40)
print("Example 6: Download without extraction")
print("-" * 40)
if product.product_file_bag and product.product_file_bag.file_data_bag and min_file:
downloaded_path = client.download_file(
file_data=min_file,
destination=DEST_PATH,
overwrite=True,
extract=False,
)
print(f"Archive saved to {downloaded_path}")