Skip to content

Commit 1e17aea

Browse files
Avoid mutating extract tool params and validate schema JSON
Co-authored-by: Shri Sukhani <shrisukhani@users.noreply.github.com>
1 parent 260c51f commit 1e17aea

File tree

2 files changed

+109
-6
lines changed

2 files changed

+109
-6
lines changed

hyperbrowser/tools/__init__.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import json
2+
from typing import Any, Dict, Mapping
3+
4+
from hyperbrowser.exceptions import HyperbrowserError
25
from hyperbrowser.models.agents.browser_use import StartBrowserUseTaskParams
36
from hyperbrowser.models.crawl import StartCrawlJobParams
47
from hyperbrowser.models.extract import StartExtractJobParams
@@ -21,6 +24,20 @@
2124
)
2225

2326

27+
def _prepare_extract_tool_params(params: Mapping[str, Any]) -> Dict[str, Any]:
28+
normalized_params: Dict[str, Any] = dict(params)
29+
schema_value = normalized_params.get("schema")
30+
if isinstance(schema_value, str):
31+
try:
32+
normalized_params["schema"] = json.loads(schema_value)
33+
except json.JSONDecodeError as exc:
34+
raise HyperbrowserError(
35+
"Invalid JSON string provided for `schema` in extract tool params",
36+
original_error=exc,
37+
) from exc
38+
return normalized_params
39+
40+
2441
class WebsiteScrapeTool:
2542
openai_tool_definition = SCRAPE_TOOL_OPENAI
2643
anthropic_tool_definition = SCRAPE_TOOL_ANTHROPIC
@@ -86,16 +103,18 @@ class WebsiteExtractTool:
86103

87104
@staticmethod
88105
def runnable(hb: Hyperbrowser, params: dict) -> str:
89-
if params.get("schema") and isinstance(params.get("schema"), str):
90-
params["schema"] = json.loads(params["schema"])
91-
resp = hb.extract.start_and_wait(params=StartExtractJobParams(**params))
106+
normalized_params = _prepare_extract_tool_params(params)
107+
resp = hb.extract.start_and_wait(
108+
params=StartExtractJobParams(**normalized_params)
109+
)
92110
return json.dumps(resp.data) if resp.data else ""
93111

94112
@staticmethod
95113
async def async_runnable(hb: AsyncHyperbrowser, params: dict) -> str:
96-
if params.get("schema") and isinstance(params.get("schema"), str):
97-
params["schema"] = json.loads(params["schema"])
98-
resp = await hb.extract.start_and_wait(params=StartExtractJobParams(**params))
114+
normalized_params = _prepare_extract_tool_params(params)
115+
resp = await hb.extract.start_and_wait(
116+
params=StartExtractJobParams(**normalized_params)
117+
)
99118
return json.dumps(resp.data) if resp.data else ""
100119

101120

tests/test_tools_extract.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import asyncio
2+
3+
import pytest
4+
5+
from hyperbrowser.exceptions import HyperbrowserError
6+
from hyperbrowser.models.extract import StartExtractJobParams
7+
from hyperbrowser.tools import WebsiteExtractTool
8+
9+
10+
class _Response:
11+
def __init__(self, data):
12+
self.data = data
13+
14+
15+
class _SyncExtractManager:
16+
def __init__(self):
17+
self.last_params = None
18+
19+
def start_and_wait(self, params: StartExtractJobParams):
20+
self.last_params = params
21+
return _Response({"ok": True})
22+
23+
24+
class _AsyncExtractManager:
25+
def __init__(self):
26+
self.last_params = None
27+
28+
async def start_and_wait(self, params: StartExtractJobParams):
29+
self.last_params = params
30+
return _Response({"ok": True})
31+
32+
33+
class _SyncClient:
34+
def __init__(self):
35+
self.extract = _SyncExtractManager()
36+
37+
38+
class _AsyncClient:
39+
def __init__(self):
40+
self.extract = _AsyncExtractManager()
41+
42+
43+
def test_extract_tool_runnable_does_not_mutate_input_params():
44+
client = _SyncClient()
45+
params = {
46+
"urls": ["https://example.com"],
47+
"schema": '{"type":"object","properties":{"name":{"type":"string"}}}',
48+
}
49+
50+
output = WebsiteExtractTool.runnable(client, params)
51+
52+
assert output == '{"ok": true}'
53+
assert isinstance(client.extract.last_params, StartExtractJobParams)
54+
assert isinstance(client.extract.last_params.schema_, dict)
55+
assert params["schema"] == '{"type":"object","properties":{"name":{"type":"string"}}}'
56+
57+
58+
def test_extract_tool_async_runnable_does_not_mutate_input_params():
59+
client = _AsyncClient()
60+
params = {
61+
"urls": ["https://example.com"],
62+
"schema": '{"type":"object","properties":{"name":{"type":"string"}}}',
63+
}
64+
65+
async def run():
66+
return await WebsiteExtractTool.async_runnable(client, params)
67+
68+
output = asyncio.run(run())
69+
70+
assert output == '{"ok": true}'
71+
assert isinstance(client.extract.last_params, StartExtractJobParams)
72+
assert isinstance(client.extract.last_params.schema_, dict)
73+
assert params["schema"] == '{"type":"object","properties":{"name":{"type":"string"}}}'
74+
75+
76+
def test_extract_tool_runnable_raises_for_invalid_schema_json():
77+
client = _SyncClient()
78+
params = {
79+
"urls": ["https://example.com"],
80+
"schema": "{invalid-json}",
81+
}
82+
83+
with pytest.raises(HyperbrowserError, match="Invalid JSON string provided for `schema`"):
84+
WebsiteExtractTool.runnable(client, params)

0 commit comments

Comments
 (0)