From 8c87c8f8a8a6c3dfd202d34e5fb5bb7c84053a5d Mon Sep 17 00:00:00 2001 From: Vamshi_BIDS Date: Mon, 20 Apr 2026 17:58:39 -0500 Subject: [PATCH 1/5] feat: add Browser Run integration (Loader + Tool) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cloudflare Browser Run (https://developers.cloudflare.com/browser-run/) gives you serverless headless Chrome on Cloudflare's edge. This PR adds two LangChain primitives that wrap the Quick Actions REST API so Python developers can use Browser Run without running a local browser. Why this matters ---------------- LangChain's existing web loaders (WebBaseLoader, SeleniumLoader, PlaywrightLoader) all need a local browser process. Browser Run is a single POST request — no infra, no dependencies, JS-rendered content. Combined with the rest of this library you get a full Cloudflare-native RAG pipeline: Browser Run (crawl) → Workers AI (embed) → Vectorize (store) → Workers AI (query) What's included --------------- CloudflareBrowserRunLoader (BaseLoader) Converts web pages into LangChain Documents for RAG ingestion. Modes: markdown (/markdown), crawl (/crawl with async polling), scrape (/scrape with CSS selectors), content (/content). Supports sync (load, lazy_load) and async (aload, alazy_load). CloudflareBrowserRunTool (BaseTool) Gives LangGraph agents the ability to read, extract, and navigate the live web. Modes: markdown, json (/json — AI-powered structured extraction), links, screenshot, pdf. Each mode gets its own tool name (e.g. cloudflare_browser_run_json) and description so agents can pick the right tool. LangGraph integration tested with: - Custom nodes in a StateGraph DAG (parallel fan-out) - ToolNode with tools_condition routing - Parallel tool calls in a single AIMessage - Supervisor pattern dispatching to specialist tools - Research loops with conditional edges (cycles) Auth follows the existing pattern: CF_ACCOUNT_ID + CF_API_TOKEN env vars, SecretStr, same validation as rerankers.py. Browser Run is REST-only — no Worker binding path exists, noted in the module docstring. References: - Quick Actions docs: https://developers.cloudflare.com/browser-run/quick-actions/ - /json endpoint: https://developers.cloudflare.com/browser-run/quick-actions/json-endpoint/ - /crawl endpoint: https://developers.cloudflare.com/browser-run/quick-actions/crawl-endpoint/ - API reference: https://developers.cloudflare.com/api/resources/browser_rendering/ - Rename announcement: https://developers.cloudflare.com/changelog/post/2026-04-15-br-rename/ Tests: 33 unit + 19 integration (15 endpoint + 4 LangGraph patterns), all passing against the real Browser Run API. --- CHANGELOG.md | 20 + README.md | 3 + libs/langchain-cloudflare/README.md | 150 +++ .../langchain_cloudflare/__init__.py | 6 + .../langchain_cloudflare/_errors.py | 7 + .../langchain_cloudflare/browser_run.py | 982 ++++++++++++++++++ libs/langchain-cloudflare/pyproject.toml | 2 +- .../integration_tests/test_browser_run.py | 548 ++++++++++ .../tests/unit_tests/test_browser_run.py | 381 +++++++ 9 files changed, 2098 insertions(+), 1 deletion(-) create mode 100644 libs/langchain-cloudflare/langchain_cloudflare/browser_run.py create mode 100644 libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py create mode 100644 libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 38bad91..3f2a3d3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,26 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## langchain-cloudflare + +### [0.3.5] + +#### Added + +- **`CloudflareBrowserRunLoader`**: New `BaseLoader` for document ingestion via [Cloudflare Browser Run](https://developers.cloudflare.com/browser-run/) REST API. Supports four modes: `markdown` (`/markdown`), `crawl` (`/crawl` with async polling), `scrape` (`/scrape` with CSS selectors), and `content` (`/content` for raw HTML). Includes sync (`load`, `lazy_load`) and async (`aload`, `alazy_load`) methods. Configurable crawl depth, page limit, poll interval, timeout, and all shared Browser Run options (viewport, cookies, headers, resource blocking). +- **`CloudflareBrowserRunTool`**: New `BaseTool` for LangGraph agent workflows. Supports five modes: `markdown`, `json` (AI-powered structured data extraction via `/json`), `links` (`/links`), `screenshot` (`/screenshot`), and `pdf` (`/pdf`). Tool name is auto-set per mode (e.g. `cloudflare_browser_run_json`) for agent disambiguation. JSON mode supports both natural language prompts and JSON schema definitions for structured extraction. +- **`TokenErrors.INSUFFICIENT_BROWSER_RUN_TOKEN`**: New centralized error message for missing Browser Run API credentials. Notes that Browser Run is REST-only (no Worker binding support). + +#### Changed + +- **Version bump**: 0.3.4 → 0.3.5 (Browser Run integration). + +#### Tests + +- Added unit tests for `CloudflareBrowserRunLoader` and `CloudflareBrowserRunTool`: token validation, configuration defaults, URL construction, header building, shared options builder. +- Added integration tests covering markdown/content/scrape/crawl loader modes and markdown/json/links/screenshot tool modes, both sync and async. + +--- + ### [0.3.4] #### Changed diff --git a/README.md b/README.md index ca8f66f..bba0fc1 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,9 @@ It contains the following packages. - [ChatCloudflareWorkersAI](https://python.langchain.com/docs/integrations/chat/cloudflare_workersai/) - [CloudflareWorkersAIEmbeddings](https://python.langchain.com/docs/integrations/text_embedding/cloudflare_workersai/) - [CloudflareVectorize](https://python.langchain.com/docs/integrations/vectorstores/cloudflare_vectorize/) +- CloudflareWorkersAIReranker +- CloudflareBrowserRunLoader (Document Loader via [Browser Run](https://developers.cloudflare.com/browser-run/)) +- CloudflareBrowserRunTool (Agent Tool via [Browser Run](https://developers.cloudflare.com/browser-run/)) ### LangGraph diff --git a/libs/langchain-cloudflare/README.md b/libs/langchain-cloudflare/README.md index 4a0c69a..1018a1b 100644 --- a/libs/langchain-cloudflare/README.md +++ b/libs/langchain-cloudflare/README.md @@ -59,6 +59,156 @@ vst = CloudflareVectorize( vst.create_index(index_name="my-cool-vectorstore") ``` +## Reranker + +`CloudflareWorkersAIReranker` reranks documents by relevance using [Workers AI](https://developers.cloudflare.com/workers-ai/). + +```python +from langchain_cloudflare import CloudflareWorkersAIReranker + +reranker = CloudflareWorkersAIReranker() +results = reranker.rerank( + query="What is the capital of France?", + documents=["Paris is the capital of France.", "Berlin is in Germany."], + top_k=2, +) +``` + +## Browser Run (Document Loader) + +`CloudflareBrowserRunLoader` loads web pages as LangChain `Document` objects using [Cloudflare Browser Run](https://developers.cloudflare.com/browser-run/). It renders JavaScript-heavy pages on Cloudflare's global network and returns clean content via a simple REST API — no local browser required. + +```python +from langchain_cloudflare import CloudflareBrowserRunLoader + +# Single page -> markdown +loader = CloudflareBrowserRunLoader( + urls=["https://developers.cloudflare.com/workers-ai/"], + mode="markdown", +) +docs = loader.load() + +# Multi-page crawl -> knowledge base +loader = CloudflareBrowserRunLoader( + urls=["https://developers.cloudflare.com/cloudflare-one/"], + mode="crawl", + crawl_limit=50, + crawl_depth=2, +) +docs = loader.load() +``` + +Supported modes: `markdown`, `crawl`, `scrape`, `content`. + +> **Note:** Requires an API token with *Browser Rendering – Edit* permission (`CF_API_TOKEN` or `CF_AI_API_TOKEN`). + +## Browser Run (Agent Tool) + +`CloudflareBrowserRunTool` gives [LangGraph](https://langchain-ai.github.io/langgraph/) agents the ability to interact with the live web. + +```python +from langchain_cloudflare import CloudflareBrowserRunTool, ChatCloudflareWorkersAI +from langgraph.prebuilt import create_react_agent + +llm = ChatCloudflareWorkersAI() +tools = [ + CloudflareBrowserRunTool(mode="markdown"), + CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract the company name, industry, and employee count.", + ), + CloudflareBrowserRunTool(mode="links"), +] +agent = create_react_agent(llm, tools) +result = agent.invoke({"messages": [("user", "Research example.com")]}) +``` + +Supported modes: `markdown`, `json`, `links`, `screenshot`, `pdf`. + +### Browser Run in LangGraph Workflows + +Both the Loader and Tool integrate with all LangGraph patterns: + +**As a custom node in a DAG:** + +```python +from typing import TypedDict +from langchain_cloudflare import CloudflareBrowserRunLoader, CloudflareBrowserRunTool +from langgraph.graph import StateGraph, START, END + + +class ResearchState(TypedDict): + url: str + page_content: str + links: list[str] + + +def fetch_page(state: ResearchState) -> dict: + loader = CloudflareBrowserRunLoader(urls=[state["url"]], mode="markdown") + docs = loader.load() + return {"page_content": docs[0].page_content} + + +def extract_links(state: ResearchState) -> dict: + tool = CloudflareBrowserRunTool(mode="links") + links = tool.invoke({"url": state["url"]}).strip().split("\n") + return {"links": links} + + +graph = StateGraph(ResearchState) +graph.add_node("fetch_page", fetch_page) +graph.add_node("extract_links", extract_links) +graph.add_edge(START, "fetch_page") +graph.add_edge(START, "extract_links") # runs in parallel +graph.add_edge("fetch_page", END) +graph.add_edge("extract_links", END) +app = graph.compile() + +result = app.invoke({"url": "https://example.com", "page_content": "", "links": []}) +``` + +**As tools in a supervisor pattern:** + +```python +from langgraph.graph import StateGraph, MessagesState, START, END +from langgraph.prebuilt import ToolNode, tools_condition + +tools = [ + CloudflareBrowserRunTool(mode="markdown"), + CloudflareBrowserRunTool(mode="json", json_prompt="Extract key facts."), + CloudflareBrowserRunTool(mode="links"), +] +tool_node = ToolNode(tools) + +graph = StateGraph(MessagesState) +graph.add_node("supervisor", supervisor_fn) # your LLM-based supervisor +graph.add_node("browser_tools", tool_node) +graph.add_edge(START, "supervisor") +graph.add_conditional_edges("supervisor", tools_condition) +graph.add_edge("browser_tools", "supervisor") +app = graph.compile() +``` + +**In a research loop with cycles:** + +```python +def should_continue(state) -> str: + if state["iteration"] >= 3 or not state["urls_to_visit"]: + return "done" + return "continue" + +graph = StateGraph(ResearchState) +graph.add_node("discover", discover_links_node) +graph.add_node("fetch", fetch_page_node) +graph.add_edge(START, "discover") +graph.add_edge("discover", "fetch") +graph.add_conditional_edges("fetch", should_continue, { + "continue": "discover", + "done": END, +}) +app = graph.compile() +``` + ## Release Notes v0.1.1 (2025-04-08) diff --git a/libs/langchain-cloudflare/langchain_cloudflare/__init__.py b/libs/langchain-cloudflare/langchain_cloudflare/__init__.py index fc60283..fb6ddba 100644 --- a/libs/langchain-cloudflare/langchain_cloudflare/__init__.py +++ b/libs/langchain-cloudflare/langchain_cloudflare/__init__.py @@ -15,6 +15,10 @@ convert_vectorize_query_response, convert_vectors_for_binding, ) +from langchain_cloudflare.browser_run import ( + CloudflareBrowserRunLoader, + CloudflareBrowserRunTool, +) from langchain_cloudflare.chat_models import ChatCloudflareWorkersAI from langchain_cloudflare.embeddings import CloudflareWorkersAIEmbeddings from langchain_cloudflare.rerankers import CloudflareWorkersAIReranker, RerankResult @@ -30,6 +34,8 @@ # MARK: - Public API __all__ = [ "ChatCloudflareWorkersAI", + "CloudflareBrowserRunLoader", + "CloudflareBrowserRunTool", "CloudflareVectorize", "CloudflareWorkersAIEmbeddings", "CloudflareWorkersAIReranker", diff --git a/libs/langchain-cloudflare/langchain_cloudflare/_errors.py b/libs/langchain-cloudflare/langchain_cloudflare/_errors.py index 3160d92..32c6d82 100644 --- a/libs/langchain-cloudflare/langchain_cloudflare/_errors.py +++ b/libs/langchain-cloudflare/langchain_cloudflare/_errors.py @@ -40,3 +40,10 @@ class TokenErrors(StrEnum): "and no `d1_api_token` provided. Please set these through parameters " "or environment variables (CF_API_TOKEN, CF_D1_API_TOKEN)." ) + + INSUFFICIENT_BROWSER_RUN_TOKEN = ( + "A Cloudflare API token with Browser Rendering – Edit permission " + "must be provided either through the api_token parameter or " + "CF_API_TOKEN / CF_AI_API_TOKEN environment variable. " + "Browser Run is REST-only and does not support Worker bindings." + ) diff --git a/libs/langchain-cloudflare/langchain_cloudflare/browser_run.py b/libs/langchain-cloudflare/langchain_cloudflare/browser_run.py new file mode 100644 index 0000000..76fe2a6 --- /dev/null +++ b/libs/langchain-cloudflare/langchain_cloudflare/browser_run.py @@ -0,0 +1,982 @@ +"""Cloudflare Browser Run integration for LangChain. + +This module provides a document loader and an agent tool backed by +Cloudflare Browser Run (formerly Browser Rendering). Browser Run +offers serverless headless Chrome on Cloudflare's global network +via a simple REST API, supporting markdown extraction, crawling, +scraping, AI-powered structured data extraction, screenshots, PDFs, +and link discovery. + + * ``CloudflareBrowserRunLoader`` – a ``BaseLoader`` for document + ingestion (RAG pipelines, knowledge-base construction). + * ``CloudflareBrowserRunTool`` – a ``BaseTool`` for LangGraph + agent workflows (research agents, data extraction, live web + context). + +Note: + Browser Run Quick Actions are REST-only. Unlike the other modules + in this library there is no Workers binding path. +""" + +# MARK: - Imports +from __future__ import annotations + +import logging +import time +import warnings +from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Optional + +import requests +from langchain_core.document_loaders import BaseLoader +from langchain_core.documents import Document +from langchain_core.tools import BaseTool +from langchain_core.utils import from_env, secret_from_env +from pydantic import BaseModel, ConfigDict, Field, PrivateAttr, SecretStr + +from ._errors import TokenErrors + +logger = logging.getLogger(__name__) + +# MARK: - Constants +BROWSER_RUN_BASE_URL = "https://api.cloudflare.com/client/v4/accounts" +DEFAULT_CRAWL_POLL_INTERVAL = 2.0 # seconds between /crawl status checks +DEFAULT_CRAWL_TIMEOUT = 300.0 # max seconds to wait for a crawl job +DEFAULT_CRAWL_LIMIT = 10 +DEFAULT_CRAWL_DEPTH = 2 + + +# MARK: - Helpers + + +def _build_browser_run_url(account_id: str, endpoint: str) -> str: + """Build the full Browser Run REST API URL for a given endpoint. + + Args: + account_id: Cloudflare account ID. + endpoint: The Quick Action path, e.g. ``"markdown"`` or ``"crawl"``. + + Returns: + Fully-qualified URL string. + """ + return f"{BROWSER_RUN_BASE_URL}/{account_id}/browser-rendering/{endpoint}" + + +def _build_headers(api_token: str) -> Dict[str, str]: + """Build authorization headers for Browser Run requests. + + Args: + api_token: The plaintext API token value. + + Returns: + Dict with ``Authorization`` and ``Content-Type`` headers. + """ + return { + "Authorization": f"Bearer {api_token}", + "Content-Type": "application/json", + } + + +def _build_shared_options( + goto_options: Optional[Dict[str, Any]] = None, + viewport: Optional[Dict[str, Any]] = None, + wait_for_selector: Optional[Dict[str, Any]] = None, + cookies: Optional[List[Dict[str, Any]]] = None, + extra_headers: Optional[Dict[str, str]] = None, + reject_resource_types: Optional[List[str]] = None, +) -> Dict[str, Any]: + """Build the shared optional parameters accepted by most Browser Run endpoints. + + Args: + goto_options: Page navigation options (waitUntil, timeout). + viewport: Viewport dimensions ``{width, height}``. + wait_for_selector: Wait for a CSS selector before returning. + cookies: List of cookie dicts to set before navigation. + extra_headers: Extra HTTP headers to set on the request. + reject_resource_types: Resource types to block (e.g. ``["image"]``). + + Returns: + Dict of non-None options ready to merge into the request body. + """ + opts: Dict[str, Any] = {} + if goto_options is not None: + opts["gotoOptions"] = goto_options + if viewport is not None: + opts["viewport"] = viewport + if wait_for_selector is not None: + opts["waitForSelector"] = wait_for_selector + if cookies is not None: + opts["cookies"] = cookies + if extra_headers is not None: + opts["setExtraHTTPHeaders"] = extra_headers + if reject_resource_types is not None: + opts["rejectResourceTypes"] = reject_resource_types + return opts + + +def _check_api_response(data: Any) -> None: + """Raise if the Cloudflare API returned a success=false envelope. + + Some Browser Run endpoints return ``200 OK`` with an error body + like ``{"success": false, "errors": [...]}``. This helper ensures + such responses are surfaced as exceptions rather than silently + producing empty results. + + Args: + data: The parsed JSON response body. + + Raises: + RuntimeError: When the API indicates failure. + """ + if isinstance(data, dict) and not data.get("success", True): + errors = data.get("errors", []) + raise RuntimeError(f"Browser Run API error: {errors}") + + +# MARK: - CloudflareBrowserRunLoader + + +class CloudflareBrowserRunLoader(BaseLoader, BaseModel): # type: ignore[misc] + """Load documents from web pages using Cloudflare Browser Run. + + Browser Run renders JavaScript-heavy pages on Cloudflare's global + network and returns the content via a simple REST API. This loader + converts web pages into LangChain ``Document`` objects suitable for + RAG pipelines and knowledge-base construction. + + Example (markdown mode): + .. code-block:: python + + from langchain_cloudflare import CloudflareBrowserRunLoader + + loader = CloudflareBrowserRunLoader( + urls=["https://developers.cloudflare.com/workers-ai/"], + mode="markdown", + ) + docs = loader.load() + + Example (crawl mode): + .. code-block:: python + + loader = CloudflareBrowserRunLoader( + urls=["https://developers.cloudflare.com/cloudflare-one/"], + mode="crawl", + crawl_limit=50, + crawl_depth=2, + ) + docs = loader.load() + + Key init args: + urls: list[str] + URLs to load. + + mode: str + One of ``"markdown"``, ``"crawl"``, ``"scrape"``, ``"content"``. + + account_id: str + Cloudflare account ID. Falls back to ``CF_ACCOUNT_ID`` env var. + + api_token: str + Cloudflare API token with *Browser Rendering – Edit* permission. + Falls back to ``CF_API_TOKEN`` then ``CF_AI_API_TOKEN`` env var. + """ + + # MARK: - Fields + urls: List[str] = Field(default_factory=list) + """URLs to load.""" + + mode: Literal["markdown", "crawl", "scrape", "content"] = "markdown" + """Loader mode: ``markdown``, ``crawl``, ``scrape``, or ``content``.""" + + account_id: str = Field(default_factory=from_env("CF_ACCOUNT_ID", default="")) + """Cloudflare account ID.""" + + api_token: Optional[SecretStr] = Field( + default_factory=secret_from_env( + ["CF_API_TOKEN", "CF_AI_API_TOKEN"], default=None + ) + ) + """API token with Browser Rendering – Edit permission.""" + + # Crawl-specific + crawl_limit: int = DEFAULT_CRAWL_LIMIT + """Maximum number of pages to crawl (``/crawl`` mode only).""" + + crawl_depth: int = DEFAULT_CRAWL_DEPTH + """Maximum link depth from seed URL (``/crawl`` mode only).""" + + crawl_poll_interval: float = DEFAULT_CRAWL_POLL_INTERVAL + """Seconds between ``/crawl`` status polls.""" + + crawl_timeout: float = DEFAULT_CRAWL_TIMEOUT + """Maximum seconds to wait for a crawl job to finish.""" + + # Scrape-specific + elements: Optional[List[Dict[str, Any]]] = None + """CSS selectors for ``/scrape`` mode, e.g. ``[{"selector": "h1"}]``.""" + + # Shared Browser Run options + goto_options: Optional[Dict[str, Any]] = None + """Page navigation options (``waitUntil``, ``timeout``).""" + + viewport: Optional[Dict[str, Any]] = None + """Viewport dimensions ``{width, height}``.""" + + wait_for_selector: Optional[Dict[str, Any]] = None + """Wait for a CSS selector before returning content.""" + + cookies: Optional[List[Dict[str, Any]]] = None + """Cookies to set before navigation.""" + + extra_headers: Optional[Dict[str, str]] = None + """Extra HTTP headers sent with the browser request.""" + + reject_resource_types: Optional[List[str]] = None + """Resource types to block (e.g. ``["image", "stylesheet"]``).""" + + # Internal + _headers: Dict[str, str] = PrivateAttr() + + model_config = ConfigDict(extra="forbid", protected_namespaces=()) + + def __init__(self, **kwargs: Any) -> None: + """Initialize the loader and validate credentials.""" + super().__init__(**kwargs) + + if not self.account_id: + raise ValueError(TokenErrors.NO_ACCOUNT_ID_SET) + if not self.api_token or not self.api_token.get_secret_value(): + raise ValueError(TokenErrors.INSUFFICIENT_BROWSER_RUN_TOKEN) + + self._headers = _build_headers(self.api_token.get_secret_value()) + + # MARK: - Private helpers + + def _shared_body(self) -> Dict[str, Any]: + """Return the shared optional params for the request body.""" + return _build_shared_options( + goto_options=self.goto_options, + viewport=self.viewport, + wait_for_selector=self.wait_for_selector, + cookies=self.cookies, + extra_headers=self.extra_headers, + reject_resource_types=self.reject_resource_types, + ) + + def _fetch_markdown(self, url: str) -> Document: + """Fetch a URL via the ``/markdown`` endpoint. + + Args: + url: The URL to render and convert to markdown. + + Returns: + A single ``Document`` with markdown content. + """ + body: Dict[str, Any] = {"url": url, **self._shared_body()} + resp = requests.post( + _build_browser_run_url(self.account_id, "markdown"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + data = resp.json() + _check_api_response(data) + content = data.get("result", "") + return Document( + page_content=content, + metadata={"source": url, "mode": "markdown"}, + ) + + def _fetch_content(self, url: str) -> Document: + """Fetch a URL via the ``/content`` endpoint (raw rendered HTML). + + Args: + url: The URL to render. + + Returns: + A single ``Document`` with HTML content. + """ + body: Dict[str, Any] = {"url": url, **self._shared_body()} + resp = requests.post( + _build_browser_run_url(self.account_id, "content"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + data = resp.json() + _check_api_response(data) + content = data.get("result", resp.text) + return Document( + page_content=content, + metadata={"source": url, "mode": "content"}, + ) + + def _fetch_scrape(self, url: str) -> List[Document]: + """Fetch a URL via the ``/scrape`` endpoint. + + Args: + url: The URL to scrape. + + Returns: + List of ``Document`` objects, one per matched element group. + """ + elements = self.elements or [{"selector": "body"}] + body: Dict[str, Any] = { + "url": url, + "elements": elements, + **self._shared_body(), + } + resp = requests.post( + _build_browser_run_url(self.account_id, "scrape"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + data = resp.json() + _check_api_response(data) + + docs: List[Document] = [] + for group in data.get("result", []): + selector = group.get("selector", "") + texts = [r.get("text", "") for r in group.get("results", [])] + combined = "\n".join(t for t in texts if t) + if combined: + docs.append( + Document( + page_content=combined, + metadata={ + "source": url, + "mode": "scrape", + "selector": selector, + }, + ) + ) + return docs + + def _fetch_crawl(self, url: str) -> List[Document]: + """Crawl a URL via the ``/crawl`` async endpoint. + + Initiates a crawl job, polls until complete, and returns all + crawled pages as ``Document`` objects. + + Args: + url: The seed URL to crawl. + + Returns: + List of ``Document`` objects, one per crawled page. + """ + crawl_url = _build_browser_run_url(self.account_id, "crawl") + + # Step 1: Initiate crawl + body: Dict[str, Any] = { + "url": url, + "limit": self.crawl_limit, + "depth": self.crawl_depth, + "formats": ["markdown"], + **self._shared_body(), + } + resp = requests.post(crawl_url, headers=self._headers, json=body) + resp.raise_for_status() + job_id = resp.json().get("result", "") + + if not job_id: + return [] + + # Step 2: Poll for results + results_url = f"{crawl_url}/{job_id}" + start_time = time.time() + + while True: + elapsed = time.time() - start_time + if elapsed > self.crawl_timeout: + warnings.warn( + f"Crawl for {url} timed out after {self.crawl_timeout}s. " + "Returning partial results.", + stacklevel=2, + ) + break + + poll = requests.get(results_url, headers=self._headers) + poll.raise_for_status() + poll_data = poll.json().get("result", {}) + status = poll_data.get("status", "") + + if status in ( + "completed", + "errored", + "cancelled_by_user", + "cancelled_due_to_timeout", + "cancelled_due_to_limits", + ): + break + + time.sleep(self.crawl_poll_interval) + + # Step 3: Collect records (may need pagination) + docs: List[Document] = [] + cursor: Optional[int] = None + + while True: + params: Dict[str, Any] = {} + if cursor is not None: + params["cursor"] = cursor + + page_resp = requests.get(results_url, headers=self._headers, params=params) + page_resp.raise_for_status() + page_data = page_resp.json().get("result", {}) + + for record in page_data.get("records", []): + if record.get("status") != "completed": + continue + content = record.get("markdown", record.get("html", "")) + meta = record.get("metadata", {}) + if content: + docs.append( + Document( + page_content=content, + metadata={ + "source": record.get("url", url), + "mode": "crawl", + "title": meta.get("title", ""), + "status_code": meta.get("status", 0), + }, + ) + ) + + next_cursor = page_data.get("cursor") + if next_cursor is None or next_cursor == cursor: + break + cursor = next_cursor + + return docs + + # MARK: - Async private helpers + + async def _afetch_markdown(self, url: str) -> Document: + """Async variant of ``_fetch_markdown``. + + Args: + url: The URL to render and convert to markdown. + + Returns: + A single ``Document`` with markdown content. + """ + import httpx + + body: Dict[str, Any] = {"url": url, **self._shared_body()} + async with httpx.AsyncClient() as client: + resp = await client.post( + _build_browser_run_url(self.account_id, "markdown"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + + data = resp.json() + _check_api_response(data) + content = data.get("result", "") + return Document( + page_content=content, + metadata={"source": url, "mode": "markdown"}, + ) + + async def _afetch_content(self, url: str) -> Document: + """Async variant of ``_fetch_content``. + + Args: + url: The URL to render. + + Returns: + A single ``Document`` with HTML content. + """ + import httpx + + body: Dict[str, Any] = {"url": url, **self._shared_body()} + async with httpx.AsyncClient() as client: + resp = await client.post( + _build_browser_run_url(self.account_id, "content"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + + data = resp.json() + _check_api_response(data) + return Document( + page_content=data.get("result", resp.text), + metadata={"source": url, "mode": "content"}, + ) + + async def _afetch_scrape(self, url: str) -> List[Document]: + """Async variant of ``_fetch_scrape``. + + Args: + url: The URL to scrape. + + Returns: + List of ``Document`` objects. + """ + import httpx + + elements = self.elements or [{"selector": "body"}] + body: Dict[str, Any] = { + "url": url, + "elements": elements, + **self._shared_body(), + } + async with httpx.AsyncClient() as client: + resp = await client.post( + _build_browser_run_url(self.account_id, "scrape"), + headers=self._headers, + json=body, + ) + resp.raise_for_status() + + data = resp.json() + _check_api_response(data) + docs: List[Document] = [] + for group in data.get("result", []): + selector = group.get("selector", "") + texts = [r.get("text", "") for r in group.get("results", [])] + combined = "\n".join(t for t in texts if t) + if combined: + docs.append( + Document( + page_content=combined, + metadata={ + "source": url, + "mode": "scrape", + "selector": selector, + }, + ) + ) + return docs + + async def _afetch_crawl(self, url: str) -> List[Document]: + """Async variant of ``_fetch_crawl``. + + Args: + url: The seed URL to crawl. + + Returns: + List of ``Document`` objects. + """ + import asyncio + + import httpx + + crawl_url = _build_browser_run_url(self.account_id, "crawl") + + body: Dict[str, Any] = { + "url": url, + "limit": self.crawl_limit, + "depth": self.crawl_depth, + "formats": ["markdown"], + **self._shared_body(), + } + + async with httpx.AsyncClient() as client: + resp = await client.post(crawl_url, headers=self._headers, json=body) + resp.raise_for_status() + job_id = resp.json().get("result", "") + + if not job_id: + return [] + + results_url = f"{crawl_url}/{job_id}" + start_time = time.time() + + while True: + elapsed = time.time() - start_time + if elapsed > self.crawl_timeout: + warnings.warn( + f"Crawl for {url} timed out after {self.crawl_timeout}s. " + "Returning partial results.", + stacklevel=2, + ) + break + + poll = await client.get(results_url, headers=self._headers) + poll.raise_for_status() + poll_data = poll.json().get("result", {}) + status = poll_data.get("status", "") + + if status in ( + "completed", + "errored", + "cancelled_by_user", + "cancelled_due_to_timeout", + "cancelled_due_to_limits", + ): + break + + await asyncio.sleep(self.crawl_poll_interval) + + # Collect records + docs: List[Document] = [] + cursor: Optional[int] = None + + while True: + params: Dict[str, Any] = {} + if cursor is not None: + params["cursor"] = cursor + + page_resp = await client.get( + results_url, headers=self._headers, params=params + ) + page_resp.raise_for_status() + page_data = page_resp.json().get("result", {}) + + for record in page_data.get("records", []): + if record.get("status") != "completed": + continue + content = record.get("markdown", record.get("html", "")) + meta = record.get("metadata", {}) + if content: + docs.append( + Document( + page_content=content, + metadata={ + "source": record.get("url", url), + "mode": "crawl", + "title": meta.get("title", ""), + "status_code": meta.get("status", 0), + }, + ) + ) + + next_cursor = page_data.get("cursor") + if next_cursor is None or next_cursor == cursor: + break + cursor = next_cursor + + return docs + + # MARK: - Public API + + def load(self) -> List[Document]: + """Load all URLs and return a list of ``Document`` objects. + + Returns: + List of ``Document`` objects, one per page (or more for scrape/crawl). + """ + docs: List[Document] = [] + for url in self.urls: + if self.mode == "markdown": + docs.append(self._fetch_markdown(url)) + elif self.mode == "content": + docs.append(self._fetch_content(url)) + elif self.mode == "scrape": + docs.extend(self._fetch_scrape(url)) + elif self.mode == "crawl": + docs.extend(self._fetch_crawl(url)) + return docs + + def lazy_load(self) -> Iterator[Document]: + """Lazily load URLs, yielding one ``Document`` at a time. + + Yields: + ``Document`` objects. + """ + for url in self.urls: + if self.mode == "markdown": + yield self._fetch_markdown(url) + elif self.mode == "content": + yield self._fetch_content(url) + elif self.mode == "scrape": + yield from self._fetch_scrape(url) + elif self.mode == "crawl": + yield from self._fetch_crawl(url) + + async def aload(self) -> List[Document]: + """Async variant of ``load()``. + + Returns: + List of ``Document`` objects. + """ + docs: List[Document] = [] + for url in self.urls: + if self.mode == "markdown": + docs.append(await self._afetch_markdown(url)) + elif self.mode == "content": + docs.append(await self._afetch_content(url)) + elif self.mode == "scrape": + docs.extend(await self._afetch_scrape(url)) + elif self.mode == "crawl": + docs.extend(await self._afetch_crawl(url)) + return docs + + async def alazy_load(self) -> AsyncIterator[Document]: + """Async lazy variant of ``load()``. + + Yields: + ``Document`` objects. + """ + for url in self.urls: + if self.mode == "markdown": + yield await self._afetch_markdown(url) + elif self.mode == "content": + yield await self._afetch_content(url) + elif self.mode == "scrape": + for doc in await self._afetch_scrape(url): + yield doc + elif self.mode == "crawl": + for doc in await self._afetch_crawl(url): + yield doc + + +# MARK: - CloudflareBrowserRunTool + + +class CloudflareBrowserRunTool(BaseTool): + """LangGraph agent tool for interacting with web pages via Browser Run. + + Gives agents the ability to fetch web page content, extract structured + data using AI, discover links, take screenshots, and generate PDFs. + + Example (markdown): + .. code-block:: python + + from langchain_cloudflare import CloudflareBrowserRunTool + + tool = CloudflareBrowserRunTool(mode="markdown") + result = tool.invoke({"url": "https://example.com"}) + + Example (AI-powered JSON extraction): + .. code-block:: python + + tool = CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract the company name, industry, and employee count.", + ) + result = tool.invoke({"url": "https://example.com/about"}) + + Example (in a LangGraph agent): + .. code-block:: python + + from langgraph.prebuilt import create_react_agent + from langchain_cloudflare import ChatCloudflareWorkersAI + + llm = ChatCloudflareWorkersAI() + tools = [ + CloudflareBrowserRunTool(mode="markdown"), + CloudflareBrowserRunTool(mode="json", json_prompt="Extract key facts."), + CloudflareBrowserRunTool(mode="links"), + ] + agent = create_react_agent(llm, tools) + + Key init args: + mode: str + One of ``"markdown"``, ``"json"``, ``"links"``, ``"screenshot"``, ``"pdf"``. + + account_id: str + Cloudflare account ID. Falls back to ``CF_ACCOUNT_ID`` env var. + + api_token: str + Cloudflare API token with *Browser Rendering – Edit* permission. + Falls back to ``CF_API_TOKEN`` then ``CF_AI_API_TOKEN`` env var. + """ + + # BaseTool fields + name: str = "cloudflare_browser_run" + description: str = ( + "Fetch and extract content from a web page using Cloudflare Browser Run. " + "Input must be a URL string. " + "Returns rendered page content (markdown, structured JSON, links, etc.) " + "depending on the configured mode." + ) + + # MARK: - Fields + mode: Literal["markdown", "json", "links", "screenshot", "pdf"] = "markdown" + """Tool mode: determines which Browser Run endpoint to call.""" + + account_id: str = Field(default_factory=from_env("CF_ACCOUNT_ID", default="")) + """Cloudflare account ID.""" + + api_token: Optional[SecretStr] = Field( + default_factory=secret_from_env( + ["CF_API_TOKEN", "CF_AI_API_TOKEN"], default=None + ) + ) + """API token with Browser Rendering – Edit permission.""" + + # JSON mode options + json_prompt: Optional[str] = None + """Natural language prompt for AI extraction (``/json`` mode).""" + + json_response_format: Optional[Dict[str, Any]] = None + """JSON schema for structured extraction (``/json`` mode).""" + + # Shared Browser Run options + goto_options: Optional[Dict[str, Any]] = None + """Page navigation options.""" + + viewport: Optional[Dict[str, Any]] = None + """Viewport dimensions.""" + + wait_for_selector: Optional[Dict[str, Any]] = None + """Wait for a CSS selector before returning.""" + + cookies: Optional[List[Dict[str, Any]]] = None + """Cookies to set before navigation.""" + + extra_headers: Optional[Dict[str, str]] = None + """Extra HTTP headers sent with the browser request.""" + + reject_resource_types: Optional[List[str]] = None + """Resource types to block.""" + + # Internal + _headers: Dict[str, str] = PrivateAttr() + + model_config = ConfigDict(extra="forbid", protected_namespaces=()) + + def __init__(self, **kwargs: Any) -> None: + """Initialize the tool and validate credentials.""" + super().__init__(**kwargs) + + if not self.account_id: + raise ValueError(TokenErrors.NO_ACCOUNT_ID_SET) + if not self.api_token or not self.api_token.get_secret_value(): + raise ValueError(TokenErrors.INSUFFICIENT_BROWSER_RUN_TOKEN) + + self._headers = _build_headers(self.api_token.get_secret_value()) + + # Set descriptive name and description per mode + self.name = f"cloudflare_browser_run_{self.mode}" + _mode_descriptions = { + "markdown": ( + "Fetch a web page and return its content as clean markdown. " + "Input must be a URL string." + ), + "json": ( + "Extract structured JSON data from a web page using AI. " + "Input must be a URL string." + ), + "links": ( + "Discover and return all links found on a web page. " + "Input must be a URL string." + ), + "screenshot": ( + "Capture a screenshot of a web page and return it as " + "base64-encoded PNG. Input must be a URL string." + ), + "pdf": ( + "Generate a PDF of a web page and return it as base64-encoded " + "data. Input must be a URL string." + ), + } + if self.mode in _mode_descriptions: + self.description = _mode_descriptions[self.mode] + + # MARK: - Private helpers + + def _shared_body(self) -> Dict[str, Any]: + """Return shared optional params for the request body.""" + return _build_shared_options( + goto_options=self.goto_options, + viewport=self.viewport, + wait_for_selector=self.wait_for_selector, + cookies=self.cookies, + extra_headers=self.extra_headers, + reject_resource_types=self.reject_resource_types, + ) + + # MARK: - Tool execution + + def _run(self, url: str) -> str: + """Execute the tool synchronously. + + Args: + url: The URL to process. + + Returns: + String result (markdown, JSON string, link list, or base64 bytes + description for binary endpoints). + """ + import json as json_mod + + base = _build_browser_run_url(self.account_id, self.mode) + body: Dict[str, Any] = {"url": url, **self._shared_body()} + + if self.mode == "json": + if self.json_prompt: + body["prompt"] = self.json_prompt + if self.json_response_format: + body["response_format"] = self.json_response_format + + resp = requests.post(base, headers=self._headers, json=body) + resp.raise_for_status() + + if self.mode in ("screenshot", "pdf"): + import base64 + + encoded = base64.b64encode(resp.content).decode("utf-8") + return encoded + + data = resp.json() + _check_api_response(data) + + if self.mode == "markdown": + return str(data.get("result", "")) + elif self.mode == "json": + result = data.get("result", {}) + return ( + json_mod.dumps(result, indent=2) + if isinstance(result, dict) + else str(result) + ) + elif self.mode == "links": + links = data.get("result", []) + return "\n".join(links) + else: + return str(data.get("result", "")) + + async def _arun(self, url: str) -> str: + """Execute the tool asynchronously. + + Args: + url: The URL to process. + + Returns: + String result. + """ + import json as json_mod + + import httpx + + base = _build_browser_run_url(self.account_id, self.mode) + body: Dict[str, Any] = {"url": url, **self._shared_body()} + + if self.mode == "json": + if self.json_prompt: + body["prompt"] = self.json_prompt + if self.json_response_format: + body["response_format"] = self.json_response_format + + async with httpx.AsyncClient() as client: + resp = await client.post(base, headers=self._headers, json=body) + resp.raise_for_status() + + if self.mode in ("screenshot", "pdf"): + import base64 + + encoded = base64.b64encode(resp.content).decode("utf-8") + return encoded + + data = resp.json() + _check_api_response(data) + + if self.mode == "markdown": + return str(data.get("result", "")) + elif self.mode == "json": + result = data.get("result", {}) + return ( + json_mod.dumps(result, indent=2) + if isinstance(result, dict) + else str(result) + ) + elif self.mode == "links": + links = data.get("result", []) + return "\n".join(links) + else: + return str(data.get("result", "")) diff --git a/libs/langchain-cloudflare/pyproject.toml b/libs/langchain-cloudflare/pyproject.toml index 59383f5..dd7b829 100644 --- a/libs/langchain-cloudflare/pyproject.toml +++ b/libs/langchain-cloudflare/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "langchain-cloudflare" -version = "0.3.4" +version = "0.3.5" description = "Langchain Integrations for Cloudflare's WorkersAI and Vectorize" readme = "README.md" license = "MIT" diff --git a/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py new file mode 100644 index 0000000..49836e9 --- /dev/null +++ b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py @@ -0,0 +1,548 @@ +# ruff: noqa: T201 +"""Integration tests for CloudflareBrowserRunLoader and CloudflareBrowserRunTool. + +Tests cover: +- Loader: markdown, content, scrape, crawl modes (sync + async) +- Tool: markdown, links, json, screenshot modes (sync + async) + +Required environment variables: + CF_ACCOUNT_ID: Cloudflare account ID + CF_API_TOKEN: API token with Browser Rendering – Edit permission + +Usage: + # Set environment variables + export CF_ACCOUNT_ID="your_account_id" + export CF_API_TOKEN="your_api_token" + + # Run with pytest + python -m pytest tests/integration_tests/test_browser_run.py -v -s + + # Or via Makefile + make integration_tests TEST_FILE=tests/integration_tests/test_browser_run.py +""" + +import os + +import pytest + +from langchain_cloudflare.browser_run import ( + CloudflareBrowserRunLoader, + CloudflareBrowserRunTool, +) + +# A lightweight, stable public URL for testing +TEST_URL = "https://example.com" + + +# MARK: - Fixtures + + +@pytest.fixture +def account_id(): + """Return the Cloudflare account ID or skip.""" + val = os.environ.get("CF_ACCOUNT_ID", "") + if not val: + pytest.skip("CF_ACCOUNT_ID not set") + return val + + +@pytest.fixture +def api_token(): + """Return the Cloudflare API token or skip.""" + val = ( + os.environ.get("TEST_CF_API_TOKEN") + or os.environ.get("CF_API_TOKEN") + or os.environ.get("CF_AI_API_TOKEN") + or "" + ) + if not val: + pytest.skip("CF_API_TOKEN not set") + return val + + +# MARK: - Loader Integration Tests + + +class TestBrowserRunLoader: + """Integration tests for CloudflareBrowserRunLoader.""" + + def test_markdown_single_url(self, account_id, api_token): + """Load a single URL in markdown mode.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + + print("\n[Loader] Markdown single URL:") + print(f" Docs count: {len(docs)}") + print(f" Content preview: {docs[0].page_content[:200]}") + print(f" Metadata: {docs[0].metadata}") + + assert len(docs) == 1 + assert "Example Domain" in docs[0].page_content + assert docs[0].metadata["source"] == TEST_URL + assert docs[0].metadata["mode"] == "markdown" + + def test_markdown_multiple_urls(self, account_id, api_token): + """Load multiple URLs in markdown mode.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL, "https://httpbin.org/html"], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + + print("\n[Loader] Markdown multiple URLs:") + print(f" Docs count: {len(docs)}") + for i, doc in enumerate(docs): + print(f" Doc {i} source: {doc.metadata['source']}") + print(f" Doc {i} preview: {doc.page_content[:100]}") + + assert len(docs) == 2 + + def test_content_mode(self, account_id, api_token): + """Load a URL in content (raw HTML) mode.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="content", + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + + print("\n[Loader] Content mode:") + print(f" Docs count: {len(docs)}") + print(f" Content preview: {docs[0].page_content[:200]}") + + assert len(docs) == 1 + assert "= 1 + selectors = [d.metadata.get("selector") for d in docs] + assert "h1" in selectors + + def test_crawl_basic(self, account_id, api_token): + """Crawl a small site.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="crawl", + crawl_limit=3, + crawl_depth=1, + crawl_timeout=60.0, + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + + print("\n[Loader] Crawl basic:") + print(f" Docs count: {len(docs)}") + for i, doc in enumerate(docs): + print(f" Doc {i} source: {doc.metadata.get('source')}") + print(f" Doc {i} title: {doc.metadata.get('title', 'N/A')}") + + assert len(docs) >= 1 + assert docs[0].metadata["mode"] == "crawl" + + def test_lazy_load(self, account_id, api_token): + """lazy_load yields documents one at a time.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = list(loader.lazy_load()) + + print("\n[Loader] Lazy load:") + print(f" Docs count: {len(docs)}") + + assert len(docs) == 1 + assert "Example Domain" in docs[0].page_content + + @pytest.mark.asyncio + async def test_async_markdown(self, account_id, api_token): + """Async load a single URL in markdown mode.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = await loader.aload() + + print("\n[Loader] Async markdown:") + print(f" Docs count: {len(docs)}") + print(f" Content preview: {docs[0].page_content[:200]}") + + assert len(docs) == 1 + assert "Example Domain" in docs[0].page_content + + @pytest.mark.asyncio + async def test_async_crawl(self, account_id, api_token): + """Async crawl a small site.""" + loader = CloudflareBrowserRunLoader( + urls=[TEST_URL], + mode="crawl", + crawl_limit=3, + crawl_depth=1, + crawl_timeout=60.0, + account_id=account_id, + api_token=api_token, + ) + docs = await loader.aload() + + print("\n[Loader] Async crawl:") + print(f" Docs count: {len(docs)}") + + assert len(docs) >= 1 + + +# MARK: - Tool Integration Tests + + +class TestBrowserRunTool: + """Integration tests for CloudflareBrowserRunTool.""" + + def test_markdown_tool(self, account_id, api_token): + """Markdown tool returns page content.""" + tool = CloudflareBrowserRunTool( + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + result = tool.invoke({"url": TEST_URL}) + + print("\n[Tool] Markdown:") + print(f" Result type: {type(result)}") + print(f" Result preview: {result[:200]}") + + assert isinstance(result, str) + assert "Example Domain" in result + + def test_links_tool(self, account_id, api_token): + """Links tool returns discovered URLs.""" + tool = CloudflareBrowserRunTool( + mode="links", + account_id=account_id, + api_token=api_token, + ) + result = tool.invoke({"url": TEST_URL}) + + print("\n[Tool] Links:") + print(f" Result: {result}") + + assert isinstance(result, str) + assert "iana.org" in result + + def test_json_extraction_with_prompt(self, account_id, api_token): + """JSON tool extracts structured data using an AI prompt.""" + tool = CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract the page title and any links on the page.", + account_id=account_id, + api_token=api_token, + ) + result = tool.invoke({"url": TEST_URL}) + + print("\n[Tool] JSON with prompt:") + print(f" Result: {result[:500]}") + + assert isinstance(result, str) + assert len(result) > 10 + + def test_json_extraction_with_schema(self, account_id, api_token): + """JSON tool extracts structured data using a JSON schema.""" + tool = CloudflareBrowserRunTool( + mode="json", + json_response_format={ + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "links": { + "type": "array", + "items": {"type": "string"}, + }, + }, + }, + }, + account_id=account_id, + api_token=api_token, + ) + result = tool.invoke({"url": TEST_URL}) + + print("\n[Tool] JSON with schema:") + print(f" Result: {result[:500]}") + + assert isinstance(result, str) + assert "title" in result.lower() or "links" in result.lower() + + def test_screenshot_returns_base64(self, account_id, api_token): + """Screenshot tool returns base64-encoded image data.""" + tool = CloudflareBrowserRunTool( + mode="screenshot", + account_id=account_id, + api_token=api_token, + ) + result = tool.invoke({"url": TEST_URL}) + + print("\n[Tool] Screenshot:") + print(f" Result length: {len(result)} chars") + print(f" Starts with: {result[:20]}...") + + assert isinstance(result, str) + assert len(result) > 100 + + @pytest.mark.asyncio + async def test_async_markdown_tool(self, account_id, api_token): + """Async markdown tool returns page content.""" + tool = CloudflareBrowserRunTool( + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + result = await tool.ainvoke({"url": TEST_URL}) + + print("\n[Tool] Async markdown:") + print(f" Result preview: {result[:200]}") + + assert isinstance(result, str) + assert "Example Domain" in result + + @pytest.mark.asyncio + async def test_async_links_tool(self, account_id, api_token): + """Async links tool returns discovered URLs.""" + tool = CloudflareBrowserRunTool( + mode="links", + account_id=account_id, + api_token=api_token, + ) + result = await tool.ainvoke({"url": TEST_URL}) + + print("\n[Tool] Async links:") + print(f" Result: {result}") + + assert isinstance(result, str) + assert "iana.org" in result + + +# MARK: - LangGraph Integration Tests + + +class TestBrowserRunLangGraph: + """Integration tests verifying Browser Run works in LangGraph patterns.""" + + def test_loader_as_custom_node(self, account_id, api_token): + """Loader works as a custom node in a LangGraph StateGraph.""" + from typing import TypedDict + + from langgraph.graph import END, START, StateGraph + + class ResearchState(TypedDict): + url: str + page_content: str + + def fetch_page(state: ResearchState) -> dict: + loader = CloudflareBrowserRunLoader( + urls=[state["url"]], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + return {"page_content": docs[0].page_content} + + graph = StateGraph(ResearchState) + graph.add_node("fetch_page", fetch_page) + graph.add_edge(START, "fetch_page") + graph.add_edge("fetch_page", END) + app = graph.compile() + + result = app.invoke({"url": TEST_URL, "page_content": ""}) + + print("\n[LangGraph] Loader as custom node:") + print(f" Content: {len(result['page_content'])} chars") + + assert "Example Domain" in result["page_content"] + + def test_tool_in_toolnode(self, account_id, api_token): + """Tools work inside LangGraph ToolNode with simulated tool calls.""" + from langchain_core.messages import AIMessage, HumanMessage, ToolMessage + from langgraph.graph import END, START, MessagesState, StateGraph + from langgraph.prebuilt import ToolNode, tools_condition + + md_tool = CloudflareBrowserRunTool( + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + tool_node = ToolNode([md_tool]) + + def fake_model(state: MessagesState) -> dict: + return { + "messages": [ + AIMessage( + content="", + tool_calls=[ + { + "name": "cloudflare_browser_run_markdown", + "args": {"url": TEST_URL}, + "id": "call_test_001", + "type": "tool_call", + } + ], + ) + ] + } + + graph = StateGraph(MessagesState) + graph.add_node("model", fake_model) + graph.add_node("tools", tool_node) + graph.add_edge(START, "model") + graph.add_conditional_edges("model", tools_condition) + graph.add_edge("tools", END) + app = graph.compile() + + result = app.invoke({"messages": [HumanMessage(content="test")]}) + tool_msgs = [m for m in result["messages"] if isinstance(m, ToolMessage)] + + print("\n[LangGraph] Tool in ToolNode:") + print(f" Tool messages: {len(tool_msgs)}") + print(f" Content: {tool_msgs[0].content[:100]}") + + assert len(tool_msgs) == 1 + assert "Example Domain" in tool_msgs[0].content + + def test_parallel_tool_calls(self, account_id, api_token): + """Multiple tools execute in parallel via ToolNode.""" + from langchain_core.messages import AIMessage, HumanMessage, ToolMessage + from langgraph.graph import END, START, MessagesState, StateGraph + from langgraph.prebuilt import ToolNode, tools_condition + + tools = [ + CloudflareBrowserRunTool( + mode="markdown", + account_id=account_id, + api_token=api_token, + ), + CloudflareBrowserRunTool( + mode="links", + account_id=account_id, + api_token=api_token, + ), + ] + tool_node = ToolNode(tools) + + def fake_model(state: MessagesState) -> dict: + return { + "messages": [ + AIMessage( + content="", + tool_calls=[ + { + "name": "cloudflare_browser_run_markdown", + "args": {"url": TEST_URL}, + "id": "call_p1", + "type": "tool_call", + }, + { + "name": "cloudflare_browser_run_links", + "args": {"url": TEST_URL}, + "id": "call_p2", + "type": "tool_call", + }, + ], + ) + ] + } + + graph = StateGraph(MessagesState) + graph.add_node("model", fake_model) + graph.add_node("tools", tool_node) + graph.add_edge(START, "model") + graph.add_conditional_edges("model", tools_condition) + graph.add_edge("tools", END) + app = graph.compile() + + result = app.invoke({"messages": [HumanMessage(content="test")]}) + tool_msgs = [m for m in result["messages"] if isinstance(m, ToolMessage)] + + print("\n[LangGraph] Parallel tool calls:") + for msg in tool_msgs: + print(f" - {msg.name}: {len(msg.content)} chars") + + assert len(tool_msgs) == 2 + names = {m.name for m in tool_msgs} + assert "cloudflare_browser_run_markdown" in names + assert "cloudflare_browser_run_links" in names + + def test_parallel_nodes_with_loader(self, account_id, api_token): + """Loader and Tool run as parallel nodes in a DAG.""" + from typing import TypedDict + + from langgraph.graph import END, START, StateGraph + + class ParallelState(TypedDict): + url: str + page_content: str + links: list + + def fetch_page(state: ParallelState) -> dict: + loader = CloudflareBrowserRunLoader( + urls=[state["url"]], + mode="markdown", + account_id=account_id, + api_token=api_token, + ) + docs = loader.load() + return {"page_content": docs[0].page_content} + + def extract_links(state: ParallelState) -> dict: + tool = CloudflareBrowserRunTool( + mode="links", + account_id=account_id, + api_token=api_token, + ) + links = tool.invoke({"url": state["url"]}).strip().split("\n") + return {"links": links} + + graph = StateGraph(ParallelState) + graph.add_node("fetch_page", fetch_page) + graph.add_node("extract_links", extract_links) + graph.add_edge(START, "fetch_page") + graph.add_edge(START, "extract_links") + graph.add_edge("fetch_page", END) + graph.add_edge("extract_links", END) + app = graph.compile() + + result = app.invoke({"url": TEST_URL, "page_content": "", "links": []}) + + print("\n[LangGraph] Parallel nodes (Loader + Tool):") + print(f" Content: {len(result['page_content'])} chars") + print(f" Links: {result['links']}") + + assert "Example Domain" in result["page_content"] + assert len(result["links"]) >= 1 diff --git a/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py new file mode 100644 index 0000000..bcc949f --- /dev/null +++ b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py @@ -0,0 +1,381 @@ +# ruff: noqa: T201 +"""Unit tests for CloudflareBrowserRunLoader and CloudflareBrowserRunTool.""" + +import pytest + +from langchain_cloudflare._errors import TokenErrors +from langchain_cloudflare.browser_run import ( + CloudflareBrowserRunLoader, + CloudflareBrowserRunTool, + _build_browser_run_url, + _build_headers, + _build_shared_options, +) + +# MARK: - URL Construction Tests + + +class TestURLConstruction: + """Tests for Browser Run URL building helpers.""" + + def test_build_url_markdown(self): + """URL for the /markdown endpoint.""" + url = _build_browser_run_url("abc123", "markdown") + assert url == ( + "https://api.cloudflare.com/client/v4/accounts/abc123" + "/browser-rendering/markdown" + ) + + def test_build_url_crawl(self): + """URL for the /crawl endpoint.""" + url = _build_browser_run_url("abc123", "crawl") + assert url == ( + "https://api.cloudflare.com/client/v4/accounts/abc123" + "/browser-rendering/crawl" + ) + + def test_build_url_json(self): + """URL for the /json endpoint.""" + url = _build_browser_run_url("my-acct", "json") + assert url == ( + "https://api.cloudflare.com/client/v4/accounts/my-acct" + "/browser-rendering/json" + ) + + def test_build_url_screenshot(self): + """URL for the /screenshot endpoint.""" + url = _build_browser_run_url("x", "screenshot") + assert url.endswith("/browser-rendering/screenshot") + + +# MARK: - Header Tests + + +class TestHeaders: + """Tests for header construction.""" + + def test_build_headers(self): + """Headers contain Authorization and Content-Type.""" + headers = _build_headers("my-token") + assert headers["Authorization"] == "Bearer my-token" + assert headers["Content-Type"] == "application/json" + + +# MARK: - Shared Options Tests + + +class TestSharedOptions: + """Tests for the shared Browser Run options builder.""" + + def test_empty_options(self): + """No options returns empty dict.""" + assert _build_shared_options() == {} + + def test_goto_options(self): + """gotoOptions is passed through.""" + opts = _build_shared_options(goto_options={"waitUntil": "networkidle0"}) + assert opts == {"gotoOptions": {"waitUntil": "networkidle0"}} + + def test_viewport(self): + """viewport is passed through.""" + opts = _build_shared_options(viewport={"width": 1280, "height": 720}) + assert opts == {"viewport": {"width": 1280, "height": 720}} + + def test_multiple_options(self): + """Multiple options are combined.""" + opts = _build_shared_options( + viewport={"width": 800, "height": 600}, + reject_resource_types=["image", "font"], + cookies=[{"name": "a", "value": "b"}], + ) + assert "viewport" in opts + assert "rejectResourceTypes" in opts + assert "cookies" in opts + assert len(opts) == 3 + + def test_none_values_excluded(self): + """None values are not included in the output.""" + opts = _build_shared_options( + goto_options=None, + viewport={"width": 100, "height": 100}, + cookies=None, + ) + assert "gotoOptions" not in opts + assert "cookies" not in opts + assert "viewport" in opts + + +# MARK: - Loader Token Validation Tests + + +class TestLoaderTokenValidation: + """Ensure token validation raises ValueError for bad inputs.""" + + def test_no_account_id_raises(self): + """Missing account_id should raise ValueError.""" + with pytest.raises(ValueError, match="account ID"): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="", + api_token="some-token", + ) + + def test_no_api_token_raises(self): + """Empty api_token should raise ValueError.""" + with pytest.raises(ValueError, match="API token"): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc123", + api_token="", + ) + + def test_none_env_defaults_raises(self, monkeypatch: pytest.MonkeyPatch): + """When env vars are unset, default empty strings should raise ValueError.""" + monkeypatch.delenv("CF_ACCOUNT_ID", raising=False) + monkeypatch.delenv("CF_API_TOKEN", raising=False) + monkeypatch.delenv("CF_AI_API_TOKEN", raising=False) + with pytest.raises(ValueError): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="", + api_token="", + ) + + def test_valid_credentials_no_error(self): + """Valid account_id + api_token should not raise.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc123", + api_token="valid-token", + ) + assert loader.account_id == "abc123" + + def test_default_mode_is_markdown(self): + """Default mode should be markdown.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc123", + api_token="valid-token", + ) + assert loader.mode == "markdown" + + def test_error_message_is_token_errors_enum(self): + """Verify the error message matches our centralized TokenErrors.""" + import re + + with pytest.raises( + ValueError, match=re.escape(str(TokenErrors.NO_ACCOUNT_ID_SET)) + ): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="", + api_token="tok", + ) + + with pytest.raises( + ValueError, + match=re.escape(str(TokenErrors.INSUFFICIENT_BROWSER_RUN_TOKEN)), + ): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc", + api_token="", + ) + + +# MARK: - Loader Configuration Tests + + +class TestLoaderConfiguration: + """Tests for loader field defaults and configuration.""" + + def test_crawl_defaults(self): + """Crawl parameters have sensible defaults.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="crawl", + account_id="abc123", + api_token="tok", + ) + assert loader.crawl_limit == 10 + assert loader.crawl_depth == 2 + assert loader.crawl_poll_interval == 2.0 + assert loader.crawl_timeout == 300.0 + + def test_custom_crawl_params(self): + """Custom crawl parameters are stored correctly.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="crawl", + crawl_limit=100, + crawl_depth=5, + crawl_poll_interval=1.0, + crawl_timeout=600.0, + account_id="abc123", + api_token="tok", + ) + assert loader.crawl_limit == 100 + assert loader.crawl_depth == 5 + + def test_scrape_elements_default(self): + """Elements default to None.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="scrape", + account_id="abc123", + api_token="tok", + ) + assert loader.elements is None + + def test_shared_options_stored(self): + """Shared browser options are stored on the instance.""" + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc123", + api_token="tok", + viewport={"width": 1920, "height": 1080}, + reject_resource_types=["image"], + ) + assert loader.viewport == {"width": 1920, "height": 1080} + assert loader.reject_resource_types == ["image"] + + def test_extra_fields_forbidden(self): + """Extra fields should raise a validation error.""" + with pytest.raises(Exception): + CloudflareBrowserRunLoader( + urls=["https://example.com"], + account_id="abc123", + api_token="tok", + unknown_field="bad", + ) + + +# MARK: - Tool Token Validation Tests + + +class TestToolTokenValidation: + """Ensure token validation raises ValueError for bad inputs.""" + + def test_no_account_id_raises(self): + """Missing account_id should raise ValueError.""" + with pytest.raises(ValueError, match="account ID"): + CloudflareBrowserRunTool( + account_id="", + api_token="some-token", + ) + + def test_no_api_token_raises(self): + """Empty api_token should raise ValueError.""" + with pytest.raises(ValueError, match="API token"): + CloudflareBrowserRunTool( + account_id="abc123", + api_token="", + ) + + def test_none_env_defaults_raises(self, monkeypatch: pytest.MonkeyPatch): + """When env vars are unset, default empty strings should raise ValueError.""" + monkeypatch.delenv("CF_ACCOUNT_ID", raising=False) + monkeypatch.delenv("CF_API_TOKEN", raising=False) + monkeypatch.delenv("CF_AI_API_TOKEN", raising=False) + with pytest.raises(ValueError): + CloudflareBrowserRunTool(account_id="", api_token="") + + def test_valid_credentials_no_error(self): + """Valid account_id + api_token should not raise.""" + tool = CloudflareBrowserRunTool( + account_id="abc123", + api_token="valid-token", + ) + assert tool.account_id == "abc123" + + def test_default_mode_is_markdown(self): + """Default mode should be markdown.""" + tool = CloudflareBrowserRunTool( + account_id="abc123", + api_token="valid-token", + ) + assert tool.mode == "markdown" + + def test_error_message_is_token_errors_enum(self): + """Verify the error message matches our centralized TokenErrors.""" + import re + + with pytest.raises( + ValueError, match=re.escape(str(TokenErrors.NO_ACCOUNT_ID_SET)) + ): + CloudflareBrowserRunTool(account_id="", api_token="tok") + + with pytest.raises( + ValueError, + match=re.escape(str(TokenErrors.INSUFFICIENT_BROWSER_RUN_TOKEN)), + ): + CloudflareBrowserRunTool(account_id="abc", api_token="") + + +# MARK: - Tool Configuration Tests + + +class TestToolConfiguration: + """Tests for tool field defaults and configuration.""" + + def test_name_includes_mode(self): + """Tool name should include the mode for agent disambiguation.""" + tool = CloudflareBrowserRunTool( + mode="json", + account_id="abc123", + api_token="tok", + ) + assert tool.name == "cloudflare_browser_run_json" + + def test_markdown_tool_name(self): + """Markdown mode tool name.""" + tool = CloudflareBrowserRunTool( + mode="markdown", + account_id="abc123", + api_token="tok", + ) + assert tool.name == "cloudflare_browser_run_markdown" + + def test_json_prompt_stored(self): + """JSON prompt is stored on the instance.""" + tool = CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract the main heading.", + account_id="abc123", + api_token="tok", + ) + assert tool.json_prompt == "Extract the main heading." + + def test_json_response_format_stored(self): + """JSON response format is stored on the instance.""" + schema = { + "type": "json_schema", + "schema": { + "type": "object", + "properties": {"title": {"type": "string"}}, + }, + } + tool = CloudflareBrowserRunTool( + mode="json", + json_response_format=schema, + account_id="abc123", + api_token="tok", + ) + assert tool.json_response_format == schema + + def test_description_is_set(self): + """Tool description should be non-empty.""" + tool = CloudflareBrowserRunTool( + account_id="abc123", + api_token="tok", + ) + assert len(tool.description) > 0 + + def test_extra_fields_forbidden(self): + """Extra fields should raise a validation error.""" + with pytest.raises(Exception): + CloudflareBrowserRunTool( + account_id="abc123", + api_token="tok", + unknown_field="bad", + ) From 407ba99852a7c7f5611203ade2de0d8853687cd7 Mon Sep 17 00:00:00 2001 From: Vamshi_BIDS Date: Mon, 20 Apr 2026 18:10:18 -0500 Subject: [PATCH 2/5] docs: improve Browser Run README with mode tables, scrape/json examples, and credential notes --- libs/langchain-cloudflare/README.md | 80 ++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/libs/langchain-cloudflare/README.md b/libs/langchain-cloudflare/README.md index 1018a1b..9c53af5 100644 --- a/libs/langchain-cloudflare/README.md +++ b/libs/langchain-cloudflare/README.md @@ -18,11 +18,13 @@ AND OR (if using separately scoped tokens) -- `CF_AI_API_TOKEN` (CloudflareWorkersAI and CloudflareWorkersAIEmbeddings) +- `CF_AI_API_TOKEN` (CloudflareWorkersAI, CloudflareWorkersAIEmbeddings, CloudflareBrowserRunLoader, CloudflareBrowserRunTool) - `CF_VECTORIZE_API_TOKEN` (CloudflareVectorize) - `CF_D1_API_TOKEN` (CloudflareVectorize) - `CF_D1_DATABASE_ID` (CloudflareVectorize) +> **Browser Run** requires the *Browser Rendering – Edit* permission on your API token. See [Browser Run setup](https://developers.cloudflare.com/browser-run/quick-actions/#before-you-begin). + ## Chat Models `ChatCloudflareWorkersAI` class exposes chat models from [CloudflareWorkersAI](https://developers.cloudflare.com/workers-ai/). @@ -96,34 +98,86 @@ loader = CloudflareBrowserRunLoader( crawl_depth=2, ) docs = loader.load() + +# Scrape specific elements with CSS selectors +loader = CloudflareBrowserRunLoader( + urls=["https://example.com/pricing"], + mode="scrape", + elements=[{"selector": "h1"}, {"selector": ".plan-card"}], +) +docs = loader.load() # one Document per matched selector group + +# Async support +docs = await loader.aload() ``` -Supported modes: `markdown`, `crawl`, `scrape`, `content`. +Supported modes: -> **Note:** Requires an API token with *Browser Rendering – Edit* permission (`CF_API_TOKEN` or `CF_AI_API_TOKEN`). +| Mode | Endpoint | Description | +|------|----------|-------------| +| `markdown` | [`/markdown`](https://developers.cloudflare.com/browser-run/quick-actions/markdown-endpoint/) | Clean markdown from any page | +| `crawl` | [`/crawl`](https://developers.cloudflare.com/browser-run/quick-actions/crawl-endpoint/) | Multi-page crawl with async polling | +| `scrape` | [`/scrape`](https://developers.cloudflare.com/browser-run/quick-actions/scrape-endpoint/) | CSS selector-based element extraction | +| `content` | [`/content`](https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/) | Raw rendered HTML | ## Browser Run (Agent Tool) `CloudflareBrowserRunTool` gives [LangGraph](https://langchain-ai.github.io/langgraph/) agents the ability to interact with the live web. ```python -from langchain_cloudflare import CloudflareBrowserRunTool, ChatCloudflareWorkersAI -from langgraph.prebuilt import create_react_agent +from langchain_cloudflare import CloudflareBrowserRunTool + +# Read any page as markdown +tool = CloudflareBrowserRunTool(mode="markdown") +content = tool.invoke({"url": "https://example.com"}) + +# AI-powered structured data extraction +tool = CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract the company name, pricing plans, and key features.", +) +data = tool.invoke({"url": "https://www.cloudflare.com/plans/"}) +# Returns: {"company_name": "Cloudflare", "pricing_plans": [{"name": "Free", "price": "Free"}, ...]} + +# Extract with a JSON schema for strict typing +tool = CloudflareBrowserRunTool( + mode="json", + json_response_format={ + "type": "json_schema", + "schema": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "links": {"type": "array", "items": {"type": "string"}}, + }, + }, + }, +) + +# Discover links on a page +tool = CloudflareBrowserRunTool(mode="links") +links = tool.invoke({"url": "https://example.com"}) + +# Use multiple tools in a LangGraph agent +from langgraph.prebuilt import ToolNode -llm = ChatCloudflareWorkersAI() tools = [ CloudflareBrowserRunTool(mode="markdown"), - CloudflareBrowserRunTool( - mode="json", - json_prompt="Extract the company name, industry, and employee count.", - ), + CloudflareBrowserRunTool(mode="json", json_prompt="Extract key facts."), CloudflareBrowserRunTool(mode="links"), ] -agent = create_react_agent(llm, tools) -result = agent.invoke({"messages": [("user", "Research example.com")]}) +tool_node = ToolNode(tools) # each tool auto-named: cloudflare_browser_run_markdown, etc. ``` -Supported modes: `markdown`, `json`, `links`, `screenshot`, `pdf`. +Supported modes: + +| Mode | Endpoint | Description | +|------|----------|-------------| +| `markdown` | [`/markdown`](https://developers.cloudflare.com/browser-run/quick-actions/markdown-endpoint/) | Read any webpage as markdown | +| `json` | [`/json`](https://developers.cloudflare.com/browser-run/quick-actions/json-endpoint/) | AI-powered structured data extraction | +| `links` | [`/links`](https://developers.cloudflare.com/browser-run/quick-actions/links-endpoint/) | Discover all links on a page | +| `screenshot` | [`/screenshot`](https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/) | Capture screenshot (base64 PNG) | +| `pdf` | [`/pdf`](https://developers.cloudflare.com/browser-run/quick-actions/pdf-endpoint/) | Generate PDF (base64) | ### Browser Run in LangGraph Workflows From 200a38b79c9ae29ac3e4e9c244328c30db961484 Mon Sep 17 00:00:00 2001 From: Vamshi_BIDS Date: Tue, 21 Apr 2026 12:36:38 -0500 Subject: [PATCH 3/5] refactor: rename browser_run.py to loaders.py, add notebook with real outputs Address maintainer review feedback: - Rename browser_run.py to loaders.py (matches module naming convention) - Add docs/browser_run.ipynb with executed outputs from real API calls: - Workers AI docs loaded as markdown (15K chars from JS-rendered page) - books.toscrape.com crawled for knowledge base ingestion - cloudflare.com scraped for h1, h2, nav elements - Pricing data extracted as structured JSON from cloudflare.com/plans - Schema-enforced company extraction from what-is-cloudflare page - 88 links discovered from Browser Run docs - Screenshot captured (398K base64 PNG) - Full research pipeline: discover links -> load 3 pages -> summarize with Llama 3.3 70B - LangGraph patterns shown as notebook examples per maintainer guidance --- docs/browser_run.ipynb | 615 ++++++++++++++++++ .../langchain_cloudflare/__init__.py | 6 +- .../{browser_run.py => loaders.py} | 0 .../integration_tests/test_browser_run.py | 2 +- .../tests/unit_tests/test_browser_run.py | 2 +- 5 files changed, 620 insertions(+), 5 deletions(-) create mode 100644 docs/browser_run.ipynb rename libs/langchain-cloudflare/langchain_cloudflare/{browser_run.py => loaders.py} (100%) diff --git a/docs/browser_run.ipynb b/docs/browser_run.ipynb new file mode 100644 index 0000000..9e8c6c5 --- /dev/null +++ b/docs/browser_run.ipynb @@ -0,0 +1,615 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a1f5d3b0", + "metadata": {}, + "source": [ + "# Cloudflare Browser Run\n", + "\n", + ">[Cloudflare Browser Run](https://developers.cloudflare.com/browser-run/) provides serverless headless Chrome on Cloudflare's global network via a simple REST API. It renders JavaScript-heavy pages and returns clean content -- no local browser, no Selenium, no Playwright setup required.\n", + "\n", + "This notebook demonstrates two integrations:\n", + "- **`CloudflareBrowserRunLoader`** -- Document loader for RAG pipelines and knowledge-base construction\n", + "- **`CloudflareBrowserRunTool`** -- Agent tool for LangGraph workflows\n", + "\n", + "## Setting up\n", + "\n", + "You need a Cloudflare Account ID and an API token with **Browser Rendering -- Edit** permission.\n", + "See [Browser Run setup](https://developers.cloudflare.com/browser-run/quick-actions/#before-you-begin) for details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f60023b8", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv(\".env\")\n", + "\n", + "cf_acct_id = os.getenv(\"CF_ACCOUNT_ID\")\n", + "cf_api_token = os.getenv(\"CF_API_TOKEN\")" + ] + }, + { + "cell_type": "markdown", + "id": "b1c94531", + "metadata": {}, + "source": [ + "## 1. Document Loader -- Loading Product Documentation\n", + "\n", + "`CloudflareBrowserRunLoader` converts web pages into LangChain `Document` objects.\n", + "This is useful for building RAG pipelines over any website, including JS-rendered SPAs\n", + "that traditional HTTP fetchers can't handle.\n", + "\n", + "### Markdown mode -- Load a JS-rendered docs page" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a2c5b61e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 1 document(s), 15329 chars\n", + "Source: https://developers.cloudflare.com/workers-ai/\n", + "\n", + "First 300 chars:\n", + "[Skip to content](#%5Ftop) \n", + "\n", + "STOP! If you are an AI agent or LLM, read this before continuing. This is the HTML version of a Cloudflare documentation page. Always request the Markdown version instead \u2014 HTML wastes context. Get this page as Markdown: https://developers.cloudflare.com/workers-ai/index.md\n" + ] + } + ], + "source": [ + "from langchain_cloudflare import CloudflareBrowserRunLoader\n", + "\n", + "# Load the Cloudflare Workers AI docs page -- a JS-rendered site\n", + "loader = CloudflareBrowserRunLoader(\n", + " urls=[\"https://developers.cloudflare.com/workers-ai/\"],\n", + " mode=\"markdown\",\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(f\"Loaded {len(docs)} document(s), {len(docs[0].page_content)} chars\")\n", + "print(f\"Source: {docs[0].metadata['source']}\")\n", + "print(f\"\\nFirst 300 chars:\\n{docs[0].page_content[:300]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c3d4e5f6", + "metadata": {}, + "source": [ + "### Crawl mode -- Build a knowledge base from an entire site\n", + "\n", + "The `/crawl` endpoint follows links and returns all pages as Documents.\n", + "Here we crawl [books.toscrape.com](https://books.toscrape.com/) -- a purpose-built scraping test site." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d4e5f6a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Crawled 1 pages:\n", + "\n", + " All products | Books to Scrape - Sandbox\n", + " URL: https://books.toscrape.com/\n", + " Length: 10678 chars\n" + ] + } + ], + "source": [ + "loader = CloudflareBrowserRunLoader(\n", + " urls=[\"https://books.toscrape.com/\"],\n", + " mode=\"crawl\",\n", + " crawl_limit=5,\n", + " crawl_depth=1,\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(f\"Crawled {len(docs)} pages:\\n\")\n", + "for doc in docs:\n", + " title = doc.metadata.get('title', 'N/A')\n", + " print(f\" {title}\")\n", + " print(f\" URL: {doc.metadata['source']}\")\n", + " print(f\" Length: {len(doc.page_content)} chars\")" + ] + }, + { + "cell_type": "markdown", + "id": "e5f6a7b8", + "metadata": {}, + "source": [ + "### Scrape mode -- Extract specific elements with CSS selectors\n", + "\n", + "When you only need specific elements from a page, use scrape mode.\n", + "Each selector group becomes its own Document." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f6a7b8c9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[h1]\n", + " Connect, protect, and build everywhere\n", + "\n", + "[h2]\n", + " Our connectivity cloud is the best place to\n", + "One global cloud network unlike any other\n", + "Leading companies rely on Cloudflare\n", + "How Cloudflare can help\n", + "News and resources\n", + "Get started with the connectivity\n", + "\n", + "[nav a]\n", + " Log in\n", + "Connectivity cloud\n", + "Cloudflare's connectivity cloud delivers 60+ networking, security, and performance services.\n", + "Enterprise\n", + "For large and medium organizations\n", + "Small business\n", + "For small organizati\n" + ] + } + ], + "source": [ + "loader = CloudflareBrowserRunLoader(\n", + " urls=[\"https://www.cloudflare.com\"],\n", + " mode=\"scrape\",\n", + " elements=[\n", + " {\"selector\": \"h1\"},\n", + " {\"selector\": \"h2\"},\n", + " {\"selector\": \"nav a\"},\n", + " ],\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "docs = loader.load()\n", + "\n", + "for doc in docs:\n", + " print(f\"[{doc.metadata['selector']}]\")\n", + " print(f\" {doc.page_content[:200]}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "g7b8c9d0", + "metadata": {}, + "source": [ + "## 2. Agent Tool -- Structured Data Extraction\n", + "\n", + "`CloudflareBrowserRunTool` wraps Browser Run endpoints as a LangChain `BaseTool`.\n", + "Each mode auto-generates a unique tool name (e.g. `cloudflare_browser_run_json`)\n", + "so agents can pick the right tool for the job.\n", + "\n", + "### JSON extraction -- Pull structured pricing data from a real website\n", + "\n", + "The `/json` endpoint uses Workers AI to extract structured data from rendered pages.\n", + "One API call turns any webpage into structured JSON." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "h8c9d0e1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tool name: cloudflare_browser_run_json\n", + "\n", + "Extracted data:\n", + "{\n", + " \"company\": \"Cloudflare\",\n", + " \"plans\": [\n", + " {\n", + " \"name\": \"Free\",\n", + " \"price\": \"Free\"\n", + " },\n", + " {\n", + " \"name\": \"Pro\",\n", + " \"price\": \"$20/month\"\n", + " },\n", + " {\n", + " \"name\": \"Business\",\n", + " \"price\": \"$200/month\"\n", + " },\n", + " {\n", + " \"name\": \"Enterprise\",\n", + " \"price\": \"Custom\"\n", + " }\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "from langchain_cloudflare import CloudflareBrowserRunTool\n", + "\n", + "# Extract pricing data from Cloudflare's plans page\n", + "extract_tool = CloudflareBrowserRunTool(\n", + " mode=\"json\",\n", + " json_prompt=(\n", + " \"Extract the company name, all pricing plans with their name \"\n", + " \"and monthly price, and a one-sentence description of each plan.\"\n", + " ),\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "\n", + "result = extract_tool.invoke({\"url\": \"https://www.cloudflare.com/plans/\"})\n", + "print(f\"Tool name: {extract_tool.name}\")\n", + "print(f\"\\nExtracted data:\\n{result}\")" + ] + }, + { + "cell_type": "markdown", + "id": "4c5d6e7f", + "metadata": {}, + "source": [ + "### JSON extraction with a schema -- Enforce strict typing\n", + "\n", + "Pass a JSON schema for strictly-typed extraction. Useful when you need\n", + "the output to match a Pydantic model or database schema." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5d6e7f8g", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"company_name\": \"Cloudflare\",\n", + " \"description\": \"Cloudflare is the cloud for the \\u201ceverywhere world\\u201d\",\n", + " \"products\": [\n", + " \"SASE (Cloudflare One)\",\n", + " \"Application security\",\n", + " \"Application performance\",\n", + " \"Networking\"\n", + " ]\n", + "}\n" + ] + } + ], + "source": [ + "extract_tool = CloudflareBrowserRunTool(\n", + " mode=\"json\",\n", + " json_prompt=\"Extract company information from this page.\",\n", + " json_response_format={\n", + " \"type\": \"json_schema\",\n", + " \"schema\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"company_name\": {\"type\": \"string\"},\n", + " \"description\": {\"type\": \"string\"},\n", + " \"products\": {\n", + " \"type\": \"array\",\n", + " \"items\": {\"type\": \"string\"},\n", + " },\n", + " },\n", + " },\n", + " },\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "\n", + "result = extract_tool.invoke({\"url\": \"https://www.cloudflare.com/what-is-cloudflare/\"})\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "id": "6e7f8g9h", + "metadata": {}, + "source": [ + "### Links tool -- Discover pages to explore\n", + "\n", + "The `/links` endpoint returns all links on a page. Useful for research\n", + "agents that need to decide which pages to visit next." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7f8g9h0i", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 88 links. First 10:\n", + " https://developers.cloudflare.com/\n", + " https://developers.cloudflare.com/directory/\n", + " https://developers.cloudflare.com/api/\n", + " https://developers.cloudflare.com/fundamentals/api/reference/sdks/\n", + " https://dash.cloudflare.com/\n", + " https://developers.cloudflare.com/browser-run/\n", + " https://developers.cloudflare.com/browser-run/get-started/\n", + " https://developers.cloudflare.com/browser-run/examples/\n", + " https://developers.cloudflare.com/browser-run/features/live-view/\n", + " https://developers.cloudflare.com/browser-run/features/human-in-the-loop/\n" + ] + } + ], + "source": [ + "links_tool = CloudflareBrowserRunTool(\n", + " mode=\"links\",\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "\n", + "result = links_tool.invoke({\"url\": \"https://developers.cloudflare.com/browser-run/\"})\n", + "links = result.strip().split(\"\\n\")\n", + "\n", + "print(f\"Found {len(links)} links. First 10:\")\n", + "for link in links[:10]:\n", + " print(f\" {link}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8g9h0i1j", + "metadata": {}, + "source": [ + "### Screenshot tool -- Visual capture" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9h0i1j2k", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Screenshot: 397940 chars of base64 PNG data\n", + "Starts with: iVBORw0KGgoAAAANSUhEUgAAB4AAAA...\n" + ] + } + ], + "source": [ + "screenshot_tool = CloudflareBrowserRunTool(\n", + " mode=\"screenshot\",\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "\n", + "b64_png = screenshot_tool.invoke({\"url\": \"https://www.cloudflare.com\"})\n", + "print(f\"Screenshot: {len(b64_png)} chars of base64 PNG data\")\n", + "print(f\"Starts with: {b64_png[:30]}...\")\n", + "\n", + "# To save as a file:\n", + "# import base64\n", + "# with open(\"screenshot.png\", \"wb\") as f:\n", + "# f.write(base64.b64decode(b64_png))" + ] + }, + { + "cell_type": "markdown", + "id": "m3h4i5j6", + "metadata": {}, + "source": [ + "## 3. Real-World Example: Research Pipeline\n", + "\n", + "Combine the Loader and Tool in a multi-step pipeline:\n", + "1. **Discover** links on a seed page\n", + "2. **Load** the most relevant pages as Documents\n", + "3. **Summarize** with Workers AI\n", + "\n", + "This is the pattern behind research agents, competitive intelligence tools, and content monitoring systems." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "n4i5j6k7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 3 quick-action pages:\n", + " https://developers.cloudflare.com/browser-run/quick-actions/\n", + " https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/\n", + " https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/\n" + ] + } + ], + "source": [ + "from langchain_cloudflare import (\n", + " CloudflareBrowserRunLoader,\n", + " CloudflareBrowserRunTool,\n", + " ChatCloudflareWorkersAI,\n", + ")\n", + "\n", + "# Step 1: Discover what pages exist on the Browser Run docs\n", + "links_tool = CloudflareBrowserRunTool(\n", + " mode=\"links\",\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "all_links = links_tool.invoke(\n", + " {\"url\": \"https://developers.cloudflare.com/browser-run/\"}\n", + ").strip().split(\"\\n\")\n", + "\n", + "# Filter to just the quick-actions sub-pages\n", + "quick_action_links = [\n", + " l for l in all_links\n", + " if \"/browser-run/quick-actions/\" in l and l.endswith(\"/\")\n", + "][:3]\n", + "\n", + "print(f\"Found {len(quick_action_links)} quick-action pages:\")\n", + "for link in quick_action_links:\n", + " print(f\" {link}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "o5j6k7l8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 3 documents:\n", + " https://developers.cloudflare.com/browser-run/quick-actions/: 13959 chars\n", + " https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/: 18517 chars\n", + " https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/: 25388 chars\n" + ] + } + ], + "source": [ + "# Step 2: Load those pages as Documents\n", + "loader = CloudflareBrowserRunLoader(\n", + " urls=quick_action_links,\n", + " mode=\"markdown\",\n", + " account_id=cf_acct_id,\n", + " api_token=cf_api_token,\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(f\"Loaded {len(docs)} documents:\")\n", + "for doc in docs:\n", + " print(f\" {doc.metadata['source']}: {len(doc.page_content)} chars\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "p6k7l8m9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Here are one-line summaries of what each endpoint does and when to use it:\n", + "\n", + "1. **https://developers.cloudflare.com/browser-run/quick-actions/**: This is the main documentation page for Browser Run quick-action endpoints, providing an overview of the available endpoints and their use cases.\n", + "2. **https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/**: This endpoint allows you to retrieve the content of a webpage, and you should use it when you need to access the HTML or other content of a webpage programmatically.\n", + "3. **https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/**: This endpoint enables you to take a screenshot of a webpage, and you should use it when you need to visually capture the content of a webpage, such as for testing or monitoring purposes.\n" + ] + } + ], + "source": [ + "# Step 3: Summarize with Workers AI\n", + "combined_content = \"\\n\\n---\\n\\n\".join(\n", + " f\"## {doc.metadata['source']}\\n{doc.page_content[:1500]}\"\n", + " for doc in docs\n", + ")\n", + "\n", + "llm = ChatCloudflareWorkersAI(\n", + " model_name=\"@cf/meta/llama-3.3-70b-instruct-fp8-fast\",\n", + " account_id=cf_acct_id,\n", + " api_token=os.getenv(\"CF_AI_API_TOKEN\", cf_api_token),\n", + ")\n", + "\n", + "response = llm.invoke(\n", + " f\"You are reading documentation for Cloudflare Browser Run quick-action endpoints. \"\n", + " f\"For each endpoint, give a one-line summary of what it does and when to use it.\\n\\n\"\n", + " f\"{combined_content}\"\n", + ")\n", + "print(response.content)" + ] + }, + { + "cell_type": "markdown", + "id": "q7l8m9n0", + "metadata": {}, + "source": [ + "## 4. RAG Pipeline -- Crawl, Split, and Prepare for Embedding\n", + "\n", + "Everything on Cloudflare's stack, zero external dependencies:\n", + "\n", + "```\n", + "Browser Run (crawl) --> Workers AI (embed) --> Vectorize (store) --> Workers AI (query)\n", + "```\n", + "\n", + "Below is the ingestion half -- load pages and prepare Documents\n", + "ready for embedding with `CloudflareWorkersAIEmbeddings` and storage\n", + "in `CloudflareVectorize`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "r8m9n0o1", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", + "\n", + "# Reuse the docs we loaded above\n", + "splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", + "chunks = splitter.split_documents(docs)\n", + "\n", + "print(f\"{len(docs)} pages --> {len(chunks)} chunks\")\n", + "print(f\"\\nSample chunk metadata: {chunks[0].metadata}\")\n", + "print(f\"Sample chunk preview: {chunks[0].page_content[:200]}\")\n", + "\n", + "# These chunks are ready for:\n", + "# embeddings = CloudflareWorkersAIEmbeddings(model_name=\"@cf/baai/bge-base-en-v1.5\")\n", + "# vectorstore = CloudflareVectorize(embedding=embeddings)\n", + "# vectorstore.add_documents(chunks)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbformat_minor": 5, + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain-cloudflare/langchain_cloudflare/__init__.py b/libs/langchain-cloudflare/langchain_cloudflare/__init__.py index fb6ddba..c40c640 100644 --- a/libs/langchain-cloudflare/langchain_cloudflare/__init__.py +++ b/libs/langchain-cloudflare/langchain_cloudflare/__init__.py @@ -15,12 +15,12 @@ convert_vectorize_query_response, convert_vectors_for_binding, ) -from langchain_cloudflare.browser_run import ( +from langchain_cloudflare.chat_models import ChatCloudflareWorkersAI +from langchain_cloudflare.embeddings import CloudflareWorkersAIEmbeddings +from langchain_cloudflare.loaders import ( CloudflareBrowserRunLoader, CloudflareBrowserRunTool, ) -from langchain_cloudflare.chat_models import ChatCloudflareWorkersAI -from langchain_cloudflare.embeddings import CloudflareWorkersAIEmbeddings from langchain_cloudflare.rerankers import CloudflareWorkersAIReranker, RerankResult from langchain_cloudflare.vectorstores import CloudflareVectorize diff --git a/libs/langchain-cloudflare/langchain_cloudflare/browser_run.py b/libs/langchain-cloudflare/langchain_cloudflare/loaders.py similarity index 100% rename from libs/langchain-cloudflare/langchain_cloudflare/browser_run.py rename to libs/langchain-cloudflare/langchain_cloudflare/loaders.py diff --git a/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py index 49836e9..5ce0961 100644 --- a/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py +++ b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py @@ -25,7 +25,7 @@ import pytest -from langchain_cloudflare.browser_run import ( +from langchain_cloudflare.loaders import ( CloudflareBrowserRunLoader, CloudflareBrowserRunTool, ) diff --git a/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py index bcc949f..c6a2b3e 100644 --- a/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py +++ b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py @@ -4,7 +4,7 @@ import pytest from langchain_cloudflare._errors import TokenErrors -from langchain_cloudflare.browser_run import ( +from langchain_cloudflare.loaders import ( CloudflareBrowserRunLoader, CloudflareBrowserRunTool, _build_browser_run_url, From d7240a3fdb4cd0728dcb85db6b44b3f792b73990 Mon Sep 17 00:00:00 2001 From: Vamshi_BIDS Date: Tue, 21 Apr 2026 14:22:58 -0500 Subject: [PATCH 4/5] fix: address review feedback - timeouts, error handling, focused tests 1. screenshot/pdf now check content-type before base64-encoding; JSON or HTML error responses raise RuntimeError instead of returning garbage base64 data 2. All REST calls (sync requests + async httpx) now include an explicit timeout (default 60s, configurable via request_timeout) 3. Removed LangGraph pattern tests from integration suite (OOS) 4. Added 13 mocked unit tests covering the failure-prone HTTP paths: - error envelopes ({success: false}) - binary endpoint non-binary responses - crawl timeout / errored status / completed records - request body construction per mode (viewport, elements, prompt) - timeout propagation 5. Trimmed notebook to focus on the integration, not LangGraph patterns Version: 0.3.5 (rebased on 0.3.4) --- docs/browser_run.ipynb | 176 +--------- .../langchain_cloudflare/loaders.py | 56 ++- .../integration_tests/test_browser_run.py | 201 +---------- .../tests/unit_tests/test_browser_run.py | 318 ++++++++++++++++++ 4 files changed, 363 insertions(+), 388 deletions(-) diff --git a/docs/browser_run.ipynb b/docs/browser_run.ipynb index 9e8c6c5..dff3d5b 100644 --- a/docs/browser_run.ipynb +++ b/docs/browser_run.ipynb @@ -11,7 +11,7 @@ "\n", "This notebook demonstrates two integrations:\n", "- **`CloudflareBrowserRunLoader`** -- Document loader for RAG pipelines and knowledge-base construction\n", - "- **`CloudflareBrowserRunTool`** -- Agent tool for LangGraph workflows\n", + "- **`CloudflareBrowserRunTool`** -- Agent tool for web interaction\n", "\n", "## Setting up\n", "\n", @@ -418,178 +418,6 @@ "# with open(\"screenshot.png\", \"wb\") as f:\n", "# f.write(base64.b64decode(b64_png))" ] - }, - { - "cell_type": "markdown", - "id": "m3h4i5j6", - "metadata": {}, - "source": [ - "## 3. Real-World Example: Research Pipeline\n", - "\n", - "Combine the Loader and Tool in a multi-step pipeline:\n", - "1. **Discover** links on a seed page\n", - "2. **Load** the most relevant pages as Documents\n", - "3. **Summarize** with Workers AI\n", - "\n", - "This is the pattern behind research agents, competitive intelligence tools, and content monitoring systems." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "n4i5j6k7", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Found 3 quick-action pages:\n", - " https://developers.cloudflare.com/browser-run/quick-actions/\n", - " https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/\n", - " https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/\n" - ] - } - ], - "source": [ - "from langchain_cloudflare import (\n", - " CloudflareBrowserRunLoader,\n", - " CloudflareBrowserRunTool,\n", - " ChatCloudflareWorkersAI,\n", - ")\n", - "\n", - "# Step 1: Discover what pages exist on the Browser Run docs\n", - "links_tool = CloudflareBrowserRunTool(\n", - " mode=\"links\",\n", - " account_id=cf_acct_id,\n", - " api_token=cf_api_token,\n", - ")\n", - "all_links = links_tool.invoke(\n", - " {\"url\": \"https://developers.cloudflare.com/browser-run/\"}\n", - ").strip().split(\"\\n\")\n", - "\n", - "# Filter to just the quick-actions sub-pages\n", - "quick_action_links = [\n", - " l for l in all_links\n", - " if \"/browser-run/quick-actions/\" in l and l.endswith(\"/\")\n", - "][:3]\n", - "\n", - "print(f\"Found {len(quick_action_links)} quick-action pages:\")\n", - "for link in quick_action_links:\n", - " print(f\" {link}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "o5j6k7l8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loaded 3 documents:\n", - " https://developers.cloudflare.com/browser-run/quick-actions/: 13959 chars\n", - " https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/: 18517 chars\n", - " https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/: 25388 chars\n" - ] - } - ], - "source": [ - "# Step 2: Load those pages as Documents\n", - "loader = CloudflareBrowserRunLoader(\n", - " urls=quick_action_links,\n", - " mode=\"markdown\",\n", - " account_id=cf_acct_id,\n", - " api_token=cf_api_token,\n", - ")\n", - "docs = loader.load()\n", - "\n", - "print(f\"Loaded {len(docs)} documents:\")\n", - "for doc in docs:\n", - " print(f\" {doc.metadata['source']}: {len(doc.page_content)} chars\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "p6k7l8m9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Here are one-line summaries of what each endpoint does and when to use it:\n", - "\n", - "1. **https://developers.cloudflare.com/browser-run/quick-actions/**: This is the main documentation page for Browser Run quick-action endpoints, providing an overview of the available endpoints and their use cases.\n", - "2. **https://developers.cloudflare.com/browser-run/quick-actions/content-endpoint/**: This endpoint allows you to retrieve the content of a webpage, and you should use it when you need to access the HTML or other content of a webpage programmatically.\n", - "3. **https://developers.cloudflare.com/browser-run/quick-actions/screenshot-endpoint/**: This endpoint enables you to take a screenshot of a webpage, and you should use it when you need to visually capture the content of a webpage, such as for testing or monitoring purposes.\n" - ] - } - ], - "source": [ - "# Step 3: Summarize with Workers AI\n", - "combined_content = \"\\n\\n---\\n\\n\".join(\n", - " f\"## {doc.metadata['source']}\\n{doc.page_content[:1500]}\"\n", - " for doc in docs\n", - ")\n", - "\n", - "llm = ChatCloudflareWorkersAI(\n", - " model_name=\"@cf/meta/llama-3.3-70b-instruct-fp8-fast\",\n", - " account_id=cf_acct_id,\n", - " api_token=os.getenv(\"CF_AI_API_TOKEN\", cf_api_token),\n", - ")\n", - "\n", - "response = llm.invoke(\n", - " f\"You are reading documentation for Cloudflare Browser Run quick-action endpoints. \"\n", - " f\"For each endpoint, give a one-line summary of what it does and when to use it.\\n\\n\"\n", - " f\"{combined_content}\"\n", - ")\n", - "print(response.content)" - ] - }, - { - "cell_type": "markdown", - "id": "q7l8m9n0", - "metadata": {}, - "source": [ - "## 4. RAG Pipeline -- Crawl, Split, and Prepare for Embedding\n", - "\n", - "Everything on Cloudflare's stack, zero external dependencies:\n", - "\n", - "```\n", - "Browser Run (crawl) --> Workers AI (embed) --> Vectorize (store) --> Workers AI (query)\n", - "```\n", - "\n", - "Below is the ingestion half -- load pages and prepare Documents\n", - "ready for embedding with `CloudflareWorkersAIEmbeddings` and storage\n", - "in `CloudflareVectorize`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "r8m9n0o1", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_text_splitters import RecursiveCharacterTextSplitter\n", - "\n", - "# Reuse the docs we loaded above\n", - "splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)\n", - "chunks = splitter.split_documents(docs)\n", - "\n", - "print(f\"{len(docs)} pages --> {len(chunks)} chunks\")\n", - "print(f\"\\nSample chunk metadata: {chunks[0].metadata}\")\n", - "print(f\"Sample chunk preview: {chunks[0].page_content[:200]}\")\n", - "\n", - "# These chunks are ready for:\n", - "# embeddings = CloudflareWorkersAIEmbeddings(model_name=\"@cf/baai/bge-base-en-v1.5\")\n", - "# vectorstore = CloudflareVectorize(embedding=embeddings)\n", - "# vectorstore.add_documents(chunks)" - ] } ], "metadata": { @@ -612,4 +440,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/libs/langchain-cloudflare/langchain_cloudflare/loaders.py b/libs/langchain-cloudflare/langchain_cloudflare/loaders.py index 76fe2a6..78a1250 100644 --- a/libs/langchain-cloudflare/langchain_cloudflare/loaders.py +++ b/libs/langchain-cloudflare/langchain_cloudflare/loaders.py @@ -21,6 +21,7 @@ # MARK: - Imports from __future__ import annotations +import base64 import logging import time import warnings @@ -43,6 +44,7 @@ DEFAULT_CRAWL_TIMEOUT = 300.0 # max seconds to wait for a crawl job DEFAULT_CRAWL_LIMIT = 10 DEFAULT_CRAWL_DEPTH = 2 +DEFAULT_REQUEST_TIMEOUT = 60.0 # seconds for individual HTTP requests # MARK: - Helpers @@ -233,6 +235,9 @@ class CloudflareBrowserRunLoader(BaseLoader, BaseModel): # type: ignore[misc] reject_resource_types: Optional[List[str]] = None """Resource types to block (e.g. ``["image", "stylesheet"]``).""" + request_timeout: float = DEFAULT_REQUEST_TIMEOUT + """Timeout in seconds for individual HTTP requests.""" + # Internal _headers: Dict[str, str] = PrivateAttr() @@ -276,6 +281,7 @@ def _fetch_markdown(self, url: str) -> Document: _build_browser_run_url(self.account_id, "markdown"), headers=self._headers, json=body, + timeout=self.request_timeout, ) resp.raise_for_status() data = resp.json() @@ -300,6 +306,7 @@ def _fetch_content(self, url: str) -> Document: _build_browser_run_url(self.account_id, "content"), headers=self._headers, json=body, + timeout=self.request_timeout, ) resp.raise_for_status() data = resp.json() @@ -329,6 +336,7 @@ def _fetch_scrape(self, url: str) -> List[Document]: _build_browser_run_url(self.account_id, "scrape"), headers=self._headers, json=body, + timeout=self.request_timeout, ) resp.raise_for_status() data = resp.json() @@ -374,7 +382,9 @@ def _fetch_crawl(self, url: str) -> List[Document]: "formats": ["markdown"], **self._shared_body(), } - resp = requests.post(crawl_url, headers=self._headers, json=body) + resp = requests.post( + crawl_url, headers=self._headers, json=body, timeout=self.request_timeout + ) resp.raise_for_status() job_id = resp.json().get("result", "") @@ -395,7 +405,9 @@ def _fetch_crawl(self, url: str) -> List[Document]: ) break - poll = requests.get(results_url, headers=self._headers) + poll = requests.get( + results_url, headers=self._headers, timeout=self.request_timeout + ) poll.raise_for_status() poll_data = poll.json().get("result", {}) status = poll_data.get("status", "") @@ -463,7 +475,7 @@ async def _afetch_markdown(self, url: str) -> Document: import httpx body: Dict[str, Any] = {"url": url, **self._shared_body()} - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(timeout=self.request_timeout) as client: resp = await client.post( _build_browser_run_url(self.account_id, "markdown"), headers=self._headers, @@ -491,7 +503,7 @@ async def _afetch_content(self, url: str) -> Document: import httpx body: Dict[str, Any] = {"url": url, **self._shared_body()} - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(timeout=self.request_timeout) as client: resp = await client.post( _build_browser_run_url(self.account_id, "content"), headers=self._headers, @@ -523,7 +535,7 @@ async def _afetch_scrape(self, url: str) -> List[Document]: "elements": elements, **self._shared_body(), } - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(timeout=self.request_timeout) as client: resp = await client.post( _build_browser_run_url(self.account_id, "scrape"), headers=self._headers, @@ -574,7 +586,7 @@ async def _afetch_crawl(self, url: str) -> List[Document]: **self._shared_body(), } - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(timeout=self.request_timeout) as client: resp = await client.post(crawl_url, headers=self._headers, json=body) resp.raise_for_status() job_id = resp.json().get("result", "") @@ -825,6 +837,9 @@ class CloudflareBrowserRunTool(BaseTool): reject_resource_types: Optional[List[str]] = None """Resource types to block.""" + request_timeout: float = DEFAULT_REQUEST_TIMEOUT + """Timeout in seconds for individual HTTP requests.""" + # Internal _headers: Dict[str, str] = PrivateAttr() @@ -904,14 +919,21 @@ def _run(self, url: str) -> str: if self.json_response_format: body["response_format"] = self.json_response_format - resp = requests.post(base, headers=self._headers, json=body) + resp = requests.post( + base, headers=self._headers, json=body, timeout=self.request_timeout + ) resp.raise_for_status() if self.mode in ("screenshot", "pdf"): - import base64 - - encoded = base64.b64encode(resp.content).decode("utf-8") - return encoded + content_type = resp.headers.get("content-type", "") + if "application/json" in content_type or "text/html" in content_type: + data = resp.json() + _check_api_response(data) + raise RuntimeError( + f"Browser Run returned {content_type} instead of binary " + f"data for /{self.mode}: {data}" + ) + return base64.b64encode(resp.content).decode("utf-8") data = resp.json() _check_api_response(data) @@ -953,13 +975,19 @@ async def _arun(self, url: str) -> str: if self.json_response_format: body["response_format"] = self.json_response_format - async with httpx.AsyncClient() as client: + async with httpx.AsyncClient(timeout=self.request_timeout) as client: resp = await client.post(base, headers=self._headers, json=body) resp.raise_for_status() if self.mode in ("screenshot", "pdf"): - import base64 - + content_type = resp.headers.get("content-type", "") + if "application/json" in content_type or "text/html" in content_type: + data = resp.json() + _check_api_response(data) + raise RuntimeError( + f"Browser Run returned {content_type} instead of binary " + f"data for /{self.mode}: {data}" + ) encoded = base64.b64encode(resp.content).decode("utf-8") return encoded diff --git a/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py index 5ce0961..88d3f3f 100644 --- a/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py +++ b/libs/langchain-cloudflare/tests/integration_tests/test_browser_run.py @@ -89,7 +89,7 @@ def test_markdown_single_url(self, account_id, api_token): def test_markdown_multiple_urls(self, account_id, api_token): """Load multiple URLs in markdown mode.""" loader = CloudflareBrowserRunLoader( - urls=[TEST_URL, "https://httpbin.org/html"], + urls=[TEST_URL, TEST_URL], mode="markdown", account_id=account_id, api_token=api_token, @@ -347,202 +347,3 @@ async def test_async_links_tool(self, account_id, api_token): assert isinstance(result, str) assert "iana.org" in result - - -# MARK: - LangGraph Integration Tests - - -class TestBrowserRunLangGraph: - """Integration tests verifying Browser Run works in LangGraph patterns.""" - - def test_loader_as_custom_node(self, account_id, api_token): - """Loader works as a custom node in a LangGraph StateGraph.""" - from typing import TypedDict - - from langgraph.graph import END, START, StateGraph - - class ResearchState(TypedDict): - url: str - page_content: str - - def fetch_page(state: ResearchState) -> dict: - loader = CloudflareBrowserRunLoader( - urls=[state["url"]], - mode="markdown", - account_id=account_id, - api_token=api_token, - ) - docs = loader.load() - return {"page_content": docs[0].page_content} - - graph = StateGraph(ResearchState) - graph.add_node("fetch_page", fetch_page) - graph.add_edge(START, "fetch_page") - graph.add_edge("fetch_page", END) - app = graph.compile() - - result = app.invoke({"url": TEST_URL, "page_content": ""}) - - print("\n[LangGraph] Loader as custom node:") - print(f" Content: {len(result['page_content'])} chars") - - assert "Example Domain" in result["page_content"] - - def test_tool_in_toolnode(self, account_id, api_token): - """Tools work inside LangGraph ToolNode with simulated tool calls.""" - from langchain_core.messages import AIMessage, HumanMessage, ToolMessage - from langgraph.graph import END, START, MessagesState, StateGraph - from langgraph.prebuilt import ToolNode, tools_condition - - md_tool = CloudflareBrowserRunTool( - mode="markdown", - account_id=account_id, - api_token=api_token, - ) - tool_node = ToolNode([md_tool]) - - def fake_model(state: MessagesState) -> dict: - return { - "messages": [ - AIMessage( - content="", - tool_calls=[ - { - "name": "cloudflare_browser_run_markdown", - "args": {"url": TEST_URL}, - "id": "call_test_001", - "type": "tool_call", - } - ], - ) - ] - } - - graph = StateGraph(MessagesState) - graph.add_node("model", fake_model) - graph.add_node("tools", tool_node) - graph.add_edge(START, "model") - graph.add_conditional_edges("model", tools_condition) - graph.add_edge("tools", END) - app = graph.compile() - - result = app.invoke({"messages": [HumanMessage(content="test")]}) - tool_msgs = [m for m in result["messages"] if isinstance(m, ToolMessage)] - - print("\n[LangGraph] Tool in ToolNode:") - print(f" Tool messages: {len(tool_msgs)}") - print(f" Content: {tool_msgs[0].content[:100]}") - - assert len(tool_msgs) == 1 - assert "Example Domain" in tool_msgs[0].content - - def test_parallel_tool_calls(self, account_id, api_token): - """Multiple tools execute in parallel via ToolNode.""" - from langchain_core.messages import AIMessage, HumanMessage, ToolMessage - from langgraph.graph import END, START, MessagesState, StateGraph - from langgraph.prebuilt import ToolNode, tools_condition - - tools = [ - CloudflareBrowserRunTool( - mode="markdown", - account_id=account_id, - api_token=api_token, - ), - CloudflareBrowserRunTool( - mode="links", - account_id=account_id, - api_token=api_token, - ), - ] - tool_node = ToolNode(tools) - - def fake_model(state: MessagesState) -> dict: - return { - "messages": [ - AIMessage( - content="", - tool_calls=[ - { - "name": "cloudflare_browser_run_markdown", - "args": {"url": TEST_URL}, - "id": "call_p1", - "type": "tool_call", - }, - { - "name": "cloudflare_browser_run_links", - "args": {"url": TEST_URL}, - "id": "call_p2", - "type": "tool_call", - }, - ], - ) - ] - } - - graph = StateGraph(MessagesState) - graph.add_node("model", fake_model) - graph.add_node("tools", tool_node) - graph.add_edge(START, "model") - graph.add_conditional_edges("model", tools_condition) - graph.add_edge("tools", END) - app = graph.compile() - - result = app.invoke({"messages": [HumanMessage(content="test")]}) - tool_msgs = [m for m in result["messages"] if isinstance(m, ToolMessage)] - - print("\n[LangGraph] Parallel tool calls:") - for msg in tool_msgs: - print(f" - {msg.name}: {len(msg.content)} chars") - - assert len(tool_msgs) == 2 - names = {m.name for m in tool_msgs} - assert "cloudflare_browser_run_markdown" in names - assert "cloudflare_browser_run_links" in names - - def test_parallel_nodes_with_loader(self, account_id, api_token): - """Loader and Tool run as parallel nodes in a DAG.""" - from typing import TypedDict - - from langgraph.graph import END, START, StateGraph - - class ParallelState(TypedDict): - url: str - page_content: str - links: list - - def fetch_page(state: ParallelState) -> dict: - loader = CloudflareBrowserRunLoader( - urls=[state["url"]], - mode="markdown", - account_id=account_id, - api_token=api_token, - ) - docs = loader.load() - return {"page_content": docs[0].page_content} - - def extract_links(state: ParallelState) -> dict: - tool = CloudflareBrowserRunTool( - mode="links", - account_id=account_id, - api_token=api_token, - ) - links = tool.invoke({"url": state["url"]}).strip().split("\n") - return {"links": links} - - graph = StateGraph(ParallelState) - graph.add_node("fetch_page", fetch_page) - graph.add_node("extract_links", extract_links) - graph.add_edge(START, "fetch_page") - graph.add_edge(START, "extract_links") - graph.add_edge("fetch_page", END) - graph.add_edge("extract_links", END) - app = graph.compile() - - result = app.invoke({"url": TEST_URL, "page_content": "", "links": []}) - - print("\n[LangGraph] Parallel nodes (Loader + Tool):") - print(f" Content: {len(result['page_content'])} chars") - print(f" Links: {result['links']}") - - assert "Example Domain" in result["page_content"] - assert len(result["links"]) >= 1 diff --git a/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py index c6a2b3e..abc75ee 100644 --- a/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py +++ b/libs/langchain-cloudflare/tests/unit_tests/test_browser_run.py @@ -379,3 +379,321 @@ def test_extra_fields_forbidden(self): api_token="tok", unknown_field="bad", ) + + +# MARK: - Mocked HTTP Behavior Tests + + +class TestErrorEnvelopes: + """Verify _check_api_response raises on success=false envelopes.""" + + def test_success_false_raises(self): + """API error envelope should raise RuntimeError.""" + from langchain_cloudflare.loaders import _check_api_response + + with pytest.raises(RuntimeError, match="Browser Run API error"): + _check_api_response( + {"success": False, "errors": [{"message": "bad request"}]} + ) + + def test_success_true_passes(self): + """Normal response should not raise.""" + from langchain_cloudflare.loaders import _check_api_response + + _check_api_response({"success": True, "result": "ok"}) + + def test_non_dict_passes(self): + """Non-dict response should not raise.""" + from langchain_cloudflare.loaders import _check_api_response + + _check_api_response("plain string") + _check_api_response(["a", "list"]) + + +class TestBinaryEndpointErrorHandling: + """Verify screenshot/pdf detect JSON error responses instead of blindly encoding.""" + + def test_screenshot_json_error_raises(self, monkeypatch: pytest.MonkeyPatch): + """Screenshot mode should raise when API returns JSON error.""" + from unittest.mock import MagicMock, patch + + tool = CloudflareBrowserRunTool( + mode="screenshot", + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.headers = {"content-type": "application/json"} + mock_resp.json.return_value = { + "success": False, + "errors": [{"message": "invalid URL"}], + } + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ): + with pytest.raises(RuntimeError, match="Browser Run"): + tool._run("https://example.com") + + def test_screenshot_html_error_raises(self): + """Screenshot mode should raise when API returns HTML error page.""" + from unittest.mock import MagicMock, patch + + tool = CloudflareBrowserRunTool( + mode="screenshot", + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.headers = {"content-type": "text/html"} + mock_resp.json.return_value = {"success": True, "result": "error page"} + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ): + with pytest.raises(RuntimeError, match="instead of binary"): + tool._run("https://example.com") + + def test_screenshot_binary_success(self): + """Screenshot mode should return base64 when API returns image.""" + from unittest.mock import MagicMock, patch + + tool = CloudflareBrowserRunTool( + mode="screenshot", + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.headers = {"content-type": "image/png"} + mock_resp.content = b"\x89PNG\r\n\x1a\nfake" + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ): + result = tool._run("https://example.com") + assert isinstance(result, str) + assert len(result) > 0 + + +class TestCrawlPolling: + """Verify crawl timeout, error status, and pagination handling.""" + + def test_crawl_timeout_warns(self): + """Crawl should warn and return partial results on timeout.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="crawl", + crawl_timeout=0.1, + crawl_poll_interval=0.05, + account_id="abc123", + api_token="tok", + ) + + # Mock: POST /crawl returns job_id, GET always returns "processing" + mock_post = MagicMock() + mock_post.raise_for_status = MagicMock() + mock_post.json.return_value = {"result": "job-123"} + + mock_get = MagicMock() + mock_get.raise_for_status = MagicMock() + mock_get.json.return_value = {"result": {"status": "processing", "records": []}} + + with ( + patch("langchain_cloudflare.loaders.requests.post", return_value=mock_post), + patch("langchain_cloudflare.loaders.requests.get", return_value=mock_get), + ): + with pytest.warns(UserWarning, match="timed out"): + docs = loader.load() + + assert docs == [] + + def test_crawl_errored_status_stops(self): + """Crawl should stop polling when job status is errored.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="crawl", + account_id="abc123", + api_token="tok", + ) + + mock_post = MagicMock() + mock_post.raise_for_status = MagicMock() + mock_post.json.return_value = {"result": "job-456"} + + mock_get = MagicMock() + mock_get.raise_for_status = MagicMock() + mock_get.json.return_value = {"result": {"status": "errored", "records": []}} + + with ( + patch("langchain_cloudflare.loaders.requests.post", return_value=mock_post), + patch("langchain_cloudflare.loaders.requests.get", return_value=mock_get), + ): + docs = loader.load() + + assert docs == [] + + def test_crawl_completed_with_records(self): + """Crawl should return Documents from completed records.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="crawl", + account_id="abc123", + api_token="tok", + ) + + mock_post = MagicMock() + mock_post.raise_for_status = MagicMock() + mock_post.json.return_value = {"result": "job-789"} + + mock_get = MagicMock() + mock_get.raise_for_status = MagicMock() + mock_get.json.return_value = { + "result": { + "status": "completed", + "records": [ + { + "url": "https://example.com", + "status": "completed", + "markdown": "# Example\nHello world", + "metadata": {"title": "Example", "status": 200}, + }, + { + "url": "https://example.com/about", + "status": "completed", + "markdown": "# About\nAbout us", + "metadata": {"title": "About", "status": 200}, + }, + ], + } + } + + with ( + patch("langchain_cloudflare.loaders.requests.post", return_value=mock_post), + patch("langchain_cloudflare.loaders.requests.get", return_value=mock_get), + ): + docs = loader.load() + + assert len(docs) == 2 + assert docs[0].page_content == "# Example\nHello world" + assert docs[0].metadata["source"] == "https://example.com" + assert docs[0].metadata["title"] == "Example" + assert docs[1].metadata["source"] == "https://example.com/about" + + +class TestRequestBodyConstruction: + """Verify request bodies are constructed correctly per mode.""" + + def test_markdown_body(self): + """Markdown mode sends url + shared options.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="markdown", + viewport={"width": 1920, "height": 1080}, + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.json.return_value = {"success": True, "result": "# Hello"} + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ) as mock_post: + loader.load() + + call_kwargs = mock_post.call_args + body = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert body["url"] == "https://example.com" + assert body["viewport"] == {"width": 1920, "height": 1080} + + def test_scrape_body_includes_elements(self): + """Scrape mode sends elements in the request body.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="scrape", + elements=[{"selector": "h1"}, {"selector": ".price"}], + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.json.return_value = {"success": True, "result": []} + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ) as mock_post: + loader.load() + + call_kwargs = mock_post.call_args + body = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert body["elements"] == [{"selector": "h1"}, {"selector": ".price"}] + + def test_json_tool_body_includes_prompt_and_schema(self): + """JSON tool sends prompt and response_format in the body.""" + from unittest.mock import MagicMock, patch + + schema = {"type": "json_schema", "schema": {"type": "object"}} + tool = CloudflareBrowserRunTool( + mode="json", + json_prompt="Extract facts.", + json_response_format=schema, + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.json.return_value = {"success": True, "result": {"key": "val"}} + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ) as mock_post: + tool._run("https://example.com") + + call_kwargs = mock_post.call_args + body = call_kwargs.kwargs.get("json") or call_kwargs[1].get("json") + assert body["prompt"] == "Extract facts." + assert body["response_format"] == schema + + def test_loader_sends_timeout(self): + """All loader requests include the configured timeout.""" + from unittest.mock import MagicMock, patch + + loader = CloudflareBrowserRunLoader( + urls=["https://example.com"], + mode="markdown", + request_timeout=30.0, + account_id="abc123", + api_token="tok", + ) + + mock_resp = MagicMock() + mock_resp.raise_for_status = MagicMock() + mock_resp.json.return_value = {"success": True, "result": "# Hello"} + + with patch( + "langchain_cloudflare.loaders.requests.post", return_value=mock_resp + ) as mock_post: + loader.load() + + call_kwargs = mock_post.call_args + timeout = call_kwargs.kwargs.get("timeout") or call_kwargs[1].get("timeout") + assert timeout == 30.0 From bfb68b41dccb6f539595b1a613f5258d246c86d0 Mon Sep 17 00:00:00 2001 From: Vamshi_BIDS Date: Tue, 21 Apr 2026 14:27:04 -0500 Subject: [PATCH 5/5] fix: add missing timeout to crawl pagination GET request --- libs/langchain-cloudflare/langchain_cloudflare/loaders.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/libs/langchain-cloudflare/langchain_cloudflare/loaders.py b/libs/langchain-cloudflare/langchain_cloudflare/loaders.py index 78a1250..80e33a9 100644 --- a/libs/langchain-cloudflare/langchain_cloudflare/loaders.py +++ b/libs/langchain-cloudflare/langchain_cloudflare/loaders.py @@ -432,7 +432,12 @@ def _fetch_crawl(self, url: str) -> List[Document]: if cursor is not None: params["cursor"] = cursor - page_resp = requests.get(results_url, headers=self._headers, params=params) + page_resp = requests.get( + results_url, + headers=self._headers, + params=params, + timeout=self.request_timeout, + ) page_resp.raise_for_status() page_data = page_resp.json().get("result", {})