From 19a565f7514593a00431dfba332cfd9be02bb21f Mon Sep 17 00:00:00 2001 From: Vikrant-Khedkar Date: Thu, 30 Apr 2026 14:34:20 +0530 Subject: [PATCH] feat!: rename MCP tools to match v2 docs canonical names (BREAKING) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aligns the MCP surface to the public v2 API vocabulary at docs.scrapegraphai.com/services/*. Hard rename, no aliases — bumped to v3.0.0. Renames: - smartscraper -> extract (POST /extract) - searchscraper -> search (POST /search) - smartcrawler_initiate -> crawl_start (POST /crawl) - smartcrawler_fetch_results -> crawl_get_status (GET /crawl/:id) - sgai_history -> history (GET /history) - generate_schema -> schema (POST /schema) - markdownify -> removed (use scrape with output_format="markdown") Tool count: 18 -> 17. ScapeGraphClient also lost its dead markdownify and duplicate-name shim methods (extract/search) that would have recursed infinitely after the rename. End-to-end verified locally via HTTP transport with X-API-Key: - tools/list returns the 17 canonical names - extract / search / scrape / history / credits all return live API data - smartscraper / markdownify return "Unknown tool" as expected Co-Authored-By: Claude Opus 4.7 (1M context) --- .agent/README.md | 18 +- .agent/system/mcp_protocol.md | 40 ++-- .agent/system/project_architecture.md | 70 +++--- README.md | 67 +++--- pyproject.toml | 2 +- server.json | 4 +- src/scrapegraph_mcp/server.py | 311 +++++++++----------------- uv.lock | 2 +- 8 files changed, 217 insertions(+), 297 deletions(-) diff --git a/.agent/README.md b/.agent/README.md index d7ace3c..fbb8a16 100644 --- a/.agent/README.md +++ b/.agent/README.md @@ -12,7 +12,7 @@ Complete system architecture documentation including: - **Technology Stack** - Python 3.10+, FastMCP, httpx dependencies - **Project Structure** - File organization and key files - **Core Architecture** - MCP design, server architecture, patterns -- **MCP Tools** - API v2 tools (markdownify, scrape, smartscraper, searchscraper, crawl, credits, history, monitor, …) +- **MCP Tools** - API v3 tools (scrape, extract, search, crawl_start, crawl_get_status, schema, credits, history, monitor_*) - **API Integration** - ScrapeGraphAI API endpoints and credit system - **Deployment** - Smithery, Claude Desktop, Cursor, Docker setup - **Recent Updates** - SmartCrawler integration and latest features @@ -95,14 +95,14 @@ Complete Model Context Protocol integration documentation: **...available tools and their parameters:** - Read: [Project Architecture - MCP Tools](./system/project_architecture.md#mcp-tools) -- Quick reference: see README “Available Tools” table (v2: + scrape, crawl_stop/resume, credits, sgai_history, monitor_*; removed sitemap, agentic_scrapper, *\_status tools) +- Quick reference: see README “Available Tools” table (v2: + scrape, crawl_stop/resume, credits, history, monitor_*; removed sitemap, agentic_scrapper, *\_status tools) **...error handling:** - Read: [MCP Protocol - Error Handling](./system/mcp_protocol.md#error-handling) - Pattern: Return `{"error": "message"}` instead of raising exceptions **...how SmartCrawler works:** -- Read: [Project Architecture - Tool #4 & #5](./system/project_architecture.md#4-smartcrawler_initiate) +- Read: [Project Architecture - Tool #4 & #5](./system/project_architecture.md#4-crawl_start) - Pattern: Initiate (async) → Poll fetch_results until complete --- @@ -133,7 +133,7 @@ npx @modelcontextprotocol/inspector scrapegraph-mcp **Manual Testing (stdio):** ```bash -echo '{"jsonrpc":"2.0","method":"tools/call","params":{"name":"markdownify","arguments":{"website_url":"https://scrapegraphai.com"}},"id":1}' | scrapegraph-mcp +echo '{"jsonrpc":"2.0","method":"tools/call","params":{"name":"scrape","arguments":{"website_url":"https://scrapegraphai.com","output_format":"markdown"}},"id":1}' | scrapegraph-mcp # (v2: same tool name; backend calls POST /scrape) ``` @@ -177,11 +177,11 @@ Quick reference to all MCP tools: | Tool | Notes | |------|--------| -| `markdownify` / `scrape` | POST /scrape (v2) | -| `smartscraper` | POST /extract; URL only | -| `searchscraper` | POST /search; num_results 3–20 | +| `scrape` | POST /scrape (v2) | +| `extract` | POST /extract; URL only | +| `search` | POST /search; num_results 3–20 | | `smartcrawler_*`, `crawl_stop`, `crawl_resume` | POST/GET /crawl | -| `credits`, `sgai_history` | GET /credits, /history | +| `credits`, `history` | GET /credits, /history | | `monitor_*` | /monitor namespace | For detailed tool documentation, see [Project Architecture - MCP Tools](./system/project_architecture.md#mcp-tools). @@ -229,7 +229,7 @@ For detailed tool documentation, see [Project Architecture - MCP Tools](./system **Issue: SmartCrawler not returning results** - **Cause:** Still processing (async operation) -- **Solution:** Keep polling `smartcrawler_fetch_results()` until `status == "completed"` +- **Solution:** Keep polling `crawl_get_status()` until `status == "completed"` **Issue: Python version error** - **Cause:** Python < 3.10 diff --git a/.agent/system/mcp_protocol.md b/.agent/system/mcp_protocol.md index 822a1de..5868497 100644 --- a/.agent/system/mcp_protocol.md +++ b/.agent/system/mcp_protocol.md @@ -41,7 +41,7 @@ The **Model Context Protocol** (MCP) is an open standard that defines how AI ass - Functions exposed by the server - Have typed parameters and return values - Automatically discovered by AI assistants -- **Examples:** `markdownify()`, `smartscraper()` +- **Examples:** `scrape()`, `extract()` **5. Resources** - Data exposed by the server (optional) @@ -104,7 +104,7 @@ mcp = FastMCP("ScapeGraph API MCP Server") # Define tools with decorators @mcp.tool() -def markdownify(website_url: str) -> Dict[str, Any]: +def scrape(website_url: str) -> Dict[str, Any]: """Convert a webpage to markdown.""" # Implementation... return {"result": "..."} @@ -135,7 +135,7 @@ mcp.run(transport="stdio") **Example Flow:** ``` Client → Server (stdin): -{"jsonrpc": "2.0", "method": "tools/call", "params": {"name": "markdownify", "arguments": {"website_url": "https://example.com"}}, "id": 1} +{"jsonrpc": "2.0", "method": "tools/call", "params": {"name": "scrape", "arguments": {"website_url": "https://example.com"}}, "id": 1} Server → Client (stdout): {"jsonrpc": "2.0", "result": {"result": "# Example\n\nMarkdown content..."}, "id": 1} @@ -151,7 +151,7 @@ MCP uses JSON-RPC 2.0 for message structure: "jsonrpc": "2.0", "method": "tools/call", "params": { - "name": "smartscraper", + "name": "extract", "arguments": { "user_prompt": "Extract product names", "website_url": "https://example.com" @@ -199,7 +199,7 @@ Response: "result": { "tools": [ { - "name": "markdownify", + "name": "scrape", "description": "Convert a webpage into clean, formatted markdown.", "inputSchema": { "type": "object", @@ -232,12 +232,12 @@ Response: Each tool exposed by the server has a schema that defines its parameters and return type. -### Example: `markdownify` Tool +### Example: `scrape` Tool **Python Definition:** ```python @mcp.tool() -def markdownify(website_url: str) -> Dict[str, Any]: +def scrape(website_url: str) -> Dict[str, Any]: """ Convert a webpage into clean, formatted markdown. @@ -253,7 +253,7 @@ def markdownify(website_url: str) -> Dict[str, Any]: **Generated MCP Schema:** ```json { - "name": "markdownify", + "name": "scrape", "description": "Convert a webpage into clean, formatted markdown.", "inputSchema": { "type": "object", @@ -275,12 +275,12 @@ def markdownify(website_url: str) -> Dict[str, Any]: - Python `Dict[str, Any]` → JSON Schema `"type": "object"` - Python `Optional[str]` → JSON Schema `"type": ["string", "null"]` -### Example: `smartscraper` Tool (with optional parameters) +### Example: `extract` Tool (with optional parameters) **Python Definition:** ```python @mcp.tool() -def smartscraper( +def extract( user_prompt: str, website_url: str, number_of_scrolls: int = None, @@ -293,7 +293,7 @@ def smartscraper( **Generated MCP Schema:** ```json { - "name": "smartscraper", + "name": "extract", "description": "Extract structured data from a webpage using AI.", "inputSchema": { "type": "object", @@ -458,8 +458,8 @@ AI: "I wasn't able to convert the webpage because your ScrapeGraphAI account has User: "What are the main features of ScrapeGraphAI?" Claude (internal): -1. Determines that markdownify tool could help -2. Calls: markdownify("https://scrapegraphai.com") +1. Determines that scrape tool could help +2. Calls: scrape("https://scrapegraphai.com") 3. Receives markdown content 4. Analyzes content 5. Responds to user @@ -518,7 +518,7 @@ async def main(): # Call a tool result = await session.call_tool( - "markdownify", + "scrape", arguments={"website_url": "https://example.com"} ) print(f"Result: {result}") @@ -543,7 +543,7 @@ else: Currently, the server does not implement tool versioning. All tools are v1 implicitly. **Future Consideration:** -- Add version to tool names: `smartscraper_v2()` +- Add version to tool names: `extract_v2()` - Maintain backward compatibility with deprecated tools - Use MCP metadata for version info @@ -552,11 +552,11 @@ Currently, the server does not implement tool versioning. All tools are v1 impli MCP supports streaming results for long-running operations. This could be useful for SmartCrawler: **Current Approach (polling):** -1. Call `smartcrawler_initiate()` → get `request_id` -2. Repeatedly call `smartcrawler_fetch_results(request_id)` until complete +1. Call `crawl_start()` → get `request_id` +2. Repeatedly call `crawl_get_status(request_id)` until complete **Potential Streaming Approach:** -1. Call `smartcrawler_initiate()` → server keeps connection open +1. Call `crawl_start()` → server keeps connection open 2. Server streams progress updates: `{"status": "processing", "pages": 10}` 3. Server sends final result: `{"status": "completed", "results": [...]}` @@ -618,8 +618,8 @@ logging.basicConfig( logger = logging.getLogger(__name__) @mcp.tool() -def markdownify(website_url: str) -> Dict[str, Any]: - logger.info(f"markdownify called with URL: {website_url}") +def scrape(website_url: str) -> Dict[str, Any]: + logger.info(f"scrape called with URL: {website_url}") # ... ``` diff --git a/.agent/system/project_architecture.md b/.agent/system/project_architecture.md index 94e5657..3340cd9 100644 --- a/.agent/system/project_architecture.md +++ b/.agent/system/project_architecture.md @@ -1,7 +1,7 @@ # ScrapeGraph MCP Server - Project Architecture **Last Updated:** April 2026 -**Version:** 2.0.0 +**Version:** 3.0.0 ## Table of Contents - [System Overview](#system-overview) @@ -20,9 +20,9 @@ The ScrapeGraph MCP Server is a production-ready [Model Context Protocol](https://modelcontextprotocol.io/introduction) (MCP) server that provides seamless integration between AI assistants (like Claude, Cursor, etc.) and the [ScrapeGraphAI API](https://scrapegraphai.com). This server enables language models to leverage advanced AI-powered web scraping capabilities with enterprise-grade reliability. **Key Capabilities (API v2):** -- **Scrape** (`markdownify`, `scrape`) — POST `/v2/scrape` -- **Extract** (`smartscraper`) — POST `/v2/extract` (URL-only) -- **Search** (`searchscraper`) — POST `/v2/search` +- **Scrape** (`scrape`, `scrape`) — POST `/v2/scrape` +- **Extract** (`extract`) — POST `/v2/extract` (URL-only) +- **Search** (`search`) — POST `/v2/search` - **Crawl** — POST/GET `/v2/crawl` (+ stop/resume); markdown/html crawl only - **Monitor, credits, history** — `/v2/monitor`, `/credits`, `/history` @@ -188,9 +188,9 @@ The server follows a simple, single-file architecture: The server exposes many `@mcp.tool()` handlers (see repository `README.md` for the full table). The detailed subsections below still use **v1-style endpoint names** in several places; treat them as illustrative and prefer the v2 mapping in **API Integration**. -**v2 tool names:** `markdownify`, `scrape`, `smartscraper`, `searchscraper`, `smartcrawler_initiate`, `smartcrawler_fetch_results`, `crawl_stop`, `crawl_resume`, `credits`, `sgai_history`, `monitor_create`, `monitor_list`, `monitor_get`, `monitor_pause`, `monitor_resume`, `monitor_delete`, `monitor_activity`. +**v2 tool names:** `scrape`, `scrape`, `extract`, `search`, `crawl_start`, `crawl_get_status`, `crawl_stop`, `crawl_resume`, `credits`, `history`, `monitor_create`, `monitor_list`, `monitor_get`, `monitor_pause`, `monitor_resume`, `monitor_delete`, `monitor_activity`. -### 1. `markdownify(website_url: str)` +### 1. `scrape(website_url: str)` **Purpose:** Convert a webpage into clean, formatted markdown @@ -214,16 +214,16 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Example Usage (from AI):** ``` "Convert https://scrapegraphai.com to markdown" -→ AI calls: markdownify("https://scrapegraphai.com") +→ AI calls: scrape("https://scrapegraphai.com") ``` -**API Endpoint:** `POST /v1/markdownify` +**API Endpoint:** `POST /v1/scrape` **Credits:** 2 credits per request --- -### 2. `smartscraper(user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None)` +### 2. `extract(user_prompt: str, website_url: str, number_of_scrolls: int = None, markdown_only: bool = None)` **Purpose:** Extract structured data from a webpage using AI @@ -246,19 +246,19 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Example Usage:** ``` "Extract all product names and prices from https://example.com/products" -→ AI calls: smartscraper( +→ AI calls: extract( user_prompt="Extract product names and prices", website_url="https://example.com/products" ) ``` -**API Endpoint:** `POST /v1/smartscraper` +**API Endpoint:** `POST /v1/extract` **Credits:** 10 credits (base) + 1 credit per scroll + additional charges --- -### 3. `searchscraper(user_prompt: str, num_results: int = None, number_of_scrolls: int = None, time_range: str = None)` +### 3. `search(user_prompt: str, num_results: int = None, number_of_scrolls: int = None, time_range: str = None)` **Purpose:** Perform AI-powered web searches with structured results @@ -284,20 +284,20 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Example Usage:** ``` "Research the latest AI developments in 2025" -→ AI calls: searchscraper( +→ AI calls: search( user_prompt="Latest AI developments in 2025", num_results=5, time_range="past_week" ) ``` -**API Endpoint:** `POST /v1/searchscraper` +**API Endpoint:** `POST /v1/search` **Credits:** Variable (3-20 websites × 10 credits per website) --- -### 4. `smartcrawler_initiate(url: str, prompt: str = None, extraction_mode: str = "ai", depth: int = None, max_pages: int = None, same_domain_only: bool = None)` +### 4. `crawl_start(url: str, prompt: str = None, extraction_mode: str = "ai", depth: int = None, max_pages: int = None, same_domain_only: bool = None)` **Purpose:** Initiate intelligent multi-page web crawling (asynchronous) @@ -320,7 +320,7 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Example Usage:** ``` "Crawl https://docs.python.org and extract all function signatures" -→ AI calls: smartcrawler_initiate( +→ AI calls: crawl_start( url="https://docs.python.org", prompt="Extract function signatures and descriptions", extraction_mode="ai", @@ -333,16 +333,16 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Credits:** 100 credits (base) + 10 credits per page (AI mode) or 2 credits per page (markdown mode) -**Note:** This is an asynchronous operation. Use `smartcrawler_fetch_results()` to retrieve results. +**Note:** This is an asynchronous operation. Use `crawl_get_status()` to retrieve results. --- -### 5. `smartcrawler_fetch_results(request_id: str)` +### 5. `crawl_get_status(request_id: str)` **Purpose:** Fetch the results of a SmartCrawler operation **Parameters:** -- `request_id` (str) - The request ID returned by `smartcrawler_initiate()` +- `request_id` (str) - The request ID returned by `crawl_start()` **Returns (while processing):** ```json @@ -369,7 +369,7 @@ The server exposes many `@mcp.tool()` handlers (see repository `README.md` for t **Example Usage:** ``` AI: "Check the status of crawl request abc-123" -→ AI calls: smartcrawler_fetch_results("abc-123") +→ AI calls: crawl_get_status("abc-123") If status is "processing": → AI: "Still processing, 15/50 pages completed" @@ -401,15 +401,15 @@ If status is "completed": | Endpoint | Method | MCP tools (typical) | |----------|--------|---------------------| -| `/scrape` | POST | `markdownify`, `scrape` | -| `/extract` | POST | `smartscraper` | -| `/search` | POST | `searchscraper` | -| `/crawl` | POST | `smartcrawler_initiate` | -| `/crawl/{id}` | GET | `smartcrawler_fetch_results` | +| `/scrape` | POST | `scrape`, `scrape` | +| `/extract` | POST | `extract` | +| `/search` | POST | `search` | +| `/crawl` | POST | `crawl_start` | +| `/crawl/{id}` | GET | `crawl_get_status` | | `/crawl/{id}/stop` | POST | `crawl_stop` | | `/crawl/{id}/resume` | POST | `crawl_resume` | | `/credits` | GET | `credits` | -| `/history` | GET | `sgai_history` | +| `/history` | GET | `history` | | `/monitor` | POST, GET | `monitor_create`, `monitor_list` | | `/monitor/{id}` | GET, DELETE | `monitor_get`, `monitor_delete` | | `/monitor/{id}/pause` | POST | `monitor_pause` | @@ -596,8 +596,8 @@ scrapegraph-mcp ### October 2025 **SmartCrawler Integration (Latest):** -- Added `smartcrawler_initiate()` tool for multi-page crawling -- Added `smartcrawler_fetch_results()` tool for async result retrieval +- Added `crawl_start()` tool for multi-page crawling +- Added `crawl_get_status()` tool for async result retrieval - Support for AI extraction mode (10 credits/page) and markdown mode (2 credits/page) - Configurable depth, max_pages, and same_domain_only parameters - Enhanced error handling for extraction mode validation @@ -679,19 +679,19 @@ mypy src/ ### Manual Testing -**Test markdownify:** +**Test scrape:** ```bash -echo '{"method":"tools/call","params":{"name":"markdownify","arguments":{"website_url":"https://scrapegraphai.com"}}}' | scrapegraph-mcp +echo '{"method":"tools/call","params":{"name":"scrape","arguments":{"website_url":"https://scrapegraphai.com"}}}' | scrapegraph-mcp ``` -**Test smartscraper:** +**Test extract:** ```bash -echo '{"method":"tools/call","params":{"name":"smartscraper","arguments":{"user_prompt":"Extract main features","website_url":"https://scrapegraphai.com"}}}' | scrapegraph-mcp +echo '{"method":"tools/call","params":{"name":"extract","arguments":{"user_prompt":"Extract main features","website_url":"https://scrapegraphai.com"}}}' | scrapegraph-mcp ``` -**Test searchscraper:** +**Test search:** ```bash -echo '{"method":"tools/call","params":{"name":"searchscraper","arguments":{"user_prompt":"Latest AI news"}}}' | scrapegraph-mcp +echo '{"method":"tools/call","params":{"name":"search","arguments":{"user_prompt":"Latest AI news"}}}' | scrapegraph-mcp ``` ### Integration Testing @@ -735,7 +735,7 @@ echo '{"method":"tools/call","params":{"name":"searchscraper","arguments":{"user **Issue: SmartCrawler not returning results** - **Cause:** Still processing (async operation) -- **Solution:** Keep polling `smartcrawler_fetch_results()` until `status == "completed"` +- **Solution:** Keep polling `crawl_get_status()` until `status == "completed"` --- diff --git a/README.md b/README.md index e7b24a3..8c8f680 100644 --- a/README.md +++ b/README.md @@ -40,14 +40,29 @@ This MCP server targets **ScrapeGraph API v2** (`https://v2-api.scrapegraphai.co ## Key Features -- **Scrape & extract**: `markdownify` / `scrape` (POST /scrape), `smartscraper` (POST /extract, URL only) -- **Search**: `searchscraper` (POST /search; `num_results` clamped 3–20) -- **Crawl**: Async multi-page crawl in **markdown** or **html** only; `crawl_stop` / `crawl_resume` +- **Scrape & extract**: `scrape` (POST /scrape, multi-format), `extract` (POST /extract, URL + prompt) +- **Search**: `search` (POST /search; `num_results` clamped 3–20) +- **Crawl**: Async multi-page crawl with `crawl_start` / `crawl_get_status` / `crawl_stop` / `crawl_resume` +- **Schema**: `schema` (POST /schema) — generate or augment a JSON Schema from a prompt - **Monitors**: Scheduled jobs via `monitor_create`, `monitor_list`, `monitor_get`, pause/resume/delete, `monitor_activity` (paginated tick history) -- **Account**: `credits`, `sgai_history` +- **Account**: `credits`, `history` - **Easy integration**: Claude Desktop, Cursor, Smithery, HTTP transport - **Developer docs**: `.agent/` folder +## Migration: v2 → v3 + +v3 renames every MCP tool that diverged from the v2 API docs. **Hard rename, no aliases.** + +| v2 (old) | v3 (new) | +|---|---| +| `smartscraper` | `extract` | +| `searchscraper` | `search` | +| `smartcrawler_initiate` | `crawl_start` | +| `smartcrawler_fetch_results` | `crawl_get_status` | +| `sgai_history` | `history` | +| `generate_schema` | `schema` | +| `markdownify` | **removed** — use `scrape` with `output_format="markdown"` | + ## Quick Start ### 1. Get Your API Key @@ -73,19 +88,19 @@ That's it! The server is now available to your AI assistant. | Tool | Role | |------|------| -| `markdownify` | POST /scrape (markdown) | -| `scrape` | POST /scrape (`output_format`: markdown, html, screenshot, branding) | -| `smartscraper` | POST /extract (requires `website_url`; no inline HTML/markdown body on v2) | -| `searchscraper` | POST /search (`num_results` 3–20; `time_range` / `number_of_scrolls` ignored on v2) | -| `smartcrawler_initiate` | POST /crawl — `extraction_mode` **`markdown`** or **`html`** (default markdown). No AI crawl across pages. | -| `smartcrawler_fetch_results` | GET /crawl/:id | +| `scrape` | POST /scrape (`output_format`: markdown, html, screenshot, branding, links, images, summary) | +| `extract` | POST /extract (requires `website_url` + `user_prompt`; optional `output_schema`) | +| `search` | POST /search (`num_results` 1–20; supports `country_search`, `time_range`, `output_schema`) | +| `crawl_start` | POST /crawl — `extraction_mode` markdown / html / links / images / summary / branding / screenshot | +| `crawl_get_status` | GET /crawl/:id (poll until `status: completed`) | | `crawl_stop`, `crawl_resume` | POST /crawl/:id/stop \| resume | +| `schema` | POST /schema (generate or augment a JSON Schema from a prompt) | | `credits` | GET /credits | -| `sgai_history` | GET /history | +| `history` | GET /history (paginated, `service` filter) | | `monitor_create`, `monitor_list`, `monitor_get`, `monitor_pause`, `monitor_resume`, `monitor_delete` | /monitor API | | `monitor_activity` | GET /monitor/:id/activity (paginated tick history: `id`, `createdAt`, `status`, `changed`, `elapsedMs`, `diffs`) | -**Removed vs older MCP releases:** `sitemap`, `agentic_scrapper`, `markdownify_status`, `smartscraper_status` (no v2 endpoints). +**Removed:** `sitemap`, `agentic_scrapper`, async-status polling, and (in v3) `markdownify` — use `scrape` with `output_format="markdown"`. ## Setup Instructions @@ -386,7 +401,7 @@ root_agent = LlmAgent( timeout=300.0,) ), # Optional: Filter which tools from the MCP server are exposed - # tool_filter=['markdownify', 'smartscraper', 'searchscraper'] + # tool_filter=['scrape', 'extract', 'search'] ) ], ) @@ -403,7 +418,7 @@ root_agent = LlmAgent( - By default, all registered MCP tools are exposed to the agent (see [Available Tools](#available-tools)) - Use `tool_filter` to limit which tools are available: ```python - tool_filter=['markdownify', 'smartscraper', 'searchscraper'] + tool_filter=['scrape', 'extract', 'search'] ``` **API Key Configuration:** @@ -430,26 +445,26 @@ The server enables sophisticated queries across various scraping scenarios: ### Single Page Scraping - **Markdownify**: "Convert the ScrapeGraph documentation page to markdown" -- **SmartScraper**: "Extract all product names, prices, and ratings from this e-commerce page" -- **SmartScraper with scrolling**: "Scrape this infinite scroll page with 5 scrolls and extract all items" +- **Extract**: "Extract all product names, prices, and ratings from this e-commerce page" +- **Extract with scrolling**: "Scrape this infinite scroll page with 5 scrolls and extract all items" - **Basic Scrape**: "Fetch the HTML content of this JavaScript-heavy page with full rendering" ### Search and Research -- **SearchScraper**: "Research and summarize recent developments in AI-powered web scraping" -- **SearchScraper**: "Search for the top 5 articles about machine learning frameworks and extract key insights" -- **SearchScraper**: "Find recent news about GPT-4 and provide a structured summary" -- **SearchScraper**: v2 does not apply `time_range`; phrase queries to bias recency in natural language instead +- **Search**: "Research and summarize recent developments in AI-powered web scraping" +- **Search**: "Search for the top 5 articles about machine learning frameworks and extract key insights" +- **Search**: "Find recent news about GPT-4 and provide a structured summary" +- **Search**: v2 does not apply `time_range`; phrase queries to bias recency in natural language instead ### Website analysis -- Use **`smartcrawler_initiate`** (markdown/html) plus **`smartcrawler_fetch_results`** to map and capture multi-page content; there is no separate **sitemap** tool on v2. +- Use **`crawl_start`** plus **`crawl_get_status`** to map and capture multi-page content; there is no separate **sitemap** tool on v2. ### Multi-page crawling -- **SmartCrawler (markdown/html)**: "Crawl the blog in markdown mode and poll until complete" -- For structured fields per page, run **`smartscraper`** on individual URLs (or **`monitor_create`** on a schedule) +- **Crawl**: "Crawl the blog in markdown mode and poll until complete" +- For structured fields per page, run **`extract`** on individual URLs (or **`monitor_create`** on a schedule) ### Monitors and account - **Monitor**: "Run this extract prompt on https://example.com every day at 9am" (`monitor_create` with interval) -- **Credits / history**: `credits`, `sgai_history` +- **Credits / history**: `credits`, `history` - **Agentic Scraper**: "Execute a complex workflow: login, navigate to reports, download data, and extract summary statistics" ## Error Handling @@ -487,9 +502,9 @@ This ensures proper execution in the Windows environment. - **Cause**: Insufficient credits - **Solution**: Add credits to your ScrapeGraph account -**SmartCrawler not returning results** +**Crawl not returning results** - **Cause**: Still processing (asynchronous operation) -- **Solution**: Keep polling `smartcrawler_fetch_results()` until status is "completed" +- **Solution**: Keep polling `crawl_get_status()` until status is "completed" **Tools not appearing in Claude Desktop** - **Cause**: Server not starting or configuration error diff --git a/pyproject.toml b/pyproject.toml index 344e3ce..f53d699 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "scrapegraph-mcp" -version = "2.0.0" +version = "3.0.0" description = "MCP server for ScapeGraph API integration" license = {text = "MIT"} readme = "README.md" diff --git a/server.json b/server.json index d16eccc..7a5c766 100644 --- a/server.json +++ b/server.json @@ -6,12 +6,12 @@ "url": "https://github.com/ScrapeGraphAI/scrapegraph-mcp", "source": "github" }, - "version": "2.0.0", + "version": "3.0.0", "packages": [ { "registryType": "pypi", "identifier": "scrapegraph-mcp", - "version": "2.0.0", + "version": "3.0.0", "transport": { "type": "stdio" }, diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 99dcf58..b0bd59e 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -3,17 +3,17 @@ MCP server for ScapeGraph API integration (API v2). Aligned with scrapegraph-py v2 ([ScrapeGraphAI/scrapegraph-py#84](https://github.com/ScrapeGraphAI/scrapegraph-py/pull/84)): -- markdownify: Page content via POST /scrape (markdown by default) -- smartscraper: Structured extraction via POST /extract (url + prompt; schema optional) -- searchscraper: Web search via POST /search (supports numResults, schema, prompt, +- scrape: Page content via POST /scrape (markdown, html, screenshot, branding, links, images, summary) +- extract: Structured extraction via POST /extract (url + prompt; schema optional) +- search: Web search via POST /search (supports numResults, schema, prompt, locationGeoCode, timeRange, format/mode) -- smartcrawler_initiate / smartcrawler_fetch_results: Async crawl via /crawl +- crawl_start / crawl_get_status: Async crawl via /crawl (formats: markdown, html, links, images, summary, branding, screenshot) - crawl_stop / crawl_resume: Control running crawl jobs - scrape: Format-specific fetch (markdown, html, screenshot, branding, links, images, summary) — emitted as v2 `formats[]` entries -- generate_schema: JSON schema generation via POST /schema -- credits / sgai_history: Account usage and request history (page/limit/service) +- schema: JSON schema generation via POST /schema +- credits / history: Account usage and request history (page/limit/service) - monitor_*: Scheduled extraction jobs. `prompt`+`output_schema` are wrapped into a v2 `{type: "json", ...}` format entry; `webhook_url` is supported. @@ -21,7 +21,10 @@ locationGeoCode, maxDepth, maxPages, maxLinksPerPage, allowExternal, includePatterns, excludePatterns, contentTypes, webhookUrl, contentType). -Removed on v2 (no API equivalent): sitemap, agentic_scrapper, markdownify_status, smartscraper_status. +Removed on v2 (no API equivalent): sitemap, agentic_scrapper, async-status polling endpoints. +Renamed in v3 to match v2 docs: extract→extract, search→search, +crawl_start→crawl_start, crawl_get_status→crawl_get_status, +history→history, schema→schema. markdownify removed (use scrape with markdown format). Environment variables (match scrapegraph-py v2): - SGAI_API_URL (default https://v2-api.scrapegraphai.com/api) — base URL override @@ -259,25 +262,6 @@ def scrape_v2( body["fetchConfig"] = fetch_config_dict return self._request("POST", "/scrape", json_body=body) - def markdownify( - self, - website_url: str, - mode: Optional[str] = None, - stealth: Optional[bool] = None, - headers: Optional[Dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - country: Optional[str] = None, - timeout: Optional[int] = None, - wait: Optional[int] = None, - scrolls: Optional[int] = None, - mock: Optional[bool] = None, - ) -> Dict[str, Any]: - fc = self._fetch_config( - mode=mode, stealth=stealth, timeout=timeout, wait=wait, headers=headers, - cookies=cookies, country=country, scrolls=scrolls, mock=mock, - ) - return self.scrape_v2(website_url, "markdown", fetch_config_dict=fc) - def extract( self, user_prompt: str, @@ -307,16 +291,7 @@ def extract( body["fetchConfig"] = fetch_config_dict return self._request("POST", "/extract", json_body=body) - def smartscraper( - self, - user_prompt: str, - website_url: str, - output_schema: Optional[Dict[str, Any]] = None, - fetch_config_dict: Optional[Dict[str, Any]] = None, - ) -> Dict[str, Any]: - return self.extract(user_prompt, website_url, output_schema, fetch_config_dict) - - def search_api( + def search( self, query: str, num_results: Optional[int] = None, @@ -349,31 +324,6 @@ def search_api( body["fetchConfig"] = fetch_config_dict return self._request("POST", "/search", json_body=body) - def searchscraper( - self, - user_prompt: str, - num_results: Optional[int] = None, - output_schema: Optional[Dict[str, Any]] = None, - *, - country: Optional[str] = None, - prompt: Optional[str] = None, - search_format: str = "markdown", - search_mode: str = "prune", - time_range: Optional[str] = None, - fetch_config_dict: Optional[Dict[str, Any]] = None, - ) -> Dict[str, Any]: - return self.search_api( - user_prompt, - num_results=num_results, - output_schema=output_schema, - country=country, - prompt=prompt, - search_format=search_format, - search_mode=search_mode, - time_range=time_range, - fetch_config_dict=fetch_config_dict, - ) - def scrape( self, website_url: str, @@ -443,7 +393,7 @@ def crawl_start( body["fetchConfig"] = fetch_config_dict return self._request("POST", "/crawl", json_body=body) - def smartcrawler_fetch_results(self, request_id: str) -> Dict[str, Any]: + def crawl_get_status(self, request_id: str) -> Dict[str, Any]: return self._request("GET", f"/crawl/{request_id}") def crawl_stop(self, crawl_id: str) -> Dict[str, Any]: @@ -646,22 +596,21 @@ def web_scraping_guide() -> str: See [scrapegraph-py#84](https://github.com/ScrapeGraphAI/scrapegraph-py/pull/84) for the upstream SDK migration. ## Core tools -- **markdownify** — `POST /scrape` (markdown output) - **scrape** — `POST /scrape` (markdown, html, screenshot, branding, links, images, summary) -- **smartscraper** — `POST /extract` (url + prompt, optional schema/mode/contentType) -- **searchscraper** — `POST /search` (query + numResults 1–20; optional schema+prompt, locationGeoCode, timeRange) -- **smartcrawler_initiate** / **smartcrawler_fetch_results** — `POST/GET /crawl` +- **extract** — `POST /extract` (url + prompt, optional schema/mode/contentType) +- **search** — `POST /search` (query + numResults 1–20; optional schema+prompt, locationGeoCode, timeRange) +- **crawl_start** / **crawl_get_status** — `POST/GET /crawl` (formats: markdown, html, links, images, summary, branding, screenshot; maxDepth / maxPages / maxLinksPerPage / allowExternal) - **crawl_stop** / **crawl_resume** — control a running job -- **generate_schema** — `POST /schema` +- **schema** — `POST /schema` - **credits** — `GET /credits` -- **sgai_history** — `GET /history` (page/limit/service) +- **history** — `GET /history` (page/limit/service) - **monitor_*** — scheduled jobs (`POST/GET/DELETE /monitor`, pause/resume, optional webhook_url) ## Best practices -1. Use **markdownify** or **scrape** before **smartscraper** when you only need readable text. -2. Multi-page **AI** extraction: run **smartscraper** per URL, or use **monitor_create** on a schedule. -3. Poll **smartcrawler_fetch_results** until the crawl finishes. +1. Use **scrape** with `output_format="markdown"` before **extract** when you only need readable text. +2. Multi-page **AI** extraction: run **extract** per URL, or use **monitor_create** on a schedule. +3. Poll **crawl_get_status** until the crawl finishes. 4. Override API host with env **SGAI_API_URL** if needed (default `https://v2-api.scrapegraphai.com/api`). """ @@ -677,38 +626,39 @@ def quick_start_examples() -> str: ### Extract structured data (single URL) ``` -Tool: smartscraper +Tool: extract website_url: https://example.com/product/1 user_prompt: "Extract name, price, and availability" ``` ### Markdown snapshot ``` -Tool: markdownify +Tool: scrape website_url: https://docs.example.com +output_format: "markdown" ``` ### Search ``` -Tool: searchscraper +Tool: search user_prompt: "Latest Python 3.12 release highlights" num_results: 5 ``` ### Multi-page crawl (markdown/html only) ``` -Tool: smartcrawler_initiate +Tool: crawl_start url: https://blog.example.com extraction_mode: "markdown" max_pages: 15 depth: 2 ``` -Then poll `smartcrawler_fetch_results` with the returned `id`. +Then poll `crawl_get_status` with the returned `id`. ### Credits and history ``` Tool: credits -Tool: sgai_history +Tool: history limit: 10 ``` @@ -728,13 +678,13 @@ def api_status() -> str: - **MCP package version**: 2.0.0 (matches [scrapegraph-py#84](https://github.com/ScrapeGraphAI/scrapegraph-py/pull/84) API surface) - **Default API base**: `https://v2-api.scrapegraphai.com/api` (override with `SGAI_API_URL`) -- **Auth headers**: `SGAI-APIKEY`, `X-SDK-Version: scrapegraph-mcp@2.0.0` +- **Auth headers**: `SGAI-APIKEY`, `X-SDK-Version: scrapegraph-mcp@3.0.0` ## Tools -markdownify, scrape, smartscraper, searchscraper, smartcrawler_initiate, smartcrawler_fetch_results, crawl_stop, crawl_resume, generate_schema, credits, sgai_history, monitor_create, monitor_list, monitor_get, monitor_pause, monitor_resume, monitor_delete, monitor_activity +scrape, extract, search, crawl_start, crawl_get_status, crawl_stop, crawl_resume, schema, credits, history, monitor_create, monitor_list, monitor_get, monitor_pause, monitor_resume, monitor_delete, monitor_activity ## Removed vs legacy MCP -sitemap, agentic_scrapper, markdownify_status, smartscraper_status — not available on API v2. +sitemap, agentic_scrapper, async-status polling — not available on API v2. markdownify (v2) removed in v3 (use scrape with format=markdown). Credit costs are determined by the ScrapeGraphAI API; use **credits** to check balance. """ @@ -752,13 +702,13 @@ def common_use_cases() -> str: ## 🛍️ E-commerce Data Extraction ### Product Information Scraping -**Tool**: smartscraper +**Tool**: extract **Input**: Product page URL + "Extract name, price, description, rating, availability" **Output**: Structured JSON with product details **Credits**: 10 per page ### Price Monitoring -**Tool**: smartcrawler_initiate (AI mode) +**Tool**: crawl_start (AI mode) **Input**: Product category page + price extraction prompt **Output**: Structured price data across multiple products **Credits**: 10 per page crawled @@ -766,13 +716,13 @@ def common_use_cases() -> str: ## 📰 Content & Research ### News Article Extraction -**Tool**: searchscraper +**Tool**: search **Input**: "Latest news about [topic]" + num_results **Output**: Article titles, summaries, sources, dates **Credits**: 10 per website searched ### Documentation Conversion -**Tool**: smartcrawler_initiate (markdown mode) +**Tool**: crawl_start (markdown mode) **Input**: Documentation site root URL **Output**: Clean markdown files for all pages **Credits**: 2 per page converted @@ -780,13 +730,13 @@ def common_use_cases() -> str: ## 🏢 Business Intelligence ### Contact Information Gathering -**Tool**: smartscraper +**Tool**: extract **Input**: Company website + "Find contact details" **Output**: Emails, phones, addresses, social media **Credits**: 10 per page ### Competitor Analysis -**Tool**: searchscraper + smartscraper combination +**Tool**: search + extract combination **Input**: Search for competitors + extract key metrics **Output**: Structured competitive intelligence **Credits**: Variable based on pages analyzed @@ -794,13 +744,13 @@ def common_use_cases() -> str: ## 🔍 Research & Analysis ### Academic Paper Research -**Tool**: searchscraper +**Tool**: search **Input**: Research query + academic site focus **Output**: Paper titles, abstracts, authors, citations **Credits**: 10 per source website ### Market Research -**Tool**: smartcrawler_initiate +**Tool**: crawl_start **Input**: Industry website + data extraction prompts **Output**: Market trends, statistics, insights **Credits**: 10 per page (AI mode) @@ -814,7 +764,7 @@ def common_use_cases() -> str: **Credits**: Variable based on complexity ### Multi-step Research Process -**Workflow**: sitemap → smartcrawler_initiate → smartscraper +**Workflow**: sitemap → crawl_start → extract **Input**: Target site + research objectives **Output**: Comprehensive site analysis and data extraction **Credits**: Cumulative based on tools used @@ -855,10 +805,10 @@ def parameter_reference_guide() -> str: """ return """# ScapeGraph MCP Parameter Reference Guide -> **API v2 note:** This document still contains legacy v1-era tool names and parameters in places. -> Trust the live tool schemas in the MCP client and the module docstring in `server.py` for v2. -> New tools: `credits`, `sgai_history`, `crawl_stop`, `crawl_resume`, `monitor_*`. Removed: `sitemap`, -> `agentic_scrapper`, `markdownify_status`, `smartscraper_status`. +> **API v3 note:** This document still contains legacy v1-era tool names and parameters in places. +> Trust the live tool schemas in the MCP client and the module docstring in `server.py` for v3. +> v3 tools: `credits`, `history`, `crawl_stop`, `crawl_resume`, `monitor_*`. Removed: `sitemap`, +> `agentic_scrapper`, async-status polling endpoints, `markdownify` (use `scrape` with format=markdown). ## 📋 Complete Parameter Documentation @@ -869,7 +819,7 @@ def parameter_reference_guide() -> str: ## 🔧 Common Parameters ### URL Parameters -**Used in**: markdownify, smartscraper, searchscraper, smartcrawler_initiate, scrape, monitor_*, and related v2 tools +**Used in**: scrape, extract, search, crawl_start, monitor_*, and related v2 tools #### `website_url` / `url` - **Type**: `str` (required) @@ -889,7 +839,7 @@ def parameter_reference_guide() -> str: ## 🤖 AI and Extraction Parameters ### `user_prompt` -**Used in**: smartscraper, searchscraper, agentic_scrapper +**Used in**: extract, search, agentic_scrapper - **Type**: `str` (required) - **Purpose**: Natural language instructions for AI extraction @@ -904,7 +854,7 @@ def parameter_reference_guide() -> str: - Use clear, descriptive language ### `output_schema` -**Used in**: smartscraper, agentic_scrapper +**Used in**: extract, agentic_scrapper - **Type**: `Optional[Union[str, Dict[str, Any]]]` - **Purpose**: Define expected output structure @@ -945,7 +895,7 @@ def parameter_reference_guide() -> str: ## 🌐 Content Source Parameters ### `website_html` -**Used in**: smartscraper +**Used in**: extract - **Type**: `Optional[str]` - **Purpose**: Process local HTML content @@ -957,7 +907,7 @@ def parameter_reference_guide() -> str: - **Mutually Exclusive**: Cannot use with `website_url` or `website_markdown` ### `website_markdown` -**Used in**: smartscraper +**Used in**: extract - **Type**: `Optional[str]` - **Purpose**: Process local markdown content @@ -973,7 +923,7 @@ def parameter_reference_guide() -> str: ## 📄 Pagination and Scrolling Parameters ### `number_of_scrolls` -**Used in**: smartscraper, searchscraper +**Used in**: extract, search - **Type**: `Optional[int]` - **Range**: 0-50 scrolls @@ -990,7 +940,7 @@ def parameter_reference_guide() -> str: - Consider site loading behavior ### `total_pages` -**Used in**: smartscraper +**Used in**: extract - **Type**: `Optional[int]` - **Range**: 1-100 pages @@ -1011,7 +961,7 @@ def parameter_reference_guide() -> str: ## 🚀 Fetch/Proxy Parameters ### `mode` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[str]` - **Default**: `auto` @@ -1025,7 +975,7 @@ def parameter_reference_guide() -> str: - **Cost**: Same regardless of setting ### `stealth` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[bool]` - **Default**: `false` @@ -1034,24 +984,24 @@ def parameter_reference_guide() -> str: - **Combine with any mode**: e.g. `mode="js"` + `stealth=True` for JS rendering with residential proxy ### `timeout` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[int]` - **Range**: 1000-60000 milliseconds - **Purpose**: Request timeout ### `wait` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[int]` - **Range**: 0-30000 milliseconds - **Purpose**: Wait after page load before scraping ### `cookies` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[Dict[str, str]]` - **Purpose**: Cookies to send with the request ### `country` -**Used in**: markdownify, scrape, smartscraper, smartcrawler_initiate, monitor_create +**Used in**: scrape, extract, crawl_start, monitor_create - **Type**: `Optional[str]` - **Purpose**: Two-letter country code for geo-located requests (e.g. 'us') @@ -1060,7 +1010,7 @@ def parameter_reference_guide() -> str: ## 🔄 Crawling Parameters ### `extraction_mode` -**Used in**: smartcrawler_initiate +**Used in**: crawl_start - **Type**: `str` - **Default**: `"markdown"` @@ -1072,7 +1022,7 @@ def parameter_reference_guide() -> str: - HTML: Full HTML preservation ### `depth` -**Used in**: smartcrawler_initiate +**Used in**: crawl_start - **Type**: `Optional[int]` - **Default**: Unlimited @@ -1088,7 +1038,7 @@ def parameter_reference_guide() -> str: - Consider site structure ### `max_pages` -**Used in**: smartcrawler_initiate +**Used in**: crawl_start - **Type**: `Optional[int]` - **Default**: Unlimited @@ -1103,7 +1053,7 @@ def parameter_reference_guide() -> str: - Markdown mode: `max_pages × 2` credits ### `same_domain_only` -**Used in**: smartcrawler_initiate +**Used in**: crawl_start - **Type**: `Optional[bool]` - **Default**: `true` @@ -1121,7 +1071,7 @@ def parameter_reference_guide() -> str: ## 🔄 Search Parameters ### `num_results` -**Used in**: searchscraper +**Used in**: search - **Type**: `Optional[int]` - **Default**: 3 websites @@ -1201,8 +1151,8 @@ def parameter_reference_guide() -> str: | Tool | Base Cost | Additional Costs | |------|-----------|------------------| | `markdownify` | 2 credits | None | -| `smartscraper` | 10 credits | +10 per additional page | -| `searchscraper` | 30 credits (3 sites) | +10 per additional site | +| `extract` | 10 credits | +10 per additional page | +| `search` | 30 credits (3 sites) | +10 per additional site | | `smartcrawler` | 2-10 credits/page | Depends on extraction mode | | `scrape` | 1 credit | None | | `sitemap` | 1 credit | None | @@ -1238,25 +1188,25 @@ def parameter_reference_guide() -> str: ### For Simple Content Extraction ``` -Tool: markdownify or smartscraper -Parameters: website_url, user_prompt (if smartscraper) +Tool: markdownify or extract +Parameters: website_url, user_prompt (if extract) ``` ### For Dynamic Content ``` -Tool: smartscraper or scrape +Tool: extract or scrape Parameters: mode="js" (add stealth=True if bot detection is present) ``` ### For Multi-Page Content ``` -Tool: smartcrawler_initiate +Tool: crawl_start Parameters: max_pages, depth, extraction_mode ``` ### For Research Tasks ``` -Tool: searchscraper +Tool: search Parameters: num_results, user_prompt ``` @@ -1298,9 +1248,9 @@ def tool_comparison_guide() -> str: | Need | Recommended Tool | Alternative | Credits | |------|------------------|-------------|---------| | Convert page to markdown | `markdownify` | `scrape` + manual | 2 | -| Extract specific data | `smartscraper` | `agentic_scrapper` | 10 | -| Search web for info | `searchscraper` | Multiple `smartscraper` | 30 | -| Crawl multiple pages | `smartcrawler_initiate` | Loop `smartscraper` | 2-10/page | +| Extract specific data | `extract` | `agentic_scrapper` | 10 | +| Search web for info | `search` | Multiple `extract` | 30 | +| Crawl multiple pages | `crawl_start` | Loop `extract` | 2-10/page | | Get raw page content | `scrape` | `markdownify` | 1 | | Map site structure | `sitemap` | Manual discovery | 1 | | Complex automation | `agentic_scrapper` | Custom scripting | Variable | @@ -1315,21 +1265,21 @@ def tool_comparison_guide() -> str: - **Use markdownify when**: You need readable content - **Use scrape when**: You need full HTML or custom parsing -#### smartscraper vs agentic_scrapper -- **smartscraper**: Single-page AI extraction +#### extract vs agentic_scrapper +- **extract**: Single-page AI extraction - **agentic_scrapper**: Multi-step automated workflows -- **Use smartscraper when**: Simple data extraction from one page +- **Use extract when**: Simple data extraction from one page - **Use agentic_scrapper when**: Complex navigation required ### Scale & Automation #### Single Page Tools -- `markdownify`, `smartscraper`, `scrape`, `sitemap` +- `markdownify`, `extract`, `scrape`, `sitemap` - **Pros**: Fast, predictable costs, simple - **Cons**: Manual iteration for multiple pages #### Multi-Page Tools -- `smartcrawler_initiate`, `searchscraper`, `agentic_scrapper` +- `crawl_start`, `search`, `agentic_scrapper` - **Pros**: Automated scale, comprehensive results - **Cons**: Higher costs, longer processing times @@ -1341,11 +1291,11 @@ def tool_comparison_guide() -> str: - `sitemap`: Site structure #### Medium Cost (10 credits) -- `smartscraper`: AI data extraction -- `searchscraper`: Per website searched +- `extract`: AI data extraction +- `search`: Per website searched #### Variable Cost -- `smartcrawler_initiate`: 2-10 credits per page +- `crawl_start`: 2-10 credits per page - `agentic_scrapper`: Depends on complexity ## 🚀 Performance Characteristics @@ -1354,21 +1304,21 @@ def tool_comparison_guide() -> str: 1. **scrape**: 2-5 seconds 2. **sitemap**: 3-8 seconds 3. **markdownify**: 5-15 seconds -4. **smartscraper**: 15-45 seconds -5. **searchscraper**: 30-90 seconds +4. **extract**: 15-45 seconds +5. **search**: 30-90 seconds 6. **smartcrawler**: 1-5 minutes (async) 7. **agentic_scrapper**: 2-10 minutes ### Reliability - **Highest**: `scrape`, `sitemap`, `markdownify` -- **High**: `smartscraper`, `searchscraper` +- **High**: `extract`, `search` - **Variable**: `smartcrawler`, `agentic_scrapper` (depends on site complexity) ## 🎨 Output Format Comparison ### Structured Data -- **smartscraper**: JSON with extracted fields -- **searchscraper**: JSON with search results +- **extract**: JSON with extracted fields +- **search**: JSON with search results - **agentic_scrapper**: Custom schema support ### Content Formats @@ -1377,8 +1327,8 @@ def tool_comparison_guide() -> str: - **sitemap**: URL list/structure ### Async Operations -- **smartcrawler_initiate**: Returns request ID -- **smartcrawler_fetch_results**: Returns final data +- **crawl_start**: Returns request ID +- **crawl_get_status**: Returns final data - All others: Immediate response ## 🛠️ Integration Patterns @@ -1386,20 +1336,20 @@ def tool_comparison_guide() -> str: ### Simple Workflows ``` URL → markdownify → Markdown content -URL → smartscraper → Structured data -Query → searchscraper → Research results +URL → extract → Structured data +Query → search → Research results ``` ### Complex Workflows ``` -URL → sitemap → smartcrawler_initiate → smartcrawler_fetch_results +URL → sitemap → crawl_start → crawl_get_status URL → agentic_scrapper (with steps) → Complex extracted data -Query → searchscraper → smartscraper (on results) → Detailed analysis +Query → search → extract (on results) → Detailed analysis ``` ### Hybrid Approaches ``` -URL → scrape (check if JS needed) → smartscraper (extract data) +URL → scrape (check if JS needed) → extract (extract data) URL → sitemap (map structure) → smartcrawler (batch process) ``` @@ -1410,12 +1360,12 @@ def tool_comparison_guide() -> str: - ✅ Converting documentation/articles - ✅ Cost is a primary concern -**Choose smartscraper when:** +**Choose extract when:** - ✅ Need specific data extracted - ✅ Working with single pages - ✅ Want AI-powered extraction -**Choose searchscraper when:** +**Choose search when:** - ✅ Need to find information across web - ✅ Research-oriented tasks - ✅ Don't have specific URLs @@ -1432,54 +1382,9 @@ def tool_comparison_guide() -> str: """ -# Add tool for markdownify -@mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) -def markdownify( - website_url: str, - ctx: Context, - mode: Optional[Literal["auto", "fast", "js"]] = None, - stealth: Optional[bool] = None, - headers: Optional[Dict[str, str]] = None, - cookies: Optional[Dict[str, str]] = None, - country: Optional[str] = None, - timeout: Optional[int] = None, - wait: Optional[int] = None, - scrolls: Optional[int] = None, - mock: Optional[bool] = None, -) -> Dict[str, Any]: - """ - Convert a webpage into clean, formatted markdown (API v2 POST /scrape). - - Args: - website_url: URL to convert (must include http:// or https://). - mode: Fetch/proxy mode controlling how the page is retrieved. - - auto: Automatically selects the best provider chain (default). - - fast: Direct HTTP fetch via impit (fastest, no JS). - - js: Headless browser rendering for JavaScript-heavy pages. - stealth: Use residential proxies to bypass bot detection (+5 credits). - headers: Custom HTTP headers to send with the request. - cookies: Cookies to send with the request. - country: Two-letter country code for geo-located requests (e.g. 'us'). - timeout: Request timeout in milliseconds (1000-60000). - wait: Milliseconds to wait after page load before scraping (0-30000). - scrolls: Number of scrolls to perform (0-100). - mock: Use mock mode for testing (no credits consumed). - """ - try: - api_key = get_api_key(ctx) - client = ScapeGraphClient(api_key) - return client.markdownify( - website_url=website_url, mode=mode, stealth=stealth, headers=headers, - cookies=cookies, country=country, timeout=timeout, wait=wait, - scrolls=scrolls, mock=mock, - ) - except Exception as e: - return {"error": str(e)} - - -# Add tool for smartscraper +# Add tool for extract @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) -def smartscraper( +def extract( user_prompt: str, website_url: str, ctx: Context, @@ -1548,7 +1453,7 @@ def smartscraper( cookies=cookies, country=country, scrolls=scrolls, mock=mock, ) - return client.smartscraper( + return client.extract( user_prompt=user_prompt, website_url=website_url, output_schema=normalized_schema, @@ -1558,9 +1463,9 @@ def smartscraper( return {"error": str(e)} -# Add tool for searchscraper +# Add tool for search @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": False}) -def searchscraper( +def search( user_prompt: str, ctx: Context, num_results: Optional[int] = None, @@ -1638,7 +1543,7 @@ def searchscraper( cookies=cookies, country=country, scrolls=scrolls, mock=mock, ) - return client.searchscraper( + return client.search( user_prompt, num_results=num_results, output_schema=normalized_schema, @@ -1653,9 +1558,9 @@ def searchscraper( return {"error": str(e)} -# Add tool for SmartCrawler initiation +# Add tool for crawl_start @mcp.tool(annotations={"readOnlyHint": False, "destructiveHint": False, "idempotentHint": False}) -def smartcrawler_initiate( +def crawl_start( url: str, ctx: Context, extraction_mode: Literal[ @@ -1681,7 +1586,7 @@ def smartcrawler_initiate( """ Start an asynchronous multi-page crawl (API v2 POST /crawl). - Poll smartcrawler_fetch_results with the returned id. + Poll crawl_get_status with the returned id. Args: url: Starting URL (http/https). @@ -1738,9 +1643,9 @@ def smartcrawler_initiate( return {"error": str(e)} -# Add tool for fetching SmartCrawler results +# Add tool for crawl_get_status @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": True}) -def smartcrawler_fetch_results(request_id: str, ctx: Context) -> Dict[str, Any]: +def crawl_get_status(request_id: str, ctx: Context) -> Dict[str, Any]: """ Retrieve the results of an asynchronous SmartCrawler operation. @@ -1750,7 +1655,7 @@ def smartcrawler_fetch_results(request_id: str, ctx: Context) -> Dict[str, Any]: Read-only operation that safely retrieves results without side effects. Args: - request_id: The unique request ID returned by smartcrawler_initiate. Use this to retrieve the crawling results. Keep polling until status is 'completed'. Example: 'req_abc123xyz' + request_id: The unique request ID returned by crawl_start. Use this to retrieve the crawling results. Keep polling until status is 'completed'. Example: 'req_abc123xyz' Returns: Dictionary containing: @@ -1762,7 +1667,7 @@ def smartcrawler_fetch_results(request_id: str, ctx: Context) -> Dict[str, Any]: try: api_key = get_api_key(ctx) client = ScapeGraphClient(api_key) - return client.smartcrawler_fetch_results(request_id) + return client.crawl_get_status(request_id) except Exception as e: return {"error": str(e)} @@ -1801,7 +1706,7 @@ def credits(ctx: Context) -> Dict[str, Any]: @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": False}) -def sgai_history( +def history( ctx: Context, service: Optional[str] = None, page: Optional[int] = None, @@ -1922,7 +1827,7 @@ def monitor_create( @mcp.tool(annotations={"readOnlyHint": True, "destructiveHint": False, "idempotentHint": False}) -def generate_schema( +def schema( prompt: str, ctx: Context, existing_schema: Optional[ diff --git a/uv.lock b/uv.lock index 42752df..e015242 100644 --- a/uv.lock +++ b/uv.lock @@ -1339,7 +1339,7 @@ wheels = [ [[package]] name = "scrapegraph-mcp" -version = "1.0.1" +version = "3.0.0" source = { editable = "." } dependencies = [ { name = "fastmcp" },