From 7504628d6556e413599a2bc0fa16513270db9885 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 15 Apr 2026 17:14:48 +0200 Subject: [PATCH 1/2] feat: migrate CLI to scrapegraph-js v2 API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full migration of just-scrape to the v2 API via scrapegraph-js PR #13 (pinned to head 096c110). The CLI surface, commands, and flags all move to v2 semantics and a single src/commands.ts holds every endpoint. ## CLI surface - extract, search, scrape, markdownify, crawl, monitor, history, credits - monitor gains full lifecycle: create, list, get, update, delete, pause, resume, and activity (paginated tick history with --limit / --cursor) - scrape supports 8 formats (markdown, html, screenshot, branding, links, images, summary, json) plus multi-format via comma-separated -f - crawl uses the v2 formats array and supports multi-format - search gains --country, --time-range, --format - Removed from API: agentic-scraper, generate-schema, sitemap, validate - --stealth is now a separate boolean; fetch mode is --mode auto|fast|js ## Environment - New default base URL: https://api.scrapegraphai.com/api/v2 - SGAI_TIMEOUT (was SGAI_TIMEOUT_S); legacy SGAI_TIMEOUT_S and JUST_SCRAPE_TIMEOUT_S are bridged transparently - SGAI_API_KEY, SGAI_API_URL, SGAI_DEBUG honored; JUST_SCRAPE_* still bridged ## Refactor - 8 per-endpoint files under src/commands/ merged into one src/commands.ts with named exports; cli.ts imports them statically - tsup output: single dist/cli.mjs (~30 KB) instead of 8 chunks - biome override scoped to src/commands.ts - scrapegraph-js ships without a prebuilt dist/, so CI builds it in-place after install Version bumped 0.3.0 β†’ 1.0.0 to track SDK v2.0.0. Co-Authored-By: Claude Opus 4.6 (1M context) --- .github/workflows/ci.yml | 4 + README.md | 323 ++++++------ biome.json | 2 +- bun.lock | 6 +- package.json | 4 +- skills/just-scrape/SKILL.md | 221 +++----- src/cli.ts | 29 +- src/commands.ts | 873 ++++++++++++++++++++++++++++++++ src/commands/.gitkeep | 0 src/commands/agentic-scraper.ts | 51 -- src/commands/crawl.ts | 62 --- src/commands/credits.ts | 25 - src/commands/generate-schema.ts | 37 -- src/commands/history.ts | 146 ------ src/commands/markdownify.ts | 40 -- src/commands/scrape.ts | 40 -- src/commands/search-scraper.ts | 52 -- src/commands/sitemap.ts | 31 -- src/commands/smart-scraper.ts | 57 --- src/commands/validate.ts | 25 - src/lib/client.ts | 9 + src/lib/env.ts | 9 +- src/utils/banner.ts | 6 +- tests/smoke.test.ts | 16 +- 24 files changed, 1166 insertions(+), 902 deletions(-) create mode 100644 src/commands.ts delete mode 100644 src/commands/.gitkeep delete mode 100644 src/commands/agentic-scraper.ts delete mode 100644 src/commands/crawl.ts delete mode 100644 src/commands/credits.ts delete mode 100644 src/commands/generate-schema.ts delete mode 100644 src/commands/history.ts delete mode 100644 src/commands/markdownify.ts delete mode 100644 src/commands/scrape.ts delete mode 100644 src/commands/search-scraper.ts delete mode 100644 src/commands/sitemap.ts delete mode 100644 src/commands/smart-scraper.ts delete mode 100644 src/commands/validate.ts create mode 100644 src/lib/client.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b404665..516fb29 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,6 +14,9 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + # scrapegraph-js is pinned to a GitHub commit (not yet on npm) and ships + # without a prebuilt dist/ β€” build it in-place so module resolution works. + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun test lint: @@ -23,4 +26,5 @@ jobs: - uses: actions/checkout@v4 - uses: oven-sh/setup-bun@v2 - run: bun install + - run: cd node_modules/scrapegraph-js && bun install && bun run build - run: bun run check diff --git a/README.md b/README.md index 5871dea..3dc72b1 100644 --- a/README.md +++ b/README.md @@ -5,40 +5,20 @@ Made with love by the [ScrapeGraphAI team](https://scrapegraphai.com) πŸ’œ ![Demo Video](/assets/demo.gif) -Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, and crawling. +Command-line interface for [ScrapeGraph AI](https://scrapegraphai.com) β€” AI-powered web scraping, data extraction, search, crawling, and monitoring. Uses the **v2 API**. ## Project Structure ``` just-scrape/ -β”œβ”€β”€ docs/ # API response docs per endpoint -β”‚ β”œβ”€β”€ smartscraper.md -β”‚ β”œβ”€β”€ searchscraper.md -β”‚ β”œβ”€β”€ markdownify.md -β”‚ β”œβ”€β”€ crawl.md -β”‚ β”œβ”€β”€ scrape.md -β”‚ β”œβ”€β”€ agenticscraper.md -β”‚ β”œβ”€β”€ generate-schema.md -β”‚ β”œβ”€β”€ sitemap.md -β”‚ └── credits.md β”œβ”€β”€ src/ β”‚ β”œβ”€β”€ cli.ts # Entry point, citty main command + subcommands +β”‚ β”œβ”€β”€ commands.ts # All endpoint commands in one file (extract, search, scrape, markdownify, crawl, monitor, history, credits) β”‚ β”œβ”€β”€ lib/ +β”‚ β”‚ β”œβ”€β”€ client.ts # API key resolver β”‚ β”‚ β”œβ”€β”€ env.ts # Env config (API key, JUST_SCRAPE_* β†’ SGAI_* bridge) β”‚ β”‚ β”œβ”€β”€ folders.ts # API key resolution + interactive prompt β”‚ β”‚ └── log.ts # Logger factory + syntax-highlighted JSON output -β”‚ β”œβ”€β”€ commands/ -β”‚ β”‚ β”œβ”€β”€ smart-scraper.ts -β”‚ β”‚ β”œβ”€β”€ search-scraper.ts -β”‚ β”‚ β”œβ”€β”€ markdownify.ts -β”‚ β”‚ β”œβ”€β”€ crawl.ts -β”‚ β”‚ β”œβ”€β”€ sitemap.ts -β”‚ β”‚ β”œβ”€β”€ scrape.ts -β”‚ β”‚ β”œβ”€β”€ agentic-scraper.ts -β”‚ β”‚ β”œβ”€β”€ generate-schema.ts -β”‚ β”‚ β”œβ”€β”€ history.ts -β”‚ β”‚ β”œβ”€β”€ credits.ts -β”‚ β”‚ └── validate.ts β”‚ └── utils/ β”‚ └── banner.ts # ASCII banner + version from package.json β”œβ”€β”€ dist/ # Build output (git-ignored) @@ -90,264 +70,260 @@ Four ways to provide it (checked in order): | Variable | Description | Default | |---|---|---| | `SGAI_API_KEY` | ScrapeGraph API key | β€” | -| `JUST_SCRAPE_API_URL` | Override API base URL | `https://api.scrapegraphai.com/v1` | -| `JUST_SCRAPE_TIMEOUT_S` | Request/polling timeout in seconds | `120` | -| `JUST_SCRAPE_DEBUG` | Set to `1` to enable debug logging to stderr | `0` | +| `SGAI_API_URL` | Override API base URL | `https://api.scrapegraphai.com/api/v2` | +| `SGAI_TIMEOUT` | Request timeout in seconds | `120` | +| `SGAI_DEBUG` | Set to `1` to log requests/responses | β€” | + +Legacy variables are still bridged transparently: `JUST_SCRAPE_API_URL` β†’ `SGAI_API_URL`, `JUST_SCRAPE_TIMEOUT_S` / `SGAI_TIMEOUT_S` β†’ `SGAI_TIMEOUT`, `JUST_SCRAPE_DEBUG` β†’ `SGAI_DEBUG`. ## JSON Mode (`--json`) All commands support `--json` for machine-readable output. When set, banner, spinners, and interactive prompts are suppressed β€” only minified JSON on stdout (saves tokens when piped to AI agents). ```bash -just-scrape credits --json | jq '.remaining_credits' -just-scrape smart-scraper https://example.com -p "Extract data" --json > result.json -just-scrape history smartscraper --json | jq '.requests[].status' +just-scrape credits --json | jq '.remaining' +just-scrape extract https://example.com -p "Extract data" --json > result.json +just-scrape history scrape --json | jq '.[].status' ``` --- -## Smart Scraper +## Extract -Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/services/smartscraper) +Extract structured data from any URL using AI. [docs](https://docs.scrapegraphai.com/api-reference/extract) ### Usage ```bash -just-scrape smart-scraper -p # Extract data with AI -just-scrape smart-scraper -p --schema # Enforce output schema -just-scrape smart-scraper -p --scrolls # Infinite scroll (0-100) -just-scrape smart-scraper -p --pages # Multi-page (1-100) -just-scrape smart-scraper -p --stealth # Anti-bot bypass (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text # Plain text instead of JSON +just-scrape extract -p # Extract data with AI +just-scrape extract -p --schema # Enforce output schema +just-scrape extract -p --mode # HTML mode: normal, reader, prune +just-scrape extract -p --scrolls # Infinite scroll (0-100) +just-scrape extract -p --mode js --stealth # Anti-bot bypass +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # Geo-targeting ``` ### Examples ```bash # Extract product listings from an e-commerce page -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" +just-scrape extract https://store.example.com/shoes -p "Extract all product names, prices, and ratings" # Extract with a strict schema, scrolling to load more content -just-scrape smart-scraper https://news.example.com -p "Get all article headlines and dates" \ +just-scrape extract https://news.example.com -p "Get all article headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 -# Scrape a JS-heavy SPA behind anti-bot protection -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ +# Scrape a JS-heavy SPA with stealth mode +just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ --stealth ``` -## Search Scraper +## Search -Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/services/searchscraper) +Search the web and extract structured data from results. [docs](https://docs.scrapegraphai.com/api-reference/search) ### Usage ```bash -just-scrape search-scraper # AI-powered web search -just-scrape search-scraper --num-results # Sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # Markdown only (2 credits vs 10) -just-scrape search-scraper --schema # Enforce output schema -just-scrape search-scraper --stealth --headers +just-scrape search # AI-powered web search +just-scrape search --num-results # Sources to scrape (1-20, default 3) +just-scrape search -p # Extraction prompt for results +just-scrape search --schema # Enforce output schema (requires -p) +just-scrape search --country # Geo-target search (e.g. 'us', 'de', 'jp') +just-scrape search --time-range # past_hour | past_24_hours | past_week | past_month | past_year +just-scrape search --format # Result format (default markdown) +just-scrape search --headers ``` ### Examples ```bash # Research a topic across multiple sources -just-scrape search-scraper "What are the best Python web frameworks in 2025?" --num-results 10 +just-scrape search "What are the best Python web frameworks in 2025?" --num-results 10 -# Get raw markdown from search results (cheaper) -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 +# Recent news only, scoped to Germany +just-scrape search "EU AI act latest news" --time-range past_week --country de # Structured output with schema -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ + -p "Extract provider name and free tier details" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -## Markdownify +## Scrape -Convert any webpage to clean markdown. [docs](https://docs.scrapegraphai.com/services/markdownify) +Scrape content from a URL in one or more formats. The v2 API supports **8 formats**: `markdown`, `html`, `screenshot`, `branding`, `links`, `images`, `summary`, `json`. [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape markdownify # Convert to markdown -just-scrape markdownify --stealth # Anti-bot bypass (+4 credits) -just-scrape markdownify --headers # Custom headers +just-scrape scrape # Markdown (default) +just-scrape scrape -f html # Raw HTML +just-scrape scrape -f screenshot # Page screenshot +just-scrape scrape -f branding # Branding (logos, colors, fonts) +just-scrape scrape -f links # Extracted links +just-scrape scrape -f images # Extracted images +just-scrape scrape -f summary # AI-generated page summary +just-scrape scrape -f json -p # Structured JSON via prompt +just-scrape scrape -f markdown,links,images # Multi-format (comma-separated) +just-scrape scrape --html-mode reader # normal (default), reader, or prune +just-scrape scrape --scrolls # Infinite scroll (0-100) +just-scrape scrape -m js --stealth # Anti-bot bypass +just-scrape scrape --country # Geo-targeting ``` ### Examples ```bash -# Convert a blog post to markdown -just-scrape markdownify https://blog.example.com/my-article - -# Convert a JS-rendered page behind Cloudflare -just-scrape markdownify https://protected.example.com --stealth - -# Pipe markdown to a file -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md -``` - -## Crawl - -Crawl multiple pages and extract data from each. [docs](https://docs.scrapegraphai.com/services/smartcrawler) - -### Usage - -```bash -just-scrape crawl -p # Crawl + extract -just-scrape crawl -p --max-pages # Max pages (default 10) -just-scrape crawl -p --depth # Crawl depth (default 1) -just-scrape crawl --no-extraction --max-pages # Markdown only (2 credits/page) -just-scrape crawl -p --schema # Enforce output schema -just-scrape crawl -p --rules # Crawl rules (include_paths, same_domain) -just-scrape crawl -p --no-sitemap # Skip sitemap discovery -just-scrape crawl -p --stealth # Anti-bot bypass -``` +# Markdown of a page +just-scrape scrape https://example.com -### Examples +# Raw HTML with reader-mode extraction +just-scrape scrape https://blog.example.com -f html --html-mode reader -```bash -# Crawl a docs site and extract all code examples -just-scrape crawl https://docs.example.com -p "Extract all code snippets with their language" \ - --max-pages 20 --depth 3 +# Multi-format: markdown + links + images in a single call +just-scrape scrape https://example.com -f markdown,links,images -# Crawl only blog pages, skip everything else -just-scrape crawl https://example.com -p "Extract article titles and summaries" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 +# Structured JSON output with a prompt +just-scrape scrape https://store.example.com -f json -p "Extract product name and price" -# Get raw markdown from all pages (no AI extraction, cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 +# Scrape with stealth mode and geo-targeting +just-scrape scrape https://store.example.com --stealth --country DE ``` -## Sitemap +## Markdownify -Get all URLs from a website's sitemap. [docs](https://docs.scrapegraphai.com/services/sitemap) +Convert any webpage to clean markdown (convenience wrapper for `scrape --format markdown`). [docs](https://docs.scrapegraphai.com/api-reference/scrape) ### Usage ```bash -just-scrape sitemap +just-scrape markdownify # Convert to markdown +just-scrape markdownify -m js --stealth # Anti-bot bypass +just-scrape markdownify --headers # Custom headers ``` ### Examples ```bash -# List all pages on a site -just-scrape sitemap https://example.com +# Convert a blog post to markdown +just-scrape markdownify https://blog.example.com/my-article + +# Convert a JS-rendered page behind Cloudflare +just-scrape markdownify https://protected.example.com -m js --stealth -# Pipe URLs to another tool -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Pipe markdown to a file +just-scrape markdownify https://docs.example.com/api --json | jq -r '.results.markdown.data[0]' > api-docs.md ``` -## Scrape +## Crawl -Get raw HTML content from a URL. [docs](https://docs.scrapegraphai.com/services/scrape) +Crawl multiple pages. The CLI starts the crawl and polls until completion. Supports the same format options as scrape. [docs](https://docs.scrapegraphai.com/api-reference/crawl) ### Usage ```bash -just-scrape scrape # Raw HTML -just-scrape scrape --stealth # Anti-bot bypass (+4 credits) -just-scrape scrape --branding # Extract branding (+2 credits) -just-scrape scrape --country-code # Geo-targeting +just-scrape crawl # Crawl with defaults +just-scrape crawl --max-pages # Max pages (default 50) +just-scrape crawl --max-depth # Crawl depth (default 2) +just-scrape crawl --max-links-per-page # Links per page (default 10) +just-scrape crawl --allow-external # Allow external domains +just-scrape crawl -f html # Page format (default markdown) +just-scrape crawl -f markdown,links # Multi-format (comma-separated) +just-scrape crawl -m js --stealth # Anti-bot bypass ``` ### Examples ```bash -# Get raw HTML of a page -just-scrape scrape https://example.com +# Crawl a docs site +just-scrape crawl https://docs.example.com --max-pages 20 --max-depth 3 -# Scrape a geo-restricted page with anti-bot bypass -just-scrape scrape https://store.example.com --stealth --country-code DE +# Crawl staying within domain +just-scrape crawl https://example.com --max-pages 50 -# Extract branding info (logos, colors, fonts) -just-scrape scrape https://example.com --branding +# Get crawl results as JSON +just-scrape crawl https://example.com --json --max-pages 10 ``` -## Agentic Scraper +## Monitor -Browser automation with AI β€” login, click, navigate, fill forms. [docs](https://docs.scrapegraphai.com/services/agenticscraper) +Create and manage page-change monitors. Monitors periodically scrape a URL and detect changes. [docs](https://docs.scrapegraphai.com/api-reference/monitor) ### Usage ```bash -just-scrape agentic-scraper -s # Run browser steps -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # Persist browser session +just-scrape monitor create --url --interval # Create a monitor +just-scrape monitor create --url --interval 1h --name "My Monitor" +just-scrape monitor create --url --interval 30m --webhook-url +just-scrape monitor create --url --interval 1d -f markdown,screenshot +just-scrape monitor list # List all monitors +just-scrape monitor get --id # Get monitor details +just-scrape monitor update --id --interval 2h # Update interval +just-scrape monitor pause --id # Pause a monitor +just-scrape monitor resume --id # Resume a paused monitor +just-scrape monitor delete --id # Delete a monitor +just-scrape monitor activity --id # Paginated tick history +just-scrape monitor activity --id --limit 50 # Ticks per page (max 100) +just-scrape monitor activity --id --cursor # Paginate with a cursor ``` ### Examples ```bash -# Log in and extract dashboard data -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Navigate through a multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across multiple runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session -``` +# Monitor a pricing page every hour +just-scrape monitor create --url https://store.example.com/pricing --interval 1h -## Generate Schema +# Monitor with webhook notification +just-scrape monitor create --url https://example.com \ + --interval 30m --webhook-url https://hooks.example.com/notify -Generate a JSON schema from a natural language description. +# Monitor markdown + screenshot changes daily +just-scrape monitor create --url https://example.com \ + --interval 1d -f markdown,screenshot --name "Daily check" -### Usage +# List all monitors +just-scrape monitor list -```bash -just-scrape generate-schema # AI generates a schema -just-scrape generate-schema --existing-schema -``` - -### Examples - -```bash -# Generate a schema for product data -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" +# Pause and resume +just-scrape monitor pause --id abc123 +just-scrape monitor resume --id abc123 -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' +# Inspect recent ticks (checks the monitor performed) with their diffs +just-scrape monitor activity --id abc123 --limit 20 +just-scrape monitor activity --id abc123 --json | jq '.ticks[] | select(.hasChanges == true)' ``` ## History -Browse request history for any service. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for infinite scroll. +Browse request history. Interactive by default β€” arrow keys to navigate, select to view details, "Load more" for pagination. Service filter is optional. ### Usage ```bash -just-scrape history # Interactive browser -just-scrape history # Fetch specific request -just-scrape history --page # Start from page (default 1) -just-scrape history --page-size # Results per page (default 10, max 100) -just-scrape history --json # Raw JSON (pipeable) +just-scrape history # All history (interactive) +just-scrape history # Filter by service +just-scrape history # Fetch specific request by ID +just-scrape history --page # Start from page (default 1) +just-scrape history --page-size # Results per page (default 20, max 100) +just-scrape history --json # Raw JSON (pipeable) ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `schema`, `search`, `monitor`, `crawl` ### Examples ```bash -# Browse your smart-scraper history interactively -just-scrape history smartscraper +# Browse your extract history interactively +just-scrape history extract # Jump to a specific request by ID -just-scrape history smartscraper abc123-def456-7890 +just-scrape history scrape abc123-def456-7890 -# Export crawl history as JSON -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +# Export all history as JSON +just-scrape history --json --page-size 100 | jq '.[].status' ``` ## Credits @@ -356,18 +332,29 @@ Check your credit balance. ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' +just-scrape credits --json | jq '.remaining' ``` -## Validate +--- -Validate your API key (health check). +## Migration from v0.2.x -```bash -just-scrape validate -``` +Commands have been renamed to match the v2 API: ---- +| Old command | New command | Notes | +|---|---|---| +| `smart-scraper` | `extract` | Renamed | +| `search-scraper` | `search` | Renamed | +| `markdownify` | `markdownify` | Now wraps `scrape --format markdown` | +| `scrape` | `scrape` | Gains `--format` (markdown, html, screenshot, branding, links, images, summary, json), multi-format via comma, `--html-mode`, `--scrolls`, `--prompt`, `--schema` | +| `crawl` | `crawl` | Now uses `formats` array like scrape, supports multi-format | +| `search` | `search` | New options: `--country`, `--time-range`, `--format` | +| β€” | `monitor` | **New**: create, list, get, update, delete, pause, resume page-change monitors | +| `--stealth` flag | `--stealth` | Separate boolean flag; fetch mode is now `auto`, `fast`, or `js` | +| `agentic-scraper` | β€” | Removed from API | +| `generate-schema` | β€” | Removed from CLI (still available in SDK) | +| `sitemap` | β€” | Removed from API | +| `validate` | β€” | Removed from API | ## Contributing @@ -392,7 +379,7 @@ bun run dev --help | CLI Framework | **citty** (unjs) | | Prompts | **@clack/prompts** | | Styling | **chalk** v5 (ESM) | -| SDK | **scrapegraph-js** | +| SDK | **scrapegraph-js** v2 | | Env | **dotenv** | | Lint / Format | **Biome** | | Target | **Node.js 22+**, ESM-only | diff --git a/biome.json b/biome.json index cf09862..831603c 100644 --- a/biome.json +++ b/biome.json @@ -16,7 +16,7 @@ }, "overrides": [ { - "include": ["tests/**"], + "include": ["tests/**", "src/commands.ts"], "linter": { "rules": { "suspicious": { diff --git a/bun.lock b/bun.lock index 5a7bd89..5b4c862 100644 --- a/bun.lock +++ b/bun.lock @@ -9,7 +9,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0", + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110", }, "devDependencies": { "@biomejs/biome": "^1.9.4", @@ -229,7 +229,7 @@ "rollup": ["rollup@4.57.1", "", { "dependencies": { "@types/estree": "1.0.8" }, "optionalDependencies": { "@rollup/rollup-android-arm-eabi": "4.57.1", "@rollup/rollup-android-arm64": "4.57.1", "@rollup/rollup-darwin-arm64": "4.57.1", "@rollup/rollup-darwin-x64": "4.57.1", "@rollup/rollup-freebsd-arm64": "4.57.1", "@rollup/rollup-freebsd-x64": "4.57.1", "@rollup/rollup-linux-arm-gnueabihf": "4.57.1", "@rollup/rollup-linux-arm-musleabihf": "4.57.1", "@rollup/rollup-linux-arm64-gnu": "4.57.1", "@rollup/rollup-linux-arm64-musl": "4.57.1", "@rollup/rollup-linux-loong64-gnu": "4.57.1", "@rollup/rollup-linux-loong64-musl": "4.57.1", "@rollup/rollup-linux-ppc64-gnu": "4.57.1", "@rollup/rollup-linux-ppc64-musl": "4.57.1", "@rollup/rollup-linux-riscv64-gnu": "4.57.1", "@rollup/rollup-linux-riscv64-musl": "4.57.1", "@rollup/rollup-linux-s390x-gnu": "4.57.1", "@rollup/rollup-linux-x64-gnu": "4.57.1", "@rollup/rollup-linux-x64-musl": "4.57.1", "@rollup/rollup-openbsd-x64": "4.57.1", "@rollup/rollup-openharmony-arm64": "4.57.1", "@rollup/rollup-win32-arm64-msvc": "4.57.1", "@rollup/rollup-win32-ia32-msvc": "4.57.1", "@rollup/rollup-win32-x64-gnu": "4.57.1", "@rollup/rollup-win32-x64-msvc": "4.57.1", "fsevents": "~2.3.2" }, "bin": { "rollup": "dist/bin/rollup" } }, "sha512-oQL6lgK3e2QZeQ7gcgIkS2YZPg5slw37hYufJ3edKlfQSGGm8ICoxswK15ntSzF/a8+h7ekRy7k7oWc3BQ7y8A=="], - "scrapegraph-js": ["scrapegraph-js@1.0.0", "", {}, "sha512-eQn8/HRfJHjCoj2yia5yHWQTYUae/bYNhLEx00ZXF+GLKpgUJT0OCGUQM13WGSX5cgw9onz5EiaDJDbzcbeYtQ=="], + "scrapegraph-js": ["scrapegraph-js@github:ScrapeGraphAI/scrapegraph-js#096c110", { "dependencies": { "zod": "^4.3.6" } }, "ScrapeGraphAI-scrapegraph-js-096c110"], "sisteransi": ["sisteransi@1.0.5", "", {}, "sha512-bLGGlR1QxBcynn2d5YmDX4MGjlZvy2MRBDRNHLJ8VI6l6+9FUiyTFNJ0IveOSP0bcXgVDPRcfGqA0pjaqUpfVg=="], @@ -256,5 +256,7 @@ "ufo": ["ufo@1.6.3", "", {}, "sha512-yDJTmhydvl5lJzBmy/hyOAA0d+aqCBuwl818haVdYCRrWV84o7YyeVm4QlVHStqNrrJSTb6jKuFAVqAFsr+K3Q=="], "undici-types": ["undici-types@6.21.0", "", {}, "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ=="], + + "zod": ["zod@4.3.6", "", {}, "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg=="], } } diff --git a/package.json b/package.json index 55c9e7b..c3d5793 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "just-scrape", - "version": "0.2.1", + "version": "1.0.0", "description": "ScrapeGraph AI CLI tool", "type": "module", "main": "dist/cli.mjs", @@ -28,7 +28,7 @@ "chalk": "^5.4.1", "citty": "^0.1.6", "dotenv": "^17.2.4", - "scrapegraph-js": "^1.0.0" + "scrapegraph-js": "github:ScrapeGraphAI/scrapegraph-js#096c110" }, "devDependencies": { "@biomejs/biome": "^1.9.4", diff --git a/skills/just-scrape/SKILL.md b/skills/just-scrape/SKILL.md index 18acea8..d9a3b57 100644 --- a/skills/just-scrape/SKILL.md +++ b/skills/just-scrape/SKILL.md @@ -1,6 +1,6 @@ --- name: just-scrape -description: "CLI tool for AI-powered web scraping, data extraction, search, and crawling via ScrapeGraph AI. Use when the user needs to scrape websites, extract structured data from URLs, convert pages to markdown, crawl multi-page sites, search the web for information, automate browser interactions (login, click, fill forms), get raw HTML, discover sitemaps, or generate JSON schemas. Triggers on tasks involving: (1) extracting data from websites, (2) web scraping or crawling, (3) converting webpages to markdown, (4) AI-powered web search with extraction, (5) browser automation, (6) generating output schemas for scraping. The CLI is just-scrape (npm package just-scrape)." +description: "CLI tool for AI-powered web scraping, data extraction, search, and crawling via ScrapeGraph AI v2 API. Use when the user needs to scrape websites, extract structured data from URLs, convert pages to markdown, crawl multi-page sites, or search the web for information. Triggers on tasks involving: (1) extracting data from websites, (2) web scraping or crawling, (3) converting webpages to markdown, (4) AI-powered web search with extraction. The CLI is just-scrape (npm package just-scrape)." --- # Web Scraping with just-scrape @@ -30,191 +30,132 @@ API key resolution order: `SGAI_API_KEY` env var β†’ `.env` file β†’ `~/.scrapeg | Need | Command | |---|---| -| Extract structured data from a known URL | `smart-scraper` | -| Search the web and extract from results | `search-scraper` | +| Extract structured data from a known URL | `extract` | +| Search the web and extract from results | `search` | +| Scrape a page (markdown, html, screenshot, branding) | `scrape` | | Convert a page to clean markdown | `markdownify` | | Crawl multiple pages from a site | `crawl` | -| Get raw HTML | `scrape` | -| Automate browser actions (login, click, fill) | `agentic-scraper` | -| Generate a JSON schema from description | `generate-schema` | -| Get all URLs from a sitemap | `sitemap` | | Check credit balance | `credits` | | Browse past requests | `history` | -| Validate API key | `validate` | ## Common Flags All commands support `--json` for machine-readable output (suppresses banner, spinners, prompts). Scraping commands share these optional flags: -- `--stealth` β€” bypass anti-bot detection (+4 credits) +- `--mode ` / `-m ` β€” fetch mode: `auto` (default), `fast`, `js`, `direct+stealth`, `js+stealth` - `--headers ` β€” custom HTTP headers as JSON string - `--schema ` β€” enforce output JSON schema +- `--country ` β€” ISO country code for geo-targeting ## Commands -### Smart Scraper +### Extract Extract structured data from any URL using AI. ```bash -just-scrape smart-scraper -p -just-scrape smart-scraper -p --schema -just-scrape smart-scraper -p --scrolls # infinite scroll (0-100) -just-scrape smart-scraper -p --pages # multi-page (1-100) -just-scrape smart-scraper -p --stealth # anti-bot (+4 credits) -just-scrape smart-scraper -p --cookies --headers -just-scrape smart-scraper -p --plain-text +just-scrape extract -p +just-scrape extract -p --schema +just-scrape extract -p --scrolls # infinite scroll (0-100) +just-scrape extract -p --mode js+stealth # anti-bot bypass +just-scrape extract -p --cookies --headers +just-scrape extract -p --country # geo-targeting ``` ```bash # E-commerce extraction -just-scrape smart-scraper https://store.example.com/shoes -p "Extract all product names, prices, and ratings" +just-scrape extract https://store.example.com/shoes -p "Extract all product names, prices, and ratings" # Strict schema + scrolling -just-scrape smart-scraper https://news.example.com -p "Get headlines and dates" \ +just-scrape extract https://news.example.com -p "Get headlines and dates" \ --schema '{"type":"object","properties":{"articles":{"type":"array","items":{"type":"object","properties":{"title":{"type":"string"},"date":{"type":"string"}}}}}}' \ --scrolls 5 # JS-heavy SPA behind anti-bot -just-scrape smart-scraper https://app.example.com/dashboard -p "Extract user stats" \ - --stealth +just-scrape extract https://app.example.com/dashboard -p "Extract user stats" \ + --mode js+stealth ``` -### Search Scraper +### Search Search the web and extract structured data from results. ```bash -just-scrape search-scraper -just-scrape search-scraper --num-results # sources to scrape (3-20, default 3) -just-scrape search-scraper --no-extraction # markdown only (2 credits vs 10) -just-scrape search-scraper --schema -just-scrape search-scraper --stealth --headers +just-scrape search +just-scrape search --num-results # sources to scrape (1-20, default 3) +just-scrape search -p # extraction prompt +just-scrape search --schema +just-scrape search --headers ``` ```bash # Research across sources -just-scrape search-scraper "Best Python web frameworks in 2025" --num-results 10 - -# Cheap markdown-only -just-scrape search-scraper "React vs Vue comparison" --no-extraction --num-results 5 +just-scrape search "Best Python web frameworks in 2025" --num-results 10 # Structured output -just-scrape search-scraper "Top 5 cloud providers pricing" \ +just-scrape search "Top 5 cloud providers pricing" \ --schema '{"type":"object","properties":{"providers":{"type":"array","items":{"type":"object","properties":{"name":{"type":"string"},"free_tier":{"type":"string"}}}}}}' ``` -### Markdownify - -Convert any webpage to clean markdown. - -```bash -just-scrape markdownify -just-scrape markdownify --stealth # +4 credits -just-scrape markdownify --headers -``` - -```bash -just-scrape markdownify https://blog.example.com/my-article -just-scrape markdownify https://protected.example.com --stealth -just-scrape markdownify https://docs.example.com/api --json | jq -r '.result' > api-docs.md -``` - -### Crawl - -Crawl multiple pages and extract data from each. - -```bash -just-scrape crawl -p -just-scrape crawl -p --max-pages # default 10 -just-scrape crawl -p --depth # default 1 -just-scrape crawl --no-extraction --max-pages # markdown only (2 credits/page) -just-scrape crawl -p --schema -just-scrape crawl -p --rules # include_paths, same_domain -just-scrape crawl -p --no-sitemap -just-scrape crawl -p --stealth -``` - -```bash -# Crawl docs site -just-scrape crawl https://docs.example.com -p "Extract all code snippets" --max-pages 20 --depth 3 - -# Filter to blog pages only -just-scrape crawl https://example.com -p "Extract article titles" \ - --rules '{"include_paths":["/blog/*"],"same_domain":true}' --max-pages 50 - -# Raw markdown, no AI extraction (cheaper) -just-scrape crawl https://example.com --no-extraction --max-pages 10 -``` - ### Scrape -Get raw HTML content from a URL. +Scrape content from a URL in various formats. ```bash -just-scrape scrape -just-scrape scrape --stealth # +4 credits -just-scrape scrape --branding # extract logos/colors/fonts (+2 credits) -just-scrape scrape --country-code +just-scrape scrape # markdown (default) +just-scrape scrape -f html # raw HTML +just-scrape scrape -f screenshot # screenshot +just-scrape scrape -f branding # extract branding info +just-scrape scrape -m direct+stealth # anti-bot bypass +just-scrape scrape --country # geo-targeting ``` ```bash just-scrape scrape https://example.com -just-scrape scrape https://store.example.com --stealth --country-code DE -just-scrape scrape https://example.com --branding +just-scrape scrape https://example.com -f html +just-scrape scrape https://store.example.com -m direct+stealth --country DE +just-scrape scrape https://example.com -f branding ``` -### Agentic Scraper +### Markdownify -Browser automation with AI β€” login, click, navigate, fill forms. Steps are comma-separated strings. +Convert any webpage to clean markdown (convenience wrapper for `scrape --format markdown`). ```bash -just-scrape agentic-scraper -s -just-scrape agentic-scraper -s --ai-extraction -p -just-scrape agentic-scraper -s --schema -just-scrape agentic-scraper -s --use-session # persist browser session +just-scrape markdownify +just-scrape markdownify -m direct+stealth +just-scrape markdownify --headers ``` ```bash -# Login + extract dashboard -just-scrape agentic-scraper https://app.example.com/login \ - -s "Fill email with user@test.com,Fill password with secret,Click Sign In" \ - --ai-extraction -p "Extract all dashboard metrics" - -# Multi-step form -just-scrape agentic-scraper https://example.com/wizard \ - -s "Click Next,Select Premium plan,Fill name with John,Click Submit" - -# Persistent session across runs -just-scrape agentic-scraper https://app.example.com \ - -s "Click Settings" --use-session +just-scrape markdownify https://blog.example.com/my-article +just-scrape markdownify https://protected.example.com -m js+stealth +just-scrape markdownify https://docs.example.com/api --json | jq -r '.markdown' > api-docs.md ``` -### Generate Schema +### Crawl -Generate a JSON schema from a natural language description. +Crawl multiple pages. The CLI starts the crawl and polls until completion. ```bash -just-scrape generate-schema -just-scrape generate-schema --existing-schema +just-scrape crawl +just-scrape crawl --max-pages # default 50 +just-scrape crawl --max-depth # default 2 +just-scrape crawl --max-links-per-page # default 10 +just-scrape crawl --allow-external # allow external domains +just-scrape crawl -m direct+stealth # anti-bot bypass ``` ```bash -just-scrape generate-schema "E-commerce product with name, price, ratings, and reviews array" - -# Refine an existing schema -just-scrape generate-schema "Add an availability field" \ - --existing-schema '{"type":"object","properties":{"name":{"type":"string"},"price":{"type":"number"}}}' -``` - -### Sitemap +# Crawl docs site +just-scrape crawl https://docs.example.com --max-pages 20 --max-depth 3 -Get all URLs from a website's sitemap. +# Crawl staying within domain +just-scrape crawl https://example.com --max-pages 50 -```bash -just-scrape sitemap -just-scrape sitemap https://example.com --json | jq -r '.urls[]' +# Get crawl results as JSON +just-scrape crawl https://example.com --json --max-pages 10 ``` ### History @@ -225,67 +166,59 @@ Browse request history. Interactive by default (arrow keys to navigate, select t just-scrape history # interactive browser just-scrape history # specific request just-scrape history --page -just-scrape history --page-size # max 100 +just-scrape history --page-size # default 20, max 100 just-scrape history --json ``` -Services: `markdownify`, `smartscraper`, `searchscraper`, `scrape`, `crawl`, `agentic-scraper`, `sitemap` +Services: `scrape`, `extract`, `search`, `monitor`, `crawl` ```bash -just-scrape history smartscraper -just-scrape history crawl --json --page-size 100 | jq '.requests[] | {id: .request_id, status}' +just-scrape history extract +just-scrape history crawl --json --page-size 100 | jq '.[].status' ``` -### Credits & Validate +### Credits ```bash just-scrape credits -just-scrape credits --json | jq '.remaining_credits' -just-scrape validate +just-scrape credits --json | jq '.remainingCredits' ``` ## Common Patterns -### Generate schema then scrape with it - -```bash -just-scrape generate-schema "Product with name, price, and reviews" --json | jq '.schema' > schema.json -just-scrape smart-scraper https://store.example.com -p "Extract products" --schema "$(cat schema.json)" -``` - ### Pipe JSON for scripting ```bash -just-scrape sitemap https://example.com --json | jq -r '.urls[]' | while read url; do - just-scrape smart-scraper "$url" -p "Extract title" --json >> results.jsonl -done +just-scrape extract https://example.com -p "Extract all links" --json | jq '.data' ``` ### Protected sites ```bash # JS-heavy SPA behind Cloudflare -just-scrape smart-scraper https://protected.example.com -p "Extract data" --stealth +just-scrape extract https://protected.example.com -p "Extract data" --mode js+stealth # With custom cookies/headers -just-scrape smart-scraper https://example.com -p "Extract data" \ +just-scrape extract https://example.com -p "Extract data" \ --cookies '{"session":"abc123"}' --headers '{"Authorization":"Bearer token"}' ``` -## Credit Costs +## Fetch Modes -| Feature | Extra Credits | +| Mode | Description | |---|---| -| `--stealth` | +4 per request | -| `--branding` (scrape only) | +2 | -| `search-scraper` extraction | 10 per request | -| `search-scraper --no-extraction` | 2 per request | -| `crawl --no-extraction` | 2 per page | +| `auto` | Automatic selection (default) | +| `fast` | Fastest, no JS rendering | +| `js` | Full JS rendering | +| `direct+stealth` | Direct fetch with anti-bot bypass | +| `js+stealth` | JS rendering with anti-bot bypass | ## Environment Variables ```bash SGAI_API_KEY=sgai-... # API key -JUST_SCRAPE_TIMEOUT_S=300 # Request timeout in seconds (default 120) -JUST_SCRAPE_DEBUG=1 # Debug logging to stderr +SGAI_API_URL=... # Override API base URL (default: https://api.scrapegraphai.com) +SGAI_TIMEOUT_S=30 # Request timeout in seconds (default 30) ``` + +Legacy variables (`JUST_SCRAPE_API_URL`, `JUST_SCRAPE_TIMEOUT_S`, `JUST_SCRAPE_DEBUG`) are still bridged. diff --git a/src/cli.ts b/src/cli.ts index 483a94c..e2d2400 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -1,6 +1,16 @@ import "dotenv/config"; import "./lib/env.js"; import { defineCommand, runMain } from "citty"; +import { + crawlCommand, + creditsCommand, + extractCommand, + historyCommand, + markdownifyCommand, + monitorCommand, + scrapeCommand, + searchCommand, +} from "./commands.js"; import { getVersion, showBanner } from "./utils/banner.js"; showBanner(); @@ -12,17 +22,14 @@ const main = defineCommand({ description: "ScrapeGraph AI CLI tool", }, subCommands: { - "smart-scraper": () => import("./commands/smart-scraper.js").then((m) => m.default), - "search-scraper": () => import("./commands/search-scraper.js").then((m) => m.default), - markdownify: () => import("./commands/markdownify.js").then((m) => m.default), - crawl: () => import("./commands/crawl.js").then((m) => m.default), - sitemap: () => import("./commands/sitemap.js").then((m) => m.default), - scrape: () => import("./commands/scrape.js").then((m) => m.default), - "agentic-scraper": () => import("./commands/agentic-scraper.js").then((m) => m.default), - "generate-schema": () => import("./commands/generate-schema.js").then((m) => m.default), - history: () => import("./commands/history.js").then((m) => m.default), - credits: () => import("./commands/credits.js").then((m) => m.default), - validate: () => import("./commands/validate.js").then((m) => m.default), + extract: extractCommand, + search: searchCommand, + scrape: scrapeCommand, + markdownify: markdownifyCommand, + crawl: crawlCommand, + monitor: monitorCommand, + history: historyCommand, + credits: creditsCommand, }, }); diff --git a/src/commands.ts b/src/commands.ts new file mode 100644 index 0000000..2ea4851 --- /dev/null +++ b/src/commands.ts @@ -0,0 +1,873 @@ +import * as p from "@clack/prompts"; +import chalk from "chalk"; +import { defineCommand } from "citty"; +import { crawl, extract, getCredits, history, monitor, scrape, search } from "scrapegraph-js"; +import { getApiKey } from "./lib/client.js"; +import * as log from "./lib/log.js"; + +// --------------------------------------------------------------------------- +// extract +// --------------------------------------------------------------------------- + +export const extractCommand = defineCommand({ + meta: { + name: "extract", + description: "Extract structured data from a URL using AI", + }, + args: { + url: { + type: "positional", + description: "Website URL to scrape", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt", + required: true, + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + mode: { type: "string", description: "HTML processing mode: normal (default), reader, prune" }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + cookies: { type: "string", description: "Cookies as JSON object string" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/extract"); + const apiKey = await getApiKey(!!args.json); + + const fetchConfig: Record = {}; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.stealth) fetchConfig.stealth = true; + if (args.cookies) fetchConfig.cookies = JSON.parse(args.cookies); + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + if (args.country) fetchConfig.country = args.country; + + const params: Record = { + url: args.url, + prompt: args.prompt, + }; + if (args.schema) params.schema = JSON.parse(args.schema); + if (args.mode) params.mode = args.mode; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Extracting"); + try { + const result = await extract(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// search +// --------------------------------------------------------------------------- + +export const searchCommand = defineCommand({ + meta: { + name: "search", + description: "Search the web and extract data with AI", + }, + args: { + query: { + type: "positional", + description: "Search query", + required: true, + }, + prompt: { + type: "string", + alias: "p", + description: "Extraction prompt for search results", + }, + "num-results": { + type: "string", + description: "Number of websites to scrape (1-20, default 3)", + }, + schema: { type: "string", description: "Output JSON schema (as JSON string)" }, + country: { + type: "string", + description: "Country code for geo-targeted search (e.g. 'us', 'de', 'jp')", + }, + "time-range": { + type: "string", + description: + "Filter results by recency: past_hour, past_24_hours, past_week, past_month, past_year", + }, + format: { + type: "string", + description: "Result format: markdown (default) or html", + }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/search"); + const apiKey = await getApiKey(!!args.json); + + const params: Record = { query: args.query }; + if (args["num-results"]) params.numResults = Number(args["num-results"]); + if (args.schema) params.schema = JSON.parse(args.schema); + if (args.prompt) params.prompt = args.prompt; + if (args.country) params.country = args.country; + if (args["time-range"]) params.timeRange = args["time-range"]; + if (args.format) params.format = args.format; + if (args.headers) params.fetchConfig = { headers: JSON.parse(args.headers) }; + + out.start("Searching"); + try { + const result = await search(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// scrape +// --------------------------------------------------------------------------- + +const SCRAPE_FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", + "json", +] as const; +type ScrapeFormat = (typeof SCRAPE_FORMATS)[number]; + +export const scrapeCommand = defineCommand({ + meta: { + name: "scrape", + description: + "Scrape content from a URL (markdown, html, screenshot, branding, links, images, summary, json)", + }, + args: { + url: { + type: "positional", + description: "Website URL to scrape", + required: true, + }, + format: { + type: "string", + alias: "f", + description: `Output format: ${SCRAPE_FORMATS.join(", ")} (default: markdown). Comma-separate for multi-format output.`, + }, + prompt: { + type: "string", + alias: "p", + description: "Prompt for json format (required when --format includes json)", + }, + schema: { + type: "string", + description: "Schema for json format (JSON string)", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + "html-mode": { + type: "string", + description: "HTML/markdown extraction mode: normal (default), reader, prune", + }, + scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, + country: { type: "string", description: "ISO country code for geo-targeting" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const apiKey = await getApiKey(!!args.json); + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (args.scrolls) fetchConfig.scrolls = Number(args.scrolls); + if (args.country) fetchConfig.country = args.country; + + const requestedFormats = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean) as ScrapeFormat[]; + const htmlMode = (args["html-mode"] as "normal" | "reader" | "prune" | undefined) ?? "normal"; + + const formats = requestedFormats.map((f) => { + switch (f) { + case "markdown": + return { type: "markdown" as const, mode: htmlMode }; + case "html": + return { type: "html" as const, mode: htmlMode }; + case "screenshot": + return { type: "screenshot" as const }; + case "branding": + return { type: "branding" as const }; + case "links": + return { type: "links" as const }; + case "images": + return { type: "images" as const }; + case "summary": + return { type: "summary" as const }; + case "json": { + if (!args.prompt) { + out.error("--prompt is required when --format includes json"); + return { type: "json" as const, prompt: "" }; + } + return { + type: "json" as const, + prompt: args.prompt, + schema: args.schema ? JSON.parse(args.schema) : undefined, + mode: htmlMode, + }; + } + default: + out.error(`Unknown format: ${f}. Valid: ${SCRAPE_FORMATS.join(", ")}`); + return { type: "markdown" as const, mode: htmlMode }; + } + }); + + const params: Record = { url: args.url, formats }; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Scraping"); + try { + const result = await scrape(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// markdownify β€” convenience wrapper for `scrape --format markdown` +// --------------------------------------------------------------------------- + +export const markdownifyCommand = defineCommand({ + meta: { + name: "markdownify", + description: "Convert a webpage to clean markdown", + }, + args: { + url: { + type: "positional", + description: "Website URL to convert", + required: true, + }, + mode: { type: "string", alias: "m", description: "Fetch mode: auto (default), fast, js" }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + headers: { type: "string", description: "Custom headers as JSON object string" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/scrape"); + const apiKey = await getApiKey(!!args.json); + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (args.headers) fetchConfig.headers = JSON.parse(args.headers); + + const params: Record = { + url: args.url, + formats: [{ type: "markdown", mode: "normal" }], + }; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Converting to markdown"); + try { + const result = await scrape(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// crawl β€” starts a job then polls until completion +// --------------------------------------------------------------------------- + +const CRAWL_POLL_INTERVAL_MS = 3000; + +export const crawlCommand = defineCommand({ + meta: { + name: "crawl", + description: "Crawl and extract data from multiple pages", + }, + args: { + url: { + type: "positional", + description: "Starting URL to crawl", + required: true, + }, + "max-pages": { type: "string", description: "Maximum pages to crawl (default 50)" }, + "max-depth": { type: "string", description: "Crawl depth (default 2)" }, + "max-links-per-page": { type: "string", description: "Max links per page (default 10)" }, + "allow-external": { type: "boolean", description: "Allow crawling external domains" }, + format: { + type: "string", + alias: "f", + description: + "Page format: markdown (default), html, screenshot, branding, links, images, summary. Comma-separate for multi-format.", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + out.docs("https://docs.scrapegraphai.com/api-reference/crawl"); + const apiKey = await getApiKey(!!args.json); + + const requestedFormats = (args.format ?? "markdown") + .split(",") + .map((f) => f.trim()) + .filter(Boolean); + + const formats = requestedFormats.map((f) => { + if (f === "markdown" || f === "html") + return { type: f as "markdown" | "html", mode: "normal" as const }; + return { type: f }; + }); + + const params: Record = { url: args.url, formats }; + if (args["max-pages"]) params.maxPages = Number(args["max-pages"]); + if (args["max-depth"]) params.maxDepth = Number(args["max-depth"]); + if (args["max-links-per-page"]) params.maxLinksPerPage = Number(args["max-links-per-page"]); + if (args["allow-external"]) params.allowExternal = true; + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Crawling"); + try { + const job = await crawl.start(apiKey, params as any); + const jobData = job.data as { id: string; status: string } | null; + + if (!jobData?.id) { + out.stop(job.elapsedMs); + out.result(job.data); + return; + } + + while (true) { + await new Promise((r) => setTimeout(r, CRAWL_POLL_INTERVAL_MS)); + const status = await crawl.get(apiKey, jobData.id); + const statusData = status.data as { status: string; [key: string]: unknown } | null; + out.poll(statusData?.status ?? "unknown"); + + if ( + statusData?.status === "completed" || + statusData?.status === "failed" || + statusData?.status === "deleted" + ) { + out.stop(job.elapsedMs + status.elapsedMs); + out.result(status.data); + return; + } + } + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// monitor β€” create, list, get, update, delete, pause, resume, activity +// --------------------------------------------------------------------------- + +const MONITOR_ACTIONS = [ + "create", + "list", + "get", + "update", + "delete", + "pause", + "resume", + "activity", +] as const; +type MonitorAction = (typeof MONITOR_ACTIONS)[number]; + +const MONITOR_FORMATS = [ + "markdown", + "html", + "screenshot", + "branding", + "links", + "images", + "summary", + "json", +] as const; + +export const monitorCommand = defineCommand({ + meta: { + name: "monitor", + description: "Create and manage page-change monitors", + }, + args: { + action: { + type: "positional", + description: `Action: ${MONITOR_ACTIONS.join(", ")}`, + required: true, + }, + url: { + type: "string", + description: "URL to monitor (for create)", + }, + id: { + type: "string", + description: "Monitor ID (for get, update, delete, pause, resume, activity)", + }, + name: { + type: "string", + description: "Monitor name", + }, + interval: { + type: "string", + description: "Check interval (e.g. '1h', '30m', '1d') β€” required for create", + }, + format: { + type: "string", + alias: "f", + description: `Formats to track: ${MONITOR_FORMATS.join(", ")} (default: markdown). Comma-separate for multi-format.`, + }, + "webhook-url": { + type: "string", + description: "Webhook URL to notify on changes", + }, + mode: { + type: "string", + alias: "m", + description: "Fetch mode: auto (default), fast, js", + }, + stealth: { type: "boolean", description: "Enable stealth mode" }, + limit: { + type: "string", + description: "Ticks per page for activity (max 100)", + }, + cursor: { + type: "string", + description: "Pagination cursor for activity", + }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + const apiKey = await getApiKey(!!args.json); + const action = args.action as MonitorAction; + + const buildFormats = (raw: string) => + raw + .split(",") + .map((f) => f.trim()) + .filter(Boolean) + .map((f) => { + if (f === "markdown" || f === "html") return { type: f, mode: "normal" as const }; + return { type: f }; + }); + + switch (action) { + case "create": { + if (!args.url) { + out.error("--url is required for create"); + return; + } + if (!args.interval) { + out.error("--interval is required for create"); + return; + } + + const params: Record = { + url: args.url, + interval: args.interval, + formats: buildFormats(args.format ?? "markdown"), + }; + if (args.name) params.name = args.name; + if (args["webhook-url"]) params.webhookUrl = args["webhook-url"]; + + const fetchConfig: Record = {}; + if (args.mode) fetchConfig.mode = args.mode; + if (args.stealth) fetchConfig.stealth = true; + if (Object.keys(fetchConfig).length > 0) params.fetchConfig = fetchConfig; + + out.start("Creating monitor"); + try { + const result = await monitor.create(apiKey, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "list": { + out.start("Fetching monitors"); + try { + const result = await monitor.list(apiKey); + out.stop(result.elapsedMs); + + if (args.json) { + out.result(result.data); + return; + } + + const monitors = result.data as Array> | null; + if (!monitors?.length) { + p.log.warning("No monitors found."); + return; + } + + for (const m of monitors) { + const status = String(m.status ?? ""); + const color = status === "active" ? chalk.green : chalk.yellow; + p.log.info( + `${chalk.dim(String(m.cronId ?? m.scheduleId ?? ""))} ${color(status)} ${String((m.config as Record)?.url ?? "")} ${chalk.dim(String(m.interval ?? ""))}`, + ); + } + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "get": { + if (!args.id) { + out.error("--id is required for get"); + return; + } + out.start("Fetching monitor"); + try { + const result = await monitor.get(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "update": { + if (!args.id) { + out.error("--id is required for update"); + return; + } + const params: Record = {}; + if (args.name) params.name = args.name; + if (args.interval) params.interval = args.interval; + if (args["webhook-url"]) params.webhookUrl = args["webhook-url"]; + if (args.format) params.formats = buildFormats(args.format); + + out.start("Updating monitor"); + try { + const result = await monitor.update(apiKey, args.id, params as any); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "delete": { + if (!args.id) { + out.error("--id is required for delete"); + return; + } + out.start("Deleting monitor"); + try { + const result = await monitor.delete(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "pause": { + if (!args.id) { + out.error("--id is required for pause"); + return; + } + out.start("Pausing monitor"); + try { + const result = await monitor.pause(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "resume": { + if (!args.id) { + out.error("--id is required for resume"); + return; + } + out.start("Resuming monitor"); + try { + const result = await monitor.resume(apiKey, args.id); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + case "activity": { + if (!args.id) { + out.error("--id is required for activity"); + return; + } + const params: { limit?: number; cursor?: string } = {}; + if (args.limit) params.limit = Number(args.limit); + if (args.cursor) params.cursor = args.cursor; + + out.start("Fetching monitor activity"); + try { + const result = await monitor.activity(apiKey, args.id, params); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + break; + } + + default: + out.error(`Unknown action: ${action}. Valid: ${MONITOR_ACTIONS.join(", ")}`); + } + }, +}); + +// --------------------------------------------------------------------------- +// history β€” interactive browser with pagination, or direct by request-id +// --------------------------------------------------------------------------- + +const HISTORY_SERVICES = ["scrape", "extract", "schema", "search", "monitor", "crawl"] as const; +const HISTORY_LOAD_MORE = "__load_more__"; + +type HistoryRow = Record; + +function historyId(row: HistoryRow): string { + return String(row.id ?? "unknown"); +} + +function historyLabel(row: HistoryRow): string { + const id = historyId(row); + const short = id.length > 12 ? `${id.slice(0, 12)}...` : id; + const status = String(row.status ?? "β€”"); + + const params = row.params as Record | undefined; + const url = String(params?.url ?? params?.query ?? ""); + const urlShort = url.length > 50 ? `${url.slice(0, 49)}...` : url; + + const color = + status === "completed" ? chalk.green : status === "failed" ? chalk.red : chalk.yellow; + + return `${chalk.dim(short)} ${color(status)} ${urlShort}`; +} + +function historyHint(row: HistoryRow): string { + const ts = row.createdAt; + if (!ts) return ""; + const d = new Date(String(ts)); + return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString(); +} + +export const historyCommand = defineCommand({ + meta: { + name: "history", + description: "View request history for a service", + }, + args: { + service: { + type: "positional", + description: `Service name (${HISTORY_SERVICES.join(", ")})`, + required: false, + }, + page: { type: "string", description: "Page number (default: 1)" }, + "page-size": { type: "string", description: "Results per page (default: 20, max: 100)" }, + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const quiet = !!args.json; + const out = log.create(quiet); + const apiKey = await getApiKey(quiet); + const service = args.service as (typeof HISTORY_SERVICES)[number] | undefined; + const requestId = (args as { _: string[] })._.at(1); + const limit = args["page-size"] ? Number(args["page-size"]) : 20; + let page = args.page ? Number(args.page) : 1; + + const fetchPage = async (pg: number) => { + const params: Record = { page: pg, limit }; + if (service) params.service = service; + const r = await history.list(apiKey, params as any); + const d = r.data as { + data?: HistoryRow[]; + pagination?: { page: number; limit: number; total: number }; + } | null; + const rows = d?.data ?? []; + const total = d?.pagination?.total ?? 0; + return { + rows, + hasMore: total > pg * limit, + ms: r.elapsedMs, + }; + }; + + if (quiet && !requestId) { + try { + const { rows } = await fetchPage(page); + out.result(rows); + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); + } + return; + } + + if (requestId) { + try { + const result = await history.get(apiKey, requestId); + out.result(result.data); + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); + } + return; + } + + out.start(`Fetching ${service ?? "all"} history`); + try { + const first = await fetchPage(page); + out.stop(first.ms); + + if (first.rows.length === 0) { + p.log.warning("No history found."); + return; + } + + const allRows = [...first.rows]; + let hasMore = first.hasMore; + + while (true) { + const options = allRows.map((row) => ({ + value: historyId(row), + label: historyLabel(row), + hint: historyHint(row), + })); + + if (hasMore) { + options.push({ + value: HISTORY_LOAD_MORE, + label: chalk.blue.bold("↓ Load more…"), + hint: `page ${page + 1}`, + }); + } + + const selected = await p.select({ + message: `${allRows.length} requests β€” select one to view`, + options, + maxItems: 15, + }); + + if (p.isCancel(selected)) { + p.cancel("Cancelled"); + return; + } + + if (selected === HISTORY_LOAD_MORE) { + page++; + const ls = p.spinner(); + ls.start(`Loading page ${page}`); + const next = await fetchPage(page); + ls.stop("Done"); + + if (next.rows.length === 0) { + hasMore = false; + p.log.warning("No more results."); + continue; + } + + allRows.push(...next.rows); + hasMore = next.hasMore; + continue; + } + + const match = allRows.find((r) => historyId(r) === selected); + if (match) out.result(match); + + const back = await p.confirm({ message: "Back to list?" }); + if (p.isCancel(back) || !back) return; + } + } catch (err) { + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); + +// --------------------------------------------------------------------------- +// credits +// --------------------------------------------------------------------------- + +export const creditsCommand = defineCommand({ + meta: { + name: "credits", + description: "Check your credit balance", + }, + args: { + json: { type: "boolean", description: "Output raw JSON (pipeable)" }, + }, + run: async ({ args }) => { + const out = log.create(!!args.json); + const apiKey = await getApiKey(!!args.json); + + out.start("Fetching credits"); + try { + const result = await getCredits(apiKey); + out.stop(result.elapsedMs); + out.result(result.data); + } catch (err) { + out.stop(0); + out.error(err instanceof Error ? err.message : String(err)); + } + }, +}); diff --git a/src/commands/.gitkeep b/src/commands/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/src/commands/agentic-scraper.ts b/src/commands/agentic-scraper.ts deleted file mode 100644 index 67e9b5b..0000000 --- a/src/commands/agentic-scraper.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "agentic-scraper", - description: "Browser automation with AI (login, click, navigate, fill forms)", - }, - args: { - url: { - type: "positional", - description: "Starting URL", - required: true, - }, - steps: { - type: "string", - alias: "s", - description: 'Comma-separated browser steps (e.g. "Click login,Fill email with x")', - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (used with --ai-extraction)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - "ai-extraction": { type: "boolean", description: "Enable AI extraction after steps" }, - "use-session": { type: "boolean", description: "Persist browser session across requests" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/agenticscraper"); - const key = await resolveApiKey(!!args.json); - - const steps = args.steps ? args.steps.split(",").map((s) => s.trim()) : []; - const params: scrapegraphai.AgenticScraperParams = { url: args.url, steps }; - if (args.prompt) params.user_prompt = args.prompt; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args["ai-extraction"]) params.ai_extraction = true; - if (args["use-session"]) params.use_session = true; - - out.start("Running browser automation"); - const result = await scrapegraphai.agenticScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/crawl.ts b/src/commands/crawl.ts deleted file mode 100644 index d55101d..0000000 --- a/src/commands/crawl.ts +++ /dev/null @@ -1,62 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "crawl", - description: "Crawl and extract data from multiple pages", - }, - args: { - url: { - type: "positional", - description: "Starting URL to crawl", - required: true, - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt (required when extraction mode is on)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/page instead of 10)", - }, - "max-pages": { type: "string", description: "Maximum pages to crawl (default 10)" }, - depth: { type: "string", description: "Crawl depth (default 1)" }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - rules: { type: "string", description: "Crawl rules as JSON object string" }, - "no-sitemap": { type: "boolean", description: "Disable sitemap-based URL discovery" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartcrawler"); - const key = await resolveApiKey(!!args.json); - - const base: Record = { url: args.url }; - if (args["max-pages"]) base.max_pages = Number(args["max-pages"]); - if (args.depth) base.depth = Number(args.depth); - if (args.rules) base.rules = JSON.parse(args.rules); - if (args["no-sitemap"]) base.sitemap = false; - if (args.stealth) base.stealth = true; - - if (args["no-extraction"]) { - base.extraction_mode = false; - } else { - if (args.prompt) base.prompt = args.prompt; - if (args.schema) base.schema = JSON.parse(args.schema); - } - - const params = base as scrapegraphai.CrawlParams; - - out.start("Crawling"); - const result = await scrapegraphai.crawl(key, params, out.poll); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/credits.ts b/src/commands/credits.ts deleted file mode 100644 index 0d7b75f..0000000 --- a/src/commands/credits.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "credits", - description: "Check your credit balance", - }, - args: { - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - out.start("Fetching credits"); - const result = await scrapegraphai.getCredits(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/generate-schema.ts b/src/commands/generate-schema.ts deleted file mode 100644 index 8d77e57..0000000 --- a/src/commands/generate-schema.ts +++ /dev/null @@ -1,37 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "generate-schema", - description: "Generate a JSON schema from a natural language prompt", - }, - args: { - prompt: { - type: "positional", - description: "Describe the schema you need", - required: true, - }, - "existing-schema": { - type: "string", - description: "Existing schema to modify (as JSON string)", - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.GenerateSchemaParams = { user_prompt: args.prompt }; - if (args["existing-schema"]) params.existing_schema = JSON.parse(args["existing-schema"]); - - out.start("Generating schema"); - const result = await scrapegraphai.generateSchema(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/history.ts b/src/commands/history.ts deleted file mode 100644 index 99ab59e..0000000 --- a/src/commands/history.ts +++ /dev/null @@ -1,146 +0,0 @@ -import * as p from "@clack/prompts"; -import chalk from "chalk"; -import { defineCommand } from "citty"; -import { HISTORY_SERVICES } from "scrapegraph-js"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -const VALID = HISTORY_SERVICES.join(", "); -const LOAD_MORE = "__load_more__"; - -function getId(row: Record): string { - return String(row.request_id ?? row.crawl_id ?? row.id ?? "unknown"); -} - -function label(row: Record): string { - const id = getId(row); - const short = id.length > 12 ? `${id.slice(0, 12)}…` : id; - const status = String(row.status ?? "β€”"); - const url = String(row.website_url ?? row.url ?? row.user_prompt ?? ""); - const urlShort = url.length > 50 ? `${url.slice(0, 49)}…` : url; - - const color = - status === "completed" || status === "done" - ? chalk.green - : status === "failed" - ? chalk.red - : chalk.yellow; - - return `${chalk.dim(short)} ${color(status)} ${urlShort}`; -} - -function hint(row: Record): string { - const ts = row.created_at ?? row.timestamp ?? row.updated_at; - if (!ts) return ""; - const d = new Date(String(ts)); - return Number.isNaN(d.getTime()) ? String(ts) : d.toLocaleString(); -} - -export default defineCommand({ - meta: { - name: "history", - description: "View request history for a service", - }, - args: { - service: { - type: "positional", - description: `Service name (${VALID})`, - required: true, - }, - page: { type: "string", description: "Page number (default: 1)" }, - "page-size": { type: "string", description: "Results per page (default: 10, max: 100)" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const quiet = !!args.json; - const out = log.create(quiet); - const key = await resolveApiKey(quiet); - const service = args.service as scrapegraphai.HistoryParams["service"]; - const requestId = (args as { _: string[] })._.at(1); - const pageSize = args["page-size"] ? Number(args["page-size"]) : 10; - let page = args.page ? Number(args.page) : 1; - - const fetchPage = async (pg: number) => { - const r = await scrapegraphai.history(key, { service, page: pg, page_size: pageSize }); - if (r.status === "error") out.error(r.error); - const d = r.data as { requests: Record[]; next_key?: string }; - return { rows: d.requests ?? [], hasMore: !!d.next_key, ms: r.elapsedMs }; - }; - - if (quiet || requestId) { - const { rows } = await fetchPage(page); - if (requestId) { - const match = rows.find((r) => getId(r) === requestId); - if (!match) out.error(`Request ${requestId} not found on page ${page}`); - out.result(match); - return; - } - out.result(rows); - return; - } - - out.start(`Fetching ${service} history`); - const first = await fetchPage(page); - out.stop(first.ms); - - if (first.rows.length === 0) { - p.log.warning("No history found."); - return; - } - - const allRows = [...first.rows]; - let hasMore = first.hasMore; - - while (true) { - const options = allRows.map((row) => ({ - value: getId(row), - label: label(row), - hint: hint(row), - })); - - if (hasMore) { - options.push({ - value: LOAD_MORE, - label: chalk.blue.bold("↓ Load more…"), - hint: `page ${page + 1}`, - }); - } - - const selected = await p.select({ - message: `${allRows.length} requests β€” select one to view`, - options, - maxItems: 15, - }); - - if (p.isCancel(selected)) { - p.cancel("Cancelled"); - return; - } - - if (selected === LOAD_MORE) { - page++; - const ls = p.spinner(); - ls.start(`Loading page ${page}`); - const next = await fetchPage(page); - ls.stop("Done"); - - if (next.rows.length === 0) { - hasMore = false; - p.log.warning("No more results."); - continue; - } - - allRows.push(...next.rows); - hasMore = next.hasMore; - continue; - } - - const match = allRows.find((r) => getId(r) === selected); - if (match) out.result(match); - - const back = await p.confirm({ message: "Back to list?" }); - if (p.isCancel(back) || !back) return; - } - }, -}); diff --git a/src/commands/markdownify.ts b/src/commands/markdownify.ts deleted file mode 100644 index ccfc494..0000000 --- a/src/commands/markdownify.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "markdownify", - description: "Convert a webpage to clean markdown", - }, - args: { - url: { - type: "positional", - description: "Website URL to convert", - required: true, - }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/markdownify"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.MarkdownifyParams = { - website_url: args.url, - }; - - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Converting to markdown"); - const result = await scrapegraphai.markdownify(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/scrape.ts b/src/commands/scrape.ts deleted file mode 100644 index b0517eb..0000000 --- a/src/commands/scrape.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "scrape", - description: "Get raw HTML content from a URL", - }, - args: { - url: { - type: "positional", - description: "Website URL to scrape", - required: true, - }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - branding: { type: "boolean", description: "Extract branding info (+2 credits)" }, - "country-code": { type: "string", description: "ISO country code for geo-targeting" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/scrape"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.ScrapeParams = { website_url: args.url }; - - if (args.stealth) params.stealth = true; - if (args.branding) params.branding = true; - if (args["country-code"]) params.country_code = args["country-code"]; - - out.start("Scraping"); - const result = await scrapegraphai.scrape(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/search-scraper.ts b/src/commands/search-scraper.ts deleted file mode 100644 index 041e32c..0000000 --- a/src/commands/search-scraper.ts +++ /dev/null @@ -1,52 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "search-scraper", - description: "Search the web and extract data with AI", - }, - args: { - prompt: { - type: "positional", - description: "Search query and extraction instructions", - required: true, - }, - "num-results": { - type: "string", - description: "Number of websites to scrape (3-20, default 3)", - }, - "no-extraction": { - type: "boolean", - description: "Return markdown only (2 credits/site instead of 10)", - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/searchscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SearchScraperParams = { - user_prompt: args.prompt, - }; - - if (args["num-results"]) params.num_results = Number(args["num-results"]); - if (args["no-extraction"]) params.extraction_mode = false; - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.stealth) params.stealth = true; - if (args.headers) params.headers = JSON.parse(args.headers); - - out.start("Searching"); - const result = await scrapegraphai.searchScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/sitemap.ts b/src/commands/sitemap.ts deleted file mode 100644 index 2120b16..0000000 --- a/src/commands/sitemap.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "sitemap", - description: "Get all URLs from a website's sitemap", - }, - args: { - url: { - type: "positional", - description: "Website URL", - required: true, - }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/sitemap"); - const key = await resolveApiKey(!!args.json); - - out.start("Fetching sitemap"); - const result = await scrapegraphai.sitemap(key, { website_url: args.url }); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/smart-scraper.ts b/src/commands/smart-scraper.ts deleted file mode 100644 index be3d2a4..0000000 --- a/src/commands/smart-scraper.ts +++ /dev/null @@ -1,57 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "smart-scraper", - description: "Extract structured data from a URL using AI", - }, - args: { - url: { - type: "positional", - description: "Website URL to scrape", - required: true, - }, - prompt: { - type: "string", - alias: "p", - description: "Extraction prompt", - required: true, - }, - schema: { type: "string", description: "Output JSON schema (as JSON string)" }, - scrolls: { type: "string", description: "Number of infinite scrolls (0-100)" }, - pages: { type: "string", description: "Total pages to scrape (1-100)" }, - stealth: { type: "boolean", description: "Bypass bot detection (+4 credits)" }, - cookies: { type: "string", description: "Cookies as JSON object string" }, - headers: { type: "string", description: "Custom headers as JSON object string" }, - "plain-text": { type: "boolean", description: "Return plain text instead of JSON" }, - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - out.docs("https://docs.scrapegraphai.com/services/smartscraper"); - const key = await resolveApiKey(!!args.json); - - const params: scrapegraphai.SmartScraperParams = { - website_url: args.url, - user_prompt: args.prompt, - }; - - if (args.schema) params.output_schema = JSON.parse(args.schema); - if (args.scrolls) params.number_of_scrolls = Number(args.scrolls); - if (args.pages) params.total_pages = Number(args.pages); - if (args.stealth) params.stealth = true; - if (args.cookies) params.cookies = JSON.parse(args.cookies); - if (args.headers) params.headers = JSON.parse(args.headers); - if (args["plain-text"]) params.plain_text = true; - - out.start("Scraping"); - const result = await scrapegraphai.smartScraper(key, params); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/commands/validate.ts b/src/commands/validate.ts deleted file mode 100644 index dd2c81d..0000000 --- a/src/commands/validate.ts +++ /dev/null @@ -1,25 +0,0 @@ -import { defineCommand } from "citty"; -import * as scrapegraphai from "scrapegraph-js"; -import { resolveApiKey } from "../lib/folders.js"; -import * as log from "../lib/log.js"; - -export default defineCommand({ - meta: { - name: "validate", - description: "Validate your API key (health check)", - }, - args: { - json: { type: "boolean", description: "Output raw JSON (pipeable)" }, - }, - run: async ({ args }) => { - const out = log.create(!!args.json); - const key = await resolveApiKey(!!args.json); - - out.start("Checking API health"); - const result = await scrapegraphai.checkHealth(key); - out.stop(result.elapsedMs); - - if (result.data) out.result(result.data); - else out.error(result.error); - }, -}); diff --git a/src/lib/client.ts b/src/lib/client.ts new file mode 100644 index 0000000..dc1b11b --- /dev/null +++ b/src/lib/client.ts @@ -0,0 +1,9 @@ +import { resolveApiKey } from "./folders.js"; + +let cached: string | null = null; + +export async function getApiKey(quiet = false): Promise { + if (cached) return cached; + cached = await resolveApiKey(quiet); + return cached; +} diff --git a/src/lib/env.ts b/src/lib/env.ts index 8777ab0..6384fd0 100644 --- a/src/lib/env.ts +++ b/src/lib/env.ts @@ -10,8 +10,13 @@ if (process.env.JUST_SCRAPE_API_URL && !process.env.SGAI_API_URL) if (process.env.JUST_SCRAPE_DEBUG === "1" && !process.env.SGAI_DEBUG) process.env.SGAI_DEBUG = "1"; -if (process.env.JUST_SCRAPE_TIMEOUT_S && !process.env.SGAI_TIMEOUT_S) - process.env.SGAI_TIMEOUT_S = process.env.JUST_SCRAPE_TIMEOUT_S; +// Bridge legacy JUST_SCRAPE_TIMEOUT_S and SGAI_TIMEOUT_S to the new SGAI_TIMEOUT var +// (renamed in scrapegraph-js v2; see scrapegraph-js PR #13 / commit 2eba148). +if (process.env.JUST_SCRAPE_TIMEOUT_S && !process.env.SGAI_TIMEOUT) + process.env.SGAI_TIMEOUT = process.env.JUST_SCRAPE_TIMEOUT_S; + +if (process.env.SGAI_TIMEOUT_S && !process.env.SGAI_TIMEOUT) + process.env.SGAI_TIMEOUT = process.env.SGAI_TIMEOUT_S; function loadConfigFile(): Record { if (!existsSync(CONFIG_PATH)) return {}; diff --git a/src/utils/banner.ts b/src/utils/banner.ts index 66c6386..8ba47a8 100644 --- a/src/utils/banner.ts +++ b/src/utils/banner.ts @@ -30,8 +30,10 @@ export function showBanner() { console.log(text); console.log(chalk.hex(BANNER_COLOR)(TAGLINE)); console.log(chalk.hex(BANNER_COLOR)(`v${getVersion()}`)); - if (process.env.JUST_SCRAPE_API_URL) { - console.log(chalk.yellow(`β†’ Custom API: ${process.env.JUST_SCRAPE_API_URL}`)); + if (process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL) { + console.log( + chalk.yellow(`β†’ Custom API: ${process.env.SGAI_API_URL || process.env.JUST_SCRAPE_API_URL}`), + ); } console.log(); } diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index e2fab44..40ba725 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -1,7 +1,15 @@ import { expect, test } from "bun:test"; -import { HISTORY_SERVICES, smartScraper } from "scrapegraph-js"; +import { scrapegraphai } from "scrapegraph-js"; -test("sdk exports are available", () => { - expect(typeof smartScraper).toBe("function"); - expect(HISTORY_SERVICES.length).toBeGreaterThan(0); +test("sdk v2 factory is callable and exposes expected methods", () => { + expect(typeof scrapegraphai).toBe("function"); + + const client = scrapegraphai({ apiKey: "sgai-test" }); + expect(typeof client.scrape).toBe("function"); + expect(typeof client.extract).toBe("function"); + expect(typeof client.search).toBe("function"); + expect(typeof client.credits).toBe("function"); + expect(typeof client.history).toBe("function"); + expect(typeof client.crawl.start).toBe("function"); + expect(typeof client.crawl.status).toBe("function"); }); From 30cda2ea42802992447320fdedbe74e394557ac5 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Wed, 15 Apr 2026 17:28:50 +0200 Subject: [PATCH 2/2] fix(tests): use correct ScrapeGraphAI export name The SDK v2 exports the factory as ScrapeGraphAI (capital) and the crawl status method is crawl.get, not crawl.status. The previous names caused CI to fail with "Export named 'scrapegraphai' not found". Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/smoke.test.ts | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/smoke.test.ts b/tests/smoke.test.ts index 40ba725..46c426d 100644 --- a/tests/smoke.test.ts +++ b/tests/smoke.test.ts @@ -1,15 +1,16 @@ import { expect, test } from "bun:test"; -import { scrapegraphai } from "scrapegraph-js"; +import { ScrapeGraphAI } from "scrapegraph-js"; test("sdk v2 factory is callable and exposes expected methods", () => { - expect(typeof scrapegraphai).toBe("function"); + expect(typeof ScrapeGraphAI).toBe("function"); - const client = scrapegraphai({ apiKey: "sgai-test" }); + const client = ScrapeGraphAI({ apiKey: "sgai-test" }); expect(typeof client.scrape).toBe("function"); expect(typeof client.extract).toBe("function"); expect(typeof client.search).toBe("function"); expect(typeof client.credits).toBe("function"); - expect(typeof client.history).toBe("function"); + expect(typeof client.history.list).toBe("function"); + expect(typeof client.history.get).toBe("function"); expect(typeof client.crawl.start).toBe("function"); - expect(typeof client.crawl.status).toBe("function"); + expect(typeof client.crawl.get).toBe("function"); });