From c67dcaaa50834bc3ef28766e68f0621b753adbd7 Mon Sep 17 00:00:00 2001 From: kalil0321 Date: Sat, 28 Mar 2026 15:38:07 +0100 Subject: [PATCH 1/3] Add fetch-api-shoe-data template (scrape shoe data via Fetch API) --- package.json | 3 + typescript/fetch-api-shoe-data/.env.example | 2 + typescript/fetch-api-shoe-data/README.md | 59 +++++++++++ typescript/fetch-api-shoe-data/index.ts | 104 ++++++++++++++++++++ typescript/fetch-api-shoe-data/package.json | 16 +++ 5 files changed, 184 insertions(+) create mode 100644 typescript/fetch-api-shoe-data/.env.example create mode 100644 typescript/fetch-api-shoe-data/README.md create mode 100644 typescript/fetch-api-shoe-data/index.ts create mode 100644 typescript/fetch-api-shoe-data/package.json diff --git a/package.json b/package.json index 45ed7ac4..633f322d 100644 --- a/package.json +++ b/package.json @@ -39,5 +39,8 @@ "uvx ruff check --fix", "uvx ruff format" ] + }, + "dependencies": { + "playwright": "^1.58.2" } } diff --git a/typescript/fetch-api-shoe-data/.env.example b/typescript/fetch-api-shoe-data/.env.example new file mode 100644 index 00000000..f1797331 --- /dev/null +++ b/typescript/fetch-api-shoe-data/.env.example @@ -0,0 +1,2 @@ +# Browserbase credentials - get these from https://www.browserbase.com/settings +BROWSERBASE_API_KEY=your_browserbase_api_key diff --git a/typescript/fetch-api-shoe-data/README.md b/typescript/fetch-api-shoe-data/README.md new file mode 100644 index 00000000..0065c913 --- /dev/null +++ b/typescript/fetch-api-shoe-data/README.md @@ -0,0 +1,59 @@ +# Fetch API Shoe Data + +## AT A GLANCE + +- Goal: demonstrate scraping websites that block standard HTTP requests using Browserbase's Fetch API. +- No browser session needed — the Fetch API is a lightweight HTTP request routed through Browserbase's infrastructure. +- Faster and cheaper than spinning up a full browser session for server-rendered pages. +- Shows a side-by-side comparison: standard HTTP request (403 blocked, even with Chrome headers) vs Browserbase Fetch API (200 OK with full content). + Docs → https://docs.browserbase.com/features/fetch + +## GLOSSARY + +- Fetch API: Browserbase's lightweight HTTP fetch — bypasses basic bot detection without spinning up a full browser session + Docs → https://docs.browserbase.com/features/fetch +- Server-rendered page: a page where the HTML returned by the server already contains the content (no JavaScript needed) +- Bot detection: techniques websites use to block automated requests (IP reputation, TLS fingerprinting, header analysis) + +## QUICKSTART + +1. cd typescript/fetch-api-shoe-data +2. pnpm install +3. cp .env.example .env +4. Add BROWSERBASE_API_KEY to .env (no project ID needed for Fetch API) +5. pnpm start + +## EXPECTED OUTPUT + +- **Step 1**: Standard HTTP request with full Chrome headers — gets blocked with 403 +- **Step 2**: Browserbase Fetch API returns 200 with full HTML, parses and displays 10 sneaker listings with name, price, and URL + +## COMMON PITFALLS + +- "Cannot find module": ensure all dependencies are installed (`pnpm install`) +- Missing credentials: verify .env contains BROWSERBASE_API_KEY +- 502 error: Browserbase Fetch API has a 1MB response limit — use a browser session for larger pages +- Empty results: the regex parser expects the current HTML structure — if the site changes their markup, update `parseSneakers()` +- JS-rendered pages: the Fetch API does not execute JavaScript — for SPAs, use a browser session instead + +## USE CASES + +- Scraping server-rendered pages that block all standard HTTP requests +- Lightweight price monitoring without the cost of a full browser session +- Quick data collection where JavaScript execution isn't needed +- Building data pipelines that need to bypass bot detection cheaply + +## NEXT STEPS + +- Swap the target URL for any server-rendered site that blocks standard requests +- Add proxy geolocation to access region-specific content +- Combine with a browser session fallback for pages that need JS (see smart-fetch-scraper template) + +## HELPFUL RESOURCES + +📚 Stagehand Docs: https://docs.stagehand.dev/v3/first-steps/introduction +🎮 Browserbase: https://www.browserbase.com +💡 Try it out: https://www.browserbase.com/playground +🔧 Templates: https://www.browserbase.com/templates +📧 Need help? support@browserbase.com +💬 Discord: http://stagehand.dev/discord diff --git a/typescript/fetch-api-shoe-data/index.ts b/typescript/fetch-api-shoe-data/index.ts new file mode 100644 index 00000000..7c6abcfd --- /dev/null +++ b/typescript/fetch-api-shoe-data/index.ts @@ -0,0 +1,104 @@ +// Fetch API Scraping - See README.md for full documentation +// +// Some websites block plain HTTP requests (curl, fetch with spoofed headers) +// but allow requests through Browserbase's Fetch API. No browser session +// needed — just a lightweight HTTP request through Browserbase's infrastructure. +// +// This template scrapes StockX sneaker listings, which block standard HTTP +// requests with a 403 but return full HTML through Browserbase's Fetch API. + +import "dotenv/config"; +import Browserbase from "@browserbasehq/sdk"; + +// ============= CONFIGURATION ============= +const TARGET_URL = "https://stockx.com/sneakers"; +const NUM_PRODUCTS = 10; +// ========================================= + +interface Sneaker { + name: string; + price: string; + url: string; +} + +// Parse sneaker listings from StockX's server-rendered HTML +function parseSneakers(html: string, limit: number): Sneaker[] { + const sneakers: Sneaker[] = []; + + // StockX renders product links with slugs, followed by price in the card + const pattern = + /href="\/((?:air-|nike-|adidas-|jordan-|new-balance-|yeezy-|vans-|asics-|puma-|a-bathing-)[a-z0-9-]+)"[^>]*>[\s\S]*?(\$\d+)/g; + + let match; + while ((match = pattern.exec(html)) !== null && sneakers.length < limit) { + const slug = match[1]; + sneakers.push({ + name: slug + .split("-") + .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) + .join(" "), + price: match[2], + url: `https://stockx.com/${slug}`, + }); + } + + return sneakers; +} + +async function main(): Promise { + console.log("Fetch API Scraping — StockX Sneakers"); + console.log(); + + // Step 1: Show that standard HTTP requests get blocked + // StockX returns 403 even with full Chrome headers. + // Try: curl -s -o /dev/null -w "%{http_code}" https://stockx.com/sneakers → 403 + console.log("--- Step 1: Standard HTTP request (with Chrome headers) ---"); + try { + const response = await fetch(TARGET_URL, { + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + Accept: "text/html", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + }, + redirect: "follow", + }); + console.log(`Status: ${response.status}`); + console.log("→ Blocked — even with real Chrome headers.\n"); + } catch (err) { + console.log(`Failed: ${err}\n`); + } + + // Step 2: Browserbase Fetch API — bypasses bot detection, no session needed + console.log("--- Step 2: Browserbase Fetch API ---"); + const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY! }); + + const result = await bb.fetchAPI.create({ + url: TARGET_URL, + allowRedirects: true, + }); + + console.log(`Status: ${result.statusCode}`); + console.log(`Content: ${result.content.length} chars`); + + const sneakers = parseSneakers(result.content, NUM_PRODUCTS); + console.log(`\nTop ${sneakers.length} sneakers:\n`); + + for (const s of sneakers) { + console.log(` ${s.name}`); + console.log(` ${s.price} — ${s.url}\n`); + } +} + +main().catch((err) => { + console.error("Error:", err); + console.error("\nCommon issues:"); + console.error(" - Check .env file has BROWSERBASE_API_KEY"); + console.error(" - Browserbase Fetch API has a 1MB response limit"); + console.error( + " - Fetch API does not execute JavaScript — for JS-rendered pages, use a browser session", + ); + console.error("Docs: https://docs.browserbase.com"); + process.exit(1); +}); diff --git a/typescript/fetch-api-shoe-data/package.json b/typescript/fetch-api-shoe-data/package.json new file mode 100644 index 00000000..5d9f8f50 --- /dev/null +++ b/typescript/fetch-api-shoe-data/package.json @@ -0,0 +1,16 @@ +{ + "name": "fetch-api-shoe-data-template", + "type": "module", + "scripts": { + "build": "tsc", + "start": "tsx index.ts" + }, + "dependencies": { + "@browserbasehq/sdk": "latest", + "dotenv": "^16.4.7" + }, + "devDependencies": { + "tsx": "^4.19.2", + "typescript": "^5.0.0" + } +} From 6c54eb06bf8d3fffe1b5ea32b6d99436c1cbbd2d Mon Sep 17 00:00:00 2001 From: kalil0321 Date: Sat, 28 Mar 2026 21:58:28 +0100 Subject: [PATCH 2/3] fix:fetch-api-shoe-data --- package.json | 3 --- typescript/fetch-api-shoe-data/index.ts | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/package.json b/package.json index 633f322d..45ed7ac4 100644 --- a/package.json +++ b/package.json @@ -39,8 +39,5 @@ "uvx ruff check --fix", "uvx ruff format" ] - }, - "dependencies": { - "playwright": "^1.58.2" } } diff --git a/typescript/fetch-api-shoe-data/index.ts b/typescript/fetch-api-shoe-data/index.ts index 7c6abcfd..7837196c 100644 --- a/typescript/fetch-api-shoe-data/index.ts +++ b/typescript/fetch-api-shoe-data/index.ts @@ -27,7 +27,7 @@ function parseSneakers(html: string, limit: number): Sneaker[] { // StockX renders product links with slugs, followed by price in the card const pattern = - /href="\/((?:air-|nike-|adidas-|jordan-|new-balance-|yeezy-|vans-|asics-|puma-|a-bathing-)[a-z0-9-]+)"[^>]*>[\s\S]*?(\$\d+)/g; + /href="\/((?:air-|nike-|adidas-|jordan-|new-balance-|yeezy-|vans-|asics-|puma-|a-bathing-)[a-z0-9-]+)"[^>]*>[\s\S]*?(\$[\d,]+(?:\.\d{2})?)/g; let match; while ((match = pattern.exec(html)) !== null && sneakers.length < limit) { From b7c8254b430a23013105b2c01cc1869ff5dee635 Mon Sep 17 00:00:00 2001 From: kalil0321 Date: Sun, 29 Mar 2026 22:44:15 +0200 Subject: [PATCH 3/3] refactor: clean up fetch-api-shoe-data template comments and hardcoded URLs --- typescript/fetch-api-shoe-data/index.ts | 28 +++++++++++++------------ 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/typescript/fetch-api-shoe-data/index.ts b/typescript/fetch-api-shoe-data/index.ts index 7837196c..6cce4514 100644 --- a/typescript/fetch-api-shoe-data/index.ts +++ b/typescript/fetch-api-shoe-data/index.ts @@ -4,14 +4,15 @@ // but allow requests through Browserbase's Fetch API. No browser session // needed — just a lightweight HTTP request through Browserbase's infrastructure. // -// This template scrapes StockX sneaker listings, which block standard HTTP +// This template gets sneaker listings, which block standard HTTP // requests with a 403 but return full HTML through Browserbase's Fetch API. import "dotenv/config"; import Browserbase from "@browserbasehq/sdk"; // ============= CONFIGURATION ============= -const TARGET_URL = "https://stockx.com/sneakers"; +const BASE_URL = "https://stockx.com"; +const TARGET_URL = `${BASE_URL}/sneakers`; const NUM_PRODUCTS = 10; // ========================================= @@ -21,11 +22,10 @@ interface Sneaker { url: string; } -// Parse sneaker listings from StockX's server-rendered HTML +// Parse sneaker listings function parseSneakers(html: string, limit: number): Sneaker[] { const sneakers: Sneaker[] = []; - // StockX renders product links with slugs, followed by price in the card const pattern = /href="\/((?:air-|nike-|adidas-|jordan-|new-balance-|yeezy-|vans-|asics-|puma-|a-bathing-)[a-z0-9-]+)"[^>]*>[\s\S]*?(\$[\d,]+(?:\.\d{2})?)/g; @@ -38,7 +38,7 @@ function parseSneakers(html: string, limit: number): Sneaker[] { .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) .join(" "), price: match[2], - url: `https://stockx.com/${slug}`, + url: `${BASE_URL}/${slug}`, }); } @@ -46,13 +46,11 @@ function parseSneakers(html: string, limit: number): Sneaker[] { } async function main(): Promise { - console.log("Fetch API Scraping — StockX Sneakers"); + console.log(`Fetch API Scraping — ${TARGET_URL}`); console.log(); - // Step 1: Show that standard HTTP requests get blocked - // StockX returns 403 even with full Chrome headers. - // Try: curl -s -o /dev/null -w "%{http_code}" https://stockx.com/sneakers → 403 - console.log("--- Step 1: Standard HTTP request (with Chrome headers) ---"); + // Returns 403 even with full Chrome headers. + console.log("--- Standard HTTP request (with Chrome headers) ---"); try { const response = await fetch(TARGET_URL, { headers: { @@ -65,13 +63,17 @@ async function main(): Promise { redirect: "follow", }); console.log(`Status: ${response.status}`); - console.log("→ Blocked — even with real Chrome headers.\n"); + + if (response.status === 403) { + console.log("→ Blocked — even with real Chrome headers.\n"); + } else { + console.log(`Status: ${response.status}\n`); + } } catch (err) { console.log(`Failed: ${err}\n`); } - // Step 2: Browserbase Fetch API — bypasses bot detection, no session needed - console.log("--- Step 2: Browserbase Fetch API ---"); + console.log("--- With Browserbase Fetch API ---"); const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY! }); const result = await bb.fetchAPI.create({