diff --git a/typescript/fetch-api-shoe-data/.env.example b/typescript/fetch-api-shoe-data/.env.example new file mode 100644 index 00000000..f1797331 --- /dev/null +++ b/typescript/fetch-api-shoe-data/.env.example @@ -0,0 +1,2 @@ +# Browserbase credentials - get these from https://www.browserbase.com/settings +BROWSERBASE_API_KEY=your_browserbase_api_key diff --git a/typescript/fetch-api-shoe-data/README.md b/typescript/fetch-api-shoe-data/README.md new file mode 100644 index 00000000..0065c913 --- /dev/null +++ b/typescript/fetch-api-shoe-data/README.md @@ -0,0 +1,59 @@ +# Fetch API Shoe Data + +## AT A GLANCE + +- Goal: demonstrate scraping websites that block standard HTTP requests using Browserbase's Fetch API. +- No browser session needed — the Fetch API is a lightweight HTTP request routed through Browserbase's infrastructure. +- Faster and cheaper than spinning up a full browser session for server-rendered pages. +- Shows a side-by-side comparison: standard HTTP request (403 blocked, even with Chrome headers) vs Browserbase Fetch API (200 OK with full content). + Docs → https://docs.browserbase.com/features/fetch + +## GLOSSARY + +- Fetch API: Browserbase's lightweight HTTP fetch — bypasses basic bot detection without spinning up a full browser session + Docs → https://docs.browserbase.com/features/fetch +- Server-rendered page: a page where the HTML returned by the server already contains the content (no JavaScript needed) +- Bot detection: techniques websites use to block automated requests (IP reputation, TLS fingerprinting, header analysis) + +## QUICKSTART + +1. cd typescript/fetch-api-shoe-data +2. pnpm install +3. cp .env.example .env +4. Add BROWSERBASE_API_KEY to .env (no project ID needed for Fetch API) +5. pnpm start + +## EXPECTED OUTPUT + +- **Step 1**: Standard HTTP request with full Chrome headers — gets blocked with 403 +- **Step 2**: Browserbase Fetch API returns 200 with full HTML, parses and displays 10 sneaker listings with name, price, and URL + +## COMMON PITFALLS + +- "Cannot find module": ensure all dependencies are installed (`pnpm install`) +- Missing credentials: verify .env contains BROWSERBASE_API_KEY +- 502 error: Browserbase Fetch API has a 1MB response limit — use a browser session for larger pages +- Empty results: the regex parser expects the current HTML structure — if the site changes their markup, update `parseSneakers()` +- JS-rendered pages: the Fetch API does not execute JavaScript — for SPAs, use a browser session instead + +## USE CASES + +- Scraping server-rendered pages that block all standard HTTP requests +- Lightweight price monitoring without the cost of a full browser session +- Quick data collection where JavaScript execution isn't needed +- Building data pipelines that need to bypass bot detection cheaply + +## NEXT STEPS + +- Swap the target URL for any server-rendered site that blocks standard requests +- Add proxy geolocation to access region-specific content +- Combine with a browser session fallback for pages that need JS (see smart-fetch-scraper template) + +## HELPFUL RESOURCES + +📚 Stagehand Docs: https://docs.stagehand.dev/v3/first-steps/introduction +🎮 Browserbase: https://www.browserbase.com +💡 Try it out: https://www.browserbase.com/playground +🔧 Templates: https://www.browserbase.com/templates +📧 Need help? support@browserbase.com +💬 Discord: http://stagehand.dev/discord diff --git a/typescript/fetch-api-shoe-data/index.ts b/typescript/fetch-api-shoe-data/index.ts new file mode 100644 index 00000000..6cce4514 --- /dev/null +++ b/typescript/fetch-api-shoe-data/index.ts @@ -0,0 +1,106 @@ +// Fetch API Scraping - See README.md for full documentation +// +// Some websites block plain HTTP requests (curl, fetch with spoofed headers) +// but allow requests through Browserbase's Fetch API. No browser session +// needed — just a lightweight HTTP request through Browserbase's infrastructure. +// +// This template gets sneaker listings, which block standard HTTP +// requests with a 403 but return full HTML through Browserbase's Fetch API. + +import "dotenv/config"; +import Browserbase from "@browserbasehq/sdk"; + +// ============= CONFIGURATION ============= +const BASE_URL = "https://stockx.com"; +const TARGET_URL = `${BASE_URL}/sneakers`; +const NUM_PRODUCTS = 10; +// ========================================= + +interface Sneaker { + name: string; + price: string; + url: string; +} + +// Parse sneaker listings +function parseSneakers(html: string, limit: number): Sneaker[] { + const sneakers: Sneaker[] = []; + + const pattern = + /href="\/((?:air-|nike-|adidas-|jordan-|new-balance-|yeezy-|vans-|asics-|puma-|a-bathing-)[a-z0-9-]+)"[^>]*>[\s\S]*?(\$[\d,]+(?:\.\d{2})?)/g; + + let match; + while ((match = pattern.exec(html)) !== null && sneakers.length < limit) { + const slug = match[1]; + sneakers.push({ + name: slug + .split("-") + .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) + .join(" "), + price: match[2], + url: `${BASE_URL}/${slug}`, + }); + } + + return sneakers; +} + +async function main(): Promise { + console.log(`Fetch API Scraping — ${TARGET_URL}`); + console.log(); + + // Returns 403 even with full Chrome headers. + console.log("--- Standard HTTP request (with Chrome headers) ---"); + try { + const response = await fetch(TARGET_URL, { + headers: { + "User-Agent": + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + Accept: "text/html", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + }, + redirect: "follow", + }); + console.log(`Status: ${response.status}`); + + if (response.status === 403) { + console.log("→ Blocked — even with real Chrome headers.\n"); + } else { + console.log(`Status: ${response.status}\n`); + } + } catch (err) { + console.log(`Failed: ${err}\n`); + } + + console.log("--- With Browserbase Fetch API ---"); + const bb = new Browserbase({ apiKey: process.env.BROWSERBASE_API_KEY! }); + + const result = await bb.fetchAPI.create({ + url: TARGET_URL, + allowRedirects: true, + }); + + console.log(`Status: ${result.statusCode}`); + console.log(`Content: ${result.content.length} chars`); + + const sneakers = parseSneakers(result.content, NUM_PRODUCTS); + console.log(`\nTop ${sneakers.length} sneakers:\n`); + + for (const s of sneakers) { + console.log(` ${s.name}`); + console.log(` ${s.price} — ${s.url}\n`); + } +} + +main().catch((err) => { + console.error("Error:", err); + console.error("\nCommon issues:"); + console.error(" - Check .env file has BROWSERBASE_API_KEY"); + console.error(" - Browserbase Fetch API has a 1MB response limit"); + console.error( + " - Fetch API does not execute JavaScript — for JS-rendered pages, use a browser session", + ); + console.error("Docs: https://docs.browserbase.com"); + process.exit(1); +}); diff --git a/typescript/fetch-api-shoe-data/package.json b/typescript/fetch-api-shoe-data/package.json new file mode 100644 index 00000000..5d9f8f50 --- /dev/null +++ b/typescript/fetch-api-shoe-data/package.json @@ -0,0 +1,16 @@ +{ + "name": "fetch-api-shoe-data-template", + "type": "module", + "scripts": { + "build": "tsc", + "start": "tsx index.ts" + }, + "dependencies": { + "@browserbasehq/sdk": "latest", + "dotenv": "^16.4.7" + }, + "devDependencies": { + "tsx": "^4.19.2", + "typescript": "^5.0.0" + } +}