diff --git a/README.md b/README.md index 6d0022e..45bb863 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ await client.close(); - **Web Scraping** — Scrape any website using anti-bot detection bypass and proxy support - **Search Engine Results** — Google, Bing, and Yandex search with batch support - **Platform Scrapers** — Structured data collection from LinkedIn, Amazon, Instagram, TikTok, YouTube, Reddit, and more +- **Crawl API** — Crawl any URL(s) and get every output format (markdown, HTML, text) bundled per page - **Discover API** — AI-powered web search with intent-based relevance ranking - **Scraper Studio** — Trigger and fetch results from custom scrapers built in Bright Data's Scraper Studio - **Browser API** — CDP WebSocket URLs for connecting Playwright, Puppeteer, or Selenium to Bright Data's cloud browsers @@ -173,6 +174,28 @@ console.log(result.rowCount); **Available platforms:** `linkedin`, `amazon`, `instagram`, `tiktok`, `youtube`, `reddit`, `facebook`, `pinterest`, `chatGPT`, `digikey`, `perplexity` +### Crawl API + +Crawl one or more URLs and get every output format (markdown, HTML, text) bundled per page. + +```javascript +// Sync — single round-trip +const result = await client.crawler.crawl('https://example.com'); +console.log(result.data[0].markdown); + +// Batch +const result = await client.crawler.crawl([ + 'https://example.com', + 'https://example.com/about', +]); +console.log(`${result.pageCount} pages`); + +// Async — trigger, poll, download +const job = await client.crawler.trigger('https://example.com'); +const status = await client.crawler.status(job.snapshotId); +const result = await client.crawler.download(job.snapshotId); +``` + ### Discover API AI-powered web search with relevance ranking based on intent. @@ -401,24 +424,6 @@ try { **Error types:** `ValidationError`, `AuthenticationError`, `ZoneError`, `NetworkError`, `NetworkTimeoutError`, `TimeoutError`, `APIError`, `DataNotReadyError`, `FSError` -## Development - -```bash -git clone https://github.com/brightdata/bright-data-sdk-js.git -cd bright-data-sdk-js -npm install -npm run build:dev -``` - -## Commits conventions and releases - -We use [Semantic Release](https://github.com/semantic-release/semantic-release) for automated releases. Commit message conventions: -- `fix:` — triggers a **PATCH** release (`0.5.0` => `0.5.1`) -- `feat:` — triggers a **MINOR** release (`0.5.0` => `0.6.0`) -- `feat!:` or `BREAKING CHANGE:` in footer — triggers a **MAJOR** release (`0.5.0` => `1.0.0`) -- `docs:` — documentation only, no release -- `chore:` — general maintenance, no release - ## Support For any issues, contact [Bright Data support](https://brightdata.com/contact), or open an issue in this repository. diff --git a/src/api/crawler/index.ts b/src/api/crawler/index.ts new file mode 100644 index 0000000..e0ab8fb --- /dev/null +++ b/src/api/crawler/index.ts @@ -0,0 +1,7 @@ +export { CrawlerService } from './service'; +export { CrawlResult } from './result'; +export type { CrawlRecord, CrawlResultFields } from './result'; + +// CrawlJob is an alias for ScrapeJob — the snapshot-job wrapper is generic. +// Re-exported under the crawler name so porters from Python keep the same vocabulary. +export { ScrapeJob as CrawlJob } from '../scrape/job'; diff --git a/src/api/crawler/result.ts b/src/api/crawler/result.ts new file mode 100644 index 0000000..3f657b7 --- /dev/null +++ b/src/api/crawler/result.ts @@ -0,0 +1,42 @@ +import { BaseResult, type BaseResultFields } from '../../models/result'; + +export interface CrawlRecord { + url?: string; + markdown?: string; + html2text?: string; + page_html?: string; + [key: string]: unknown; +} + +export interface CrawlResultFields extends BaseResultFields { + pageCount?: number | null; + snapshotId?: string | null; +} + +export class CrawlResult extends BaseResult { + readonly pageCount: number | null; + readonly snapshotId: string | null; + + constructor(fields: CrawlResultFields) { + super(fields); + this.pageCount = fields.pageCount ?? null; + this.snapshotId = fields.snapshotId ?? null; + } + + override toJSON(): Record { + return { + ...super.toJSON(), + pageCount: this.pageCount, + snapshotId: this.snapshotId, + }; + } + + override toString(): string { + const base = super.toString(); + const sid = this.snapshotId + ? ` snapshot_id=${this.snapshotId.slice(0, 12)}...` + : ''; + const pages = this.pageCount != null ? ` pages=${this.pageCount}` : ''; + return ``; + } +} diff --git a/src/api/crawler/service.ts b/src/api/crawler/service.ts new file mode 100644 index 0000000..ace783f --- /dev/null +++ b/src/api/crawler/service.ts @@ -0,0 +1,179 @@ +import { API_ENDPOINT } from '../../utils/constants'; +import { Transport, assertResponse } from '../../core/transport'; +import { parseResponse } from '../../utils/misc'; +import { getLogger } from '../../utils/logger'; +import { assertSchema } from '../../schemas/utils'; +import { + CrawlInputSchema, + CrawlOptionsSchema, + CrawlDownloadOptionsSchema, + type CrawlOptions, + type CrawlDownloadOptions, +} from '../../schemas/crawler'; +import { SnapshotMetaResponseSchema } from '../../schemas/responses'; +import { ScrapeJob } from '../scrape/job'; +import { CrawlResult, type CrawlRecord } from './result'; +import type { SnapshotOperations } from '../../types/datasets'; + +const DATASET_ID = 'gd_m6gjtfmeh43we6cqc'; +const PLATFORM = 'crawler'; + +export class CrawlerService { + private transport: Transport; + private snapshotOps: SnapshotOperations; + private logger = getLogger('crawler'); + + constructor(opts: { transport: Transport; snapshotOps: SnapshotOperations }) { + this.transport = opts.transport; + this.snapshotOps = opts.snapshotOps; + } + + async crawl( + urls: string | string[], + opts: CrawlOptions = {}, + ): Promise { + const safeUrls = assertSchema(CrawlInputSchema, urls, 'crawler.crawl.urls'); + const safeOpts = assertSchema(CrawlOptionsSchema, opts, 'crawler.crawl.opts'); + const urlList = Array.isArray(safeUrls) ? safeUrls : [safeUrls]; + + this.logger.info(`crawl: ${urlList.length} url(s)`); + const triggerSentAt = new Date(); + + try { + const response = await this.transport.request( + API_ENDPOINT.SCRAPE_SYNC, + { + method: 'POST', + query: { + dataset_id: DATASET_ID, + notify: 'false', + include_errors: safeOpts.includeErrors ? 'true' : 'false', + }, + body: JSON.stringify({ + input: urlList.map((url) => ({ url })), + }), + }, + ); + + const text = await assertResponse(response); + const records = parseRecords(text); + return new CrawlResult({ + success: true, + data: records, + pageCount: records.length, + triggerSentAt, + dataFetchedAt: new Date(), + }); + } catch (e: unknown) { + return new CrawlResult({ + success: false, + error: (e as Error).message, + triggerSentAt, + dataFetchedAt: new Date(), + }); + } + } + + async trigger( + urls: string | string[], + opts: CrawlOptions = {}, + ): Promise { + const safeUrls = assertSchema( + CrawlInputSchema, + urls, + 'crawler.trigger.urls', + ); + const safeOpts = assertSchema( + CrawlOptionsSchema, + opts, + 'crawler.trigger.opts', + ); + const urlList = Array.isArray(safeUrls) ? safeUrls : [safeUrls]; + + this.logger.info(`trigger: ${urlList.length} url(s)`); + + const response = await this.transport.request(API_ENDPOINT.SCRAPE_ASYNC, { + method: 'POST', + query: { + dataset_id: DATASET_ID, + notify: 'false', + include_errors: safeOpts.includeErrors ? 'true' : 'false', + }, + body: JSON.stringify({ input: urlList.map((url) => ({ url })) }), + }); + + const text = await assertResponse(response); + const meta = parseResponse( + text, + SnapshotMetaResponseSchema, + 'crawler.trigger', + ); + return new ScrapeJob(meta.snapshot_id, this.snapshotOps, { + platform: PLATFORM, + }); + } + + async status(snapshotId: string): Promise { + const meta = await this.snapshotOps.getStatus(snapshotId); + return meta.status; + } + + async download( + snapshotId: string, + opts: CrawlDownloadOptions = {}, + ): Promise { + const safeOpts = assertSchema( + CrawlDownloadOptionsSchema, + opts, + 'crawler.download.opts', + ); + const job = new ScrapeJob(snapshotId, this.snapshotOps, { + platform: PLATFORM, + }); + const scrapeResult = await job.toResult({ + pollInterval: safeOpts.pollInterval, + pollTimeout: safeOpts.pollTimeout, + }); + + return new CrawlResult({ + success: scrapeResult.success, + data: (scrapeResult.data as CrawlRecord[] | null) ?? [], + pageCount: scrapeResult.rowCount, + snapshotId: scrapeResult.snapshotId, + triggerSentAt: scrapeResult.triggerSentAt, + dataFetchedAt: scrapeResult.dataFetchedAt, + error: scrapeResult.error, + }); + } +} + +function parseRecords(text: string): CrawlRecord[] { + const trimmed = text.trim(); + if (!trimmed) return []; + + try { + const parsed = JSON.parse(trimmed) as unknown; + if (Array.isArray(parsed)) return parsed.filter(isRecord); + if (isRecord(parsed)) return [parsed]; + return []; + } catch { + return trimmed + .split('\n') + .map((l) => l.trim()) + .filter(Boolean) + .flatMap((line) => { + try { + const x = JSON.parse(line) as unknown; + if (Array.isArray(x)) return x.filter(isRecord); + if (isRecord(x)) return [x]; + return []; + } catch { + return []; + } + }); + } +} + +function isRecord(x: unknown): x is CrawlRecord { + return typeof x === 'object' && x !== null && !Array.isArray(x); +} diff --git a/src/client.ts b/src/client.ts index 2f57f67..1a80f16 100644 --- a/src/client.ts +++ b/src/client.ts @@ -9,6 +9,8 @@ import type { DiscoverJob } from './api/discover/job'; import type { DiscoverOptions } from './schemas/discover'; import { ScraperStudioService } from './api/scraperstudio/service'; import { BrowserService } from './api/browser/service'; +import { CrawlerService } from './api/crawler/service'; +import { SnapshotAPI } from './api/scrape/snapshot'; import { setup as setupLogger, getLogger } from './utils/logger'; import { DEFAULT_WEB_UNLOCKER_ZONE, @@ -95,6 +97,7 @@ export class bdclient { declare datasets: DatasetsClient; declare scraperStudio: ScraperStudioService; declare browser: BrowserService; + declare crawler: CrawlerService; constructor(options?: BdClientOptions) { const opt = assertSchema( @@ -190,6 +193,14 @@ export class bdclient { port: opt.browserPort, }); }); + + defineLazy(this, 'crawler', () => { + const snapshotOps = new SnapshotAPI({ transport: this.transport }); + return new CrawlerService({ + transport: this.transport, + snapshotOps, + }); + }); } private get scrapeAPI(): ScrapeAPI { diff --git a/src/index.ts b/src/index.ts index 516cfeb..4794ad8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -28,6 +28,13 @@ export { DiscoverJob } from './api/discover/job'; export type { DiscoverResultItem, DiscoverResultFields } from './api/discover/result'; export type { DiscoverPollOptions } from './api/discover/job'; +// ── Crawler ────────────────────────────────────────────────────── +export { CrawlerService } from './api/crawler/service'; +export { CrawlResult } from './api/crawler/result'; +export { ScrapeJob as CrawlJob } from './api/scrape/job'; +export type { CrawlRecord, CrawlResultFields } from './api/crawler/result'; +export type { CrawlOptions, CrawlDownloadOptions } from './schemas/crawler'; + // ── Scraper Studio ────────────────────────────────────────────── export { ScraperStudioService } from './api/scraperstudio/service'; export { ScraperStudioJob } from './api/scraperstudio/job'; diff --git a/src/schemas/crawler.ts b/src/schemas/crawler.ts new file mode 100644 index 0000000..5d9fa9d --- /dev/null +++ b/src/schemas/crawler.ts @@ -0,0 +1,16 @@ +import { z } from 'zod'; +import { URLParamSchema } from './client'; + +export const CrawlInputSchema = URLParamSchema; + +export const CrawlOptionsSchema = z.object({ + includeErrors: z.boolean().default(true), +}); + +export const CrawlDownloadOptionsSchema = z.object({ + pollInterval: z.number().positive().optional(), + pollTimeout: z.number().positive().optional(), +}); + +export type CrawlOptions = z.input; +export type CrawlDownloadOptions = z.input; diff --git a/src/schemas/shared.ts b/src/schemas/shared.ts index c59e8e2..d2e2db4 100644 --- a/src/schemas/shared.ts +++ b/src/schemas/shared.ts @@ -1,3 +1,4 @@ +import path from 'node:path'; import { z } from 'zod'; export const ZoneNameSchema = z @@ -16,7 +17,12 @@ export const ZoneNameSchema = z message: 'zone name cannot end with an underscore', }); +// Reduce to the final path segment before stripping reserved characters so +// path-traversal sequences (../, absolute paths, alt-separators) cannot escape +// the working directory once getAbsAndEnsureDir → path.resolve runs downstream. +// path.basename only splits on the platform's native separator, so the regex +// must still strip backslashes for POSIX hosts receiving Windows-shaped input. export const FilenameSchema = z .string() .min(1) - .transform((v) => v.replace(/[<>:"\\|?*]/g, '_')); + .transform((v) => path.basename(v).replace(/[<>:"\\|?*]/g, '_')); diff --git a/tests/crawler.test.ts b/tests/crawler.test.ts new file mode 100644 index 0000000..4366abc --- /dev/null +++ b/tests/crawler.test.ts @@ -0,0 +1,431 @@ +import type { Dispatcher } from 'undici'; +import { describe, it, expect, test, vi, beforeEach } from 'vitest'; +import type { Mock } from 'vitest'; +import { CrawlerService } from '../src/api/crawler/service'; +import { CrawlResult } from '../src/api/crawler/result'; +import { ScrapeJob } from '../src/api/scrape/job'; +import { Transport } from '../src/core/transport'; +import { ValidationError } from '../src/utils/errors'; +import { assertSchema } from '../src/schemas/utils'; +import { + CrawlInputSchema, + CrawlOptionsSchema, + CrawlDownloadOptionsSchema, +} from '../src/schemas/crawler'; +import type { SnapshotOperations } from '../src/types/datasets'; + +// ── Mocks ──────────────────────────────────────────────────────── + +const mockTransport = { + request: vi.fn(), + stream: vi.fn(), +} as unknown as Transport; + +function mockRequest(statusCode: number, body: string) { + vi.mocked(mockTransport.request).mockResolvedValue({ + statusCode, + headers: {}, + trailers: {}, + opaque: null, + context: {}, + body: { text: () => Promise.resolve(body) }, + } as unknown as Dispatcher.ResponseData); +} + +function createMockSnapshotOps(): SnapshotOperations { + return { + getStatus: vi.fn(), + fetch: vi.fn(), + download: vi.fn(), + cancel: vi.fn(), + }; +} + +// ── Schemas ────────────────────────────────────────────────────── + +describe('CrawlInputSchema', () => { + it('accepts a URL string', () => { + expect(assertSchema(CrawlInputSchema, 'https://example.com')).toBe( + 'https://example.com', + ); + }); + + it('accepts a non-empty list of URLs', () => { + expect( + assertSchema(CrawlInputSchema, [ + 'https://example.com', + 'https://example.com/about', + ]), + ).toEqual(['https://example.com', 'https://example.com/about']); + }); + + it('rejects an empty list', () => { + expect(() => assertSchema(CrawlInputSchema, [])).toThrow( + ValidationError, + ); + }); + + it('rejects a non-URL string', () => { + expect(() => assertSchema(CrawlInputSchema, 'not-a-url')).toThrow( + ValidationError, + ); + }); +}); + +describe('CrawlOptionsSchema', () => { + it('defaults includeErrors to true', () => { + expect(assertSchema(CrawlOptionsSchema, {})).toEqual({ + includeErrors: true, + }); + }); + + it('accepts explicit includeErrors: false', () => { + expect( + assertSchema(CrawlOptionsSchema, { includeErrors: false }), + ).toEqual({ includeErrors: false }); + }); + + it('rejects non-boolean includeErrors', () => { + expect(() => + assertSchema(CrawlOptionsSchema, { + includeErrors: 'yes' as unknown as boolean, + }), + ).toThrow(ValidationError); + }); +}); + +describe('CrawlDownloadOptionsSchema', () => { + it('accepts positive numbers', () => { + expect( + assertSchema(CrawlDownloadOptionsSchema, { + pollInterval: 100, + pollTimeout: 5000, + }), + ).toEqual({ pollInterval: 100, pollTimeout: 5000 }); + }); + + it('accepts empty options', () => { + expect(assertSchema(CrawlDownloadOptionsSchema, {})).toEqual({}); + }); + + it('rejects zero pollInterval', () => { + expect(() => + assertSchema(CrawlDownloadOptionsSchema, { pollInterval: 0 }), + ).toThrow(ValidationError); + }); + + it('rejects negative pollTimeout', () => { + expect(() => + assertSchema(CrawlDownloadOptionsSchema, { pollTimeout: -1 }), + ).toThrow(ValidationError); + }); +}); + +// ── CrawlResult class ──────────────────────────────────────────── + +describe('CrawlResult', () => { + it('toJSON includes pageCount and snapshotId', () => { + const r = new CrawlResult({ + success: true, + data: [{ url: 'https://example.com' }], + pageCount: 1, + snapshotId: 's_abc', + }); + const json = r.toJSON(); + expect(json.pageCount).toBe(1); + expect(json.snapshotId).toBe('s_abc'); + expect(json.success).toBe(true); + }); + + it('toJSON yields null fields when omitted', () => { + const r = new CrawlResult({ success: false, error: 'boom' }); + const json = r.toJSON(); + expect(json.pageCount).toBeNull(); + expect(json.snapshotId).toBeNull(); + expect(json.error).toBe('boom'); + }); + + it('toString includes pages and truncated snapshot_id', () => { + const r = new CrawlResult({ + success: true, + pageCount: 2, + snapshotId: 's_1234567890abcdef', + }); + const str = r.toString(); + expect(str).toContain('pages=2'); + expect(str).toContain('snapshot_id=s_1234567890'); + expect(str).toContain('CrawlResult'); + }); + + it('toString omits page/snapshot suffixes when null', () => { + const r = new CrawlResult({ success: false, error: 'x' }); + const str = r.toString(); + expect(str).not.toContain('pages='); + expect(str).not.toContain('snapshot_id='); + }); + + it('elapsedMs works when both timestamps are set', () => { + const t0 = new Date(1_000_000); + const t1 = new Date(1_001_500); + const r = new CrawlResult({ + success: true, + triggerSentAt: t0, + dataFetchedAt: t1, + }); + expect(r.elapsedMs()).toBe(1500); + }); +}); + +// ── crawl() — sync inline ──────────────────────────────────────── + +describe('CrawlerService.crawl', () => { + let service: CrawlerService; + let snapshotOps: SnapshotOperations; + + beforeEach(() => { + vi.clearAllMocks(); + snapshotOps = createMockSnapshotOps(); + service = new CrawlerService({ + transport: mockTransport, + snapshotOps, + }); + }); + + test('single URL — wraps single record into CrawlResult', async () => { + mockRequest( + 200, + JSON.stringify({ url: 'https://example.com', markdown: '# Hi' }), + ); + + const r = await service.crawl('https://example.com'); + expect(r).toBeInstanceOf(CrawlResult); + expect(r.success).toBe(true); + expect(r.pageCount).toBe(1); + expect(r.data).toHaveLength(1); + expect(r.data?.[0]?.url).toBe('https://example.com'); + expect(r.data?.[0]?.markdown).toBe('# Hi'); + }); + + test('multi URL — pageCount matches array length', async () => { + mockRequest( + 200, + JSON.stringify([ + { url: 'https://example.com', markdown: 'a' }, + { url: 'https://example.com/about', markdown: 'b' }, + ]), + ); + + const r = await service.crawl([ + 'https://example.com', + 'https://example.com/about', + ]); + expect(r.success).toBe(true); + expect(r.pageCount).toBe(2); + expect(r.data).toHaveLength(2); + }); + + test('HTTP 500 — wrapped into CrawlResult(success=false), no throw', async () => { + mockRequest(500, 'upstream error'); + + const r = await service.crawl('https://example.com'); + expect(r.success).toBe(false); + expect(r.error).toBeTruthy(); + expect(r.pageCount).toBeNull(); + }); + + test('network error — wrapped into CrawlResult(success=false), no throw', async () => { + vi.mocked(mockTransport.request).mockRejectedValue( + new Error('socket hang up'), + ); + + const r = await service.crawl('https://example.com'); + expect(r.success).toBe(false); + expect(r.error).toContain('socket hang up'); + }); + + test('validation error on URL — throws ValidationError', async () => { + await expect(service.crawl('not-a-url')).rejects.toThrow( + ValidationError, + ); + }); + + test('NDJSON response body — parsed correctly', async () => { + const body = [ + JSON.stringify({ url: 'https://example.com/a', markdown: 'a' }), + JSON.stringify({ url: 'https://example.com/b', markdown: 'b' }), + ].join('\n'); + mockRequest(200, body); + + const r = await service.crawl([ + 'https://example.com/a', + 'https://example.com/b', + ]); + expect(r.success).toBe(true); + expect(r.pageCount).toBe(2); + expect(r.data?.[1]?.url).toBe('https://example.com/b'); + }); + + test('sends correct dataset_id, body shape, and include_errors flag', async () => { + mockRequest(200, JSON.stringify([{ url: 'https://example.com' }])); + + await service.crawl(['https://example.com', 'https://example.com/1'], { + includeErrors: false, + }); + + const call = vi.mocked(mockTransport.request).mock.calls[0]; + const url = call[0]; + const opts = call[1] as Record; + expect(url).toContain('/datasets/v3/scrape'); + expect(opts.method).toBe('POST'); + expect(opts.query).toEqual({ + dataset_id: 'gd_m6gjtfmeh43we6cqc', + notify: 'false', + include_errors: 'false', + }); + const body = JSON.parse(opts.body as string) as { + input: { url: string }[]; + }; + expect(body.input).toEqual([ + { url: 'https://example.com' }, + { url: 'https://example.com/1' }, + ]); + }); +}); + +// ── trigger() — async start ────────────────────────────────────── + +describe('CrawlerService.trigger', () => { + let service: CrawlerService; + let snapshotOps: SnapshotOperations; + + beforeEach(() => { + vi.clearAllMocks(); + snapshotOps = createMockSnapshotOps(); + service = new CrawlerService({ + transport: mockTransport, + snapshotOps, + }); + }); + + test('returns ScrapeJob with snapshotId and platform=crawler', async () => { + mockRequest(200, JSON.stringify({ snapshot_id: 's_xyz' })); + + const job = await service.trigger('https://example.com'); + expect(job).toBeInstanceOf(ScrapeJob); + expect(job.snapshotId).toBe('s_xyz'); + expect(job.platform).toBe('crawler'); + }); + + test('hits the trigger endpoint with the right dataset_id', async () => { + mockRequest(200, JSON.stringify({ snapshot_id: 's_xyz' })); + + await service.trigger(['https://example.com']); + + const call = vi.mocked(mockTransport.request).mock.calls[0]; + expect(call[0]).toContain('/datasets/v3/trigger'); + expect( + (call[1] as { query: Record }).query.dataset_id, + ).toBe('gd_m6gjtfmeh43we6cqc'); + }); + + test('throws on HTTP 500', async () => { + mockRequest(500, 'upstream error'); + await expect(service.trigger('https://example.com')).rejects.toThrow(); + }); + + test('throws when response is missing snapshot_id', async () => { + mockRequest(200, JSON.stringify({})); + await expect(service.trigger('https://example.com')).rejects.toThrow(); + }); + + test('throws ValidationError on bad URL', async () => { + await expect(service.trigger('not-a-url')).rejects.toThrow( + ValidationError, + ); + }); +}); + +// ── status() ───────────────────────────────────────────────────── + +describe('CrawlerService.status', () => { + let service: CrawlerService; + let snapshotOps: SnapshotOperations; + + beforeEach(() => { + snapshotOps = createMockSnapshotOps(); + service = new CrawlerService({ + transport: mockTransport, + snapshotOps, + }); + }); + + test('returns the upstream status string', async () => { + (snapshotOps.getStatus as Mock).mockResolvedValueOnce({ + status: 'running', + }); + const s = await service.status('s_abc'); + expect(s).toBe('running'); + expect(snapshotOps.getStatus).toHaveBeenCalledWith('s_abc'); + }); + + test('propagates upstream errors', async () => { + (snapshotOps.getStatus as Mock).mockRejectedValueOnce( + new Error('upstream'), + ); + await expect(service.status('s_abc')).rejects.toThrow('upstream'); + }); +}); + +// ── download() — poll + fetch via ScrapeJob.toResult ───────────── + +describe('CrawlerService.download', () => { + let service: CrawlerService; + let snapshotOps: SnapshotOperations; + + beforeEach(() => { + snapshotOps = createMockSnapshotOps(); + service = new CrawlerService({ + transport: mockTransport, + snapshotOps, + }); + }); + + test('returns CrawlResult(success=true) once snapshot is ready', async () => { + (snapshotOps.getStatus as Mock) + .mockResolvedValueOnce({ status: 'running' }) + .mockResolvedValueOnce({ status: 'ready' }); + (snapshotOps.fetch as Mock).mockResolvedValueOnce([ + { url: 'https://example.com', markdown: '# Hi' }, + ]); + + const r = await service.download('s_abc', { + pollInterval: 10, + pollTimeout: 5000, + }); + + expect(r).toBeInstanceOf(CrawlResult); + expect(r.success).toBe(true); + expect(r.snapshotId).toBe('s_abc'); + expect(r.pageCount).toBe(1); + expect(r.data?.[0]?.url).toBe('https://example.com'); + }); + + test('returns CrawlResult(success=false) on polling timeout', async () => { + (snapshotOps.getStatus as Mock).mockResolvedValue({ status: 'running' }); + + const r = await service.download('s_abc', { + pollInterval: 20, + pollTimeout: 60, + }); + + expect(r.success).toBe(false); + expect(r.error).toBeTruthy(); + }); + + test('rejects negative pollInterval before doing any HTTP', async () => { + await expect( + service.download('s_abc', { pollInterval: -1 }), + ).rejects.toThrow(ValidationError); + expect(snapshotOps.getStatus).not.toHaveBeenCalled(); + }); +}); diff --git a/tests/files.test.ts b/tests/files.test.ts new file mode 100644 index 0000000..40022ea --- /dev/null +++ b/tests/files.test.ts @@ -0,0 +1,136 @@ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import path from 'node:path'; +import fs from 'node:fs/promises'; +import os from 'node:os'; +import { assertSchema } from '../src/schemas/utils'; +import { FilenameSchema } from '../src/schemas/shared'; +import { ValidationError } from '../src/utils/errors'; +import { bdclient } from '../src/client'; + +describe('FilenameSchema — path traversal protection (CWE-22)', () => { + it('strips parent-directory traversal sequences', () => { + expect(assertSchema(FilenameSchema, '../../../etc/passwd')).toBe('passwd'); + expect(assertSchema(FilenameSchema, '../etc/passwd')).toBe('passwd'); + expect( + assertSchema( + FilenameSchema, + '../../../../../../../../../../tmp/pwned.txt', + ), + ).toBe('pwned.txt'); + }); + + it('reduces POSIX absolute paths to the basename', () => { + expect(assertSchema(FilenameSchema, '/tmp/pwned.txt')).toBe('pwned.txt'); + expect(assertSchema(FilenameSchema, '/etc/passwd')).toBe('passwd'); + expect(assertSchema(FilenameSchema, '/a/b/c/d/e/f.json')).toBe('f.json'); + }); + + it('reduces nested relative paths to the basename', () => { + expect(assertSchema(FilenameSchema, 'output/data.json')).toBe('data.json'); + expect(assertSchema(FilenameSchema, 'a/b/c/file.txt')).toBe('file.txt'); + }); + + it('strips Windows-style separators on POSIX hosts (regex fallback)', () => { + // path.basename on POSIX does not split on '\\', so the regex must. + const result = assertSchema( + FilenameSchema, + '..\\..\\Windows\\System32\\drivers\\etc\\hosts', + ); + expect(result).not.toContain('\\'); + expect(result).not.toContain('/'); + }); + + it('preserves legitimate basenames untouched', () => { + expect(assertSchema(FilenameSchema, 'output.json')).toBe('output.json'); + expect(assertSchema(FilenameSchema, 'my-data_2026.txt')).toBe( + 'my-data_2026.txt', + ); + expect(assertSchema(FilenameSchema, 'snapshot.csv')).toBe('snapshot.csv'); + }); + + it('still strips Windows-reserved characters', () => { + expect(assertSchema(FilenameSchema, 'ac:d"e|f?g*h.txt')).toBe( + 'a_b_c_d_e_f_g_h.txt', + ); + }); + + it('rejects empty input', () => { + expect(() => assertSchema(FilenameSchema, '')).toThrow(ValidationError); + }); +}); + +describe('saveResults — path traversal protection', () => { + let client: bdclient; + let originalCwd: string; + let tmpDir: string; + + beforeEach(async () => { + originalCwd = process.cwd(); + // realpath: on macOS, os.tmpdir() returns /var/folders/... but + // path.resolve sees through the /private/var symlink, so canonicalize + // here so equality checks against resolved paths work cross-platform. + tmpDir = await fs.realpath( + await fs.mkdtemp(path.join(os.tmpdir(), 'brd-sdk-test-')), + ); + process.chdir(tmpDir); + client = new bdclient({ apiKey: 'test-key-1234567890' }); + }); + + afterEach(async () => { + await client.close(); + process.chdir(originalCwd); + await fs.rm(tmpDir, { recursive: true, force: true }); + }); + + it('keeps a ../../../tmp/ payload inside the working directory', async () => { + const sentinel = `brd-pwn-test-${Date.now()}.txt`; + const malicious = `../../../../../../../../../../tmp/${sentinel}`; + const escapeTarget = path.join('/tmp', sentinel); + + // Ensure no pre-existing sentinel from a previous run. + await fs.unlink(escapeTarget).catch(() => {}); + + const saved = await client.saveResults('payload', { + filename: malicious, + format: 'txt', + }); + + // The escape target must not have been created. + await expect(fs.stat(escapeTarget)).rejects.toThrow(); + + // The actual write landed inside our isolated tmpDir. + expect(saved.startsWith(tmpDir + path.sep)).toBe(true); + expect(path.basename(saved)).toBe(sentinel); + + const content = await fs.readFile(saved, 'utf8'); + expect(content).toBe('payload'); + }); + + it('reduces absolute /tmp paths to a basename in the working directory', async () => { + const sentinel = `absolute-pwn-${Date.now()}.txt`; + const malicious = `/tmp/${sentinel}`; + const escapeTarget = `/tmp/${sentinel}`; + + await fs.unlink(escapeTarget).catch(() => {}); + + const saved = await client.saveResults('payload', { + filename: malicious, + format: 'txt', + }); + + await expect(fs.stat(escapeTarget)).rejects.toThrow(); + expect(saved.startsWith(tmpDir + path.sep)).toBe(true); + expect(path.basename(saved)).toBe(sentinel); + }); + + it('writes legitimate basenames at the expected location', async () => { + const name = `output-${Date.now()}.txt`; + const saved = await client.saveResults('hello', { + filename: name, + format: 'txt', + }); + + expect(saved).toBe(path.join(tmpDir, name)); + expect(await fs.readFile(saved, 'utf8')).toBe('hello'); + }); +}); diff --git a/tests/integration/crawler.test.ts b/tests/integration/crawler.test.ts new file mode 100644 index 0000000..6284f21 --- /dev/null +++ b/tests/integration/crawler.test.ts @@ -0,0 +1,66 @@ +import 'dotenv/config'; +import { describe, test, expect, beforeAll, afterAll } from 'vitest'; +import { bdclient } from '../../src/index'; +import { CrawlResult } from '../../src/api/crawler/result'; +import { ScrapeJob } from '../../src/api/scrape/job'; + +const API_KEY = process.env.BRIGHTDATA_API_TOKEN; + +describe.skipIf(!API_KEY)('Crawler (real API)', () => { + let client: bdclient; + + beforeAll(() => { + client = new bdclient({ + apiKey: API_KEY, + autoCreateZones: false, + }); + }); + + afterAll(async () => { + await client?.close(); + }); + + test('crawl single URL — returns CrawlResult with one record containing url and markdown', async () => { + const result = await client.crawler.crawl('https://example.com'); + + expect(result).toBeInstanceOf(CrawlResult); + expect(result.success).toBe(true); + expect(result.error).toBeNull(); + expect(result.pageCount).toBe(1); + expect(result.data).toHaveLength(1); + + const record = result.data?.[0]; + expect(record?.url).toContain('example.com'); + expect(typeof record?.markdown).toBe('string'); + }, 60_000); + + test('crawl batch — pageCount matches input length', async () => { + const urls = ['https://example.com', 'https://example.com/about']; + const result = await client.crawler.crawl(urls); + + expect(result.success).toBe(true); + expect(result.pageCount).toBe(urls.length); + expect(result.data).toHaveLength(urls.length); + }, 120_000); + + test('trigger + status + download round-trip', async () => { + const job = await client.crawler.trigger('https://example.com'); + expect(job).toBeInstanceOf(ScrapeJob); + expect(job.snapshotId).toBeTruthy(); + expect(job.platform).toBe('crawler'); + + const status = await client.crawler.status(job.snapshotId); + expect(typeof status).toBe('string'); + + const result = await client.crawler.download(job.snapshotId, { + pollInterval: 5_000, + pollTimeout: 480_000, + }); + + expect(result).toBeInstanceOf(CrawlResult); + expect(result.success).toBe(true); + expect(result.snapshotId).toBe(job.snapshotId); + expect(result.pageCount).toBeGreaterThan(0); + expect(result.data?.[0]?.url).toContain('example.com'); + }, 540_000); +});