Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 23 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ await client.close();
- **Web Scraping** — Scrape any website using anti-bot detection bypass and proxy support
- **Search Engine Results** — Google, Bing, and Yandex search with batch support
- **Platform Scrapers** — Structured data collection from LinkedIn, Amazon, Instagram, TikTok, YouTube, Reddit, and more
- **Crawl API** — Crawl any URL(s) and get every output format (markdown, HTML, text) bundled per page
- **Discover API** — AI-powered web search with intent-based relevance ranking
- **Scraper Studio** — Trigger and fetch results from custom scrapers built in Bright Data's Scraper Studio
- **Browser API** — CDP WebSocket URLs for connecting Playwright, Puppeteer, or Selenium to Bright Data's cloud browsers
Expand Down Expand Up @@ -173,6 +174,28 @@ console.log(result.rowCount);

**Available platforms:** `linkedin`, `amazon`, `instagram`, `tiktok`, `youtube`, `reddit`, `facebook`, `pinterest`, `chatGPT`, `digikey`, `perplexity`

### Crawl API

Crawl one or more URLs and get every output format (markdown, HTML, text) bundled per page.

```javascript
// Sync — single round-trip
const result = await client.crawler.crawl('https://example.com');
console.log(result.data[0].markdown);

// Batch
const result = await client.crawler.crawl([
'https://example.com',
'https://example.com/about',
]);
console.log(`${result.pageCount} pages`);

// Async — trigger, poll, download
const job = await client.crawler.trigger('https://example.com');
const status = await client.crawler.status(job.snapshotId);
const result = await client.crawler.download(job.snapshotId);
```

### Discover API

AI-powered web search with relevance ranking based on intent.
Expand Down Expand Up @@ -401,24 +424,6 @@ try {

**Error types:** `ValidationError`, `AuthenticationError`, `ZoneError`, `NetworkError`, `NetworkTimeoutError`, `TimeoutError`, `APIError`, `DataNotReadyError`, `FSError`

## Development

```bash
git clone https://github.com/brightdata/bright-data-sdk-js.git
cd bright-data-sdk-js
npm install
npm run build:dev
```

## Commits conventions and releases

We use [Semantic Release](https://github.com/semantic-release/semantic-release) for automated releases. Commit message conventions:
- `fix:` — triggers a **PATCH** release (`0.5.0` => `0.5.1`)
- `feat:` — triggers a **MINOR** release (`0.5.0` => `0.6.0`)
- `feat!:` or `BREAKING CHANGE:` in footer — triggers a **MAJOR** release (`0.5.0` => `1.0.0`)
- `docs:` — documentation only, no release
- `chore:` — general maintenance, no release

## Support

For any issues, contact [Bright Data support](https://brightdata.com/contact), or open an issue in this repository.
Expand Down
7 changes: 7 additions & 0 deletions src/api/crawler/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
export { CrawlerService } from './service';
export { CrawlResult } from './result';
export type { CrawlRecord, CrawlResultFields } from './result';

// CrawlJob is an alias for ScrapeJob — the snapshot-job wrapper is generic.
// Re-exported under the crawler name so porters from Python keep the same vocabulary.
export { ScrapeJob as CrawlJob } from '../scrape/job';
42 changes: 42 additions & 0 deletions src/api/crawler/result.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import { BaseResult, type BaseResultFields } from '../../models/result';

export interface CrawlRecord {
url?: string;
markdown?: string;
html2text?: string;
page_html?: string;
[key: string]: unknown;
}

export interface CrawlResultFields extends BaseResultFields<CrawlRecord[]> {
pageCount?: number | null;
snapshotId?: string | null;
}

export class CrawlResult extends BaseResult<CrawlRecord[]> {
readonly pageCount: number | null;
readonly snapshotId: string | null;

constructor(fields: CrawlResultFields) {
super(fields);
this.pageCount = fields.pageCount ?? null;
this.snapshotId = fields.snapshotId ?? null;
}

override toJSON(): Record<string, unknown> {
return {
...super.toJSON(),
pageCount: this.pageCount,
snapshotId: this.snapshotId,
};
}

override toString(): string {
const base = super.toString();
const sid = this.snapshotId
? ` snapshot_id=${this.snapshotId.slice(0, 12)}...`
: '';
const pages = this.pageCount != null ? ` pages=${this.pageCount}` : '';
return `<CrawlResult ${base}${pages}${sid}>`;
}
}
179 changes: 179 additions & 0 deletions src/api/crawler/service.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import { API_ENDPOINT } from '../../utils/constants';
import { Transport, assertResponse } from '../../core/transport';
import { parseResponse } from '../../utils/misc';
import { getLogger } from '../../utils/logger';
import { assertSchema } from '../../schemas/utils';
import {
CrawlInputSchema,
CrawlOptionsSchema,
CrawlDownloadOptionsSchema,
type CrawlOptions,
type CrawlDownloadOptions,
} from '../../schemas/crawler';
import { SnapshotMetaResponseSchema } from '../../schemas/responses';
import { ScrapeJob } from '../scrape/job';
import { CrawlResult, type CrawlRecord } from './result';
import type { SnapshotOperations } from '../../types/datasets';

const DATASET_ID = 'gd_m6gjtfmeh43we6cqc';
const PLATFORM = 'crawler';

export class CrawlerService {
private transport: Transport;
private snapshotOps: SnapshotOperations;
private logger = getLogger('crawler');

constructor(opts: { transport: Transport; snapshotOps: SnapshotOperations }) {
this.transport = opts.transport;
this.snapshotOps = opts.snapshotOps;
}

async crawl(
urls: string | string[],
opts: CrawlOptions = {},
): Promise<CrawlResult> {
const safeUrls = assertSchema(CrawlInputSchema, urls, 'crawler.crawl.urls');
const safeOpts = assertSchema(CrawlOptionsSchema, opts, 'crawler.crawl.opts');
const urlList = Array.isArray(safeUrls) ? safeUrls : [safeUrls];

this.logger.info(`crawl: ${urlList.length} url(s)`);
const triggerSentAt = new Date();

try {
const response = await this.transport.request(
API_ENDPOINT.SCRAPE_SYNC,
{
method: 'POST',
query: {
dataset_id: DATASET_ID,
notify: 'false',
include_errors: safeOpts.includeErrors ? 'true' : 'false',
},
body: JSON.stringify({
input: urlList.map((url) => ({ url })),
}),
},
);

const text = await assertResponse(response);
const records = parseRecords(text);
return new CrawlResult({
success: true,
data: records,
pageCount: records.length,
triggerSentAt,
dataFetchedAt: new Date(),
});
} catch (e: unknown) {
return new CrawlResult({
success: false,
error: (e as Error).message,
triggerSentAt,
dataFetchedAt: new Date(),
});
}
}

async trigger(
urls: string | string[],
opts: CrawlOptions = {},
): Promise<ScrapeJob> {
const safeUrls = assertSchema(
CrawlInputSchema,
urls,
'crawler.trigger.urls',
);
const safeOpts = assertSchema(
CrawlOptionsSchema,
opts,
'crawler.trigger.opts',
);
const urlList = Array.isArray(safeUrls) ? safeUrls : [safeUrls];

this.logger.info(`trigger: ${urlList.length} url(s)`);

const response = await this.transport.request(API_ENDPOINT.SCRAPE_ASYNC, {
method: 'POST',
query: {
dataset_id: DATASET_ID,
notify: 'false',
include_errors: safeOpts.includeErrors ? 'true' : 'false',
},
body: JSON.stringify({ input: urlList.map((url) => ({ url })) }),
});

const text = await assertResponse(response);
const meta = parseResponse(
text,
SnapshotMetaResponseSchema,
'crawler.trigger',
);
return new ScrapeJob(meta.snapshot_id, this.snapshotOps, {
platform: PLATFORM,
});
}

async status(snapshotId: string): Promise<string> {
const meta = await this.snapshotOps.getStatus(snapshotId);
return meta.status;
}

async download(
snapshotId: string,
opts: CrawlDownloadOptions = {},
): Promise<CrawlResult> {
const safeOpts = assertSchema(
CrawlDownloadOptionsSchema,
opts,
'crawler.download.opts',
);
const job = new ScrapeJob(snapshotId, this.snapshotOps, {
platform: PLATFORM,
});
const scrapeResult = await job.toResult({
pollInterval: safeOpts.pollInterval,
pollTimeout: safeOpts.pollTimeout,
});

return new CrawlResult({
success: scrapeResult.success,
data: (scrapeResult.data as CrawlRecord[] | null) ?? [],
pageCount: scrapeResult.rowCount,
snapshotId: scrapeResult.snapshotId,
triggerSentAt: scrapeResult.triggerSentAt,
dataFetchedAt: scrapeResult.dataFetchedAt,
error: scrapeResult.error,
});
}
}

function parseRecords(text: string): CrawlRecord[] {
const trimmed = text.trim();
if (!trimmed) return [];

try {
const parsed = JSON.parse(trimmed) as unknown;
if (Array.isArray(parsed)) return parsed.filter(isRecord);
if (isRecord(parsed)) return [parsed];
return [];
} catch {
return trimmed
.split('\n')
.map((l) => l.trim())
.filter(Boolean)
.flatMap((line) => {
try {
const x = JSON.parse(line) as unknown;
if (Array.isArray(x)) return x.filter(isRecord);
if (isRecord(x)) return [x];
return [];
} catch {
return [];
}
});
}
}

function isRecord(x: unknown): x is CrawlRecord {
return typeof x === 'object' && x !== null && !Array.isArray(x);
}
11 changes: 11 additions & 0 deletions src/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ import type { DiscoverJob } from './api/discover/job';
import type { DiscoverOptions } from './schemas/discover';
import { ScraperStudioService } from './api/scraperstudio/service';
import { BrowserService } from './api/browser/service';
import { CrawlerService } from './api/crawler/service';
import { SnapshotAPI } from './api/scrape/snapshot';
import { setup as setupLogger, getLogger } from './utils/logger';
import {
DEFAULT_WEB_UNLOCKER_ZONE,
Expand Down Expand Up @@ -95,6 +97,7 @@ export class bdclient {
declare datasets: DatasetsClient;
declare scraperStudio: ScraperStudioService;
declare browser: BrowserService;
declare crawler: CrawlerService;

constructor(options?: BdClientOptions) {
const opt = assertSchema(
Expand Down Expand Up @@ -190,6 +193,14 @@ export class bdclient {
port: opt.browserPort,
});
});

defineLazy(this, 'crawler', () => {
const snapshotOps = new SnapshotAPI({ transport: this.transport });
return new CrawlerService({
transport: this.transport,
snapshotOps,
});
});
}

private get scrapeAPI(): ScrapeAPI {
Expand Down
7 changes: 7 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ export { DiscoverJob } from './api/discover/job';
export type { DiscoverResultItem, DiscoverResultFields } from './api/discover/result';
export type { DiscoverPollOptions } from './api/discover/job';

// ── Crawler ──────────────────────────────────────────────────────
export { CrawlerService } from './api/crawler/service';
export { CrawlResult } from './api/crawler/result';
export { ScrapeJob as CrawlJob } from './api/scrape/job';
export type { CrawlRecord, CrawlResultFields } from './api/crawler/result';
export type { CrawlOptions, CrawlDownloadOptions } from './schemas/crawler';

// ── Scraper Studio ──────────────────────────────────────────────
export { ScraperStudioService } from './api/scraperstudio/service';
export { ScraperStudioJob } from './api/scraperstudio/job';
Expand Down
16 changes: 16 additions & 0 deletions src/schemas/crawler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { z } from 'zod';
import { URLParamSchema } from './client';

export const CrawlInputSchema = URLParamSchema;

export const CrawlOptionsSchema = z.object({
includeErrors: z.boolean().default(true),
});

export const CrawlDownloadOptionsSchema = z.object({
pollInterval: z.number().positive().optional(),
pollTimeout: z.number().positive().optional(),
});

export type CrawlOptions = z.input<typeof CrawlOptionsSchema>;
export type CrawlDownloadOptions = z.input<typeof CrawlDownloadOptionsSchema>;
8 changes: 7 additions & 1 deletion src/schemas/shared.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import path from 'node:path';
import { z } from 'zod';

export const ZoneNameSchema = z
Expand All @@ -16,7 +17,12 @@ export const ZoneNameSchema = z
message: 'zone name cannot end with an underscore',
});

// Reduce to the final path segment before stripping reserved characters so
// path-traversal sequences (../, absolute paths, alt-separators) cannot escape
// the working directory once getAbsAndEnsureDir → path.resolve runs downstream.
// path.basename only splits on the platform's native separator, so the regex
// must still strip backslashes for POSIX hosts receiving Windows-shaped input.
export const FilenameSchema = z
.string()
.min(1)
.transform((v) => v.replace(/[<>:"\\|?*]/g, '_'));
.transform((v) => path.basename(v).replace(/[<>:"\\|?*]/g, '_'));
Loading