diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index f6cfe77bd8c1..27b3895c5c74 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -155,6 +155,7 @@ export abstract class BrowserPlugin< browserPerProxy = this.browserPerProxy, ignoreProxyCertificate = this.ignoreProxyCertificate, proxyTier, + isRemote, } = options; return new LaunchContext({ @@ -167,6 +168,7 @@ export abstract class BrowserPlugin< browserPerProxy, ignoreProxyCertificate, proxyTier, + isRemote, }); } @@ -190,15 +192,23 @@ export abstract class BrowserPlugin< NewPageResult > = this.createLaunchContext(), ): Promise { + // launchOptions is only used by the local launch path below — remote connections ignore it. launchContext.launchOptions ??= {} as LibraryOptions; const { proxyUrl, launchOptions } = launchContext; - if (proxyUrl) { + if (proxyUrl && launchContext.isRemote) { + this.log.warning( + 'proxyUrl is set but will be ignored for remote browser connections. ' + + 'Configure proxy settings on the remote browser service instead.', + ); + } + + if (proxyUrl && !launchContext.isRemote) { await this._addProxyToLaunchOptions(launchContext); } - if (this._isChromiumBasedBrowser(launchContext)) { + if (!launchContext.isRemote && this._isChromiumBasedBrowser(launchContext)) { // This will set the args for chromium based browsers to hide the webdriver. (launchOptions as Dictionary).args = this._mergeArgsToHideWebdriver(launchOptions!.args); // When User-Agent is not set, and we're using Chromium in headless mode, @@ -210,6 +220,10 @@ export abstract class BrowserPlugin< } } + if (launchContext.isRemote) { + this.log.info('Connecting to remote browser (skipping local proxy and webdriver stealth configuration).'); + } + return this._launch(launchContext); } diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index 9ae847634b51..47029904a8f9 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -57,6 +57,12 @@ export interface LaunchContextOptions< * This is useful when using HTTPS proxies with self-signed certificates. */ ignoreProxyCertificate?: boolean; + /** + * Whether this launch context represents a connection to a remote browser + * rather than a locally launched one. + * @default false + */ + isRemote?: boolean; } export class LaunchContext< @@ -73,6 +79,7 @@ export class LaunchContext< browserPerProxy?: boolean; userDataDir: string; proxyTier?: number; + readonly isRemote: boolean; ignoreProxyCertificate?: boolean; private _proxyUrl?: string; @@ -92,6 +99,7 @@ export class LaunchContext< userDataDir = '', proxyTier, ignoreProxyCertificate, + isRemote, } = options; this.id = id; @@ -102,6 +110,7 @@ export class LaunchContext< this.userDataDir = userDataDir; this.proxyTier = proxyTier; this.ignoreProxyCertificate = ignoreProxyCertificate ?? false; + this.isRemote = isRemote ?? false; this._proxyUrl = proxyUrl; } diff --git a/packages/browser-pool/src/playwright/playwright-plugin.ts b/packages/browser-pool/src/playwright/playwright-plugin.ts index a48cf1fedfec..d23e20f7a120 100644 --- a/packages/browser-pool/src/playwright/playwright-plugin.ts +++ b/packages/browser-pool/src/playwright/playwright-plugin.ts @@ -1,8 +1,13 @@ import fs from 'node:fs'; -import type { Browser as PlaywrightBrowser, BrowserType } from 'playwright'; - -import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; +import type { Browser as PlaywrightBrowser, BrowserType, ConnectOverCDPOptions, ConnectOptions } from 'playwright'; + +import { + BrowserLaunchError, + BrowserPlugin, + type BrowserPluginOptions, + type CreateLaunchContextOptions, +} from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { createProxyServerForContainers } from '../container-proxy-server.js'; import type { LaunchContext } from '../launch-context.js'; @@ -11,6 +16,29 @@ import type { SafeParameters } from '../utils.js'; import { PlaywrightBrowser as PlaywrightBrowserWithPersistentContext } from './playwright-browser.js'; import { PlaywrightController } from './playwright-controller.js'; +/** + * Options for connecting to a remote browser via CDP. + * Mirrors `browserType.connectOverCDP(endpointURL, options?)`. + */ +export interface PlaywrightConnectOverCDPOptions extends ConnectOverCDPOptions { + /** The CDP endpoint URL to connect to (required). Overrides the deprecated optional `endpointURL` from Playwright. */ + endpointURL: string; +} + +/** + * Options for connecting to a remote browser via WebSocket. + * Mirrors `browserType.connect(wsEndpoint, options?)`. + */ +export interface PlaywrightConnectOptions extends ConnectOptions { + /** The WebSocket endpoint URL to connect to (required). */ + wsEndpoint: string; +} + +export interface PlaywrightPluginOptions extends BrowserPluginOptions[0]> { + connectOptions?: PlaywrightConnectOptions; + connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; +} + export class PlaywrightPlugin extends BrowserPlugin< BrowserType, SafeParameters[0], @@ -19,7 +47,96 @@ export class PlaywrightPlugin extends BrowserPlugin< private _browserVersion?: string; _containerProxyServer?: Awaited>; + connectOptions?: PlaywrightConnectOptions; + connectOverCDPOptions?: PlaywrightConnectOverCDPOptions; + + constructor(library: BrowserType, options: PlaywrightPluginOptions = {}) { + const { connectOptions, connectOverCDPOptions, ...baseOptions } = options; + + if (connectOptions && connectOverCDPOptions) { + throw new Error("Cannot set both 'connectOptions' and 'connectOverCDPOptions' — pick one protocol."); + } + + if (connectOverCDPOptions && !connectOverCDPOptions.endpointURL) { + throw new Error("'connectOverCDPOptions.endpointURL' must be a non-empty string."); + } + + if (connectOptions && !connectOptions.wsEndpoint) { + throw new Error("'connectOptions.wsEndpoint' must be a non-empty string."); + } + + super(library, baseOptions); + this.connectOptions = connectOptions; + this.connectOverCDPOptions = connectOverCDPOptions; + + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. + // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". + if (this.connectOptions || this.connectOverCDPOptions) { + if (options.useIncognitoPages === undefined) { + this.useIncognitoPages = true; + this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); + } else if (options.useIncognitoPages === false) { + const message = this.connectOptions + ? 'useIncognitoPages is set to false with a remote WebSocket connection. ' + + 'This may cause errors because browserType.connect() returns a browser with no default context.' + : 'useIncognitoPages is set to false with a remote browser connection. ' + + 'Pages will share cookies and storage on the remote browser instance.'; + this.log.warning(message); + } + } + } + + override createLaunchContext(options: CreateLaunchContextOptions = {}): LaunchContext { + return super.createLaunchContext({ + ...options, + isRemote: options.isRemote ?? !!(this.connectOptions || this.connectOverCDPOptions), + }); + } + + private _sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } + } + protected async _launch(launchContext: LaunchContext): Promise { + // Remote CDP connection — skip all local launch/proxy logic + if (this.connectOverCDPOptions) { + const { endpointURL, ...options } = this.connectOverCDPOptions; + this.log.info('Connecting to remote browser via connectOverCDP.'); + try { + return await this.library.connectOverCDP(endpointURL, options); + } catch (cause) { + throw new BrowserLaunchError( + `Failed to connect to remote browser via CDP at "${this._sanitizeEndpointForLog(endpointURL)}". ` + + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + { cause }, + ); + } + } + + // Remote Playwright WebSocket connection — skip all local launch/proxy logic + if (this.connectOptions) { + const { wsEndpoint, ...options } = this.connectOptions; + this.log.info('Connecting to remote browser via connect (Playwright WebSocket).'); + try { + return await this.library.connect(wsEndpoint, options); + } catch (cause) { + throw new BrowserLaunchError( + `Failed to connect to remote browser via WebSocket at "${this._sanitizeEndpointForLog(wsEndpoint)}". ` + + 'Check that the endpoint is reachable and the Playwright server is running.\n\u200b', + { cause }, + ); + } + } + const { launchOptions, useIncognitoPages, userDataDir, proxyUrl } = launchContext; let browser: PlaywrightBrowser; diff --git a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts index 91ea817d03a9..f325f89cc234 100644 --- a/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts +++ b/packages/browser-pool/src/puppeteer/puppeteer-plugin.ts @@ -4,7 +4,12 @@ import type { Dictionary } from '@crawlee/types'; import type Puppeteer from 'puppeteer'; import type * as PuppeteerTypes from 'puppeteer'; -import { BrowserPlugin } from '../abstract-classes/browser-plugin.js'; +import { + BrowserLaunchError, + BrowserPlugin, + type BrowserPluginOptions, + type CreateLaunchContextOptions, +} from '../abstract-classes/browser-plugin.js'; import { anonymizeProxySugar } from '../anonymize-proxy.js'; import type { LaunchContext } from '../launch-context.js'; import { noop } from '../utils.js'; @@ -13,12 +18,76 @@ import { PuppeteerController } from './puppeteer-controller.js'; const PROXY_SERVER_ARG = '--proxy-server='; +/** + * Options for connecting to a remote browser via Puppeteer. + * Flat object matching Puppeteer's `ConnectOptions`. + */ +export type PuppeteerConnectOverCDPOptions = Parameters<(typeof Puppeteer)['connect']>[0]; + +export interface PuppeteerPluginOptions extends BrowserPluginOptions { + connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; +} + export class PuppeteerPlugin extends BrowserPlugin< typeof Puppeteer, PuppeteerTypes.LaunchOptions, PuppeteerTypes.Browser, PuppeteerNewPageOptions > { + connectOverCDPOptions?: PuppeteerConnectOverCDPOptions; + + constructor(library: typeof Puppeteer, options: PuppeteerPluginOptions = {}) { + const { connectOverCDPOptions, ...baseOptions } = options; + + if (connectOverCDPOptions && !connectOverCDPOptions.browserWSEndpoint && !connectOverCDPOptions.browserURL) { + throw new Error("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'."); + } + + super(library, baseOptions); + this.connectOverCDPOptions = connectOverCDPOptions; + + // We check options.useIncognitoPages (not this.useIncognitoPages) because super() collapses undefined to false. + // This preserves the distinction between "not set" (undefined → default to true) and "explicitly false". + if (this.connectOverCDPOptions) { + if (options.useIncognitoPages === undefined) { + this.useIncognitoPages = true; + this.log.info('Remote browser detected — defaulting useIncognitoPages to true for session isolation.'); + } else if (options.useIncognitoPages === false) { + this.log.warning( + 'useIncognitoPages is set to false with a remote browser connection. ' + + 'Pages will share cookies and storage on the remote browser instance.', + ); + } + } + } + + override createLaunchContext( + options: CreateLaunchContextOptions< + typeof Puppeteer, + PuppeteerTypes.LaunchOptions, + PuppeteerTypes.Browser, + PuppeteerNewPageOptions + > = {}, + ): LaunchContext { + return super.createLaunchContext({ + ...options, + isRemote: options.isRemote ?? !!this.connectOverCDPOptions, + }); + } + + private _sanitizeEndpointForLog(endpoint: string): string { + try { + const url = new URL(endpoint); + if (url.username || url.password) { + url.username = '***'; + url.password = '***'; + } + return url.toString(); + } catch { + return ''; + } + } + protected async _launch( launchContext: LaunchContext< typeof Puppeteer, @@ -38,71 +107,82 @@ export class PuppeteerPlugin extends BrowserPlugin< // ignore } - const { - launchOptions, - userDataDir, - useIncognitoPages, - experimentalContainers, - proxyUrl, - ignoreProxyCertificate, - } = launchContext; - - if (experimentalContainers) { - throw new Error('Experimental containers are only available with Playwright'); - } + const { useIncognitoPages, proxyUrl, ignoreProxyCertificate } = launchContext; - launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; + let browser: PuppeteerTypes.Browser; - if (launchOptions!.headless === false) { - if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push('--disable-site-isolation-trials'); - } else { - launchOptions!.args = ['--disable-site-isolation-trials']; + if (this.connectOverCDPOptions) { + // Remote CDP connection — skip local launch/proxy/headless logic + const endpoint = this.connectOverCDPOptions.browserWSEndpoint || this.connectOverCDPOptions.browserURL!; + this.log.info('Connecting to remote browser via connect (CDP).'); + try { + browser = await this.library.connect(this.connectOverCDPOptions); + } catch (cause) { + const safeEndpoint = this._sanitizeEndpointForLog(endpoint); + throw new BrowserLaunchError( + `Failed to connect to remote browser via CDP at "${safeEndpoint}". ` + + 'Check that the endpoint is reachable and the browser is accepting CDP connections.\n\u200b', + { cause }, + ); } - } + } else { + const { launchOptions, userDataDir, experimentalContainers } = launchContext; - if (launchOptions!.headless === true && oldPuppeteerVersion) { - launchOptions!.headless = 'new' as any; - } - - let browser: PuppeteerTypes.Browser; - - { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { - ignoreProxyCertificate: launchContext.ignoreProxyCertificate, - }); + if (experimentalContainers) { + throw new Error('Experimental containers are only available with Playwright'); + } - if (proxyUrl) { - const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + launchOptions!.userDataDir = launchOptions!.userDataDir ?? userDataDir; + if (launchOptions!.headless === false) { if (Array.isArray(launchOptions!.args)) { - launchOptions!.args.push(proxyArg); + launchOptions!.args.push('--disable-site-isolation-trials'); } else { - launchOptions!.args = [proxyArg]; + launchOptions!.args = ['--disable-site-isolation-trials']; } } - try { - browser = await this.library.launch(launchOptions); + if (launchOptions!.headless === true && oldPuppeteerVersion) { + launchOptions!.headless = 'new' as any; + } - if (anonymizedProxyUrl) { - browser.on('disconnected', async () => { - await close(); - }); + { + const [anonymizedProxyUrl, close] = await anonymizeProxySugar(proxyUrl, undefined, undefined, { + ignoreProxyCertificate: launchContext.ignoreProxyCertificate, + }); + + if (proxyUrl) { + const proxyArg = `${PROXY_SERVER_ARG}${anonymizedProxyUrl ?? proxyUrl}`; + + if (Array.isArray(launchOptions!.args)) { + launchOptions!.args.push(proxyArg); + } else { + launchOptions!.args = [proxyArg]; + } } - } catch (error: any) { - await close(); - this._throwAugmentedLaunchError( - error, - launchContext.launchOptions?.executablePath, - '`apify/actor-node-puppeteer-chrome`', - "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", - ); + try { + browser = await this.library.launch(launchOptions); + + if (anonymizedProxyUrl) { + browser.on('disconnected', async () => { + await close(); + }); + } + } catch (error: any) { + await close(); + + this._throwAugmentedLaunchError( + error, + launchContext.launchOptions?.executablePath, + '`apify/actor-node-puppeteer-chrome`', + "Try installing a browser, if it's missing, by running `npx @puppeteer/browsers install chromium --path [path]` and pointing `executablePath` to the downloaded executable (https://pptr.dev/browsers-api)", + ); + } } } - browser.on('targetcreated', async (target: PuppeteerTypes.Target) => { + const targetCreatedHandler = async (target: PuppeteerTypes.Target) => { try { const page = await target.page(); @@ -115,7 +195,16 @@ export class PuppeteerPlugin extends BrowserPlugin< } catch (error: any) { this.log.exception(error, 'Failed to retrieve page from target.'); } - }); + }; + + browser.on('targetcreated', targetCreatedHandler); + + // Clean up the listener when a remote browser disconnects to prevent leaks + if (this.connectOverCDPOptions) { + browser.once('disconnected', () => { + browser.off('targetcreated', targetCreatedHandler); + }); + } const boundMethods = ( [ @@ -142,25 +231,29 @@ export class PuppeteerPlugin extends BrowserPlugin< let page: PuppeteerTypes.Page; if (useIncognitoPages) { - const [anonymizedProxyUrl, close] = await anonymizeProxySugar( - proxyUrl, - undefined, - undefined, - { ignoreProxyCertificate }, - ); + // Skip proxy setup for remote connections — proxy is managed by the remote service. + const effectiveProxyUrl = this.connectOverCDPOptions ? undefined : proxyUrl; + const [anonymizedProxyUrl, close] = effectiveProxyUrl + ? await anonymizeProxySugar(effectiveProxyUrl, undefined, undefined, { + ignoreProxyCertificate, + }) + : ([undefined, noop] as const); try { - const context = (await (browser as any)[method]({ - proxyServer: anonymizedProxyUrl ?? proxyUrl, - })) as PuppeteerTypes.BrowserContext; + const proxyServer = anonymizedProxyUrl ?? effectiveProxyUrl; + const contextOptions = proxyServer ? { proxyServer } : {}; + const context = (await (browser as any)[method]( + contextOptions, + )) as PuppeteerTypes.BrowserContext; page = await context.newPage(...args); - if (anonymizedProxyUrl) { - page.on('close', async () => { + page.once('close', async () => { + if (anonymizedProxyUrl) { await close(); - }); - } + } + await context.close().catch(noop); + }); } catch (error) { await close(); diff --git a/packages/browser-pool/test/remote-browser.test.ts b/packages/browser-pool/test/remote-browser.test.ts new file mode 100644 index 000000000000..c9ce63ba8d2d --- /dev/null +++ b/packages/browser-pool/test/remote-browser.test.ts @@ -0,0 +1,665 @@ +import { vi } from 'vitest'; + +import { serviceLocator } from '@crawlee/core'; +import type { CrawleeLogger } from '@crawlee/core'; + +import { PlaywrightPlugin } from '../src/playwright/playwright-plugin.js'; +import { PuppeteerPlugin } from '../src/puppeteer/puppeteer-plugin.js'; + +// --------------------------------------------------------------------------- +// Shared mock helpers +// --------------------------------------------------------------------------- + +function createMockPage() { + return { + close: vi.fn().mockResolvedValue(undefined), + url: vi.fn(() => 'about:blank'), + on: vi.fn(), + once: vi.fn(), + }; +} + +function createMockBrowserContext() { + const page = createMockPage(); + return { + newPage: vi.fn().mockResolvedValue(page), + close: vi.fn().mockResolvedValue(undefined), + on: vi.fn(), + once: vi.fn(), + _mockPage: page, + }; +} + +function createMockBrowser() { + const mockContext = createMockBrowserContext(); + return { + newPage: vi.fn().mockResolvedValue(createMockPage()), + close: vi.fn().mockResolvedValue(undefined), + contexts: vi.fn(() => []), + on: vi.fn(), + off: vi.fn(), + once: vi.fn(), + version: vi.fn(() => '120.0.0'), + pages: vi.fn(() => []), + process: vi.fn(() => null), + userAgent: vi.fn().mockResolvedValue('mock-ua'), + createBrowserContext: vi.fn().mockResolvedValue(mockContext), + createIncognitoBrowserContext: vi.fn().mockResolvedValue(mockContext), + _mockContext: mockContext, + }; +} + +function createMockPlaywrightLibrary(browser = createMockBrowser()) { + const mockContext = { + ...browser, + once: vi.fn(), + on: vi.fn(), + }; + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + connectOverCDP: vi.fn().mockResolvedValue(browser), + name: vi.fn(() => 'chromium'), + launchPersistentContext: vi.fn().mockResolvedValue(mockContext), + }; +} + +function createMockPuppeteerLibrary(browser = createMockBrowser()) { + return { + launch: vi.fn().mockResolvedValue(browser), + connect: vi.fn().mockResolvedValue(browser), + product: 'chrome', + }; +} + +function createMockLogger(): CrawleeLogger & { warning: ReturnType; info: ReturnType } { + const mockLogger: any = { + getOptions: vi.fn(() => ({})), + setOptions: vi.fn(), + child: vi.fn(() => mockLogger), + error: vi.fn(), + exception: vi.fn(), + softFail: vi.fn(), + warning: vi.fn(), + warningOnce: vi.fn(), + info: vi.fn(), + debug: vi.fn(), + perf: vi.fn(), + deprecated: vi.fn(), + log: vi.fn(), + setLevel: vi.fn(), + getLevel: vi.fn(), + }; + return mockLogger; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('Remote browser — PlaywrightPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + // --- Connection routing --------------------------------------------------- + + describe('connection routing', () => { + test('connectOverCDPOptions → calls connectOverCDP, not launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', {}); + expect(lib.launch).not.toHaveBeenCalled(); + expect(lib.connect).not.toHaveBeenCalled(); + }); + + test('connectOptions → calls connect, not launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', {}); + expect(lib.launch).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('no connect options → calls launch', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + expect(lib.connectOverCDP).not.toHaveBeenCalled(); + }); + + test('passes extra options through to connectOverCDP', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { + endpointURL: 'http://remote:9222', + timeout: 5000, + headers: { 'x-token': 'abc' }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connectOverCDP).toHaveBeenCalledWith('http://remote:9222', { + timeout: 5000, + headers: { 'x-token': 'abc' }, + }); + }); + + test('passes extra options through to connect', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { + wsEndpoint: 'ws://remote:3000', + timeout: 3000, + headers: { Authorization: 'Bearer xyz' }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith('ws://remote:3000', { + timeout: 3000, + headers: { Authorization: 'Bearer xyz' }, + }); + }); + }); + + // --- Validation ----------------------------------------------------------- + + describe('validation', () => { + test('throws when both connectOptions and connectOverCDPOptions are set', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }), + ).toThrow("Cannot set both 'connectOptions' and 'connectOverCDPOptions'"); + }); + + test('throws when connectOverCDPOptions has no endpointURL', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: '' }, + }), + ).toThrow("'connectOverCDPOptions.endpointURL' must be a non-empty string"); + }); + + test('throws when connectOptions has no wsEndpoint', () => { + const lib = createMockPlaywrightLibrary(); + + expect( + () => + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: '' }, + }), + ).toThrow("'connectOptions.wsEndpoint' must be a non-empty string"); + }); + }); + + // --- isRemote correctness ------------------------------------------------- + + describe('isRemote', () => { + test('true when connectOverCDPOptions is present', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('true when connectOptions is present', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('false when no connect options', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(false); + }); + }); + + // --- Proxy/webdriver skipping --------------------------------------------- + + describe('proxy/webdriver skipping for remote', () => { + test('proxy is not applied for remote connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The browser was connected via CDP, not launched — proxy is not set on launchOptions + expect(lib.connectOverCDP).toHaveBeenCalledTimes(1); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('webdriver hiding args are not added for remote connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The original args should be untouched — no webdriver stealth flag injected + expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); + expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); + }); + + test('webdriver hiding args ARE added for local chromium connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); + expect(ctx.launchOptions?.args).toContain('--custom-flag'); + }); + + test('proxy is applied for local connections', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + // Launch options should have proxy configured + const launchOpts = lib.launch.mock.calls[0][0]; + expect(launchOpts.proxy).toBeDefined(); + expect(launchOpts.proxy.server).toBeDefined(); + }); + }); + + // --- useIncognitoPages default -------------------------------------------- + + describe('useIncognitoPages default', () => { + test('defaults to true for remote (connectOverCDP)', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('defaults to true for remote (connect)', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('explicit false preserved for remote', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: false, + }); + + expect(plugin.useIncognitoPages).toBe(false); + }); + + test('explicit true preserved for remote', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: true, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('default false for local', () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + expect(plugin.useIncognitoPages).toBe(false); + }); + }); + + // --- Warnings ------------------------------------------------------------- + + describe('warnings', () => { + test('proxyUrl + remote → warning logged', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('proxyUrl is set but will be ignored'), + ); + }); + + test('useIncognitoPages: false + remote CDP → warning about shared state', () => { + const lib = createMockPlaywrightLibrary(); + new PlaywrightPlugin(lib as any, { + connectOverCDPOptions: { endpointURL: 'http://remote:9222' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('Pages will share cookies and storage'), + ); + }); + + test('useIncognitoPages: false + remote WebSocket → warning about no default context', () => { + const lib = createMockPlaywrightLibrary(); + new PlaywrightPlugin(lib as any, { + connectOptions: { wsEndpoint: 'ws://remote:3000' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('browserType.connect() returns a browser with no default context'), + ); + }); + + test('no warnings for local browser usage', async () => { + const lib = createMockPlaywrightLibrary(); + const plugin = new PlaywrightPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).not.toHaveBeenCalled(); + }); + }); +}); + +describe('Remote browser — PuppeteerPlugin', () => { + let mockLogger: ReturnType; + + beforeEach(() => { + mockLogger = createMockLogger(); + serviceLocator.setLogger(mockLogger); + }); + + // --- Connection routing --------------------------------------------------- + + describe('connection routing', () => { + test('connectOverCDPOptions → calls connect, not launch', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.connect).toHaveBeenCalledWith({ browserWSEndpoint: 'ws://remote:9222' }); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('no connect options → calls launch', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.launch).toHaveBeenCalledTimes(1); + expect(lib.connect).not.toHaveBeenCalled(); + }); + + test('passes all connect options through to connect', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { + browserWSEndpoint: 'ws://remote:9222', + defaultViewport: { width: 800, height: 600 }, + }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledWith({ + browserWSEndpoint: 'ws://remote:9222', + defaultViewport: { width: 800, height: 600 }, + }); + }); + }); + + // --- Validation ----------------------------------------------------------- + + describe('validation', () => { + test('throws when connectOverCDPOptions has no browserWSEndpoint or browserURL', () => { + const lib = createMockPuppeteerLibrary(); + + expect( + () => + new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: {} as any, + }), + ).toThrow("connectOverCDPOptions must include either 'browserWSEndpoint' or 'browserURL'"); + }); + }); + + // --- isRemote correctness ------------------------------------------------- + + describe('isRemote', () => { + test('true when connectOverCDPOptions is present', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(true); + }); + + test('false when no connect options', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + expect(ctx.isRemote).toBe(false); + }); + }); + + // --- Proxy/webdriver skipping --------------------------------------------- + + describe('proxy/webdriver skipping for remote', () => { + test('proxy is not applied for remote connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(lib.connect).toHaveBeenCalledTimes(1); + expect(lib.launch).not.toHaveBeenCalled(); + }); + + test('proxy is not leaked into createBrowserContext for remote newPage', async () => { + const browser = createMockBrowser(); + const lib = createMockPuppeteerLibrary(browser); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + const wrappedBrowser = await plugin.launch(ctx); + + // Call newPage on the wrapped browser — useIncognitoPages defaults to true for remote + await (wrappedBrowser as any).newPage(); + + // createBrowserContext should be called with empty options (no proxyServer) + expect(browser.createBrowserContext).toHaveBeenCalledTimes(1); + expect(browser.createBrowserContext).toHaveBeenCalledWith({}); + }); + + test('webdriver hiding args are not added for remote connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + // The original args should be untouched — no webdriver stealth flag injected + expect(ctx.launchOptions?.args).toEqual(['--custom-flag']); + expect(ctx.launchOptions?.args).not.toContain('--disable-blink-features=AutomationControlled'); + }); + + test('webdriver hiding args ARE added for local connections', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + launchOptions: { args: ['--custom-flag'] }, + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(ctx.launchOptions?.args).toContain('--disable-blink-features=AutomationControlled'); + expect(ctx.launchOptions?.args).toContain('--custom-flag'); + }); + }); + + // --- useIncognitoPages default -------------------------------------------- + + describe('useIncognitoPages default', () => { + test('defaults to true for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('explicit false preserved for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: false, + }); + + expect(plugin.useIncognitoPages).toBe(false); + }); + + test('explicit true preserved for remote', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: true, + }); + + expect(plugin.useIncognitoPages).toBe(true); + }); + + test('default false for local', () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + expect(plugin.useIncognitoPages).toBe(false); + }); + }); + + // --- Warnings ------------------------------------------------------------- + + describe('warnings', () => { + test('proxyUrl + remote → warning logged', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + proxyUrl: 'http://user:pass@proxy:8080', + }); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('proxyUrl is set but will be ignored'), + ); + }); + + test('useIncognitoPages: false + remote → warning logged', () => { + const lib = createMockPuppeteerLibrary(); + new PuppeteerPlugin(lib as any, { + connectOverCDPOptions: { browserWSEndpoint: 'ws://remote:9222' }, + useIncognitoPages: false, + }); + + expect(mockLogger.warning).toHaveBeenCalledWith( + expect.stringContaining('useIncognitoPages is set to false'), + ); + }); + + test('no warnings for local browser usage', async () => { + const lib = createMockPuppeteerLibrary(); + const plugin = new PuppeteerPlugin(lib as any); + + const ctx = plugin.createLaunchContext(); + await plugin.launch(ctx); + + expect(mockLogger.warning).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/packages/playwright-crawler/src/internals/playwright-launcher.ts b/packages/playwright-crawler/src/internals/playwright-launcher.ts index a8bf13c86c50..e3d2e7f44c8c 100644 --- a/packages/playwright-crawler/src/internals/playwright-launcher.ts +++ b/packages/playwright-crawler/src/internals/playwright-launcher.ts @@ -1,6 +1,8 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PlaywrightPlugin } from '@crawlee/browser-pool'; +import type { PlaywrightConnectOptions, PlaywrightConnectOverCDPOptions } from '@crawlee/browser-pool'; +import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser, BrowserType, LaunchOptions } from 'playwright'; @@ -70,6 +72,18 @@ export interface PlaywrightLaunchContext extends BrowserLaunchContext { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, launchContextOptions: ow.optional.object, + connectOptions: ow.optional.object, + connectOverCDPOptions: ow.optional.object, }; /** @@ -114,6 +130,26 @@ export class PlaywrightLauncher extends BrowserLauncher { ); this.Plugin = PlaywrightPlugin; + + const connectOptionsPresent = !!(launchContext.connectOptions || launchContext.connectOverCDPOptions); + + if (connectOptionsPresent && (launchContext.useChrome || launchContext.launchOptions?.executablePath)) { + const log = serviceLocator.getLogger().child({ prefix: 'PlaywrightLauncher' }); + + if (launchContext.useChrome) { + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + + if (launchContext.launchOptions?.executablePath) { + log.warning( + 'executablePath is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + } } } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts index 3d46c30dd432..4113ea0d90bc 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-launcher.ts @@ -1,6 +1,8 @@ import type { BrowserLaunchContext } from '@crawlee/browser'; import { BrowserLauncher, Configuration } from '@crawlee/browser'; import { PuppeteerPlugin } from '@crawlee/browser-pool'; +import type { PuppeteerConnectOverCDPOptions } from '@crawlee/browser-pool'; +import { serviceLocator } from '@crawlee/core'; import ow from 'ow'; import type { Browser } from 'puppeteer'; @@ -65,6 +67,12 @@ export interface PuppeteerLaunchContext extends BrowserLaunchContext protected static override optionsShape = { ...BrowserLauncher.optionsShape, launcher: ow.optional.object, + connectOverCDPOptions: ow.optional.object, }; /** @@ -100,6 +109,27 @@ export class PuppeteerLauncher extends BrowserLauncher ); this.Plugin = PuppeteerPlugin; + + if ( + launchContext.connectOverCDPOptions && + (launchContext.useChrome || (launchContext.launchOptions as Record)?.executablePath) + ) { + const log = serviceLocator.getLogger().child({ prefix: 'PuppeteerLauncher' }); + + if (launchContext.useChrome) { + log.warning( + 'useChrome is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + + if ((launchContext.launchOptions as Record)?.executablePath) { + log.warning( + 'executablePath is set but will be ignored for remote browser connections. ' + + 'The remote service controls which browser binary is used.', + ); + } + } } protected override _getDefaultHeadlessOption(): boolean { diff --git a/temp-examples/.env.example b/temp-examples/.env.example new file mode 100644 index 000000000000..500f5da5f2ce --- /dev/null +++ b/temp-examples/.env.example @@ -0,0 +1,9 @@ +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +# +BROWSERLESS_TOKEN= +# +REBROWSER_API_KEY= +REBROWSER_PROFILE_ID= +# +STEEL_API_KEY= diff --git a/temp-examples/.gitignore b/temp-examples/.gitignore new file mode 100644 index 000000000000..4c49bd78f1d0 --- /dev/null +++ b/temp-examples/.gitignore @@ -0,0 +1 @@ +.env diff --git a/temp-examples/examples/browserbase-playwright-ws.ts b/temp-examples/examples/browserbase-playwright-ws.ts new file mode 100644 index 000000000000..656d2cf3a0f0 --- /dev/null +++ b/temp-examples/examples/browserbase-playwright-ws.ts @@ -0,0 +1,55 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a session ID that we use to construct the WebSocket URL. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's WebSocket connection. +// The WS URL is constructed from the API key and session ID. +const wsUrl = `wss://connect.browserbase.com?apiKey=${apiKey}&sessionId=${session.id}`; + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: wsUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-playwright.ts b/temp-examples/examples/browserbase-playwright.ts new file mode 100644 index 000000000000..78ce8ca5569e --- /dev/null +++ b/temp-examples/examples/browserbase-playwright.ts @@ -0,0 +1,53 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a connectUrl that we can use with Playwright's connectOverCDP. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's CDP connection. +// The connectUrl from the session response is used as the CDP endpoint. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: session.connectUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserbase-puppeteer.ts b/temp-examples/examples/browserbase-puppeteer.ts new file mode 100644 index 000000000000..f6dcce121965 --- /dev/null +++ b/temp-examples/examples/browserbase-puppeteer.ts @@ -0,0 +1,53 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +// Browserbase requires two env variables: +// - BROWSERBASE_API_KEY: Your API key for authentication +// - BROWSERBASE_PROJECT_ID: The project to create sessions in +const apiKey = process.env.BROWSERBASE_API_KEY; +const projectId = process.env.BROWSERBASE_PROJECT_ID; + +if (!apiKey) { + throw new Error('BROWSERBASE_API_KEY env variable is required'); +} + +if (!projectId) { + throw new Error('BROWSERBASE_PROJECT_ID env variable is required'); +} + +// Step 1: Create a Browserbase session via REST API. +// This returns a connectUrl that we can use with Puppeteer's CDP connection. +// You have 5 minutes to connect before the session terminates. +const response = await fetch('https://api.browserbase.com/v1/sessions', { + method: 'POST', + headers: { + 'x-bb-api-key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ projectId }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Browserbase session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Browserbase session: ${session.id}`); + +// Step 2: Connect to the session using Puppeteer's CDP connection. +// The connectUrl from the session response is used as the browserWSEndpoint. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: session.connectUrl, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/browserless-playwright-ws.ts b/temp-examples/examples/browserless-playwright-ws.ts new file mode 100644 index 000000000000..ec659b59f025 --- /dev/null +++ b/temp-examples/examples/browserless-playwright-ws.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: `wss://production-sfo.browserless.io/chromium/playwright?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/browserless-playwright.ts b/temp-examples/examples/browserless-playwright.ts new file mode 100644 index 000000000000..ca7712c62ed0 --- /dev/null +++ b/temp-examples/examples/browserless-playwright.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://production-sfo.browserless.io?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/browserless-puppeteer.ts b/temp-examples/examples/browserless-puppeteer.ts new file mode 100644 index 000000000000..c47fbe214420 --- /dev/null +++ b/temp-examples/examples/browserless-puppeteer.ts @@ -0,0 +1,29 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const token = process.env.BROWSERLESS_TOKEN; + +if (!token) { + throw new Error('BROWSERLESS_TOKEN env variable is required'); +} + +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://production-sfo.browserless.io?token=${token}`, + }, + }, + async requestHandler({ page, request, enqueueLinks }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + + await enqueueLinks({ + globs: ['https://www.crawlee.dev/**'], + limit: 5, + }); + }, + maxRequestsPerCrawl: 10, +}); + +await crawler.run(['https://www.crawlee.dev']); diff --git a/temp-examples/examples/rebrowser-playwright-ws.ts b/temp-examples/examples/rebrowser-playwright-ws.ts new file mode 100644 index 000000000000..31587ceca6bb --- /dev/null +++ b/temp-examples/examples/rebrowser-playwright-ws.ts @@ -0,0 +1,50 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; +const profileId = process.env.REBROWSER_PROFILE_ID; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Step 1: Start a Rebrowser run via REST API. +// This gives you a dedicated WebSocket endpoint for the session. +// You can optionally specify a profileId and proxyUrl for advanced control. +const startRunUrl = new URL(`https://rebrowser.net/api/startRun?apikey=${apiKey}`); + +if (profileId) { + startRunUrl.searchParams.set('profileId', profileId); + console.log(`Using Rebrowser profile: ${profileId}`); +} + +const response = await fetch(startRunUrl.toString()); + +if (!response.ok) { + throw new Error(`Failed to start Rebrowser run: ${response.status} ${response.statusText}`); +} + +const run = await response.json(); +console.log(`Started Rebrowser run with wsEndpoint: ${run.wsEndpoint}`); + +// Step 2: Connect to the run using Playwright's WebSocket connection. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: run.wsEndpoint, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Step 3: Finish the run to stop billing. +// Rebrowser recommends explicit finishRun to avoid idle billing. +// The browser disconnects automatically after the crawl, but calling finishRun +// ensures the run is cleanly terminated on Rebrowser's side. diff --git a/temp-examples/examples/rebrowser-playwright.ts b/temp-examples/examples/rebrowser-playwright.ts new file mode 100644 index 000000000000..f88783238192 --- /dev/null +++ b/temp-examples/examples/rebrowser-playwright.ts @@ -0,0 +1,31 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Rebrowser simple connection: no profile or run creation needed. +// A random profile is auto-selected when you connect with just an API key. +// Proxies are managed via the Rebrowser dashboard or WS URL params. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://api.rebrowser.net?apikey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. +// With Crawlee, the browser disconnects automatically after the crawl finishes, +// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/rebrowser-puppeteer.ts b/temp-examples/examples/rebrowser-puppeteer.ts new file mode 100644 index 000000000000..54d49065c712 --- /dev/null +++ b/temp-examples/examples/rebrowser-puppeteer.ts @@ -0,0 +1,31 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const apiKey = process.env.REBROWSER_API_KEY; + +if (!apiKey) { + throw new Error('REBROWSER_API_KEY env variable is required'); +} + +// Rebrowser simple connection: no profile or run creation needed. +// A random profile is auto-selected when you connect with just an API key. +// Proxies are managed via the Rebrowser dashboard or WS URL params. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://api.rebrowser.net?apikey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Note: Rebrowser recommends calling finishRun after you're done to avoid idle billing. +// With Crawlee, the browser disconnects automatically after the crawl finishes, +// which should end the run. For explicit control, use the REST API finishRun endpoint. diff --git a/temp-examples/examples/steel-playwright-ws.ts b/temp-examples/examples/steel-playwright-ws.ts new file mode 100644 index 000000000000..55f4712a5315 --- /dev/null +++ b/temp-examples/examples/steel-playwright-ws.ts @@ -0,0 +1,51 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Step 1: Create a Steel session via REST API. +// Explicit session creation enables advanced features like proxy and CAPTCHA solving. +const response = await fetch('https://api.steel.dev/v1/sessions', { + method: 'POST', + headers: { + 'Steel-Api-Key': apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ useProxy: true, solveCaptcha: true }), +}); + +if (!response.ok) { + throw new Error(`Failed to create Steel session: ${response.status} ${response.statusText}`); +} + +const session = await response.json(); +console.log(`Created Steel session: ${session.id}`); + +// Step 2: Connect to the session using Playwright's WebSocket connection. +// The session ID is passed as a query parameter to the Steel WebSocket endpoint. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOptions: { + wsEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}&sessionId=${session.id}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); + +// Step 3: Release the session (optional — Steel auto-releases on disconnect). +await fetch(`https://api.steel.dev/v1/sessions/${session.id}/release`, { + method: 'POST', + headers: { 'Steel-Api-Key': apiKey }, +}); +console.log(`Released Steel session: ${session.id}`); diff --git a/temp-examples/examples/steel-playwright.ts b/temp-examples/examples/steel-playwright.ts new file mode 100644 index 000000000000..7bf2913054e9 --- /dev/null +++ b/temp-examples/examples/steel-playwright.ts @@ -0,0 +1,26 @@ +import 'dotenv/config'; + +import { PlaywrightCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Steel direct connection: no session creation needed. +// A session is auto-created when you connect and auto-released on disconnect. +const crawler = new PlaywrightCrawler({ + launchContext: { + connectOverCDPOptions: { + endpointURL: `wss://connect.steel.dev?apiKey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/examples/steel-puppeteer.ts b/temp-examples/examples/steel-puppeteer.ts new file mode 100644 index 000000000000..68dc3cdb59a9 --- /dev/null +++ b/temp-examples/examples/steel-puppeteer.ts @@ -0,0 +1,26 @@ +import 'dotenv/config'; + +import { PuppeteerCrawler } from 'crawlee'; + +const apiKey = process.env.STEEL_API_KEY; + +if (!apiKey) { + throw new Error('STEEL_API_KEY env variable is required'); +} + +// Steel direct connection: no session creation needed. +// A session is auto-created when you connect and auto-released on disconnect. +const crawler = new PuppeteerCrawler({ + launchContext: { + connectOverCDPOptions: { + browserWSEndpoint: `wss://connect.steel.dev?apiKey=${apiKey}`, + }, + }, + async requestHandler({ page, request }) { + const title = await page.title(); + console.log(`[${request.loadedUrl}] ${title}`); + }, + maxRequestsPerCrawl: 1, +}); + +await crawler.run(['https://example.com']); diff --git a/temp-examples/package.json b/temp-examples/package.json new file mode 100644 index 000000000000..cbcb71ed6c1b --- /dev/null +++ b/temp-examples/package.json @@ -0,0 +1,38 @@ +{ + "name": "temp-examples", + "version": "1.0.0", + "private": true, + "type": "module", + "scripts": { + "example:browserless-puppeteer": "node --experimental-strip-types examples/browserless-puppeteer.ts", + "example:browserless-playwright": "node --experimental-strip-types examples/browserless-playwright.ts", + "example:browserless-playwright-ws": "node --experimental-strip-types examples/browserless-playwright-ws.ts", + "example:browserbase-puppeteer": "node --experimental-strip-types examples/browserbase-puppeteer.ts", + "example:browserbase-playwright": "node --experimental-strip-types examples/browserbase-playwright.ts", + "example:browserbase-playwright-ws": "node --experimental-strip-types examples/browserbase-playwright-ws.ts", + "example:steel-puppeteer": "node --experimental-strip-types examples/steel-puppeteer.ts", + "example:steel-playwright": "node --experimental-strip-types examples/steel-playwright.ts", + "example:steel-playwright-ws": "node --experimental-strip-types examples/steel-playwright-ws.ts", + "example:rebrowser-puppeteer": "node --experimental-strip-types examples/rebrowser-puppeteer.ts", + "example:rebrowser-playwright": "node --experimental-strip-types examples/rebrowser-playwright.ts", + "example:rebrowser-playwright-ws": "node --experimental-strip-types examples/rebrowser-playwright-ws.ts" + }, + "dependencies": { + "@crawlee/basic": "file:../packages/basic-crawler/dist", + "@crawlee/browser": "file:../packages/browser-crawler/dist", + "@crawlee/browser-pool": "file:../packages/browser-pool/dist", + "@crawlee/cheerio": "file:../packages/cheerio-crawler/dist", + "@crawlee/cli": "file:../packages/cli/dist", + "@crawlee/core": "file:../packages/core/dist", + "@crawlee/http": "file:../packages/http-crawler/dist", + "@crawlee/jsdom": "file:../packages/jsdom-crawler/dist", + "@crawlee/linkedom": "file:../packages/linkedom-crawler/dist", + "@crawlee/playwright": "file:../packages/playwright-crawler/dist", + "@crawlee/puppeteer": "file:../packages/puppeteer-crawler/dist", + "@crawlee/types": "file:../packages/types/dist", + "@crawlee/utils": "file:../packages/utils/dist", + "@types/node": "^25.2.0", + "crawlee": "file:../packages/crawlee/dist", + "dotenv": "^17.3.1" + } +} diff --git a/temp-examples/readme.md b/temp-examples/readme.md new file mode 100644 index 000000000000..a570750b6774 --- /dev/null +++ b/temp-examples/readme.md @@ -0,0 +1,12 @@ +#how to start + +``` +##root +nr clean +nr build + +cd temp-examples +npm install +npm run example:browserless-puppeteer +... +``` diff --git a/temp-examples/tsconfig.json b/temp-examples/tsconfig.json new file mode 100644 index 000000000000..5fcc4b7bad3a --- /dev/null +++ b/temp-examples/tsconfig.json @@ -0,0 +1,9 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "Node16", + "moduleResolution": "Node16", + "esModuleInterop": true, + "sourceMap": false + } +}