From 452f0f6e12c5229f55a9ff60b0b98157e1c4af39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 23 Apr 2026 14:48:02 +0200 Subject: [PATCH 01/11] refactor!: remove `tieredProxyUrls` support The feature has seen little adoption, and the tier rotation bled into APIs that otherwise had no business knowing about it (`ProxyConfiguration` pulling a `Request`, `newProxyInfo` returning a tier, browser controllers tracking a tier, etc.). In v4 the main rotation unit is the `Session`, so proxy-level tier rotation is redundant. Closes #3597 --- docs/guides/proxy_management.mdx | 27 --- .../src/internals/browser-crawler.ts | 1 - .../abstract-classes/browser-controller.ts | 8 +- .../src/abstract-classes/browser-plugin.ts | 2 - packages/browser-pool/src/browser-pool.ts | 23 +-- packages/browser-pool/src/launch-context.ts | 4 - packages/core/src/proxy_configuration.ts | 172 +----------------- packages/types/src/session.ts | 5 - test/core/proxy_configuration.test.ts | 119 +----------- 9 files changed, 14 insertions(+), 347 deletions(-) diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx index 70dce2f72f9b..640f68e7f079 100644 --- a/docs/guides/proxy_management.mdx +++ b/docs/guides/proxy_management.mdx @@ -99,33 +99,6 @@ The `sessionId` parameter is always provided and allows us to differentiate betw The `options` parameter is an object containing a `Request`, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not. -### Tiered proxies - -You can also provide a list of proxy tiers to the `ProxyConfiguration` class. This is useful when you want to switch between different proxies automatically based on the blocking behavior of the website. - -:::warning - -Note that the `tieredProxyUrls` option requires `ProxyConfiguration` to be used from a crawler instance ([see below](#crawler-integration)). - -Using this configuration through the `newUrl` calls will not yield the expected results. - -::: - -```javascript -const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls: [ - [null], // At first, we try to connect without a proxy - ['http://okay-proxy.com'], - ['http://slightly-better-proxy.com', 'http://slightly-better-proxy-2.com'], - ['http://very-good-and-expensive-proxy.com'], - ] -}); -``` - -This configuration will start with no proxy, then switch to `http://okay-proxy.com` if Crawlee recognizes we're getting blocked by the target website. If that proxy is also blocked, we will switch to one of the `slightly-better-proxy` URLs. If those are blocked, we will switch to the `very-good-and-expensive-proxy.com` URL. - -Crawlee also periodically probes lower tier proxies to see if they are unblocked, and if they are, it will switch back to them. - ## Crawler integration `ProxyConfiguration` integrates seamlessly into `HttpCrawler`, `CheerioCrawler`, `JSDOMCrawler`, `PlaywrightCrawler` and `PuppeteerCrawler`. diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 3405ab3bdb91..61025a5f89d7 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -461,7 +461,6 @@ export abstract class BrowserCrawler< const proxyInfo = crawlingContext.session.proxyInfo; newPageOptions.proxyUrl = proxyInfo?.url; - newPageOptions.proxyTier = proxyInfo?.proxyTier; newPageOptions.ignoreTlsErrors = proxyInfo?.ignoreTlsErrors; } diff --git a/packages/browser-pool/src/abstract-classes/browser-controller.ts b/packages/browser-pool/src/abstract-classes/browser-controller.ts index 078fcd889a99..60a982ca3b17 100644 --- a/packages/browser-pool/src/abstract-classes/browser-controller.ts +++ b/packages/browser-pool/src/abstract-classes/browser-controller.ts @@ -58,13 +58,7 @@ export abstract class BrowserController< launchContext: LaunchContext = undefined!; /** - * The proxy tier tied to this browser controller. - * `undefined` if no tiered proxy is used. - */ - proxyTier?: number; - - /** - * The proxy URL used by the browser controller. This is set every time the browser controller uses proxy (even the tiered one). + * The proxy URL used by the browser controller. * `undefined` if no proxy is used */ proxyUrl?: string; diff --git a/packages/browser-pool/src/abstract-classes/browser-plugin.ts b/packages/browser-pool/src/abstract-classes/browser-plugin.ts index f6cfe77bd8c1..8d090a8939b5 100644 --- a/packages/browser-pool/src/abstract-classes/browser-plugin.ts +++ b/packages/browser-pool/src/abstract-classes/browser-plugin.ts @@ -154,7 +154,6 @@ export abstract class BrowserPlugin< userDataDir = this.userDataDir, browserPerProxy = this.browserPerProxy, ignoreProxyCertificate = this.ignoreProxyCertificate, - proxyTier, } = options; return new LaunchContext({ @@ -166,7 +165,6 @@ export abstract class BrowserPlugin< userDataDir, browserPerProxy, ignoreProxyCertificate, - proxyTier, }); } diff --git a/packages/browser-pool/src/browser-pool.ts b/packages/browser-pool/src/browser-pool.ts index 20c56ff70f95..d1b502a9eafd 100644 --- a/packages/browser-pool/src/browser-pool.ts +++ b/packages/browser-pool/src/browser-pool.ts @@ -1,4 +1,4 @@ -import { type CrawleeLogger, serviceLocator, type TieredProxy } from '@crawlee/core'; +import { type CrawleeLogger, serviceLocator } from '@crawlee/core'; import type { BrowserFingerprintWithHeaders } from 'fingerprint-generator'; import { FingerprintGenerator } from 'fingerprint-generator'; import { FingerprintInjector } from 'fingerprint-injector'; @@ -441,7 +441,6 @@ export class BrowserPool< pageOptions, browserPlugin = this._pickBrowserPlugin(), proxyUrl, - proxyTier, ignoreTlsErrors, } = options; @@ -455,12 +454,11 @@ export class BrowserPool< // Limiter is necessary - https://github.com/apify/crawlee/issues/1126 return this.limiter(async () => { - let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyTier, proxyUrl }); + let browserController = this._pickBrowserWithFreeCapacity(browserPlugin, { proxyUrl }); if (!browserController) browserController = await this._launchBrowser(id, { browserPlugin, - proxyTier, proxyUrl, ignoreTlsErrors, }); @@ -704,7 +702,7 @@ export class BrowserPool< } private async _launchBrowser(pageId: string, options: InternalLaunchBrowserOptions) { - const { browserPlugin, launchOptions, proxyTier, proxyUrl, ignoreTlsErrors } = options; + const { browserPlugin, launchOptions, proxyUrl, ignoreTlsErrors } = options; const browserController = browserPlugin.createController() as BrowserControllerReturn; this.startingBrowserControllers.add(browserController); @@ -712,7 +710,6 @@ export class BrowserPool< const launchContext = browserPlugin.createLaunchContext({ id: pageId, launchOptions, - proxyTier, proxyUrl, }); @@ -740,7 +737,6 @@ export class BrowserPool< } this.log.debug('Launched new browser.', { id: browserController.id }); - browserController.proxyTier = proxyTier; browserController.proxyUrl = proxyUrl; try { @@ -777,20 +773,18 @@ export class BrowserPool< return this.browserPlugins[pluginIndex]; } - private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin, options?: Partial) { + private _pickBrowserWithFreeCapacity(browserPlugin: BrowserPlugin, options?: { proxyUrl?: string }) { return [...this.activeBrowserControllers].find((controller) => { const hasCapacity = controller.activePages < this.maxOpenPagesPerBrowser; const isCorrectPlugin = controller.browserPlugin === browserPlugin; const isSameProxyUrl = controller.proxyUrl === options?.proxyUrl; - const isCorrectProxyTier = controller.proxyTier === options?.proxyTier; return ( isCorrectPlugin && hasCapacity && - ((!controller.launchContext.browserPerProxy && !options?.proxyTier) || - (options?.proxyTier && isCorrectProxyTier) || + (!controller.launchContext.browserPerProxy || (options?.proxyUrl && isSameProxyUrl) || - (!options?.proxyUrl && !options?.proxyTier && !controller.proxyUrl && !controller.proxyTier)) + (!options?.proxyUrl && !controller.proxyUrl)) ); }); } @@ -908,10 +902,6 @@ export interface BrowserPoolNewPageOptions { browserPlugin: BP; launchOptions?: BP['launchOptions']; - proxyTier?: number; proxyUrl?: string; ignoreTlsErrors?: boolean; } diff --git a/packages/browser-pool/src/launch-context.ts b/packages/browser-pool/src/launch-context.ts index 9ae847634b51..b433f8cb20f7 100644 --- a/packages/browser-pool/src/launch-context.ts +++ b/packages/browser-pool/src/launch-context.ts @@ -51,7 +51,6 @@ export interface LaunchContextOptions< */ userDataDir?: string; proxyUrl?: string; - proxyTier?: number; /** * If set to `true`, TLS certificate errors from the upstream proxy will be ignored. * This is useful when using HTTPS proxies with self-signed certificates. @@ -72,7 +71,6 @@ export class LaunchContext< useIncognitoPages: boolean; browserPerProxy?: boolean; userDataDir: string; - proxyTier?: number; ignoreProxyCertificate?: boolean; private _proxyUrl?: string; @@ -90,7 +88,6 @@ export class LaunchContext< useIncognitoPages, browserPerProxy, userDataDir = '', - proxyTier, ignoreProxyCertificate, } = options; @@ -100,7 +97,6 @@ export class LaunchContext< this.browserPerProxy = browserPerProxy ?? false; this.useIncognitoPages = useIncognitoPages ?? false; this.userDataDir = userDataDir; - this.proxyTier = proxyTier; this.ignoreProxyCertificate = ignoreProxyCertificate ?? false; this._proxyUrl = proxyUrl; diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 154b59404057..5d8fa471871d 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -25,82 +25,10 @@ export interface ProxyConfigurationOptions { * This function is used to generate the URL when {@apilink ProxyConfiguration.newUrl} or {@apilink ProxyConfiguration.newProxyInfo} is called. */ newUrlFunction?: ProxyConfigurationFunction; - - /** - * An array of custom proxy URLs to be rotated stratified in tiers. - * This is a more advanced version of `proxyUrls` that allows you to define a hierarchy of proxy URLs - * If everything goes well, all the requests will be sent through the first proxy URL in the list. - * Whenever the crawler encounters a problem with the current proxy on the given domain, it will switch to the higher tier for this domain. - * The crawler probes lower-level proxies at intervals to check if it can make the tier downshift. - * - * This feature is useful when you have a set of proxies with different performance characteristics (speed, price, antibot performance etc.) and you want to use the best one for each domain. - * - * Use `null` as a proxy URL to disable the proxy for the given tier. - */ - tieredProxyUrls?: UrlList[]; -} - -export interface TieredProxy { - proxyUrl: string | null; - proxyTier?: number; } -interface TieredProxyOptions { +interface NewUrlOptions { request?: Request; - proxyTier?: number; -} - -/** - * Internal class for tracking the proxy tier history for a specific domain. - * - * Predicts the best proxy tier for the next request based on the error history for different proxy tiers. - */ -class ProxyTierTracker { - private histogram: number[]; - private currentTier: number; - - constructor(tieredProxyUrls: (string | null)[][]) { - this.histogram = tieredProxyUrls.map(() => 0); - this.currentTier = 0; - } - - /** - * Processes a single step of the algorithm and updates the current tier prediction based on the error history. - */ - private processStep(): void { - this.histogram.forEach((x, i) => { - if (this.currentTier === i) return; - if (x > 0) this.histogram[i]--; - }); - - const left = this.currentTier > 0 ? this.histogram[this.currentTier - 1] : Infinity; - const right = this.currentTier < this.histogram.length - 1 ? this.histogram[this.currentTier + 1] : Infinity; - - if (this.histogram[this.currentTier] > Math.min(left, right)) { - this.currentTier = left <= right ? this.currentTier - 1 : this.currentTier + 1; - } else if (this.histogram[this.currentTier] === left) { - this.currentTier--; - } - } - - /** - * Increases the error score for the given proxy tier. This raises the chance of picking a different proxy tier for the subsequent requests. - * - * The error score is increased by 10 for the given tier. This means that this tier will be disadvantaged for the next 10 requests (every new request prediction decreases the error score by 1). - * @param tier The proxy tier to mark as problematic. - */ - addError(tier: number) { - this.histogram[tier] += 10; - } - - /** - * Returns the best proxy tier for the next request based on the error history for different proxy tiers. - * @returns The proxy tier prediction - */ - predictTier() { - this.processStep(); - return this.currentTier; - } } /** @@ -135,11 +63,9 @@ export class ProxyConfiguration { isManInTheMiddle = false; protected nextCustomUrlIndex = 0; protected proxyUrls?: UrlList; - protected tieredProxyUrls?: UrlList[]; protected usedProxyUrls = new Map(); protected newUrlFunction?: ProxyConfigurationFunction; protected log = serviceLocator.getLogger().child({ prefix: 'ProxyConfiguration' }); - protected domainTiers = new Map(); /** * Creates a {@apilink ProxyConfiguration} instance based on the provided options. Proxy servers are used to prevent target websites from @@ -168,21 +94,16 @@ export class ProxyConfiguration { ow.object.exactShape({ proxyUrls: ow.optional.array.nonEmpty.ofType(ow.any(ow.string.url, ow.null)), newUrlFunction: ow.optional.function, - tieredProxyUrls: ow.optional.array.nonEmpty.ofType( - ow.array.nonEmpty.ofType(ow.any(ow.string.url, ow.null)), - ), }), ); - const { proxyUrls, newUrlFunction, tieredProxyUrls } = options; + const { proxyUrls, newUrlFunction } = options; - if ([proxyUrls, newUrlFunction, tieredProxyUrls].filter((x) => x).length > 1) - this._throwCannotCombineCustomMethods(); + if (proxyUrls && newUrlFunction) this._throwCannotCombineCustomMethods(); if (!proxyUrls && !newUrlFunction && validateRequired) this._throwNoOptionsProvided(); this.proxyUrls = proxyUrls; this.newUrlFunction = newUrlFunction; - this.tieredProxyUrls = tieredProxyUrls; } /** @@ -194,17 +115,8 @@ export class ProxyConfiguration { * * @return Represents information about used proxy and its configuration. */ - async newProxyInfo(options?: TieredProxyOptions): Promise { - let url: string | undefined; - let tier: number | undefined; - if (this.tieredProxyUrls) { - const { proxyUrl, proxyTier } = this._handleTieredUrl(options); - url = proxyUrl ?? undefined; - tier = proxyTier; - } else { - url = await this.newUrl(options); - } - + async newProxyInfo(options?: NewUrlOptions): Promise { + const url = await this.newUrl(options); if (!url) return undefined; const { username, password, port, hostname } = new URL(url); @@ -215,92 +127,20 @@ export class ProxyConfiguration { password: decodeURIComponent(password), hostname, port: port!, - proxyTier: tier, }; } - /** - * Given a request / proxy tier, this function returns a new proxy URL based on the provided configuration options. - * @param options Options for the tiered proxy rotation - * @returns An object with the proxy URL and the proxy tier used. - */ - protected _handleTieredUrl(options?: TieredProxyOptions): TieredProxy { - if (!this.tieredProxyUrls) throw new Error('Tiered proxy URLs are not set'); - - if (!options || (!options?.request && options?.proxyTier === undefined)) { - const allProxyUrls = this.tieredProxyUrls.flat(); - return { - proxyUrl: allProxyUrls[this.nextCustomUrlIndex++ % allProxyUrls.length], - }; - } - - let tierPrediction = options.proxyTier!; - - if (typeof tierPrediction !== 'number') { - tierPrediction = this.predictProxyTier(options.request!)!; - } - - const proxyTier = this.tieredProxyUrls![tierPrediction]; - - return { - proxyUrl: proxyTier[this.nextCustomUrlIndex++ % proxyTier.length], - proxyTier: tierPrediction, - }; - } - - /** - * Given a `Request` object, this function returns the tier of the proxy that should be used for the request. - * - * This returns `null` if `tieredProxyUrls` option is not set. - */ - protected predictProxyTier(request: Request): number | null { - if (!this.tieredProxyUrls) return null; - - const domain = new URL(request.url).hostname; - if (!this.domainTiers.has(domain)) { - this.domainTiers.set(domain, new ProxyTierTracker(this.tieredProxyUrls)); - } - - request.userData.__crawlee ??= {}; - - const tracker = this.domainTiers.get(domain)!; - - if (typeof request.userData.__crawlee.lastProxyTier === 'number') { - tracker.addError(request.userData.__crawlee.lastProxyTier); - } - - const tierPrediction = tracker.predictTier(); - - if ( - typeof request.userData.__crawlee.lastProxyTier === 'number' && - request.userData.__crawlee.lastProxyTier !== tierPrediction - ) { - this.log.debug( - `Changing proxy tier for domain "${domain}" from ${request.userData.__crawlee.lastProxyTier} to ${tierPrediction}.`, - ); - } - - request.userData.__crawlee.lastProxyTier = tierPrediction; - request.userData.__crawlee.forefront = true; - - return tierPrediction; - } - /** * Returns a new proxy URL based on provided configuration options. * * @return A string with a proxy URL, including authentication credentials and port number. * For example, `http://bob:password123@proxy.example.com:8000` */ - async newUrl(options?: TieredProxyOptions): Promise { + async newUrl(options?: NewUrlOptions): Promise { if (this.newUrlFunction) { return (await this._callNewUrlFunction({ request: options?.request })) ?? undefined; } - if (this.tieredProxyUrls) { - return this._handleTieredUrl(options).proxyUrl ?? undefined; - } - return this._handleProxyUrlsList() ?? undefined; } diff --git a/packages/types/src/session.ts b/packages/types/src/session.ts index 5427db68b2b3..9ff549ba22fa 100644 --- a/packages/types/src/session.ts +++ b/packages/types/src/session.ts @@ -56,11 +56,6 @@ export interface ProxyInfo { */ port: number | string; - /** - * Proxy tier for the current proxy, if applicable (only for `tieredProxyUrls`). - */ - proxyTier?: number; - /** * When `true`, the proxy is likely intercepting HTTPS traffic and is able to view and modify its content. * diff --git a/test/core/proxy_configuration.test.ts b/test/core/proxy_configuration.test.ts index ed16bb93c405..bd42904f65ff 100644 --- a/test/core/proxy_configuration.test.ts +++ b/test/core/proxy_configuration.test.ts @@ -1,4 +1,4 @@ -import { ProxyConfiguration, Request } from '@crawlee/core'; +import { ProxyConfiguration } from '@crawlee/core'; describe('ProxyConfiguration', () => { test('newUrl() should return proxy URL', async () => { @@ -174,121 +174,4 @@ describe('ProxyConfiguration', () => { } }); }); - - describe('with tieredProxyUrls', () => { - test('without Request rotates the urls uniformly', async () => { - const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls: [ - ['http://proxy.com:1111', 'http://proxy.com:2222'], - ['http://proxy.com:3333', 'http://proxy.com:4444'], - ], - }); - - // @ts-expect-error protected property - const tieredProxyUrls = proxyConfiguration.tieredProxyUrls!; - expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); - expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][1]); - expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[1][0]); - expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[1][1]); - expect(await proxyConfiguration.newUrl()).toEqual(tieredProxyUrls[0][0]); - }); - - test('rotating a request results in higher-level proxies', async () => { - const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls: [['http://proxy.com:1111'], ['http://proxy.com:2222'], ['http://proxy.com:3333']], - }); - - const request = new Request({ - url: 'http://example.com', - }); - - // @ts-expect-error protected property - const tieredProxyUrls = proxyConfiguration.tieredProxyUrls!; - expect(await proxyConfiguration.newUrl({ request })).toEqual(tieredProxyUrls[0][0]); - expect(await proxyConfiguration.newUrl({ request })).toEqual(tieredProxyUrls[1][0]); - expect(await proxyConfiguration.newUrl({ request })).toEqual(tieredProxyUrls[2][0]); - - // we still get the same (higher) proxy tier even with a new request - const request2 = new Request({ - url: 'http://example.com/another-resource', - }); - - expect(await proxyConfiguration.newUrl({ request: request2 })).toEqual(tieredProxyUrls[2][0]); - }); - - test('upshifts and downshifts properly', async () => { - const tieredProxyUrls = [['http://proxy.com:1111'], ['http://proxy.com:2222'], ['http://proxy.com:3333']]; - - const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls, - }); - - const request = new Request({ - url: 'http://example.com', - }); - - let gotToTheHighestProxy = false; - for (let i = 0; i < 10; i++) { - const lastProxyUrl = await proxyConfiguration.newUrl({ request }); - if (lastProxyUrl === tieredProxyUrls[2][0]) { - gotToTheHighestProxy = true; - break; - } - } - - expect(gotToTheHighestProxy).toBe(true); - - // Even the highest-tier proxies didn't help - we should try going down - let gotToTheLowestProxy = false; - - for (let i = 0; i < 20; i++) { - const lastProxyUrl = await proxyConfiguration.newUrl({ request }); - if (lastProxyUrl === tieredProxyUrls[0][0]) { - gotToTheLowestProxy = true; - break; - } - } - - expect(gotToTheLowestProxy).toBe(true); - }); - - test('successful requests make the proxy tier drop eventually', async () => { - const tieredProxyUrls = [['http://proxy.com:1111'], ['http://proxy.com:2222'], ['http://proxy.com:3333']]; - - const proxyConfiguration = new ProxyConfiguration({ - tieredProxyUrls, - }); - - const failingRequest = new Request({ - url: 'http://example.com', - }); - let gotToTheHighestProxy = false; - - for (let i = 0; i < 10; i++) { - const lastProxyUrl = await proxyConfiguration.newUrl({ request: failingRequest }); - - if (lastProxyUrl === tieredProxyUrls[2][0]) { - gotToTheHighestProxy = true; - break; - } - } - - expect(gotToTheHighestProxy).toBe(true); - - let gotToTheLowestProxy = false; - - for (let i = 0; i < 100; i++) { - const lastProxyUrl = await proxyConfiguration.newUrl({ - request: new Request({ url: `http://example.com/${i}` }), - }); - - if (lastProxyUrl === tieredProxyUrls[0][0]) { - gotToTheLowestProxy = true; - break; - } - } - - expect(gotToTheLowestProxy).toBe(true); - }); - }); }); From c37e970188301c5f633c7cb484e2450042c28084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 23 Apr 2026 20:08:44 +0200 Subject: [PATCH 02/11] docs: document `tieredProxyUrls` removal in v4 upgrading guide --- docs/upgrading/upgrading_v4.md | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index 24d304107761..d6e0bcf55f27 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -126,6 +126,46 @@ const crawler = new BasicCrawler({ }); ``` +## `tieredProxyUrls` is removed from `ProxyConfiguration` + +The `tieredProxyUrls` option (along with `ProxyConfiguration.reportProxyError`, the `proxyTier` field on `ProxyInfo` and the `proxyTier` plumbing in `BrowserPool`) has been removed. The feature saw little adoption and the tier rotation bled into APIs that otherwise had no business knowing about proxy tiers. In v4 the `Session` is the main rotation unit — a session already carries its own proxy, cookies and error score, so the pool naturally rotates the whole fingerprint when a session gets retired on a block. + +If you need per-session proxy assignment, use a custom `createSessionFunction` on the `SessionPool` to stamp each `Session` with its own proxy URL. Retire the session on error so the pool creates a fresh one (with a fresh proxy draw) for the next request. Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. + +```typescript +import { BasicCrawler, Session, SessionPool } from '@crawlee/core'; + +const proxyUrls = ['http://proxy-1.com', 'http://proxy-2.com', 'http://proxy-3.com']; + +const sessionPool = new SessionPool({ + createSessionFunction: async (pool) => { + const proxyUrl = proxyUrls[Math.floor(Math.random() * proxyUrls.length)]; + const { username, password, hostname, port } = new URL(proxyUrl); + + return new Session({ + sessionPool: pool, + proxyInfo: { + url: proxyUrl, + username: decodeURIComponent(username), + password: decodeURIComponent(password), + hostname, + port, + }, + }); + }, +}); + +const crawler = new BasicCrawler({ + sessionPool, + requestHandler: async ({ request, session, sendRequest }) => { + const response = await sendRequest({ url: request.url }); + if (response.status === 403) session!.retire(); + }, +}); +``` + +Any further tier/priority logic (weighted draws, sticky assignment, cooldown on failing pools, etc.) now lives in `createSessionFunction` rather than in `ProxyConfiguration`, where you have full control over it. + ## Remove `experimentalContainers` option This experimental option relied on an outdated manifest version for browser extensions, it is not possible to achieve this with the currently supported versions. From e65400e3c1b07ad80c5891ad3879beb819e6e2b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 23 Apr 2026 20:13:42 +0200 Subject: [PATCH 03/11] docs: tweak `tieredProxyUrls` migration example --- docs/upgrading/upgrading_v4.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index d6e0bcf55f27..36219dc9981c 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -130,7 +130,7 @@ const crawler = new BasicCrawler({ The `tieredProxyUrls` option (along with `ProxyConfiguration.reportProxyError`, the `proxyTier` field on `ProxyInfo` and the `proxyTier` plumbing in `BrowserPool`) has been removed. The feature saw little adoption and the tier rotation bled into APIs that otherwise had no business knowing about proxy tiers. In v4 the `Session` is the main rotation unit — a session already carries its own proxy, cookies and error score, so the pool naturally rotates the whole fingerprint when a session gets retired on a block. -If you need per-session proxy assignment, use a custom `createSessionFunction` on the `SessionPool` to stamp each `Session` with its own proxy URL. Retire the session on error so the pool creates a fresh one (with a fresh proxy draw) for the next request. Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. +If you need per-session proxy assignment, use a custom `createSessionFunction` on the `SessionPool` to stamp each `Session` with its own proxy URL. With `retryOnBlocked` enabled, blocked sessions are already retired for you — but `request.sessionId` (the pin that decides which session handles a given request) is not cleared automatically, so on the retry the crawler would try to fetch the now-missing session. Reassign it from an `errorHandler` to hand the retry over to a freshly drawn session (and, implicitly, a new proxy). Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. ```typescript import { BasicCrawler, Session, SessionPool } from '@crawlee/core'; @@ -157,14 +157,17 @@ const sessionPool = new SessionPool({ const crawler = new BasicCrawler({ sessionPool, - requestHandler: async ({ request, session, sendRequest }) => { - const response = await sendRequest({ url: request.url }); - if (response.status === 403) session!.retire(); + retryOnBlocked: true, + requestHandler: async ({ request, sendRequest }) => { + await sendRequest({ url: request.url }); + }, + errorHandler: async ({ request }) => { + request.sessionId = (await sessionPool.getSession()).id; }, }); ``` -Any further tier/priority logic (weighted draws, sticky assignment, cooldown on failing pools, etc.) now lives in `createSessionFunction` rather than in `ProxyConfiguration`, where you have full control over it. +Any further tier/priority logic (weighted draws, sticky assignment, cooldown on failing pools, etc.) now lives in `createSessionFunction` and the `request.sessionId` reassignment rather than in `ProxyConfiguration`, where you have full control over it. ## Remove `experimentalContainers` option From 06ed1c5c4381fe0b37292c6388e7f668ca628198 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Thu, 23 Apr 2026 20:19:13 +0200 Subject: [PATCH 04/11] docs: tweak `tieredProxyUrls` migration example --- docs/upgrading/upgrading_v4.md | 45 ++++++++++++++++------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index 36219dc9981c..04d321dec129 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -130,30 +130,25 @@ const crawler = new BasicCrawler({ The `tieredProxyUrls` option (along with `ProxyConfiguration.reportProxyError`, the `proxyTier` field on `ProxyInfo` and the `proxyTier` plumbing in `BrowserPool`) has been removed. The feature saw little adoption and the tier rotation bled into APIs that otherwise had no business knowing about proxy tiers. In v4 the `Session` is the main rotation unit — a session already carries its own proxy, cookies and error score, so the pool naturally rotates the whole fingerprint when a session gets retired on a block. -If you need per-session proxy assignment, use a custom `createSessionFunction` on the `SessionPool` to stamp each `Session` with its own proxy URL. With `retryOnBlocked` enabled, blocked sessions are already retired for you — but `request.sessionId` (the pin that decides which session handles a given request) is not cleared automatically, so on the retry the crawler would try to fetch the now-missing session. Reassign it from an `errorHandler` to hand the retry over to a freshly drawn session (and, implicitly, a new proxy). Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. +If you used tiers to escalate from a cheap proxy pool to a pricier one on blocks, you can emulate the same thing by pre-populating a `SessionPool` with named sessions — one per proxy tier — and flipping `request.sessionId` in an `errorHandler` to reassign the retry to the next tier. Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. ```typescript -import { BasicCrawler, Session, SessionPool } from '@crawlee/core'; - -const proxyUrls = ['http://proxy-1.com', 'http://proxy-2.com', 'http://proxy-3.com']; - -const sessionPool = new SessionPool({ - createSessionFunction: async (pool) => { - const proxyUrl = proxyUrls[Math.floor(Math.random() * proxyUrls.length)]; - const { username, password, hostname, port } = new URL(proxyUrl); - - return new Session({ - sessionPool: pool, - proxyInfo: { - url: proxyUrl, - username: decodeURIComponent(username), - password: decodeURIComponent(password), - hostname, - port, - }, - }); - }, -}); +import { BasicCrawler, SessionPool } from '@crawlee/core'; + +const proxyInfoFromUrl = (proxyUrl: string) => { + const { username, password, hostname, port } = new URL(proxyUrl); + return { + url: proxyUrl, + username: decodeURIComponent(username), + password: decodeURIComponent(password), + hostname, + port, + }; +}; + +const sessionPool = new SessionPool(); +await sessionPool.addSession({ id: 'basic', proxyInfo: proxyInfoFromUrl('http://cheap-proxy.com') }); +await sessionPool.addSession({ id: 'premium', proxyInfo: proxyInfoFromUrl('http://expensive-proxy.com') }); const crawler = new BasicCrawler({ sessionPool, @@ -162,12 +157,14 @@ const crawler = new BasicCrawler({ await sendRequest({ url: request.url }); }, errorHandler: async ({ request }) => { - request.sessionId = (await sessionPool.getSession()).id; + request.sessionId = 'premium'; }, }); + +await crawler.run([{ url: 'https://example.com', sessionId: 'basic' }]); ``` -Any further tier/priority logic (weighted draws, sticky assignment, cooldown on failing pools, etc.) now lives in `createSessionFunction` and the `request.sessionId` reassignment rather than in `ProxyConfiguration`, where you have full control over it. +Richer routing (more tiers, weighted draws, sticky assignment, cooldowns) can be expressed with additional named sessions and the logic you put in `errorHandler` — it's now just regular user code instead of a built-in. ## Remove `experimentalContainers` option From 05f76f317a2bbc1903004224c07c33549a5da8ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Fri, 24 Apr 2026 15:38:42 +0200 Subject: [PATCH 05/11] docs: update upgrading guide --- docs/upgrading/upgrading_v4.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index 04d321dec129..e4f8ebd119ca 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -128,9 +128,9 @@ const crawler = new BasicCrawler({ ## `tieredProxyUrls` is removed from `ProxyConfiguration` -The `tieredProxyUrls` option (along with `ProxyConfiguration.reportProxyError`, the `proxyTier` field on `ProxyInfo` and the `proxyTier` plumbing in `BrowserPool`) has been removed. The feature saw little adoption and the tier rotation bled into APIs that otherwise had no business knowing about proxy tiers. In v4 the `Session` is the main rotation unit — a session already carries its own proxy, cookies and error score, so the pool naturally rotates the whole fingerprint when a session gets retired on a block. +The `tieredProxyUrls` option has been removed, together with the `proxyTier` field on `ProxyInfo` and the `proxyTier` plumbing in `BrowserPool`. In v4 the `Session` is the main rotation unit - a session already carries its own proxy, cookies and error score, so the pool rotates the whole fingerprint when a session gets retired on a block. -If you used tiers to escalate from a cheap proxy pool to a pricier one on blocks, you can emulate the same thing by pre-populating a `SessionPool` with named sessions — one per proxy tier — and flipping `request.sessionId` in an `errorHandler` to reassign the retry to the next tier. Skip the `proxyConfiguration` option on the crawler — the session already carries its own proxy. +If you used tiers to escalate from a cheap proxy pool to a pricier one on blocks, you can achieve the same behavior by pre-populating a `SessionPool` with named sessions - one per proxy tier - and flipping `request.sessionId` in an `errorHandler` to reassign the retry to the next tier. Skip the `proxyConfiguration` option on the crawler - the session already carries its own proxy. ```typescript import { BasicCrawler, SessionPool } from '@crawlee/core'; @@ -164,7 +164,7 @@ const crawler = new BasicCrawler({ await crawler.run([{ url: 'https://example.com', sessionId: 'basic' }]); ``` -Richer routing (more tiers, weighted draws, sticky assignment, cooldowns) can be expressed with additional named sessions and the logic you put in `errorHandler` — it's now just regular user code instead of a built-in. +More complex routing (more tiers, weighted draws, sticky assignment, cooldowns) can be expressed with additional named sessions and custom `errorHandler` logic. ## Remove `experimentalContainers` option From 77b484670cb498e41447bc94d7f2801a41ef84ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 27 Apr 2026 07:20:27 +0300 Subject: [PATCH 06/11] fix: apply PR suggestions --- docs/guides/proxy_management.mdx | 6 ++---- packages/core/src/proxy_configuration.ts | 8 ++++++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/guides/proxy_management.mdx b/docs/guides/proxy_management.mdx index 640f68e7f079..bc7253aa6ec9 100644 --- a/docs/guides/proxy_management.mdx +++ b/docs/guides/proxy_management.mdx @@ -83,7 +83,7 @@ The `ProxyConfiguration` class allows you to provide a custom function to pick a ```javascript const proxyConfiguration = new ProxyConfiguration({ - newUrlFunction: (sessionId, { request }) => { + newUrlFunction: ({ request } = {}) => { if (request?.url.includes('crawlee.dev')) { return null; // for crawlee.dev, we don't use a proxy } @@ -93,9 +93,7 @@ const proxyConfiguration = new ProxyConfiguration({ }); ``` -The `newUrlFunction` receives two parameters - `sessionId` and `options` - and returns a string containing the proxy URL. - -The `sessionId` parameter is always provided and allows us to differentiate between different sessions - e.g. when Crawlee recognizes your crawlers are being blocked, it will automatically create a new session with a different id. +The `newUrlFunction` receives a single optional `options` parameter and returns a string with the proxy URL (or `null` to skip the proxy for the current request). The `options` parameter is an object containing a `Request`, which is the request that will be made. Note that this object is not always available, for example when we are using the `newUrl` function directly. Your custom function should therefore not rely on the `request` object being present and provide a default behavior when it is not. diff --git a/packages/core/src/proxy_configuration.ts b/packages/core/src/proxy_configuration.ts index 5d8fa471871d..2c6f65078712 100644 --- a/packages/core/src/proxy_configuration.ts +++ b/packages/core/src/proxy_configuration.ts @@ -89,6 +89,14 @@ export class ProxyConfiguration { */ constructor(options: ProxyConfigurationOptions = {}) { const { validateRequired, ...rest } = options as Dictionary; + + if ('tieredProxyUrls' in rest) { + throw new Error( + 'The `tieredProxyUrls` option has been removed in Crawlee v4. ' + + 'See the v4 upgrading guide for the recommended migration to named sessions.', + ); + } + ow( rest, ow.object.exactShape({ From 77315f61aa6ad4145fb19621d73e6050c189ef9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 27 Apr 2026 12:17:03 +0300 Subject: [PATCH 07/11] feat: reuse `Sessions` correctly in `BasicCrawler` --- .../src/internals/basic-crawler.ts | 29 ++++++++++----- test/core/crawlers/basic_crawler.test.ts | 35 +++++++++++++++++++ 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 36514c1b8c8e..7e91e6bb7e64 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -21,7 +21,6 @@ import type { RequestTransform, RouterHandler, RouterRoutes, - Session, SkippedRequestCallback, Source, StatisticsOptions, @@ -59,6 +58,7 @@ import { Router, ServiceLocator, serviceLocator, + Session, SessionError, SessionPool, Statistics, @@ -873,7 +873,25 @@ export class BasicCrawler< ...statisticsOptions, }); - this.sessionPool = sessionPool ?? new SessionPool(); + if (sessionPool && proxyConfiguration) { + this.log.warning( + 'Both `sessionPool` and `proxyConfiguration` were provided to the crawler. ' + + 'The `proxyConfiguration` is ignored - sessions from the supplied pool keep whatever ' + + '`proxyInfo` they were created with. Configure proxies on the pool instead, ' + + 'e.g. via `addSession({ proxyInfo })` or a custom `createSessionFunction`.', + ); + } + + this.sessionPool = + sessionPool ?? + new SessionPool({ + createSessionFunction: async (pool, opts) => + new Session({ + proxyInfo: await this.proxyConfiguration?.newProxyInfo(), + ...opts?.sessionOptions, + sessionPool: pool, + }), + }); this.sessionPool.setMaxListeners(20); this.ownsSessionPool = !sessionPool; @@ -1116,12 +1134,7 @@ export class BasicCrawler< return existingSession; } - return await this.sessionPool!.newSession({ - proxyInfo: await this.proxyConfiguration?.newProxyInfo({ - request: request ?? undefined, - }), - maxUsageCount: 1, - }); + return await this.sessionPool!.getSession(); }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`, diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index cb239ed042c8..9d776c85eb9f 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -4,6 +4,7 @@ import http from 'node:http'; import type { AddressInfo } from 'node:net'; import type { EnqueueLinksOptions, ErrorHandler, RequestHandler, RequestOptions, Source } from '@crawlee/basic'; +import type { Session } from '@crawlee/basic'; import { BasicCrawler, Configuration, @@ -12,6 +13,7 @@ import { KeyValueStore, MissingRouteError, NonRetryableError, + ProxyConfiguration, Request, RequestList, RequestQueue, @@ -19,6 +21,7 @@ import { SessionPool, } from '@crawlee/basic'; import { RequestState } from '@crawlee/core'; +import type { ProxyInfo } from '@crawlee/types'; import type { Dictionary } from '@crawlee/utils'; import { RobotsTxtFile, sleep } from '@crawlee/utils'; import express from 'express'; @@ -1602,6 +1605,38 @@ describe('BasicCrawler', () => { }); }); + describe('proxyConfiguration', () => { + it('assigns a proxyInfo from the proxyConfiguration to each Session and exposes it on the context', async () => { + const proxyUrls = [0, 1, 2].map((n) => `http://proxy.example.com:${1000 + n}`); + const proxyConfiguration = new ProxyConfiguration({ proxyUrls }); + + const sessions: Session[] = []; + const proxyInfos: (ProxyInfo | undefined)[] = []; + + const crawler = new BasicCrawler({ + proxyConfiguration, + requestHandler: async ({ session, proxyInfo }) => { + sessions.push(session); + proxyInfos.push(proxyInfo); + }, + }); + + await crawler.run([ + { url: 'https://example.com/a' }, + { url: 'https://example.com/b' }, + { url: 'https://example.com/c' }, + ]); + + expect(sessions).toHaveLength(3); + for (let i = 0; i < sessions.length; i++) { + const proxyInfo = proxyInfos[i]; + expect(proxyInfo).toBeDefined(); + expect(proxyUrls).toContain(proxyInfo!.url); + expect(sessions[i].proxyInfo).toBe(proxyInfo); + } + }); + }); + test('extendContext', async () => { const url = 'https://example.com'; const requestHandlerImplementation = vi.fn(); From 8ef88098f75df4d2a9d7f97ca5d0e2584a031b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Tue, 28 Apr 2026 08:35:06 +0300 Subject: [PATCH 08/11] chore: apply PR suggestions --- .../src/internals/basic-crawler.ts | 3 +- test/core/crawlers/basic_crawler.test.ts | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 7e91e6bb7e64..9f37db96108b 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -887,7 +887,8 @@ export class BasicCrawler< new SessionPool({ createSessionFunction: async (pool, opts) => new Session({ - proxyInfo: await this.proxyConfiguration?.newProxyInfo(), + proxyInfo: + opts?.sessionOptions?.proxyInfo ?? (await this.proxyConfiguration?.newProxyInfo()), ...opts?.sessionOptions, sessionPool: pool, }), diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index 9d776c85eb9f..9a4c0f709cad 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -1635,6 +1635,36 @@ describe('BasicCrawler', () => { expect(sessions[i].proxyInfo).toBe(proxyInfo); } }); + + it('reuses the same Session across multiple requests when the pool is restricted', async () => { + const sessions: Session[] = []; + const proxyInfos: (ProxyInfo | undefined)[] = []; + + const crawler = new BasicCrawler({ + sessionPool: new SessionPool({ maxPoolSize: 1 }), + requestHandler: async ({ session, proxyInfo }) => { + sessions.push(session); + proxyInfos.push(proxyInfo); + }, + }); + + await crawler.run([ + { url: 'https://example.com/a' }, + { url: 'https://example.com/b' }, + { url: 'https://example.com/c' }, + ]); + + expect(sessions).toHaveLength(3); + const firstId = sessions[0].id; + for (const session of sessions) { + expect(session.id).toBe(firstId); + expect(session.proxyInfo).toBe(sessions[0].proxyInfo); + } + for (const proxyInfo of proxyInfos) { + expect(proxyInfo).toBe(sessions[0].proxyInfo); + } + expect(sessions[0].usageCount).toBe(3); + }); }); test('extendContext', async () => { From e3805ba2651b38913bd1000d6289750999abe07f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Tue, 28 Apr 2026 09:19:07 +0300 Subject: [PATCH 09/11] refactor: simplify `SessionPoolOptions` lifecycle --- docs/upgrading/upgrading_v4.md | 31 +++++++++++++++++++ .../src/internals/basic-crawler.ts | 2 +- .../core/src/session_pool/session_pool.ts | 25 +++++++++------ 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index e4f8ebd119ca..06767a9dfad8 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -105,6 +105,37 @@ const count = await sessionPool.usableSessionsCount(); const state = await sessionPool.getState(); ``` +## Custom `createSessionFunction` receives merged session options + +`SessionPool` now merges its pool-wide `sessionOptions` (including the pool-scoped logger) with per-call overrides before invoking `createSessionFunction`. Custom implementations no longer need to spread `pool.sessionOptions` themselves to inherit pool defaults. + +**Before:** +```typescript +new SessionPool({ + sessionOptions: { maxUsageCount: 5 }, + createSessionFunction: async (pool, opts) => + new Session({ + ...pool.sessionOptions, // had to be spread manually for the logger / pool defaults to apply + ...opts?.sessionOptions, + sessionPool: pool, + }), +}); +``` + +**After:** +```typescript +new SessionPool({ + sessionOptions: { maxUsageCount: 5 }, + createSessionFunction: async (pool, opts) => + new Session({ + ...opts?.sessionOptions, // already merged with pool-wide defaults + sessionPool: pool, + }), +}); +``` + +If you were already spreading `pool.sessionOptions`, the change is harmless - pool defaults now appear twice in the spread chain, with the later (per-call) one winning, exactly as before. + ## `retireOnBlockedStatusCodes` is removed from `Session` `Session.retireOnBlockedStatusCodes` is removed. Blocked status code handling is now internal to the crawler. Configure blocked status codes via the `blockedStatusCodes` crawler option (moved from `sessionPoolOptions`). diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 9f37db96108b..24ae05b00796 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -887,9 +887,9 @@ export class BasicCrawler< new SessionPool({ createSessionFunction: async (pool, opts) => new Session({ + ...opts?.sessionOptions, proxyInfo: opts?.sessionOptions?.proxyInfo ?? (await this.proxyConfiguration?.newProxyInfo()), - ...opts?.sessionOptions, sessionPool: pool, }), }); diff --git a/packages/core/src/session_pool/session_pool.ts b/packages/core/src/session_pool/session_pool.ts index 9001134a014d..53243db19821 100644 --- a/packages/core/src/session_pool/session_pool.ts +++ b/packages/core/src/session_pool/session_pool.ts @@ -179,11 +179,11 @@ export class SessionPool extends EventEmitter { this.maxPoolSize = maxPoolSize; this.createSessionFunction = createSessionFunction || this._defaultCreateSessionFunction; - // Session configuration + // Session configuration. The pool-scoped logger is merged into per-call sessionOptions inside + // `_invokeCreateSessionFunction`, so every Session inherits it without custom createSessionFunctions + // having to know about it. this.sessionOptions = { ...sessionOptions, - // the log needs to propagate to createSessionFunction as in "new Session({ ...sessionPool.sessionOptions })" - // and can't go inside _defaultCreateSessionFunction log: this.log, }; @@ -264,8 +264,7 @@ export class SessionPool extends EventEmitter { this._removeRetiredSessions(); } - const newSession = - options instanceof Session ? options : await this.createSessionFunction(this, { sessionOptions: options }); + const newSession = options instanceof Session ? options : await this._invokeCreateSessionFunction(options); this.log.debug(`Adding new Session - ${newSession.id}`); this._addSession(newSession); @@ -280,7 +279,7 @@ export class SessionPool extends EventEmitter { async newSession(sessionOptions?: SessionOptions): Promise { await this.ensureInitialized(); - const newSession = await this.createSessionFunction(this, { sessionOptions }); + const newSession = await this._invokeCreateSessionFunction(sessionOptions); this._addSession(newSession); return newSession; @@ -446,18 +445,26 @@ export class SessionPool extends EventEmitter { const { sessionOptions = {} } = options; return new Session({ - ...this.sessionOptions, ...sessionOptions, sessionPool, }); } + /** + * Invokes `createSessionFunction` with `sessionOptions` already merged from pool-wide defaults and + * the supplied per-call overrides, so custom implementations don't need to spread `pool.sessionOptions` themselves. + */ + private async _invokeCreateSessionFunction(perCallOptions?: SessionOptions): Promise { + const sessionOptions = { ...this.sessionOptions, ...perCallOptions }; + return this.createSessionFunction(this, { sessionOptions }); + } + /** * Creates new session and adds it to the pool. * @returns Newly created `Session` instance. */ protected async _createSession(): Promise { - const newSession = await this.createSessionFunction(this); + const newSession = await this._invokeCreateSessionFunction(); this._addSession(newSession); this.log.debug(`Created new Session - ${newSession.id}`); @@ -498,7 +505,7 @@ export class SessionPool extends EventEmitter { sessionObject.sessionPool = this; sessionObject.createdAt = new Date(sessionObject.createdAt as string); sessionObject.expiresAt = new Date(sessionObject.expiresAt as string); - const recreatedSession = await this.createSessionFunction(this, { sessionOptions: sessionObject }); + const recreatedSession = await this._invokeCreateSessionFunction(sessionObject); if (recreatedSession.isUsable()) { this._addSession(recreatedSession); From 954f7953e282bd4edfcf01cd8a3a2073576ea13e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 4 May 2026 12:21:43 +0200 Subject: [PATCH 10/11] chore: fix failing tests --- test/core/crawlers/browser_crawler.test.ts | 2 +- test/core/crawlers/puppeteer_crawler.test.ts | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 140a56435643..3f9f42d62a20 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -820,7 +820,7 @@ describe('BrowserCrawler', () => { { url: `${serverAddress}/?q=6` }, ]); - expect(sessionUsageHistory).toEqual([0, 0, 0, 0, 0, 0]); + expect(sessionUsageHistory).toEqual([0, 1, 2, 3, 4, 5]); } finally { await localStorageEmulator.destroy(); } diff --git a/test/core/crawlers/puppeteer_crawler.test.ts b/test/core/crawlers/puppeteer_crawler.test.ts index c3a3a5a4cb72..dbc9e13fa8eb 100644 --- a/test/core/crawlers/puppeteer_crawler.test.ts +++ b/test/core/crawlers/puppeteer_crawler.test.ts @@ -363,11 +363,6 @@ describe('PuppeteerCrawler', () => { }, }, maxConcurrency: 1, - sessionPool: new SessionPool({ - sessionOptions: { - maxUsageCount: 1, - }, - }), proxyConfiguration, requestHandler: async ({ proxyInfo, session }) => { proxies.add(proxyInfo!.url); From 781bf7e8afdd2a48abc08e88189397c3ff849564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 4 May 2026 12:22:17 +0200 Subject: [PATCH 11/11] chore: passthrough `proxyConfiguration` option to `BasicCrawler` --- packages/browser-crawler/src/internals/browser-crawler.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 61025a5f89d7..68bcddef2ab3 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -355,7 +355,6 @@ export abstract class BrowserCrawler< ignoreShadowRoots = false, contextPipelineBuilder, extendContext, - proxyConfiguration, ...basicCrawlerOptions } = options; @@ -371,7 +370,6 @@ export abstract class BrowserCrawler< this.launchContext = launchContext; this.navigationTimeoutMillis = navigationTimeoutSecs * 1000; - this.proxyConfiguration = proxyConfiguration; this.preNavigationHooks = preNavigationHooks; this.postNavigationHooks = postNavigationHooks; this.ignoreIframes = ignoreIframes;