diff --git a/docs/upgrading/upgrading_v4.md b/docs/upgrading/upgrading_v4.md index be076fda4aed..853b1031c0fa 100644 --- a/docs/upgrading/upgrading_v4.md +++ b/docs/upgrading/upgrading_v4.md @@ -106,6 +106,37 @@ const count = await sessionPool.usableSessionsCount(); const state = await sessionPool.getState(); ``` +## Custom `createSessionFunction` receives merged session options + +`SessionPool` now merges its pool-wide `sessionOptions` (including the pool-scoped logger) with per-call overrides before invoking `createSessionFunction`. Custom implementations no longer need to spread `pool.sessionOptions` themselves to inherit pool defaults. + +**Before:** +```typescript +new SessionPool({ + sessionOptions: { maxUsageCount: 5 }, + createSessionFunction: async (pool, opts) => + new Session({ + ...pool.sessionOptions, // had to be spread manually for the logger / pool defaults to apply + ...opts?.sessionOptions, + sessionPool: pool, + }), +}); +``` + +**After:** +```typescript +new SessionPool({ + sessionOptions: { maxUsageCount: 5 }, + createSessionFunction: async (pool, opts) => + new Session({ + ...opts?.sessionOptions, // already merged with pool-wide defaults + sessionPool: pool, + }), +}); +``` + +If you were already spreading `pool.sessionOptions`, the change is harmless - pool defaults now appear twice in the spread chain, with the later (per-call) one winning, exactly as before. + ## `retireOnBlockedStatusCodes` is removed from `Session` `Session.retireOnBlockedStatusCodes` is removed. Blocked status code handling is now internal to the crawler. Configure blocked status codes via the `blockedStatusCodes` crawler option (moved from `sessionPoolOptions`). diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index 36514c1b8c8e..24ae05b00796 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -21,7 +21,6 @@ import type { RequestTransform, RouterHandler, RouterRoutes, - Session, SkippedRequestCallback, Source, StatisticsOptions, @@ -59,6 +58,7 @@ import { Router, ServiceLocator, serviceLocator, + Session, SessionError, SessionPool, Statistics, @@ -873,7 +873,26 @@ export class BasicCrawler< ...statisticsOptions, }); - this.sessionPool = sessionPool ?? new SessionPool(); + if (sessionPool && proxyConfiguration) { + this.log.warning( + 'Both `sessionPool` and `proxyConfiguration` were provided to the crawler. ' + + 'The `proxyConfiguration` is ignored - sessions from the supplied pool keep whatever ' + + '`proxyInfo` they were created with. Configure proxies on the pool instead, ' + + 'e.g. via `addSession({ proxyInfo })` or a custom `createSessionFunction`.', + ); + } + + this.sessionPool = + sessionPool ?? + new SessionPool({ + createSessionFunction: async (pool, opts) => + new Session({ + ...opts?.sessionOptions, + proxyInfo: + opts?.sessionOptions?.proxyInfo ?? (await this.proxyConfiguration?.newProxyInfo()), + sessionPool: pool, + }), + }); this.sessionPool.setMaxListeners(20); this.ownsSessionPool = !sessionPool; @@ -1116,12 +1135,7 @@ export class BasicCrawler< return existingSession; } - return await this.sessionPool!.newSession({ - proxyInfo: await this.proxyConfiguration?.newProxyInfo({ - request: request ?? undefined, - }), - maxUsageCount: 1, - }); + return await this.sessionPool!.getSession(); }, this.internalTimeoutMillis, `Fetching session timed out after ${this.internalTimeoutMillis / 1e3} seconds.`, diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index 61025a5f89d7..68bcddef2ab3 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -355,7 +355,6 @@ export abstract class BrowserCrawler< ignoreShadowRoots = false, contextPipelineBuilder, extendContext, - proxyConfiguration, ...basicCrawlerOptions } = options; @@ -371,7 +370,6 @@ export abstract class BrowserCrawler< this.launchContext = launchContext; this.navigationTimeoutMillis = navigationTimeoutSecs * 1000; - this.proxyConfiguration = proxyConfiguration; this.preNavigationHooks = preNavigationHooks; this.postNavigationHooks = postNavigationHooks; this.ignoreIframes = ignoreIframes; diff --git a/packages/core/src/session_pool/session_pool.ts b/packages/core/src/session_pool/session_pool.ts index 0a4d20b8bd5a..3bad85c7405d 100644 --- a/packages/core/src/session_pool/session_pool.ts +++ b/packages/core/src/session_pool/session_pool.ts @@ -196,11 +196,11 @@ export class SessionPool extends EventEmitter { this.maxPoolSize = maxPoolSize; this.createSessionFunction = createSessionFunction || this._defaultCreateSessionFunction; - // Session configuration + // Session configuration. The pool-scoped logger is merged into per-call sessionOptions inside + // `_invokeCreateSessionFunction`, so every Session inherits it without custom createSessionFunctions + // having to know about it. this.sessionOptions = { ...sessionOptions, - // the log needs to propagate to createSessionFunction as in "new Session({ ...sessionPool.sessionOptions })" - // and can't go inside _defaultCreateSessionFunction log: this.log, }; @@ -281,8 +281,7 @@ export class SessionPool extends EventEmitter { this._removeRetiredSessions(); } - const newSession = - options instanceof Session ? options : await this.createSessionFunction(this, { sessionOptions: options }); + const newSession = options instanceof Session ? options : await this._invokeCreateSessionFunction(options); this.log.debug(`Adding new Session - ${newSession.id}`); this._addSession(newSession); @@ -297,7 +296,7 @@ export class SessionPool extends EventEmitter { async newSession(sessionOptions?: SessionOptions): Promise { await this.ensureInitialized(); - const newSession = await this.createSessionFunction(this, { sessionOptions }); + const newSession = await this._invokeCreateSessionFunction(sessionOptions); this._addSession(newSession); return newSession; @@ -461,18 +460,26 @@ export class SessionPool extends EventEmitter { const { sessionOptions = {} } = options; return new Session({ - ...this.sessionOptions, ...sessionOptions, sessionPool, }); } + /** + * Invokes `createSessionFunction` with `sessionOptions` already merged from pool-wide defaults and + * the supplied per-call overrides, so custom implementations don't need to spread `pool.sessionOptions` themselves. + */ + private async _invokeCreateSessionFunction(perCallOptions?: SessionOptions): Promise { + const sessionOptions = { ...this.sessionOptions, ...perCallOptions }; + return this.createSessionFunction(this, { sessionOptions }); + } + /** * Creates new session and adds it to the pool. * @returns Newly created `Session` instance. */ protected async _createSession(): Promise { - const newSession = await this.createSessionFunction(this); + const newSession = await this._invokeCreateSessionFunction(); this._addSession(newSession); this.log.debug(`Created new Session - ${newSession.id}`); @@ -528,7 +535,7 @@ export class SessionPool extends EventEmitter { sessionObject.sessionPool = this; sessionObject.createdAt = new Date(sessionObject.createdAt as string); sessionObject.expiresAt = new Date(sessionObject.expiresAt as string); - const recreatedSession = await this.createSessionFunction(this, { sessionOptions: sessionObject }); + const recreatedSession = await this._invokeCreateSessionFunction(sessionObject); if (recreatedSession.isUsable()) { this._addSession(recreatedSession); diff --git a/test/core/crawlers/basic_crawler.test.ts b/test/core/crawlers/basic_crawler.test.ts index cb239ed042c8..9a4c0f709cad 100644 --- a/test/core/crawlers/basic_crawler.test.ts +++ b/test/core/crawlers/basic_crawler.test.ts @@ -4,6 +4,7 @@ import http from 'node:http'; import type { AddressInfo } from 'node:net'; import type { EnqueueLinksOptions, ErrorHandler, RequestHandler, RequestOptions, Source } from '@crawlee/basic'; +import type { Session } from '@crawlee/basic'; import { BasicCrawler, Configuration, @@ -12,6 +13,7 @@ import { KeyValueStore, MissingRouteError, NonRetryableError, + ProxyConfiguration, Request, RequestList, RequestQueue, @@ -19,6 +21,7 @@ import { SessionPool, } from '@crawlee/basic'; import { RequestState } from '@crawlee/core'; +import type { ProxyInfo } from '@crawlee/types'; import type { Dictionary } from '@crawlee/utils'; import { RobotsTxtFile, sleep } from '@crawlee/utils'; import express from 'express'; @@ -1602,6 +1605,68 @@ describe('BasicCrawler', () => { }); }); + describe('proxyConfiguration', () => { + it('assigns a proxyInfo from the proxyConfiguration to each Session and exposes it on the context', async () => { + const proxyUrls = [0, 1, 2].map((n) => `http://proxy.example.com:${1000 + n}`); + const proxyConfiguration = new ProxyConfiguration({ proxyUrls }); + + const sessions: Session[] = []; + const proxyInfos: (ProxyInfo | undefined)[] = []; + + const crawler = new BasicCrawler({ + proxyConfiguration, + requestHandler: async ({ session, proxyInfo }) => { + sessions.push(session); + proxyInfos.push(proxyInfo); + }, + }); + + await crawler.run([ + { url: 'https://example.com/a' }, + { url: 'https://example.com/b' }, + { url: 'https://example.com/c' }, + ]); + + expect(sessions).toHaveLength(3); + for (let i = 0; i < sessions.length; i++) { + const proxyInfo = proxyInfos[i]; + expect(proxyInfo).toBeDefined(); + expect(proxyUrls).toContain(proxyInfo!.url); + expect(sessions[i].proxyInfo).toBe(proxyInfo); + } + }); + + it('reuses the same Session across multiple requests when the pool is restricted', async () => { + const sessions: Session[] = []; + const proxyInfos: (ProxyInfo | undefined)[] = []; + + const crawler = new BasicCrawler({ + sessionPool: new SessionPool({ maxPoolSize: 1 }), + requestHandler: async ({ session, proxyInfo }) => { + sessions.push(session); + proxyInfos.push(proxyInfo); + }, + }); + + await crawler.run([ + { url: 'https://example.com/a' }, + { url: 'https://example.com/b' }, + { url: 'https://example.com/c' }, + ]); + + expect(sessions).toHaveLength(3); + const firstId = sessions[0].id; + for (const session of sessions) { + expect(session.id).toBe(firstId); + expect(session.proxyInfo).toBe(sessions[0].proxyInfo); + } + for (const proxyInfo of proxyInfos) { + expect(proxyInfo).toBe(sessions[0].proxyInfo); + } + expect(sessions[0].usageCount).toBe(3); + }); + }); + test('extendContext', async () => { const url = 'https://example.com'; const requestHandlerImplementation = vi.fn(); diff --git a/test/core/crawlers/browser_crawler.test.ts b/test/core/crawlers/browser_crawler.test.ts index 140a56435643..3f9f42d62a20 100644 --- a/test/core/crawlers/browser_crawler.test.ts +++ b/test/core/crawlers/browser_crawler.test.ts @@ -820,7 +820,7 @@ describe('BrowserCrawler', () => { { url: `${serverAddress}/?q=6` }, ]); - expect(sessionUsageHistory).toEqual([0, 0, 0, 0, 0, 0]); + expect(sessionUsageHistory).toEqual([0, 1, 2, 3, 4, 5]); } finally { await localStorageEmulator.destroy(); } diff --git a/test/core/crawlers/puppeteer_crawler.test.ts b/test/core/crawlers/puppeteer_crawler.test.ts index c3a3a5a4cb72..dbc9e13fa8eb 100644 --- a/test/core/crawlers/puppeteer_crawler.test.ts +++ b/test/core/crawlers/puppeteer_crawler.test.ts @@ -363,11 +363,6 @@ describe('PuppeteerCrawler', () => { }, }, maxConcurrency: 1, - sessionPool: new SessionPool({ - sessionOptions: { - maxUsageCount: 1, - }, - }), proxyConfiguration, requestHandler: async ({ proxyInfo, session }) => { proxies.add(proxyInfo!.url);