From e41f0afb9c85ca053217aaf0f14e17433b36bd99 Mon Sep 17 00:00:00 2001 From: KevinShiCN Date: Sat, 25 Apr 2026 17:02:09 +0800 Subject: [PATCH] fix(proxy): classify errors by rule category to preserve failover for service errors Previously, isNonRetryableClientErrorAsync (and the sync companion) treated any matched error rule as NON_RETRYABLE_CLIENT_ERROR, regardless of the rule's category. This caused user-added rules with category like service_error or network_error to short-circuit failover/circuit-breaker logic, even though their underlying errors should be retried or routed to a different provider. Worse, regex patterns without word boundaries (e.g. "503|service.unavailable", "529|service.overloaded") matched 3-digit substrings inside upstream response bodies (request IDs, price amounts), causing INSUFFICIENT_BALANCE responses to be misclassified as service-overloaded and silenced. This change introduces an explicit category white-list for client-input errors that should genuinely bypass retry/failover. Only matches whose category falls within this set are classified as NON_RETRYABLE_CLIENT_ERROR; others fall back to the standard ProxyError-based classification (typically PROVIDER_ERROR or SYSTEM_ERROR), preserving failover and circuit-breaker behaviour. The white-list reflects the categories actually used by DEFAULT_ERROR_RULES: prompt_limit, input_limit, context_limit, token_limit, content_filter, validation_error, thinking_error, parameter_error, pdf_limit, media_limit, cache_limit, invalid_request, model_error, store_error. Reproduction context (CCH 0.7.2 production, observed 2026-04-25): - User added rule pattern "529|service.overloaded" with category=service_error - Upstream returned 403 INSUFFICIENT_BALANCE; response body contained substring "529" inside the price "0.352942" - Old logic: matched -> NON_RETRYABLE -> stop_immediately -> client received generic permission_error with no upstream detail - New logic: matched but category not white-listed -> falls back to PROVIDER_ERROR -> failover to next provider -> standard behaviour Tests: 10 new unit tests covering the white-list and regression cases for service_error / network_error / rate_limit categories. --- src/app/v1/_lib/proxy/errors.ts | 75 +++- ...rror-category-aware-classification.test.ts | 363 ++++++++++++++++++ 2 files changed, 435 insertions(+), 3 deletions(-) create mode 100644 tests/unit/proxy/error-category-aware-classification.test.ts diff --git a/src/app/v1/_lib/proxy/errors.ts b/src/app/v1/_lib/proxy/errors.ts index 91e220263..d7925818f 100644 --- a/src/app/v1/_lib/proxy/errors.ts +++ b/src/app/v1/_lib/proxy/errors.ts @@ -570,6 +570,61 @@ function extractErrorContentForDetection(error: Error): string { return error.message; } +/** + * 真正属于 "客户端输入错误" 的规则 category 白名单 + * + * 仅当匹配的规则 category 落入此集合时,对应错误才被分类为 + * NON_RETRYABLE_CLIENT_ERROR(不重试 + 不计入熔断器 + 直接返回)。 + * + * 设计原因: + * - error_rules 表允许用户自定义规则,category 字段可以是任意字符串 + * - 历史实现中,任何匹配规则都会被当作 NON_RETRYABLE_CLIENT_ERROR + * - 但 service_error / network_error / rate_limit 这类 category 实际是 + * 供应商侧或网络侧的可恢复错误,应进入正常 failover/熔断器流程,而不是 + * 被白名单化导致 "stopping immediately" 不再切换供应商 + * + * 通过显式白名单,确保只有真正的客户端输入错误(如 prompt_limit、 + * validation_error、content_filter 等)触发不可重试逻辑。 + * + * 此集合的设计原则: + * - 仅包含官方默认规则(DEFAULT_ERROR_RULES)实际使用的客户端错误 category + * - 用户自定义规则若使用集合外的 category(如 service_error),将不再 + * 被错误地白名单化;这些错误会回退到 ProxyError 的标准分类逻辑 + */ +const NON_RETRYABLE_CLIENT_ERROR_CATEGORIES = new Set([ + "prompt_limit", + "input_limit", + "context_limit", + "token_limit", + "content_filter", + "validation_error", + "thinking_error", + "parameter_error", + "pdf_limit", + "media_limit", + "cache_limit", + "invalid_request", + "model_error", + "store_error", +]); + +/** + * 判断已匹配的检测结果是否应触发 NON_RETRYABLE_CLIENT_ERROR 分类 + * + * @param result - errorRuleDetector 返回的检测结果 + * @returns true = 视为不可重试客户端错误;false = 不触发该分类(让标准分类逻辑处理) + */ +function shouldTreatMatchedRuleAsNonRetryable(result: ErrorDetectionResult): boolean { + if (!result.matched) { + return false; + } + // category 必须显式存在,且落在白名单集合内 + return ( + typeof result.category === "string" && + NON_RETRYABLE_CLIENT_ERROR_CATEGORIES.has(result.category) + ); +} + /** * 错误规则检测结果缓存 * @@ -623,7 +678,7 @@ export async function getErrorDetectionResultAsync(error: Error): Promise undefined); } - return result.matched; + return shouldTreatMatchedRuleAsNonRetryable(result); } /** @@ -645,11 +700,25 @@ export function isNonRetryableClientError(error: Error): boolean { * * 此函数会确保错误规则已加载后再进行检测 * 生产代码路径(forwarder、error-handler)应优先使用此版本 + * + * 仅当匹配规则的 category 落入 NON_RETRYABLE_CLIENT_ERROR_CATEGORIES 白名单时 + * 才返回 true。其他 category(如 service_error、network_error 等)即使匹配也 + * 会回退到 ProxyError 的标准分类(PROVIDER_ERROR/SYSTEM_ERROR),从而保留 + * 故障转移和熔断器逻辑。 */ export async function isNonRetryableClientErrorAsync(error: Error): Promise { // 使用缓存的检测结果,避免重复执行规则匹配 const result = await detectErrorRuleOnceAsync(error); - return result.matched; + return shouldTreatMatchedRuleAsNonRetryable(result); +} + +/** + * 暴露内部白名单常量供测试使用 + * + * 注意:这是只读副本,外部不应修改运行时白名单。 + */ +export function getNonRetryableClientErrorCategoriesForTesting(): ReadonlySet { + return NON_RETRYABLE_CLIENT_ERROR_CATEGORIES; } /** diff --git a/tests/unit/proxy/error-category-aware-classification.test.ts b/tests/unit/proxy/error-category-aware-classification.test.ts new file mode 100644 index 000000000..5c0299954 --- /dev/null +++ b/tests/unit/proxy/error-category-aware-classification.test.ts @@ -0,0 +1,363 @@ +/** + * Error rule category-aware classification tests + * + * Validates that isNonRetryableClientErrorAsync (and the sync companion) + * only classify a matched error as NON_RETRYABLE_CLIENT_ERROR when the + * matched rule's category is one of the recognised client-input categories. + * + * Without this guard, user-added rules with category like "service_error" + * or "network_error" would be incorrectly white-listed, causing + * categorizeErrorAsync to short-circuit failover/circuit-breaker logic + * for upstream/transport problems that should be retried or routed to a + * different provider. + * + * Repro context (CCH 0.7.2 production): + * - User added rule pattern "529|service.overloaded" with category=service_error + * - Upstream INSUFFICIENT_BALANCE response body contained substring "529" + * (e.g. inside "0.352942" price amount or request id) + * - Old logic flagged the error as NON_RETRYABLE_CLIENT_ERROR -> stopped + * failover -> client received generic "permission_error" + * - Expected: service_error category should NOT be white-listed; the error + * falls back to PROVIDER_ERROR and standard failover applies. + */ + +import { afterEach, beforeEach, describe, expect, test, vi } from "vitest"; + +const mocks = vi.hoisted(() => { + const listeners = new Map void>>(); + + return { + getActiveErrorRules: vi.fn(), + subscribeCacheInvalidation: vi.fn(async () => undefined), + eventEmitter: { + on(event: string, handler: (...args: unknown[]) => void) { + const current = listeners.get(event) ?? new Set<(...args: unknown[]) => void>(); + current.add(handler); + listeners.set(event, current); + }, + emit(event: string, ...args: unknown[]) { + for (const handler of listeners.get(event) ?? []) { + handler(...args); + } + }, + removeAllListeners() { + listeners.clear(); + }, + }, + logger: { + debug: vi.fn(), + info: vi.fn(), + warn: vi.fn(), + trace: vi.fn(), + error: vi.fn(), + fatal: vi.fn(), + }, + }; +}); + +vi.mock("@/repository/error-rules", () => ({ + getActiveErrorRules: mocks.getActiveErrorRules, +})); + +vi.mock("@/lib/event-emitter", () => ({ + eventEmitter: mocks.eventEmitter, +})); + +vi.mock("@/lib/redis/pubsub", () => ({ + CHANNEL_ERROR_RULES_UPDATED: "errorRulesUpdated", + subscribeCacheInvalidation: mocks.subscribeCacheInvalidation, +})); + +vi.mock("@/lib/logger", () => ({ + logger: mocks.logger, +})); + +interface BuildRuleOverrides { + id?: number; + pattern?: string; + matchType?: "regex" | "contains" | "exact"; + category?: string; + description?: string; + isEnabled?: boolean; + isDefault?: boolean; + priority?: number; +} + +function buildRule(overrides: BuildRuleOverrides = {}) { + return { + id: 100, + pattern: "default-pattern", + matchType: "contains" as const, + category: "validation_error", + description: "test rule", + overrideResponse: undefined, + overrideStatusCode: undefined, + isEnabled: true, + isDefault: false, + priority: 10, + createdAt: new Date("2026-04-25T00:00:00.000Z"), + updatedAt: new Date("2026-04-25T00:00:00.000Z"), + ...overrides, + }; +} + +async function loadDetectorWithRules(rules: ReturnType[]) { + mocks.getActiveErrorRules.mockResolvedValue(rules); + + const { errorRuleDetector } = await import("@/lib/error-rule-detector"); + await new Promise((resolve) => setTimeout(resolve, 0)); + await errorRuleDetector.reload(); + + return errorRuleDetector; +} + +describe("isNonRetryableClientErrorAsync - category-aware white-list", () => { + beforeEach(() => { + vi.resetModules(); + vi.clearAllMocks(); + mocks.eventEmitter.removeAllListeners(); + }); + + afterEach(() => { + mocks.eventEmitter.removeAllListeners(); + }); + + test("matches rule with white-listed category (validation_error) -> NON_RETRYABLE", async () => { + await loadDetectorWithRules([ + buildRule({ pattern: "ValidationException", category: "validation_error" }), + ]); + + const { isNonRetryableClientErrorAsync, ProxyError } = await import( + "@/app/v1/_lib/proxy/errors" + ); + const error = new ProxyError("ValidationException: bad input", 400, { + body: '{"error":{"type":"validation","message":"ValidationException"}}', + parsed: null, + providerId: 1, + providerName: "test", + }); + + expect(await isNonRetryableClientErrorAsync(error)).toBe(true); + }); + + test("matches rule with non-white-listed category (service_error) -> NOT NON_RETRYABLE", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 40, + pattern: "529|service.overloaded", + matchType: "regex", + category: "service_error", + }), + ]); + + const { isNonRetryableClientErrorAsync, ProxyError } = await import( + "@/app/v1/_lib/proxy/errors" + ); + // Reproduces real-world false positive: substring "529" appears inside + // the price amount "0.352942" of an INSUFFICIENT_BALANCE response body. + const error = new ProxyError("Provider returned 403: insufficient balance", 403, { + body: '{"error":{"type":"new_api_error","message":"insufficient balance, remaining: 0.214220, required: 0.352942"}}', + parsed: null, + providerId: 92, + providerName: "test-upstream", + }); + + expect(await isNonRetryableClientErrorAsync(error)).toBe(false); + }); + + test("matches rule with non-white-listed category (network_error) -> NOT NON_RETRYABLE", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 38, + pattern: "ECONNREFUSED|ECONNRESET", + matchType: "regex", + category: "network_error", + }), + ]); + + const { isNonRetryableClientErrorAsync } = await import("@/app/v1/_lib/proxy/errors"); + const error = new Error("connect ECONNREFUSED 127.0.0.1:8080"); + + expect(await isNonRetryableClientErrorAsync(error)).toBe(false); + }); + + test("matches rule with non-white-listed category (rate_limit) -> NOT NON_RETRYABLE", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 36, + pattern: "rate_limit_error|rate.limit", + matchType: "regex", + category: "rate_limit", + }), + ]); + + const { isNonRetryableClientErrorAsync, ProxyError } = await import( + "@/app/v1/_lib/proxy/errors" + ); + const error = new ProxyError("Provider returned 429: rate_limit_error", 429, { + body: '{"error":{"type":"rate_limit_error"}}', + parsed: null, + providerId: 1, + providerName: "test", + }); + + expect(await isNonRetryableClientErrorAsync(error)).toBe(false); + }); + + test("no rule match -> NOT NON_RETRYABLE", async () => { + await loadDetectorWithRules([ + buildRule({ pattern: "completely-unrelated-pattern", category: "validation_error" }), + ]); + + const { isNonRetryableClientErrorAsync } = await import("@/app/v1/_lib/proxy/errors"); + const error = new Error("a totally different error message"); + + expect(await isNonRetryableClientErrorAsync(error)).toBe(false); + }); + + test("each white-listed category triggers NON_RETRYABLE classification", async () => { + const { getNonRetryableClientErrorCategoriesForTesting } = await import( + "@/app/v1/_lib/proxy/errors" + ); + const categories = Array.from(getNonRetryableClientErrorCategoriesForTesting()); + + expect(categories.length).toBeGreaterThan(0); + + for (let i = 0; i < categories.length; i++) { + const category = categories[i]; + vi.resetModules(); + vi.clearAllMocks(); + mocks.eventEmitter.removeAllListeners(); + + const sentinel = `__sentinel_${i}_${category}__`; + await loadDetectorWithRules([ + buildRule({ + id: 200 + i, + pattern: sentinel, + matchType: "contains", + category, + }), + ]); + + const { isNonRetryableClientErrorAsync: detect } = await import("@/app/v1/_lib/proxy/errors"); + const error = new Error(`upstream said: ${sentinel}`); + + expect( + await detect(error), + `category="${category}" should be classified as NON_RETRYABLE` + ).toBe(true); + } + }); +}); + +describe("categorizeErrorAsync - regression: service_error rule must not stop failover", () => { + beforeEach(() => { + vi.resetModules(); + vi.clearAllMocks(); + mocks.eventEmitter.removeAllListeners(); + }); + + afterEach(() => { + mocks.eventEmitter.removeAllListeners(); + }); + + test("substring match on user-added service_error rule still produces PROVIDER_ERROR", async () => { + // User-added rule with substring-prone pattern (no \b boundary). + await loadDetectorWithRules([ + buildRule({ + id: 41, + pattern: "503|service.unavailable", + matchType: "regex", + category: "service_error", + }), + ]); + + const { ErrorCategory, categorizeErrorAsync, ProxyError } = await import( + "@/app/v1/_lib/proxy/errors" + ); + + // INSUFFICIENT_BALANCE 403 response body where "503" appears as a + // substring inside the request-id token. + const error = new ProxyError("Provider returned 403: insufficient balance", 403, { + body: '{"error":{"message":"insufficient balance (request id: 202604250550399959166838268d9d6K9sUhgdW)"}}', + parsed: null, + providerId: 85, + providerName: "synai996_Aws", + }); + + // Without the fix, this would return NON_RETRYABLE_CLIENT_ERROR + // because the rule's `matched=true` short-circuits classification. + expect(await categorizeErrorAsync(error)).toBe(ErrorCategory.PROVIDER_ERROR); + }); + + test("network_error rule on Error instance still produces SYSTEM_ERROR", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 37, + pattern: "timeout|timed.out|ETIMEDOUT", + matchType: "regex", + category: "network_error", + }), + ]); + + const { ErrorCategory, categorizeErrorAsync } = await import("@/app/v1/_lib/proxy/errors"); + + const error = new Error("ETIMEDOUT: connection timed out after 30000ms"); + expect(await categorizeErrorAsync(error)).toBe(ErrorCategory.SYSTEM_ERROR); + }); + + test("validation_error rule (white-listed) still produces NON_RETRYABLE_CLIENT_ERROR", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 5, + pattern: "ValidationException", + matchType: "contains", + category: "validation_error", + }), + ]); + + const { ErrorCategory, categorizeErrorAsync, ProxyError } = await import( + "@/app/v1/_lib/proxy/errors" + ); + const error = new ProxyError("Provider returned 400: ValidationException", 400, { + body: '{"error":"ValidationException: invalid model"}', + parsed: null, + providerId: 1, + providerName: "test", + }); + + expect(await categorizeErrorAsync(error)).toBe(ErrorCategory.NON_RETRYABLE_CLIENT_ERROR); + }); +}); + +describe("isNonRetryableClientError (sync) - category-aware white-list", () => { + beforeEach(() => { + vi.resetModules(); + vi.clearAllMocks(); + mocks.eventEmitter.removeAllListeners(); + }); + + afterEach(() => { + mocks.eventEmitter.removeAllListeners(); + }); + + test("sync version respects white-list when cache hit", async () => { + await loadDetectorWithRules([ + buildRule({ + id: 40, + pattern: "service-overloaded-token", + matchType: "contains", + category: "service_error", + }), + ]); + + const { isNonRetryableClientError, isNonRetryableClientErrorAsync } = await import( + "@/app/v1/_lib/proxy/errors" + ); + + const error = new Error("upstream said: service-overloaded-token"); + // Prime the cache via async path first so sync path hits the cached entry. + await isNonRetryableClientErrorAsync(error); + expect(isNonRetryableClientError(error)).toBe(false); + }); +});