From 9dc89243439bf4a423870729db40fe4eb98ca43a Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Wed, 11 Feb 2026 15:39:05 +0530 Subject: [PATCH 01/16] chore: move dockerode to optionalDependencies, wrap require Co-Authored-By: Claude Opus 4.6 --- package.json | 4 ++-- src/headroom/launcher.js | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index 982dc1a..73e6797 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "lynkr", - "version": "7.2.3", + "version": "7.2.5", "description": "Self-hosted Claude Code & Cursor proxy with Databricks,AWS BedRock,Azure adapters, openrouter, Ollama,llamacpp,LM Studio, workspace tooling, and MCP integration.", "main": "index.js", "bin": { @@ -49,7 +49,6 @@ "@babel/traverse": "^7.29.0", "compression": "^1.7.4", "diff": "^5.2.0", - "dockerode": "^4.0.2", "dotenv": "^16.4.5", "express": "^5.1.0", "express-rate-limit": "^8.2.1", @@ -62,6 +61,7 @@ }, "optionalDependencies": { "better-sqlite3": "^12.6.2", + "dockerode": "^4.0.2", "tree-sitter": "^0.21.1", "tree-sitter-javascript": "^0.21.0", "tree-sitter-python": "^0.21.0", diff --git a/src/headroom/launcher.js b/src/headroom/launcher.js index f35231c..cc40c6d 100644 --- a/src/headroom/launcher.js +++ b/src/headroom/launcher.js @@ -5,12 +5,17 @@ * Provides automatic container creation, health checking, and graceful shutdown. */ -const Docker = require("dockerode"); +let Docker; +try { + Docker = require("dockerode"); +} catch { + Docker = null; +} const logger = require("../logger"); const config = require("../config"); -// Initialize Docker client -const docker = new Docker(); +// Initialize Docker client (only if dockerode is available) +const docker = Docker ? new Docker() : null; // Launcher state let containerInstance = null; From 526fa3072e7b35d016c7d10df146046abb9e098e Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Thu, 12 Feb 2026 12:01:17 +0530 Subject: [PATCH 02/16] Added tiered System --- .claude/settings.local.json | 6 +- config/model-tiers.json | 84 ++++++ src/api/middleware/session.js | 17 +- src/api/router.js | 93 ++++++ src/budget/index.js | 4 +- src/clients/databricks.js | 169 +++++++++-- src/clients/gpt-utils.js | 181 ++++++++++++ src/clients/standard-tools.js | 9 +- src/config/index.js | 40 ++- src/orchestrator/index.js | 121 +++++++- src/routing/agentic-detector.js | 320 +++++++++++++++++++++ src/routing/complexity-analyzer.js | 204 +++++++++++++- src/routing/cost-optimizer.js | 305 ++++++++++++++++++++ src/routing/index.js | 149 +++++++++- src/routing/model-registry.js | 437 +++++++++++++++++++++++++++++ src/routing/model-tiers.js | 363 ++++++++++++++++++++++++ src/sessions/cleanup.js | 6 +- src/sessions/record.js | 11 +- src/sessions/store.js | 9 +- src/tools/agent-task.js | 49 +++- src/tools/index.js | 17 +- 21 files changed, 2517 insertions(+), 77 deletions(-) create mode 100644 config/model-tiers.json create mode 100644 src/clients/gpt-utils.js create mode 100644 src/routing/agentic-detector.js create mode 100644 src/routing/cost-optimizer.js create mode 100644 src/routing/model-registry.js create mode 100644 src/routing/model-tiers.js diff --git a/.claude/settings.local.json b/.claude/settings.local.json index e518c4f..a1dd8cd 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -77,7 +77,11 @@ "Bash(kill:*)", "Bash(grep:*)", "WebFetch(domain:opencode.ai)", - "Bash(find:*)" + "Bash(find:*)", + "WebFetch(domain:www.databricks.com)", + "WebFetch(domain:docs.databricks.com)", + "Bash(env:*)", + "Bash(DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test:*)" ], "deny": [], "ask": [] diff --git a/config/model-tiers.json b/config/model-tiers.json new file mode 100644 index 0000000..6358395 --- /dev/null +++ b/config/model-tiers.json @@ -0,0 +1,84 @@ +{ + "tiers": { + "SIMPLE": { + "description": "Greetings, simple Q&A, confirmations, basic lookups", + "range": [0, 25], + "priority": 1, + "preferred": { + "ollama": ["llama3.2", "gemma2", "phi3", "qwen2.5:7b", "mistral"], + "llamacpp": ["default"], + "lmstudio": ["default"], + "openai": ["gpt-4o-mini", "gpt-3.5-turbo"], + "azure-openai": ["gpt-4o-mini", "gpt-35-turbo"], + "anthropic": ["claude-3-haiku-20240307", "claude-3-5-haiku-20241022"], + "bedrock": ["anthropic.claude-3-haiku-20240307-v1:0", "amazon.nova-lite-v1:0"], + "databricks": ["databricks-claude-haiku-4-5", "databricks-gpt-5-nano"], + "google": ["gemini-2.0-flash", "gemini-1.5-flash"], + "openrouter": ["google/gemini-flash-1.5", "deepseek/deepseek-chat"], + "zai": ["GLM-4-Flash"] + } + }, + "MEDIUM": { + "description": "Code reading, simple edits, research, documentation", + "range": [26, 50], + "priority": 2, + "preferred": { + "ollama": ["qwen2.5:32b", "deepseek-coder:33b", "codellama:34b"], + "llamacpp": ["default"], + "lmstudio": ["default"], + "openai": ["gpt-4o", "gpt-4-turbo"], + "azure-openai": ["gpt-4o", "gpt-4"], + "anthropic": ["claude-sonnet-4-20250514", "claude-3-5-sonnet-20241022"], + "bedrock": ["anthropic.claude-3-5-sonnet-20241022-v2:0", "amazon.nova-pro-v1:0"], + "databricks": ["databricks-claude-sonnet-4-5", "databricks-gpt-5-1"], + "google": ["gemini-1.5-pro", "gemini-2.0-pro"], + "openrouter": ["anthropic/claude-3.5-sonnet", "openai/gpt-4o"], + "zai": ["GLM-4.7"] + } + }, + "COMPLEX": { + "description": "Multi-file changes, debugging, architecture, refactoring", + "range": [51, 75], + "priority": 3, + "preferred": { + "ollama": ["qwen2.5:72b", "llama3.1:70b", "deepseek-coder-v2:236b"], + "openai": ["o1-mini", "o3-mini", "gpt-4o"], + "azure-openai": ["o1-mini", "gpt-4o"], + "anthropic": ["claude-sonnet-4-20250514", "claude-3-5-sonnet-20241022"], + "bedrock": ["anthropic.claude-3-5-sonnet-20241022-v2:0"], + "databricks": ["databricks-claude-sonnet-4-5", "databricks-gpt-5-1-codex-max"], + "google": ["gemini-2.5-pro", "gemini-1.5-pro"], + "openrouter": ["anthropic/claude-3.5-sonnet", "meta-llama/llama-3.1-405b"], + "zai": ["GLM-4.7"] + } + }, + "REASONING": { + "description": "Complex analysis, security audits, novel problems, deep thinking", + "range": [76, 100], + "priority": 4, + "preferred": { + "openai": ["o1", "o1-pro", "o3"], + "azure-openai": ["o1", "o1-pro"], + "anthropic": ["claude-opus-4-20250514", "claude-3-opus-20240229"], + "bedrock": ["anthropic.claude-3-opus-20240229-v1:0"], + "databricks": ["databricks-claude-opus-4-6", "databricks-claude-opus-4-5", "databricks-gpt-5-2"], + "google": ["gemini-2.5-pro"], + "openrouter": ["anthropic/claude-3-opus", "deepseek/deepseek-reasoner", "openai/o1"], + "deepseek": ["deepseek-reasoner", "deepseek-r1"] + } + } + }, + "localProviders": { + "ollama": { "free": true, "defaultTier": "SIMPLE" }, + "llamacpp": { "free": true, "defaultTier": "SIMPLE" }, + "lmstudio": { "free": true, "defaultTier": "SIMPLE" } + }, + "providerAliases": { + "azure": "azure-openai", + "aws": "bedrock", + "amazon": "bedrock", + "claude": "anthropic", + "gemini": "google", + "vertex": "google" + } +} diff --git a/src/api/middleware/session.js b/src/api/middleware/session.js index f19c58a..15381da 100644 --- a/src/api/middleware/session.js +++ b/src/api/middleware/session.js @@ -45,8 +45,21 @@ function sessionMiddleware(req, res, next) { // Add sessionId to logger context for this request req.log = logger.child({ sessionId }); - const session = getOrCreateSession(sessionId); - req.session = session; + // Skip DB persistence for auto-generated (ephemeral) session IDs. + // These are created when the client doesn't send a session header, + // so storing them just bloats the DB with throwaway records. + if (req.generatedSessionId) { + req.session = { + id: sessionId, + createdAt: Date.now(), + updatedAt: Date.now(), + metadata: {}, + history: [], + _ephemeral: true, + }; + } else { + req.session = getOrCreateSession(sessionId); + } return next(); } catch (err) { return next(err); diff --git a/src/api/router.js b/src/api/router.js index b3ed198..f6e469a 100644 --- a/src/api/router.js +++ b/src/api/router.js @@ -71,6 +71,99 @@ router.get("/routing/stats", (req, res) => { }); }); +// Model registry info (from LiteLLM + models.dev APIs) +router.get("/routing/models", async (req, res) => { + try { + const { getModelRegistry } = require("../routing/model-registry"); + const registry = await getModelRegistry(); + res.json({ + status: "ok", + ...registry.getStats(), + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Get specific model info +router.get("/routing/models/:model", async (req, res) => { + try { + const { getModelRegistry } = require("../routing/model-registry"); + const registry = await getModelRegistry(); + const model = registry.getModel(req.params.model); + if (!model || model.source === "default") { + return res.status(404).json({ error: "Model not found", model: req.params.model }); + } + res.json({ status: "ok", model: req.params.model, ...model }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Routing tier information +router.get("/routing/tiers", (req, res) => { + try { + const { getModelTierSelector } = require("../routing/model-tiers"); + const selector = getModelTierSelector(); + res.json({ + status: "ok", + ...selector.getTierStats(), + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Cost optimization stats +router.get("/metrics/cost-optimization", (req, res) => { + try { + const { getCostOptimizer } = require("../routing/cost-optimizer"); + const optimizer = getCostOptimizer(); + res.json({ + status: "ok", + ...optimizer.getStats(), + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + +// Request analysis test endpoint +router.post("/routing/analyze", async (req, res) => { + try { + const { getAgenticDetector } = require("../routing/agentic-detector"); + const { getModelTierSelector } = require("../routing/model-tiers"); + const { getModelRegistry } = require("../routing/model-registry"); + + const analysis = analyzeComplexity(req.body, { weighted: req.query.weighted === "true" }); + const agentic = getAgenticDetector().detect(req.body); + const selector = getModelTierSelector(); + const tier = selector.getTier(analysis.score); + + // Get recommended model for tier + const provider = req.query.provider || "openai"; + const modelSelection = selector.selectModel(tier, provider); + + // Get model cost info + let modelInfo = null; + if (modelSelection.model) { + const registry = await getModelRegistry(); + modelInfo = registry.getCost(modelSelection.model); + } + + res.json({ + status: "ok", + analysis, + agentic, + tier, + modelSelection, + modelInfo, + }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + router.get("/debug/session", (req, res) => { if (!req.sessionId) { return res.status(400).json({ error: "missing_session_id", message: "Provide x-session-id header" }); diff --git a/src/budget/index.js b/src/budget/index.js index 2d05b79..3961c8d 100644 --- a/src/budget/index.js +++ b/src/budget/index.js @@ -25,14 +25,14 @@ class BudgetManager { } this.db = new Database(dbPath); + this.dbPath = dbPath; this.initDatabase(); + logger.info({ dbPath }, 'Budget manager initialized'); } catch (err) { logger.warn({ err: err.message }, "BudgetManager: better-sqlite3 not available"); this.enabled = false; return; } - - logger.info({ dbPath }, 'Budget manager initialized'); } initDatabase() { diff --git a/src/clients/databricks.js b/src/clients/databricks.js index 9b536cd..4eb24ed 100644 --- a/src/clients/databricks.js +++ b/src/clients/databricks.js @@ -6,11 +6,12 @@ const { getCircuitBreakerRegistry } = require("./circuit-breaker"); const { getMetricsCollector } = require("../observability/metrics"); const { getHealthTracker } = require("../observability/health-tracker"); const logger = require("../logger"); -const { STANDARD_TOOLS } = require("./standard-tools"); +const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools"); const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils"); const { detectModelFamily } = require("./bedrock-utils"); +const { getGPTSystemPromptAddendum } = require("./gpt-utils"); @@ -185,7 +186,7 @@ async function invokeDatabricks(body) { databricksBody.tools = STANDARD_TOOLS; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (Databricks) ==="); } @@ -226,7 +227,7 @@ async function invokeAzureAnthropic(body) { body.tools = STANDARD_TOOLS; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (Azure Anthropic) ==="); } @@ -428,7 +429,7 @@ async function invokeOpenRouter(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (OpenRouter) ==="); } @@ -490,6 +491,9 @@ async function invokeAzureOpenAI(body) { }); } + // System prompt injection disabled - breaks model response + // Tool guidance now provided via tool descriptions instead + const azureBody = { messages, temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior @@ -509,14 +513,14 @@ async function invokeAzureOpenAI(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS ==="); } if (Array.isArray(toolsToSend) && toolsToSend.length > 0) { azureBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend); - azureBody.parallel_tool_calls = true; // Enable parallel tool calling for better performance + azureBody.parallel_tool_calls = true; // Enable parallel tool calls azureBody.tool_choice = "auto"; // Explicitly enable tool use (helps GPT models understand they should use tools) logger.info({ toolCount: toolsToSend.length, @@ -563,14 +567,83 @@ async function invokeAzureOpenAI(body) { // Track function call IDs for matching with outputs const pendingCallIds = []; + // Detect if this is a continuation request (has tool results) + // Azure content filter triggers on full system prompt in continuations + // Check for: + // 1. tool_result blocks in user messages (Anthropic format) + // 2. tool messages (OpenAI format) + // 3. assistant messages with tool_use or tool_calls (indicates prior tool invocation) + // 4. Flattened continuation pattern from orchestrator (contains "IMPORTANT: Focus on") + const hasToolResults = (body.messages || []).some(msg => { + // Check for Anthropic format tool_result in user messages + if (msg.role === "user" && Array.isArray(msg.content)) { + if (msg.content.some(block => block.type === "tool_result")) return true; + } + // Check for OpenAI format tool messages + if (msg.role === "tool") return true; + // Check for assistant messages with tool_use (Anthropic) or tool_calls (OpenAI) + // If there's a prior tool use, this is a continuation + if (msg.role === "assistant") { + if (Array.isArray(msg.content)) { + if (msg.content.some(block => block.type === "tool_use")) return true; + } + if (msg.tool_calls && msg.tool_calls.length > 0) return true; + } + return false; + }) || azureBody.messages.some(msg => { + // Also check converted messages for flattened continuation pattern + // The orchestrator flattens tool results into user message with this marker + if (msg.role === "user" && typeof msg.content === "string") { + if (msg.content.includes("IMPORTANT: Focus on and respond ONLY to my most recent request")) return true; + } + return false; + }); + + if (hasToolResults) { + logger.info({ + hasToolResults: true, + originalMessageCount: (body.messages || []).length, + convertedMessageCount: azureBody.messages.length, + messageRoles: (body.messages || []).map(m => m.role), + }, "=== CONTINUATION REQUEST DETECTED - using minimal system prompt to avoid Azure content filter ==="); + } else { + logger.debug({ + hasToolResults: false, + originalMessageCount: (body.messages || []).length, + messageRoles: (body.messages || []).map(m => m.role), + }, "Initial request - using full system prompt"); + } + + // Helper function to strip tags and meta-instructions from content + // Azure's jailbreak filter triggers on these instructions in continuation requests + const stripSystemReminders = (content) => { + if (!content || typeof content !== 'string') return content; + // Remove ... blocks + let cleaned = content.replace(/[\s\S]*?<\/system-reminder>/gi, ''); + // Remove the continuation marker that orchestrator adds + cleaned = cleaned.replace(/---\s*IMPORTANT:\s*Focus on and respond ONLY to my most recent request[^\n]*/gi, ''); + // Trim whitespace + return cleaned.trim(); + }; + for (const msg of azureBody.messages) { if (msg.role === "system") { - // System messages become developer messages - responsesInput.push({ - type: "message", - role: "developer", - content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content) - }); + // For continuation requests, use minimal system prompt to avoid content filter + // Azure's jailbreak detection triggers on security-related text in continuations + if (hasToolResults) { + responsesInput.push({ + type: "message", + role: "developer", + content: "You are a helpful coding assistant. Continue helping the user based on the tool results." + }); + } else { + // Initial request - use full system prompt + responsesInput.push({ + type: "message", + role: "developer", + content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content) + }); + } } else if (msg.role === "user") { // Check if content contains tool_result blocks (Anthropic format) if (Array.isArray(msg.content)) { @@ -585,19 +658,30 @@ async function invokeAzureOpenAI(body) { output: typeof block.content === 'string' ? block.content : JSON.stringify(block.content || "") }); } else if (block.type === "text") { - responsesInput.push({ - type: "message", - role: "user", - content: block.text || "" - }); + // For continuation requests, strip system-reminder tags to avoid jailbreak filter + const textContent = hasToolResults ? stripSystemReminders(block.text || "") : (block.text || ""); + if (textContent) { // Only add if there's content after stripping + responsesInput.push({ + type: "message", + role: "user", + content: textContent + }); + } } } } else { - responsesInput.push({ - type: "message", - role: "user", - content: typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content) - }); + // For continuation requests, strip system-reminder tags to avoid jailbreak filter + let userContent = typeof msg.content === 'string' ? msg.content : JSON.stringify(msg.content); + if (hasToolResults) { + userContent = stripSystemReminders(userContent); + } + if (userContent) { // Only add if there's content after stripping + responsesInput.push({ + type: "message", + role: "user", + content: userContent + }); + } } } else if (msg.role === "assistant") { // Assistant messages - handle tool_calls (OpenAI format) and tool_use blocks (Anthropic format) @@ -681,7 +765,7 @@ async function invokeAzureOpenAI(body) { const textContent = messageOutput?.content?.find(c => c.type === "output_text")?.text || ""; // Find function_call outputs (tool calls are separate items in output array) - const toolCalls = outputArray + const rawToolCalls = outputArray .filter(o => o.type === "function_call") .map(tc => ({ id: tc.call_id || tc.id || `call_${Date.now()}`, @@ -692,6 +776,29 @@ async function invokeAzureOpenAI(body) { } })); + // Deduplicate identical tool calls (GPT sometimes returns multiple identical calls) + const seenSignatures = new Set(); + const toolCalls = rawToolCalls.filter(tc => { + const signature = `${tc.function.name}:${tc.function.arguments}`; + if (seenSignatures.has(signature)) { + logger.warn({ + toolName: tc.function.name, + signature: signature.substring(0, 100), + }, "Filtered duplicate tool call from GPT response"); + return false; + } + seenSignatures.add(signature); + return true; + }); + + if (rawToolCalls.length !== toolCalls.length) { + logger.info({ + originalCount: rawToolCalls.length, + dedupedCount: toolCalls.length, + removed: rawToolCalls.length - toolCalls.length, + }, "Deduplicated identical tool calls from single response"); + } + logger.info({ outputTypes: outputArray.map(o => o.type), hasMessage: !!messageOutput, @@ -841,6 +948,8 @@ async function invokeOpenAI(body) { }); } + // System prompt injection disabled - breaks model response + const openAIBody = { model: body._suggestionModeModel || config.openai.model || "gpt-4o", messages, @@ -860,14 +969,14 @@ async function invokeOpenAI(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (OpenAI) ==="); } if (Array.isArray(toolsToSend) && toolsToSend.length > 0) { openAIBody.tools = convertAnthropicToolsToOpenRouter(toolsToSend); - openAIBody.parallel_tool_calls = true; // Enable parallel tool calling + openAIBody.parallel_tool_calls = false; // Disable parallel tool calls - GPT often makes duplicate calls openAIBody.tool_choice = "auto"; // Let the model decide when to use tools logger.info({ toolCount: toolsToSend.length, @@ -961,7 +1070,7 @@ async function invokeLlamaCpp(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (llama.cpp) ==="); } else if (!injectToolsLlamacpp) { @@ -1044,7 +1153,7 @@ async function invokeLMStudio(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (LM Studio) ==="); } @@ -1091,7 +1200,7 @@ async function invokeBedrock(body) { toolsInjected = true; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (Bedrock) ==="); } @@ -1357,7 +1466,7 @@ async function invokeZai(body) { // "required" was forcing tools even for simple greetings zaiBody.tool_choice = "auto"; // Also enable parallel tool calls - zaiBody.parallel_tool_calls = true; + zaiBody.parallel_tool_calls = false; // Disable parallel tool calls - GPT often makes duplicate calls } headers = { @@ -1374,7 +1483,7 @@ async function invokeZai(body) { zaiBody.tools = STANDARD_TOOLS; logger.info({ injectedToolCount: STANDARD_TOOLS.length, - injectedToolNames: STANDARD_TOOLS.map(t => t.name), + injectedToolNames: STANDARD_TOOL_NAMES, reason: "Client did not send tools (passthrough mode)" }, "=== INJECTING STANDARD TOOLS (Z.AI Anthropic) ==="); } diff --git a/src/clients/gpt-utils.js b/src/clients/gpt-utils.js new file mode 100644 index 0000000..945ea43 --- /dev/null +++ b/src/clients/gpt-utils.js @@ -0,0 +1,181 @@ +/** + * GPT-specific utilities for handling tool calls and responses + * All settings are hardcoded - no env vars required + * + * This module addresses GPT model compatibility issues when using Azure OpenAI + * through Lynkr proxy with Claude Code: + * - GPT doesn't interpret "0 files found" as a final answer + * - GPT retries the same tool expecting different results + * - GPT needs explicit guidance on tool result interpretation + */ + +const logger = require("../logger"); + +// Hardcoded GPT settings - optimized for GPT model behavior +const GPT_SETTINGS = { + toolLoopThreshold: 2, // Lower than Claude's 3 to catch loops earlier + enhancedFormatting: true, // Always format results explicitly for GPT + similarityThreshold: 0.8, // For detecting similar (not just identical) tool calls +}; + +// Provider identifiers that use GPT models +const GPT_PROVIDERS = ['azure-openai', 'openai']; + +/** + * Check if a provider uses GPT models + * @param {string} provider - Provider type (e.g., 'azure-openai', 'databricks') + * @returns {boolean} - True if provider uses GPT models + */ +function isGPTProvider(provider) { + if (!provider) return false; + return GPT_PROVIDERS.includes(provider.toLowerCase()); +} + +/** + * Get the tool loop threshold for GPT models + * @returns {number} - Threshold (2 for GPT, lower than Claude's 3) + */ +function getGPTToolLoopThreshold() { + return GPT_SETTINGS.toolLoopThreshold; +} + +/** + * Format tool result with explicit structure for GPT models + * GPT models need clear, unambiguous formatting to understand tool results + * + * @param {string} toolName - Name of the tool that was called + * @param {string} content - The tool result content + * @param {Object} args - The arguments passed to the tool + * @returns {string} - Formatted result with explicit status and instructions + */ +function formatToolResultForGPT(toolName, content, args) { + // Handle empty/no results explicitly - add clear messaging to prevent retries + const isEmpty = !content || + content.trim() === '' || + content.includes('0 files found') || + content.includes('No matches found') || + content.includes('No results') || + content.includes('Found 0') || + /^Found \d+ files?\.$/.test(content.trim()) && content.includes('Found 0'); + + if (isEmpty) { + // Only format empty results - add explicit "don't retry" instruction + return `Tool "${toolName}" completed with no results found. +Query: ${JSON.stringify(args)} + +This is a FINAL result - do not retry this query. Respond to the user based on this outcome.`; + } + + // For successful results, return content as-is (don't add markers that might confuse GPT) + return content; +} + +/** + * Get system prompt addendum for GPT models + * This teaches GPT how to properly interpret and use tools + * + * @returns {string} - System prompt instructions for GPT + */ +function getGPTSystemPromptAddendum() { + return `Use the Bash tool with ls command for listing files. After any tool returns results, respond to the user.`; +} + +/** + * Calculate string similarity using Jaccard index + * Used to detect semantically similar tool calls + * + * @param {string} s1 - First string + * @param {string} s2 - Second string + * @returns {number} - Similarity score between 0 and 1 + */ +function stringSimilarity(s1, s2) { + if (!s1 || !s2) return 0; + if (s1 === s2) return 1; + + // Tokenize by whitespace and common delimiters + const tokenize = (s) => new Set( + s.toLowerCase() + .split(/[\s\-_\/\.\,\:\;]+/) + .filter(t => t.length > 0) + ); + + const set1 = tokenize(s1); + const set2 = tokenize(s2); + + const intersection = new Set([...set1].filter(x => set2.has(x))); + const union = new Set([...set1, ...set2]); + + return union.size > 0 ? intersection.size / union.size : 0; +} + +/** + * Check if two tool calls are semantically similar + * GPT often retries with slightly different parameters that are functionally equivalent + * + * @param {Object} call1 - First tool call {name, arguments} + * @param {Object} call2 - Second tool call {name, arguments} + * @returns {boolean} - True if calls are similar enough to be considered duplicates + */ +function areSimilarToolCalls(call1, call2) { + if (!call1 || !call2) return false; + + // Must be the same tool + const name1 = call1.function?.name ?? call1.name; + const name2 = call2.function?.name ?? call2.name; + if (name1 !== name2) return false; + + // Get arguments + const args1 = call1.function?.arguments ?? call1.arguments ?? call1.input ?? {}; + const args2 = call2.function?.arguments ?? call2.arguments ?? call2.input ?? {}; + + // Stringify for comparison + const argsStr1 = typeof args1 === 'string' ? args1 : JSON.stringify(args1); + const argsStr2 = typeof args2 === 'string' ? args2 : JSON.stringify(args2); + + // Exact match + if (argsStr1 === argsStr2) return true; + + // For search-related tools, check semantic similarity + const searchTools = ['grep', 'glob', 'search', 'find', 'read', 'bash', 'shell']; + const toolName = (name1 || '').toLowerCase(); + const isSearchTool = searchTools.some(t => toolName.includes(t)); + + if (isSearchTool) { + const similarity = stringSimilarity(argsStr1, argsStr2); + if (similarity >= GPT_SETTINGS.similarityThreshold) { + logger.debug({ + tool: name1, + similarity, + threshold: GPT_SETTINGS.similarityThreshold, + args1: argsStr1.substring(0, 100), + args2: argsStr2.substring(0, 100), + }, "GPT similar tool call detected"); + return true; + } + } + + return false; +} + +/** + * Get a signature for a tool call (for tracking in history) + * @param {Object} call - Tool call object + * @returns {string} - Unique signature for the call + */ +function getToolCallSignature(call) { + const name = call.function?.name ?? call.name ?? 'unknown'; + const args = call.function?.arguments ?? call.arguments ?? call.input ?? {}; + const argsStr = typeof args === 'string' ? args : JSON.stringify(args); + return `${name}:${argsStr}`; +} + +module.exports = { + GPT_SETTINGS, + isGPTProvider, + getGPTToolLoopThreshold, + formatToolResultForGPT, + getGPTSystemPromptAddendum, + stringSimilarity, + areSimilarToolCalls, + getToolCallSignature, +}; diff --git a/src/clients/standard-tools.js b/src/clients/standard-tools.js index 61ac791..6de81a9 100644 --- a/src/clients/standard-tools.js +++ b/src/clients/standard-tools.js @@ -76,7 +76,7 @@ const STANDARD_TOOLS = [ }, { name: "Bash", - description: "Executes a bash command in a persistent shell session. Use for terminal operations like git, npm, docker, etc. DO NOT use for file operations - use specialized tools instead.", + description: "Executes a bash command in a persistent shell session. Use for terminal operations like git, npm, docker, listing files (ls), etc. PREFERRED for listing directory contents - use 'ls' command. DO NOT use for reading file contents - use Read tool instead.", input_schema: { type: "object", properties: { @@ -98,7 +98,7 @@ const STANDARD_TOOLS = [ }, { name: "Glob", - description: "Fast file pattern matching tool. Supports glob patterns like '**/*.js' or 'src/**/*.ts'. Returns matching file paths sorted by modification time.", + description: "File pattern matching for finding files by name pattern. Use ONLY when you need to find files matching a specific pattern like '**/*.js'. For simple directory listing, use Bash with 'ls' instead.", input_schema: { type: "object", properties: { @@ -354,4 +354,7 @@ EXAMPLE: User says "explore this project" → Call Task with subagent_type="Expl } ]; -module.exports = { STANDARD_TOOLS }; +// Pre-computed name list to avoid re-mapping on every log call +const STANDARD_TOOL_NAMES = STANDARD_TOOLS.map(t => t.name); + +module.exports = { STANDARD_TOOLS, STANDARD_TOOL_NAMES }; diff --git a/src/config/index.js b/src/config/index.js index 466585d..fd2e34c 100644 --- a/src/config/index.js +++ b/src/config/index.js @@ -704,8 +704,8 @@ var config = { semanticCache: { enabled: process.env.SEMANTIC_CACHE_ENABLED !== 'false', // Disable via env if needed similarityThreshold: parseFloat(process.env.SEMANTIC_CACHE_THRESHOLD || '0.95'), // Higher threshold - maxEntries: 500, - ttlMs: 3600000, // 1 hour + maxEntries: Number.parseInt(process.env.SEMANTIC_CACHE_MAX_ENTRIES ?? "50", 10), // Reduced from 500 to prevent memory bloat + ttlMs: Number.parseInt(process.env.SEMANTIC_CACHE_TTL_MS ?? "300000", 10), // 5 minutes (was 1 hour) }, agents: { enabled: agentsEnabled, @@ -857,6 +857,23 @@ var config = { taskTimeoutMs: Number.isNaN(workerTaskTimeoutMs) ? 5000 : workerTaskTimeoutMs, offloadThresholdBytes: Number.isNaN(workerOffloadThresholdBytes) ? 10000 : workerOffloadThresholdBytes, }, + + // Intelligent Routing + routing: { + weightedScoring: true, + costOptimization: true, + agenticDetection: true, + }, + + // Model Tier Configuration (REQUIRED) + // Format: TIER_=provider:model (e.g., TIER_SIMPLE=ollama:llama3.2) + modelTiers: { + enabled: true, + SIMPLE: process.env.TIER_SIMPLE?.trim() || null, + MEDIUM: process.env.TIER_MEDIUM?.trim() || null, + COMPLEX: process.env.TIER_COMPLEX?.trim() || null, + REASONING: process.env.TIER_REASONING?.trim() || null, + }, }; /** @@ -902,4 +919,23 @@ function reloadConfig() { // Make config mutable for hot reload config.reloadConfig = reloadConfig; +// Validate mandatory TIER_* configuration +const missingTiers = []; +if (!config.modelTiers.SIMPLE) missingTiers.push('TIER_SIMPLE'); +if (!config.modelTiers.MEDIUM) missingTiers.push('TIER_MEDIUM'); +if (!config.modelTiers.COMPLEX) missingTiers.push('TIER_COMPLEX'); +if (!config.modelTiers.REASONING) missingTiers.push('TIER_REASONING'); + +if (missingTiers.length > 0) { + throw new Error( + `Missing required tier configuration: ${missingTiers.join(', ')}\n` + + `Format: TIER_=provider:model\n` + + `Example:\n` + + ` TIER_SIMPLE=ollama:llama3.2\n` + + ` TIER_MEDIUM=azure-openai:gpt-4o\n` + + ` TIER_COMPLEX=azure-openai:gpt-4o\n` + + ` TIER_REASONING=openai:o1` + ); +} + module.exports = config; diff --git a/src/orchestrator/index.js b/src/orchestrator/index.js index 55a47a5..260ce23 100644 --- a/src/orchestrator/index.js +++ b/src/orchestrator/index.js @@ -19,6 +19,7 @@ const crypto = require("crypto"); const { asyncClone, asyncTransform, getPoolStats } = require("../workers/helpers"); const { getSemanticCache, isSemanticCacheEnabled } = require("../cache/semantic"); const lazyLoader = require("../tools/lazy-loader"); +const { isGPTProvider, getGPTToolLoopThreshold, areSimilarToolCalls } = require("../clients/gpt-utils"); /** * Get destination URL for audit logging based on provider type @@ -516,13 +517,51 @@ function parseExecutionContent(content) { const trimmed = content.trim(); if (trimmed.startsWith("{") || trimmed.startsWith("[")) { try { - return JSON.parse(trimmed); + const parsed = JSON.parse(trimmed); + // Handle Anthropic content blocks array - extract text + if (Array.isArray(parsed)) { + const textParts = parsed + .filter(block => block && typeof block === 'object') + .map(block => { + if (block.type === 'text' && typeof block.text === 'string') { + return block.text; + } + // Handle other block types gracefully + if (block.text) return block.text; + if (block.content) return typeof block.content === 'string' ? block.content : JSON.stringify(block.content); + return null; + }) + .filter(text => text !== null); + + if (textParts.length > 0) { + return textParts.join('\n'); + } + } + return parsed; } catch { return content; } } return content; } + // Handle content that's already an array (content blocks) + if (Array.isArray(content)) { + const textParts = content + .filter(block => block && typeof block === 'object') + .map(block => { + if (block.type === 'text' && typeof block.text === 'string') { + return block.text; + } + if (block.text) return block.text; + if (block.content) return typeof block.content === 'string' ? block.content : JSON.stringify(block.content); + return null; + }) + .filter(text => text !== null); + + if (textParts.length > 0) { + return textParts.join('\n'); + } + } return content; } @@ -1397,6 +1436,19 @@ async function runAgentLoop({ "Agent loop step", ); + // Trim messages when they grow too large to prevent OOM. + // Keep the first message (system/user) and the last MAX_LOOP_MESSAGES. + const MAX_LOOP_MESSAGES = 40; + if (cleanPayload.messages && cleanPayload.messages.length > MAX_LOOP_MESSAGES) { + const excess = cleanPayload.messages.length - MAX_LOOP_MESSAGES; + // Keep first 2 messages (system context + initial user) and trim from the middle + cleanPayload.messages.splice(2, excess); + logger.info( + { trimmed: excess, remaining: cleanPayload.messages.length }, + "Trimmed intermediate messages to prevent memory growth", + ); + } + // Debug: Log payload before sending to Azure if (providerType === "azure-anthropic") { logger.debug( @@ -2179,6 +2231,7 @@ IMPORTANT TOOL USAGE RULES: session, cwd, requestMessages: cleanPayload.messages, + provider: providerType, // Pass provider for GPT-specific formatting })) ); @@ -2412,6 +2465,7 @@ IMPORTANT TOOL USAGE RULES: session, cwd, requestMessages: cleanPayload.messages, + provider: providerType, // Pass provider for GPT-specific formatting }); let toolMessage; @@ -2527,33 +2581,59 @@ IMPORTANT TOOL USAGE RULES: // === TOOL CALL LOOP DETECTION === // Track tool calls to detect infinite loops where the model calls the same tool // repeatedly with identical parameters + // GPT models get stricter thresholds (2 vs 3) and similarity-based detection + const isGPT = isGPTProvider(providerType); + const loopThreshold = isGPT ? getGPTToolLoopThreshold() : 3; + for (const call of toolCalls) { const signature = getToolCallSignature(call); - const count = (toolCallHistory.get(signature) || 0) + 1; - toolCallHistory.set(signature, count); + const existingEntry = toolCallHistory.get(signature); + let count = (existingEntry?.count || 0) + 1; + toolCallHistory.set(signature, { count, call }); const toolName = call.function?.name ?? call.name ?? 'unknown'; - if (count === 3 && !loopWarningInjected) { + // For GPT models, also check for similar (not just identical) tool calls + // This catches cases where GPT slightly varies parameters but is essentially looping + if (isGPT) { + for (const [existingSig, existingData] of toolCallHistory.entries()) { + if (existingSig !== signature && areSimilarToolCalls(call, existingData.call)) { + // Found a similar call - increase count to trigger loop detection earlier + count = Math.max(count, existingData.count + 1); + logger.debug({ + tool: toolName, + currentSignature: signature, + similarSignature: existingSig, + combinedCount: count, + }, "GPT similar tool call detected - combining counts"); + } + } + } + + if (count === loopThreshold && !loopWarningInjected) { logger.warn( { sessionId: session?.id ?? null, correlationId: options?.correlationId, tool: toolName, loopCount: count, + loopThreshold, + isGPT, signature: signature, action: 'warning_injected', totalSteps: steps, remainingSteps: settings.maxSteps - steps, }, - "Tool call loop detected - same tool called 3 times with identical parameters", + `Tool call loop detected - same tool called ${loopThreshold} times with identical/similar parameters`, ); - // Inject warning message to model + // Inject warning message to model - GPT gets a more explicit message loopWarningInjected = true; const warningMessage = { role: "user", - content: "⚠️ System Warning: You have called the same tool with identical parameters 3 times in this request. This may indicate an infinite loop. Please provide a final answer to the user instead of calling the same tool again, or explain why you need to continue retrying with the same parameters.", + content: isGPT + ? `⚠️ CRITICAL SYSTEM WARNING: You have called the "${toolName}" tool ${count} times with identical or similar parameters. This IS an infinite loop. STOP calling this tool immediately. You MUST now provide a direct text response to the user based on the results you have received. If the tool returned "no results" or empty output, that IS the final answer - do not retry. Summarize your findings and respond.` + : `⚠️ System Warning: You have called the same tool with identical parameters ${count} times in this request. This may indicate an infinite loop. Please provide a final answer to the user instead of calling the same tool again, or explain why you need to continue retrying with the same parameters.`, }; cleanPayload.messages.push(warningMessage); @@ -2568,11 +2648,13 @@ IMPORTANT TOOL USAGE RULES: reason: "tool_call_loop_warning", toolName, loopCount: count, + isGPT, + loopThreshold, }, }); } - } else if (count > 3) { - // Force termination after 3 identical calls + } else if (count > loopThreshold) { + // Force termination after threshold exceeded // Log FULL context for debugging why the loop occurred logger.error( { @@ -2580,6 +2662,8 @@ IMPORTANT TOOL USAGE RULES: correlationId: options?.correlationId, tool: toolName, loopCount: count, + loopThreshold, + isGPT, signature: signature, action: 'request_terminated', totalSteps: steps, @@ -2600,7 +2684,7 @@ IMPORTANT TOOL USAGE RULES: body: { error: { type: "tool_call_loop_detected", - message: `Tool call loop detected: The model called the same tool ("${toolName}") with identical parameters ${count} times. This indicates an infinite loop and execution has been terminated. Please try rephrasing your request or provide different parameters.`, + message: `Tool call loop detected: The model called the same tool ("${toolName}") with identical parameters ${count} times (threshold: ${loopThreshold}). This indicates an infinite loop and execution has been terminated. Please try rephrasing your request or provide different parameters.`, }, }, terminationReason: "tool_call_loop", @@ -3059,6 +3143,7 @@ IMPORTANT TOOL USAGE RULES: session, cwd, requestMessages: cleanPayload.messages, + provider: providerType, // Pass provider for GPT-specific formatting }); const toolResultMessage = createFallbackToolResultMessage(providerType, { @@ -3332,7 +3417,12 @@ async function processMessage({ payload, headers, session, cwd, options = {} }) // === TOOL LOOP GUARD (EARLY CHECK) === // Check BEFORE sanitization since sanitizePayload removes conversation history - const toolLoopThreshold = config.policy?.toolLoopThreshold ?? 3; + // GPT models get a lower threshold (2) to catch loops earlier + const providerType = config.modelProvider?.type ?? "databricks"; + const isGPT = isGPTProvider(providerType); + const toolLoopThreshold = isGPT + ? getGPTToolLoopThreshold() // Hardcoded: 2 for GPT + : (config.policy?.toolLoopThreshold ?? 3); const { toolResultCount, toolUseCount } = countToolCallsInHistory(payload?.messages); console.log('[ToolLoopGuard EARLY] Checking ORIGINAL messages:', { @@ -3340,13 +3430,18 @@ async function processMessage({ payload, headers, session, cwd, options = {} }) toolResultCount, toolUseCount, threshold: toolLoopThreshold, + isGPT, + providerType, }); - if (toolResultCount >= toolLoopThreshold) { + // Temporarily increase threshold in client mode to allow tool execution flow + const effectiveThreshold = config.toolExecutionMode === 'client' ? 10 : toolLoopThreshold; + + if (toolResultCount >= effectiveThreshold) { logger.error({ toolResultCount, toolUseCount, - threshold: toolLoopThreshold, + threshold: effectiveThreshold, sessionId: session?.id ?? null, }, "[ToolLoopGuard] FORCE TERMINATING - too many tool calls in conversation"); diff --git a/src/routing/agentic-detector.js b/src/routing/agentic-detector.js new file mode 100644 index 0000000..889d404 --- /dev/null +++ b/src/routing/agentic-detector.js @@ -0,0 +1,320 @@ +/** + * Agentic Workflow Detector + * Detects multi-step tool chains and autonomous agent patterns + * Used to boost complexity tier for agentic workloads + */ + +const logger = require('../logger'); + +// Agent type classification with tier requirements +const AGENT_TYPES = { + SINGLE_SHOT: { + minTier: 'SIMPLE', + scoreBoost: 0, + description: 'Simple request-response, no tool chains', + }, + TOOL_CHAIN: { + minTier: 'MEDIUM', + scoreBoost: 15, + requiresToolUse: true, + description: 'Sequential tool usage (read -> edit -> test)', + }, + ITERATIVE: { + minTier: 'COMPLEX', + scoreBoost: 25, + requiresToolUse: true, + description: 'Retry loops, debugging cycles, iterative refinement', + }, + AUTONOMOUS: { + minTier: 'REASONING', + scoreBoost: 35, + requiresToolUse: true, + description: 'Open-ended tasks, full autonomy, complex decision making', + }, +}; + +// Detection patterns +const PATTERNS = { + // Tool chain indicators + toolChain: /\b(then\s+use|after\s+that|next\s+step|finally|first.*then|step\s*\d+)\b/i, + + // Iterative work indicators + iterative: /\b(keep\s+trying|until|repeat|loop|retry|iterate|fix.*again|try.*different|debug)\b/i, + + // Autonomous work indicators + autonomous: /\b(figure\s+out|solve|complete\s+the\s+task|do\s+whatever|make\s+it\s+work|find\s+a\s+way|whatever\s+it\s+takes)\b/i, + + // Multi-file work + multiFile: /\b(multiple\s+files?|across\s+(the\s+)?codebase|all\s+files?|refactor\s+entire|whole\s+project|everywhere)\b/i, + + // Planning indicators + planning: /\b(plan|design|architect|strategy|roadmap|approach|how\s+would\s+you)\b/i, + + // Implementation indicators + implementation: /\b(implement|build|create|develop|write|code|add\s+feature)\b/i, + + // Analysis indicators + analysis: /\b(analyze|investigate|understand|explain|why\s+is|what\s+causes|root\s+cause)\b/i, + + // Testing indicators + testing: /\b(test|verify|validate|check|ensure|confirm|make\s+sure)\b/i, +}; + +// High-complexity tools that indicate agentic work +const AGENTIC_TOOLS = new Set([ + // Execution tools + 'Bash', 'bash', 'shell', 'execute', 'run_command', + // Write tools + 'Write', 'write_file', 'fs_write', 'create_file', + // Edit tools + 'Edit', 'edit_file', 'fs_edit', 'edit_patch', 'str_replace_editor', + // Agent tools + 'Task', 'agent_task', 'spawn_agent', 'delegate', + // Git tools + 'Git', 'git_commit', 'git_push', 'git_create_branch', + // Test tools + 'Test', 'run_tests', 'pytest', 'jest', + // Notebook tools + 'NotebookEdit', 'notebook_edit', +]); + +// Read-only tools (lower complexity) +const READ_ONLY_TOOLS = new Set([ + 'Read', 'read_file', 'fs_read', + 'Glob', 'glob', 'find_files', + 'Grep', 'grep', 'search', 'ripgrep', + 'WebFetch', 'web_fetch', 'fetch_url', + 'WebSearch', 'web_search', +]); + +class AgenticDetector { + /** + * Detect agentic workflow patterns + * @param {Object} payload - Request payload with messages and tools + * @returns {Object} Detection result + */ + detect(payload) { + const messages = payload?.messages || []; + const tools = payload?.tools || []; + const content = this._extractContent(messages); + + let score = 0; + const signals = []; + + // Signal 1: Tool count (many tools = likely multi-step) + const toolCount = tools.length; + if (toolCount > 10) { + score += 25; + signals.push({ signal: 'very_high_tool_count', value: toolCount, weight: 25 }); + } else if (toolCount > 5) { + score += 15; + signals.push({ signal: 'high_tool_count', value: toolCount, weight: 15 }); + } else if (toolCount > 3) { + score += 8; + signals.push({ signal: 'moderate_tool_count', value: toolCount, weight: 8 }); + } + + // Signal 2: Agentic tools present (Bash, Write, Edit, Task) + const agenticToolCount = tools.filter(t => { + const name = t.name || t.function?.name || ''; + return AGENTIC_TOOLS.has(name); + }).length; + + if (agenticToolCount > 3) { + score += 25; + signals.push({ signal: 'many_agentic_tools', value: agenticToolCount, weight: 25 }); + } else if (agenticToolCount > 1) { + score += 15; + signals.push({ signal: 'has_agentic_tools', value: agenticToolCount, weight: 15 }); + } else if (agenticToolCount === 1) { + score += 8; + signals.push({ signal: 'single_agentic_tool', value: agenticToolCount, weight: 8 }); + } + + // Signal 3: Prior tool results (already in agentic loop) + const toolResultCount = this._countToolResults(messages); + if (toolResultCount > 5) { + score += 30; + signals.push({ signal: 'deep_tool_loop', value: toolResultCount, weight: 30 }); + } else if (toolResultCount > 2) { + score += 20; + signals.push({ signal: 'active_tool_loop', value: toolResultCount, weight: 20 }); + } else if (toolResultCount > 0) { + score += 10; + signals.push({ signal: 'has_tool_results', value: toolResultCount, weight: 10 }); + } + + // Signal 4: Pattern matching on content + if (PATTERNS.autonomous.test(content)) { + score += 25; + signals.push({ signal: 'autonomous_pattern', weight: 25 }); + } + + if (PATTERNS.iterative.test(content)) { + score += 20; + signals.push({ signal: 'iterative_pattern', weight: 20 }); + } + + if (PATTERNS.toolChain.test(content)) { + score += 15; + signals.push({ signal: 'tool_chain_pattern', weight: 15 }); + } + + if (PATTERNS.multiFile.test(content)) { + score += 15; + signals.push({ signal: 'multi_file_work', weight: 15 }); + } + + if (PATTERNS.planning.test(content)) { + score += 10; + signals.push({ signal: 'planning_required', weight: 10 }); + } + + if (PATTERNS.implementation.test(content) && PATTERNS.testing.test(content)) { + score += 15; + signals.push({ signal: 'implementation_with_testing', weight: 15 }); + } + + // Signal 5: Conversation depth + const messageCount = messages.length; + if (messageCount > 15) { + score += 20; + signals.push({ signal: 'very_deep_conversation', value: messageCount, weight: 20 }); + } else if (messageCount > 8) { + score += 12; + signals.push({ signal: 'deep_conversation', value: messageCount, weight: 12 }); + } else if (messageCount > 4) { + score += 6; + signals.push({ signal: 'ongoing_conversation', value: messageCount, weight: 6 }); + } + + // Signal 6: Content length (longer prompts often = more complex tasks) + if (content.length > 2000) { + score += 10; + signals.push({ signal: 'long_prompt', value: content.length, weight: 10 }); + } + + // Determine agent type + const agentType = this._classifyAgentType(score, signals); + const isAgentic = score >= 25; + + const result = { + isAgentic, + agentType, + confidence: Math.min(score / 100, 1), + score, + signals, + minTier: AGENT_TYPES[agentType].minTier, + scoreBoost: AGENT_TYPES[agentType].scoreBoost, + description: AGENT_TYPES[agentType].description, + }; + + if (isAgentic) { + logger.debug({ + agentType, + score, + signalCount: signals.length, + toolCount, + toolResultCount, + }, '[AgenticDetector] Agentic workflow detected'); + } + + return result; + } + + /** + * Classify agent type based on score and signals + */ + _classifyAgentType(score, signals) { + // Check for specific signal combinations + const hasAutonomousPattern = signals.some(s => s.signal === 'autonomous_pattern'); + const hasDeepToolLoop = signals.some(s => s.signal === 'deep_tool_loop'); + const hasManyAgenticTools = signals.some(s => s.signal === 'many_agentic_tools'); + + // Autonomous: high score + autonomous pattern or very deep tool usage + if (score >= 60 || (hasAutonomousPattern && score >= 40)) { + return 'AUTONOMOUS'; + } + + // Iterative: moderate-high score with tool loops + if (score >= 40 || (hasDeepToolLoop && score >= 30)) { + return 'ITERATIVE'; + } + + // Tool chain: some tool usage indicated + if (score >= 20 || hasManyAgenticTools) { + return 'TOOL_CHAIN'; + } + + return 'SINGLE_SHOT'; + } + + /** + * Extract user content from messages + */ + _extractContent(messages) { + const userMsgs = messages.filter(m => m?.role === 'user'); + if (userMsgs.length === 0) return ''; + + // Get last user message + const last = userMsgs[userMsgs.length - 1]; + + if (typeof last.content === 'string') { + return last.content; + } + + if (Array.isArray(last.content)) { + return last.content + .filter(block => block?.type === 'text') + .map(block => block.text || '') + .join(' '); + } + + return ''; + } + + /** + * Count tool results in conversation + */ + _countToolResults(messages) { + let count = 0; + + for (const msg of messages) { + if (msg?.role === 'user' && Array.isArray(msg.content)) { + count += msg.content.filter(c => c?.type === 'tool_result').length; + } + } + + return count; + } + + /** + * Get detection stats for debugging + */ + getPatternStats(content) { + const stats = {}; + for (const [name, pattern] of Object.entries(PATTERNS)) { + stats[name] = pattern.test(content); + } + return stats; + } +} + +// Singleton instance +let instance = null; + +function getAgenticDetector() { + if (!instance) { + instance = new AgenticDetector(); + } + return instance; +} + +module.exports = { + AgenticDetector, + getAgenticDetector, + AGENT_TYPES, + PATTERNS, + AGENTIC_TOOLS, + READ_ONLY_TOOLS, +}; diff --git a/src/routing/complexity-analyzer.js b/src/routing/complexity-analyzer.js index 0929de4..8781362 100644 --- a/src/routing/complexity-analyzer.js +++ b/src/routing/complexity-analyzer.js @@ -88,6 +88,58 @@ const FORCE_LOCAL_PATTERNS = [ /^(help|menu|commands?|options?)[\s\.\!\?]*$/i, ]; +// Weighted Scoring (15 Dimensions) +const DIMENSION_WEIGHTS = { + // Content Analysis (35%) + tokenCount: 0.08, + promptComplexity: 0.10, + technicalDepth: 0.10, + domainSpecificity: 0.07, + // Tool Analysis (25%) + toolCount: 0.08, + toolComplexity: 0.10, + toolChainPotential: 0.07, + // Reasoning Requirements (25%) + multiStepReasoning: 0.10, + codeGeneration: 0.08, + analysisDepth: 0.07, + // Context Factors (15%) + conversationDepth: 0.05, + priorToolUsage: 0.05, + ambiguity: 0.05, +}; + +// Tool complexity weights (higher = more complex) +const TOOL_COMPLEXITY_WEIGHTS = { + Bash: 0.9, + bash: 0.9, + shell: 0.9, + Write: 0.8, + write_file: 0.8, + Edit: 0.7, + edit_file: 0.7, + NotebookEdit: 0.7, + Task: 0.9, + agent_task: 0.9, + WebSearch: 0.5, + WebFetch: 0.4, + Read: 0.3, + read_file: 0.3, + Glob: 0.2, + Grep: 0.2, + default: 0.5, +}; + +// Domain-specific keywords for complexity +const DOMAIN_KEYWORDS = { + security: /\b(auth|encrypt|vulnerability|injection|xss|csrf|jwt|oauth|password|credential|secret)\b/i, + ml: /\b(model|train|inference|tensor|embedding|neural|llm|gpt|transformer|pytorch|tensorflow)\b/i, + distributed: /\b(microservice|kafka|redis|queue|scale|cluster|replicate|kubernetes|docker|container)\b/i, + database: /\b(sql|nosql|migration|index|query|transaction|orm|postgres|mongodb|mysql)\b/i, + frontend: /\b(react|vue|angular|svelte|css|html|component|state|redux|hooks)\b/i, + devops: /\b(ci\/cd|pipeline|deploy|terraform|ansible|github\s*actions|jenkins)\b/i, +}; + // ============================================================================ // PHASE 3: Metrics Tracking // ============================================================================ @@ -360,6 +412,116 @@ function scoreReasoning(content) { return { score: Math.min(score, 15), reasons }; } +// ============================================================================ +// WEIGHTED SCORING FUNCTION (15 Dimensions) +// ============================================================================ + +/** + * Calculate weighted complexity score (0-100) + * Uses 15 dimensions with configurable weights + * @param {Object} payload - Request payload + * @param {string} content - Extracted content + * @returns {Object} Weighted score result + */ +function calculateWeightedScore(payload, content) { + const dimensions = {}; + + // 1. Token count (0-100) + const tokens = estimateTokens(payload); + dimensions.tokenCount = tokens < 500 ? 10 : tokens < 2000 ? 30 : tokens < 5000 ? 50 : tokens < 10000 ? 70 : 90; + + // 2. Prompt complexity (sentence structure, avg length) + const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 0); + const avgLength = content.length / Math.max(sentences.length, 1); + dimensions.promptComplexity = Math.min(avgLength / 2, 100); + + // 3. Technical depth (keyword density) + const techMatches = (content.match(PATTERNS.technical) || []).length; + dimensions.technicalDepth = Math.min(techMatches * 15, 100); + + // 4. Domain specificity (how many domains are touched) + let domainScore = 0; + const domainsMatched = []; + for (const [domain, regex] of Object.entries(DOMAIN_KEYWORDS)) { + if (regex.test(content)) { + domainScore += 20; + domainsMatched.push(domain); + } + } + dimensions.domainSpecificity = Math.min(domainScore, 100); + + // 5. Tool count + const toolCount = payload?.tools?.length ?? 0; + dimensions.toolCount = toolCount === 0 ? 0 : + toolCount <= 3 ? 20 : + toolCount <= 6 ? 40 : + toolCount <= 10 ? 60 : + toolCount <= 15 ? 80 : 100; + + // 6. Tool complexity (weighted by tool types) + if (payload?.tools?.length > 0) { + const totalWeight = payload.tools.reduce((sum, t) => { + const name = t.name || t.function?.name || ''; + return sum + (TOOL_COMPLEXITY_WEIGHTS[name] || TOOL_COMPLEXITY_WEIGHTS.default); + }, 0); + const avgWeight = totalWeight / payload.tools.length; + dimensions.toolComplexity = avgWeight * 100; + } else { + dimensions.toolComplexity = 0; + } + + // 7. Tool chain potential (sequential operations) + dimensions.toolChainPotential = /\b(then|after|next|finally|first.*then|step\s*\d+)\b/i.test(content) ? 70 : 20; + + // 8. Multi-step reasoning + dimensions.multiStepReasoning = ADVANCED_PATTERNS.reasoning.stepByStep.test(content) ? 80 : + ADVANCED_PATTERNS.reasoning.planning.test(content) ? 60 : 20; + + // 9. Code generation requirement + dimensions.codeGeneration = /\b(write|create|implement|build|generate)\s+(a\s+)?(new\s+)?(function|class|module|api|endpoint|service|component)/i.test(content) ? 80 : 20; + + // 10. Analysis depth + dimensions.analysisDepth = ADVANCED_PATTERNS.reasoning.tradeoffs.test(content) ? 80 : + ADVANCED_PATTERNS.reasoning.analysis.test(content) ? 60 : 20; + + // 11. Conversation depth + const messageCount = payload?.messages?.length ?? 0; + dimensions.conversationDepth = messageCount < 3 ? 10 : + messageCount < 6 ? 30 : + messageCount < 10 ? 50 : 70; + + // 12. Prior tool usage (tool results in conversation) + const toolResults = (payload?.messages || []).filter(m => + m.role === 'user' && Array.isArray(m.content) && m.content.some(c => c.type === 'tool_result') + ).length; + dimensions.priorToolUsage = toolResults === 0 ? 10 : + toolResults < 3 ? 40 : + toolResults < 6 ? 60 : 80; + + // 13. Ambiguity (inverse of specificity) + const hasSpecifics = /\b(file|function|line\s*\d+|error|bug|at\s+[\w.]+:\d+|\/[\w/]+\.\w+)\b/i.test(content); + dimensions.ambiguity = hasSpecifics ? 20 : content.length < 50 ? 70 : 40; + + // Calculate weighted total + let weightedTotal = 0; + for (const [dimension, weight] of Object.entries(DIMENSION_WEIGHTS)) { + weightedTotal += (dimensions[dimension] || 0) * weight; + } + + return { + score: Math.round(weightedTotal), + dimensions, + weights: DIMENSION_WEIGHTS, + meta: { + tokens, + toolCount, + messageCount, + toolResults, + domainsMatched, + }, + }; +} + /** * Get threshold based on SMART_TOOL_SELECTION_MODE */ @@ -381,13 +543,45 @@ function getThreshold() { * Analyze request complexity and return full analysis * * @param {Object} payload - Request payload + * @param {Object} options - Analysis options * @returns {Object} Complexity analysis result */ -function analyzeComplexity(payload) { +function analyzeComplexity(payload, options = {}) { const content = extractContent(payload); const messageCount = payload?.messages?.length ?? 0; + const useWeighted = options.weighted ?? config.routing?.weightedScoring ?? false; + + // Use weighted scoring if enabled + if (useWeighted) { + const weighted = calculateWeightedScore(payload, content); + const threshold = getThreshold(); + const mode = config.smartToolSelection?.mode ?? 'heuristic'; + + // Check force patterns + const taskTypeResult = scoreTaskType(content); + let recommendation; + if (taskTypeResult.reason === 'force_local') { + recommendation = 'local'; + } else if (taskTypeResult.reason === 'force_cloud') { + recommendation = 'cloud'; + } else { + recommendation = weighted.score >= threshold ? 'cloud' : 'local'; + } - // Calculate individual scores + return { + score: weighted.score, + threshold, + mode: 'weighted', + recommendation, + breakdown: weighted.dimensions, + weights: weighted.weights, + meta: weighted.meta, + forceReason: taskTypeResult.reason?.startsWith('force_') ? taskTypeResult.reason : null, + content: content.slice(0, 100) + (content.length > 100 ? '...' : ''), + }; + } + + // Standard scoring (original logic) const tokenScore = scoreTokens(payload); const toolScore = scoreTools(payload); const taskTypeResult = scoreTaskType(content); @@ -577,6 +771,9 @@ module.exports = { scoreCodeComplexity, scoreReasoning, + // Weighted scoring + calculateWeightedScore, + // Configuration getThreshold, @@ -592,4 +789,7 @@ module.exports = { ADVANCED_PATTERNS, FORCE_CLOUD_PATTERNS, FORCE_LOCAL_PATTERNS, + DIMENSION_WEIGHTS, + TOOL_COMPLEXITY_WEIGHTS, + DOMAIN_KEYWORDS, }; diff --git a/src/routing/cost-optimizer.js b/src/routing/cost-optimizer.js new file mode 100644 index 0000000..696f73b --- /dev/null +++ b/src/routing/cost-optimizer.js @@ -0,0 +1,305 @@ +/** + * Cost Optimizer Module + * Tracks and optimizes LLM costs across providers + * Uses ModelRegistry for dynamic pricing data + */ + +const logger = require('../logger'); +const config = require('../config'); +const { getModelRegistry, getModelRegistrySync } = require('./model-registry'); +const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers'); + +// Session cost tracking (in-memory) +const sessionCosts = new Map(); // sessionId -> { total, requests, byModel, byProvider } + +// Global stats +const globalStats = { + totalCost: 0, + totalSavings: 0, + requestCount: 0, + byProvider: {}, + byTier: {}, +}; + +class CostOptimizer { + constructor() { + this.registry = null; + this.tierSelector = null; + } + + /** + * Initialize with registry (async) + */ + async initialize() { + this.registry = await getModelRegistry(); + this.tierSelector = getModelTierSelector(); + } + + /** + * Get registry (sync fallback) + */ + _getRegistry() { + if (!this.registry) { + this.registry = getModelRegistrySync(); + } + return this.registry; + } + + /** + * Get tier selector + */ + _getTierSelector() { + if (!this.tierSelector) { + this.tierSelector = getModelTierSelector(); + } + return this.tierSelector; + } + + /** + * Estimate cost for a request before sending + * @param {string} model - Model name + * @param {number} inputTokens - Estimated input tokens + * @param {number} outputTokens - Estimated output tokens (optional) + * @returns {Object} Cost estimate + */ + estimateCost(model, inputTokens, outputTokens = null) { + const registry = this._getRegistry(); + const costs = registry.getCost(model); + + const inputCost = (inputTokens / 1_000_000) * costs.input; + const estimatedOutputTokens = outputTokens || Math.min(inputTokens * 0.5, 4096); + const outputCost = (estimatedOutputTokens / 1_000_000) * costs.output; + + return { + inputCost: Math.round(inputCost * 1_000_000) / 1_000_000, + outputCost: Math.round(outputCost * 1_000_000) / 1_000_000, + totalEstimate: Math.round((inputCost + outputCost) * 1_000_000) / 1_000_000, + model, + inputTokens, + outputTokens: estimatedOutputTokens, + pricePerMillion: { + input: costs.input, + output: costs.output, + }, + source: costs.source, + }; + } + + /** + * Find cheapest model capable of handling a complexity tier + * @param {string} requiredTier - Minimum tier required + * @param {string[]} availableProviders - Providers to consider + * @returns {Object|null} Cheapest model info + */ + findCheapestForTier(requiredTier, availableProviders) { + const registry = this._getRegistry(); + const tierSelector = this._getTierSelector(); + + const tierOrder = ['SIMPLE', 'MEDIUM', 'COMPLEX', 'REASONING']; + const minTierIndex = tierOrder.indexOf(requiredTier); + + if (minTierIndex === -1) { + logger.warn({ tier: requiredTier }, '[CostOptimizer] Unknown tier'); + return null; + } + + const candidates = []; + + // Collect models from all capable tiers (>= required tier) + for (let i = minTierIndex; i < tierOrder.length; i++) { + const tier = tierOrder[i]; + + for (const provider of availableProviders) { + const models = tierSelector.getPreferredModels(tier, provider); + + for (const model of models) { + const cost = registry.getCost(model); + const totalCost = cost.input + cost.output; // Simple cost metric + + candidates.push({ + model, + provider, + tier, + inputCost: cost.input, + outputCost: cost.output, + totalCost, + context: cost.context, + source: cost.source, + }); + } + } + } + + if (candidates.length === 0) { + return null; + } + + // Sort by total cost (input + output per 1M tokens) + candidates.sort((a, b) => a.totalCost - b.totalCost); + + const cheapest = candidates[0]; + + logger.debug({ + requiredTier, + selectedModel: cheapest.model, + selectedProvider: cheapest.provider, + cost: cheapest.totalCost, + candidateCount: candidates.length, + }, '[CostOptimizer] Found cheapest model'); + + return cheapest; + } + + /** + * Record actual cost after response + * @param {string} sessionId - Session identifier + * @param {string} provider - Provider used + * @param {string} model - Model used + * @param {number} inputTokens - Actual input tokens + * @param {number} outputTokens - Actual output tokens + * @param {string} tier - Complexity tier + * @returns {number} Actual cost + */ + recordCost(sessionId, provider, model, inputTokens, outputTokens, tier = 'MEDIUM') { + const registry = this._getRegistry(); + const costs = registry.getCost(model); + + const inputCost = (inputTokens / 1_000_000) * costs.input; + const outputCost = (outputTokens / 1_000_000) * costs.output; + const actualCost = inputCost + outputCost; + + // Update session costs + if (sessionId) { + if (!sessionCosts.has(sessionId)) { + sessionCosts.set(sessionId, { + total: 0, + requests: 0, + byModel: {}, + byProvider: {}, + byTier: {}, + }); + } + + const session = sessionCosts.get(sessionId); + session.total += actualCost; + session.requests++; + session.byModel[model] = (session.byModel[model] || 0) + actualCost; + session.byProvider[provider] = (session.byProvider[provider] || 0) + actualCost; + session.byTier[tier] = (session.byTier[tier] || 0) + actualCost; + } + + // Update global stats + globalStats.totalCost += actualCost; + globalStats.requestCount++; + globalStats.byProvider[provider] = (globalStats.byProvider[provider] || 0) + actualCost; + globalStats.byTier[tier] = (globalStats.byTier[tier] || 0) + actualCost; + + logger.debug({ + sessionId, + provider, + model, + inputTokens, + outputTokens, + cost: actualCost.toFixed(6), + tier, + }, '[CostOptimizer] Recorded cost'); + + return actualCost; + } + + /** + * Calculate potential savings from routing optimization + */ + calculateSavings(originalModel, optimizedModel, tokens) { + const registry = this._getRegistry(); + + const originalCost = registry.getCost(originalModel); + const optimizedCost = registry.getCost(optimizedModel); + + const originalTotal = (tokens / 1_000_000) * (originalCost.input + originalCost.output); + const optimizedTotal = (tokens / 1_000_000) * (optimizedCost.input + optimizedCost.output); + + const savings = originalTotal - optimizedTotal; + + if (savings > 0) { + globalStats.totalSavings += savings; + } + + return { + originalCost: originalTotal, + optimizedCost: optimizedTotal, + savings: Math.max(0, savings), + percentSaved: originalTotal > 0 ? (savings / originalTotal) * 100 : 0, + }; + } + + + /** + * Get session cost summary + */ + getSessionCost(sessionId) { + return sessionCosts.get(sessionId) || { + total: 0, + requests: 0, + byModel: {}, + byProvider: {}, + byTier: {}, + }; + } + + /** + * Get global stats + */ + getStats() { + return { + ...globalStats, + sessionCount: sessionCosts.size, + avgCostPerRequest: globalStats.requestCount > 0 + ? (globalStats.totalCost / globalStats.requestCount).toFixed(6) + : '0', + totalCostFormatted: `$${globalStats.totalCost.toFixed(4)}`, + totalSavingsFormatted: `$${globalStats.totalSavings.toFixed(4)}`, + }; + } + + /** + * Clear session data (for cleanup) + */ + clearSession(sessionId) { + sessionCosts.delete(sessionId); + } + + /** + * Reset all stats (for testing) + */ + resetStats() { + sessionCosts.clear(); + globalStats.totalCost = 0; + globalStats.totalSavings = 0; + globalStats.requestCount = 0; + globalStats.byProvider = {}; + globalStats.byTier = {}; + } +} + +// Singleton instance +let instance = null; + +function getCostOptimizer() { + if (!instance) { + instance = new CostOptimizer(); + } + return instance; +} + +async function getCostOptimizerAsync() { + const optimizer = getCostOptimizer(); + await optimizer.initialize(); + return optimizer; +} + +module.exports = { + CostOptimizer, + getCostOptimizer, + getCostOptimizerAsync, +}; diff --git a/src/routing/index.js b/src/routing/index.js index f47853f..40f0ace 100644 --- a/src/routing/index.js +++ b/src/routing/index.js @@ -19,6 +19,11 @@ const { analyzeWithEmbeddings, } = require('./complexity-analyzer'); +// Intelligent routing modules +const { getAgenticDetector, AGENT_TYPES } = require('./agentic-detector'); +const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers'); +const { getCostOptimizer } = require('./cost-optimizer'); + // Local providers const LOCAL_PROVIDERS = ['ollama', 'llamacpp', 'lmstudio']; @@ -196,7 +201,8 @@ async function determineProviderSmart(payload, options = {}) { } // Full complexity analysis for non-tool requests - const analysis = analyzeComplexity(payload); + const useWeightedScoring = config.routing?.weightedScoring ?? false; + const analysis = analyzeComplexity(payload, { weighted: useWeightedScoring }); // Phase 4: Optional embeddings adjustment let embeddingsResult = null; @@ -214,25 +220,116 @@ async function determineProviderSmart(payload, options = {}) { } } - // Apply routing decision based on complexity + // Agentic workflow detection + let agenticResult = null; + if (config.routing?.agenticDetection !== false) { + try { + const detector = getAgenticDetector(); + agenticResult = detector.detect(payload); + + // Boost complexity score for agentic workflows + if (agenticResult.isAgentic) { + analysis.score = Math.min(100, analysis.score + agenticResult.scoreBoost); + analysis.agenticBoost = agenticResult.scoreBoost; + analysis.agentType = agenticResult.agentType; + + logger.debug({ + agentType: agenticResult.agentType, + boost: agenticResult.scoreBoost, + newScore: analysis.score, + }, '[Routing] Agentic workflow detected, boosting score'); + + // Force cloud for autonomous workflows + if (agenticResult.agentType === 'AUTONOMOUS' && isFallbackEnabled()) { + const provider = getBestCloudProvider({ toolCount }); + const decision = { + provider, + method: 'agentic', + reason: 'autonomous_workflow', + score: analysis.score, + agenticResult, + }; + routingMetrics.record(decision); + return decision; + } + } + } catch (err) { + logger.debug({ err: err.message }, 'Agentic detection failed'); + } + } + + // Tier-based model selection + let selectedModel = null; + let tier = null; + if (config.modelTiers?.enabled) { + try { + const selector = getModelTierSelector(); + tier = selector.getTier(analysis.score); + + // Check if agentic detection requires a higher tier + if (agenticResult?.minTier) { + const agenticTierPriority = TIER_DEFINITIONS[agenticResult.minTier]?.priority || 0; + const currentTierPriority = TIER_DEFINITIONS[tier]?.priority || 0; + if (agenticTierPriority > currentTierPriority) { + tier = agenticResult.minTier; + logger.debug({ from: selector.getTier(analysis.score), to: tier }, '[Routing] Upgrading tier for agentic workflow'); + } + } + + // Select model for the tier (will be applied after provider selection) + analysis.tier = tier; + } catch (err) { + logger.debug({ err: err.message }, 'Tier selection failed'); + } + } + + // Apply routing decision based on tier config (TIER_* env vars are mandatory) let provider; - let method = 'complexity'; + let method = 'tier_config'; - if (analysis.recommendation === 'local') { - provider = getBestLocalProvider(); - } else { - // Cloud recommendation - if (isFallbackEnabled()) { - provider = getBestCloudProvider({ toolCount }); - } else { - // Fallback disabled, use local anyway - provider = getBestLocalProvider(); - method = 'fallback_disabled'; + const selector = getModelTierSelector(); + const modelSelection = selector.selectModel(tier, null); + + provider = modelSelection.provider; + selectedModel = modelSelection.model; + logger.debug({ tier, provider, model: selectedModel }, '[Routing] Using tier config'); + + // Cost optimization: check if cheaper model can handle this tier + let costOptimized = false; + if (config.routing?.costOptimization && tier) { + try { + const optimizer = getCostOptimizer(); + const availableProviders = [provider]; + + // Also consider local provider if not already selected + const localProvider = getBestLocalProvider(); + if (localProvider !== provider) { + availableProviders.push(localProvider); + } + + const cheapest = optimizer.findCheapestForTier(tier, availableProviders); + if (cheapest && cheapest.provider !== provider) { + logger.debug({ + from: provider, + to: cheapest.provider, + tier, + savings: `${cheapest.model} is cheaper`, + }, '[Routing] Cost optimization: switching provider'); + + provider = cheapest.provider; + selectedModel = cheapest.model; + costOptimized = true; + method = 'cost_optimized'; + } + } catch (err) { + logger.debug({ err: err.message }, 'Cost optimization failed'); } } const decision = { provider, + model: selectedModel, + tier, method, reason: analysis.recommendation, score: analysis.score, @@ -240,6 +337,8 @@ async function determineProviderSmart(payload, options = {}) { mode: analysis.mode, analysis, embeddingsResult, + agenticResult, + costOptimized, }; // Phase 3: Record metrics @@ -343,6 +442,23 @@ function getRoutingHeaders(decision) { headers['X-Lynkr-Routing-Reason'] = decision.reason; } + // Tier and model headers + if (decision.tier) { + headers['X-Lynkr-Tier'] = decision.tier; + } + + if (decision.model) { + headers['X-Lynkr-Model'] = decision.model; + } + + if (decision.agenticResult?.isAgentic) { + headers['X-Lynkr-Agentic'] = decision.agenticResult.agentType; + } + + if (decision.costOptimized) { + headers['X-Lynkr-Cost-Optimized'] = 'true'; + } + return headers; } @@ -372,4 +488,11 @@ module.exports = { // Re-export analyzer for direct access analyzeComplexity: require('./complexity-analyzer').analyzeComplexity, + + // Intelligent routing modules + getAgenticDetector, + getModelTierSelector, + getCostOptimizer, + AGENT_TYPES, + TIER_DEFINITIONS, }; diff --git a/src/routing/model-registry.js b/src/routing/model-registry.js new file mode 100644 index 0000000..e52258b --- /dev/null +++ b/src/routing/model-registry.js @@ -0,0 +1,437 @@ +/** + * Model Registry + * Multi-source pricing: LiteLLM -> models.dev -> Databricks fallback + * Caches data locally with 24h TTL + */ + +const fs = require('fs'); +const path = require('path'); +const logger = require('../logger'); + +// API URLs +const LITELLM_URL = 'https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json'; +const MODELS_DEV_URL = 'https://models.dev/api.json'; + +// Cache settings +const CACHE_FILE = path.join(__dirname, '../../data/model-prices-cache.json'); +const CACHE_TTL_MS = 24 * 60 * 60 * 1000; // 24 hours + +// Databricks fallback pricing (based on Anthropic direct API prices) +const DATABRICKS_FALLBACK = { + // Claude models + 'databricks-claude-opus-4-6': { input: 5.0, output: 25.0, context: 1000000 }, + 'databricks-claude-opus-4-5': { input: 5.0, output: 25.0, context: 200000 }, + 'databricks-claude-opus-4-1': { input: 15.0, output: 75.0, context: 200000 }, + 'databricks-claude-sonnet-4-5': { input: 3.0, output: 15.0, context: 200000 }, + 'databricks-claude-sonnet-4': { input: 3.0, output: 15.0, context: 200000 }, + 'databricks-claude-3-7-sonnet': { input: 3.0, output: 15.0, context: 200000 }, + 'databricks-claude-haiku-4-5': { input: 1.0, output: 5.0, context: 200000 }, + + // Llama models + 'databricks-llama-4-maverick': { input: 1.0, output: 1.0, context: 128000 }, + 'databricks-meta-llama-3-3-70b-instruct': { input: 0.9, output: 0.9, context: 128000 }, + 'databricks-meta-llama-3-1-405b-instruct': { input: 2.0, output: 2.0, context: 128000 }, + 'databricks-meta-llama-3-1-8b-instruct': { input: 0.2, output: 0.2, context: 128000 }, + + // GPT models via Databricks + 'databricks-gpt-5-2': { input: 5.0, output: 15.0, context: 200000 }, + 'databricks-gpt-5-1': { input: 3.0, output: 12.0, context: 200000 }, + 'databricks-gpt-5': { input: 2.5, output: 10.0, context: 128000 }, + 'databricks-gpt-5-mini': { input: 0.5, output: 1.5, context: 128000 }, + 'databricks-gpt-5-nano': { input: 0.15, output: 0.6, context: 128000 }, + + // Gemini models via Databricks + 'databricks-gemini-3-flash': { input: 0.075, output: 0.3, context: 1000000 }, + 'databricks-gemini-3-pro': { input: 1.25, output: 5.0, context: 2000000 }, + 'databricks-gemini-2-5-pro': { input: 1.25, output: 5.0, context: 1000000 }, + 'databricks-gemini-2-5-flash': { input: 0.075, output: 0.3, context: 1000000 }, + + // DBRX + 'databricks-dbrx-instruct': { input: 0.75, output: 2.25, context: 32000 }, + + // Embedding models (price per 1M tokens) + 'databricks-gte-large-en': { input: 0.02, output: 0, context: 8192 }, + 'databricks-bge-large-en': { input: 0.02, output: 0, context: 512 }, +}; + +// Default cost for unknown models +const DEFAULT_COST = { input: 1.0, output: 3.0, context: 128000 }; + +class ModelRegistry { + constructor() { + this.litellmPrices = {}; + this.modelsDevPrices = {}; + this.loaded = false; + this.lastFetch = 0; + this.modelIndex = new Map(); + } + + /** + * Initialize registry - load from cache or fetch fresh data + */ + async initialize() { + if (this.loaded) return; + + // Try cache first + if (this._loadFromCache()) { + this.loaded = true; + // Background refresh if stale + if (Date.now() - this.lastFetch > CACHE_TTL_MS) { + this._fetchAll().catch(err => + logger.warn({ err: err.message }, '[ModelRegistry] Background refresh failed') + ); + } + return; + } + + // Fetch fresh data + await this._fetchAll(); + this.loaded = true; + } + + /** + * Fetch from both sources + */ + async _fetchAll() { + const results = await Promise.allSettled([ + this._fetchLiteLLM(), + this._fetchModelsDev(), + ]); + + const litellmOk = results[0].status === 'fulfilled'; + const modelsDevOk = results[1].status === 'fulfilled'; + + if (litellmOk || modelsDevOk) { + this._buildIndex(); + this._saveToCache(); + this.lastFetch = Date.now(); + + logger.info({ + litellm: litellmOk ? Object.keys(this.litellmPrices).length : 0, + modelsDev: modelsDevOk ? Object.keys(this.modelsDevPrices).length : 0, + total: this.modelIndex.size, + }, '[ModelRegistry] Loaded pricing data'); + } else { + logger.warn('[ModelRegistry] All sources failed, using Databricks fallback only'); + } + } + + /** + * Fetch LiteLLM pricing + */ + async _fetchLiteLLM() { + try { + const response = await fetch(LITELLM_URL, { + signal: AbortSignal.timeout(15000), + headers: { 'Accept': 'application/json' }, + }); + + if (!response.ok) throw new Error(`HTTP ${response.status}`); + + const data = await response.json(); + this.litellmPrices = this._processLiteLLM(data); + + logger.debug({ count: Object.keys(this.litellmPrices).length }, '[ModelRegistry] LiteLLM loaded'); + } catch (err) { + logger.warn({ err: err.message }, '[ModelRegistry] LiteLLM fetch failed'); + throw err; + } + } + + /** + * Process LiteLLM format into our format + * LiteLLM uses cost per token, we use cost per 1M tokens + */ + _processLiteLLM(data) { + const prices = {}; + + for (const [modelId, info] of Object.entries(data)) { + if (!info || typeof info !== 'object') continue; + + // Convert per-token to per-million-tokens + const inputCost = (info.input_cost_per_token || 0) * 1_000_000; + const outputCost = (info.output_cost_per_token || 0) * 1_000_000; + + prices[modelId.toLowerCase()] = { + input: inputCost, + output: outputCost, + context: info.max_input_tokens || info.max_tokens || 128000, + maxOutput: info.max_output_tokens || 4096, + toolCall: info.supports_function_calling ?? true, + vision: info.supports_vision ?? false, + source: 'litellm', + }; + + // Also index without provider prefix for flexible lookup + const shortName = modelId.split('/').pop().toLowerCase(); + if (shortName !== modelId.toLowerCase()) { + prices[shortName] = prices[modelId.toLowerCase()]; + } + } + + return prices; + } + + /** + * Fetch models.dev pricing + */ + async _fetchModelsDev() { + try { + const response = await fetch(MODELS_DEV_URL, { + signal: AbortSignal.timeout(15000), + headers: { 'Accept': 'application/json' }, + }); + + if (!response.ok) throw new Error(`HTTP ${response.status}`); + + const data = await response.json(); + this.modelsDevPrices = this._processModelsDev(data); + + logger.debug({ count: Object.keys(this.modelsDevPrices).length }, '[ModelRegistry] models.dev loaded'); + } catch (err) { + logger.warn({ err: err.message }, '[ModelRegistry] models.dev fetch failed'); + throw err; + } + } + + /** + * Process models.dev format into our format + */ + _processModelsDev(data) { + const prices = {}; + + for (const [providerId, providerData] of Object.entries(data)) { + if (!providerData?.models) continue; + + for (const [modelId, info] of Object.entries(providerData.models)) { + const fullId = `${providerId}/${modelId}`.toLowerCase(); + + prices[fullId] = { + input: info.cost?.input || 0, + output: info.cost?.output || 0, + cacheRead: info.cost?.cache_read, + cacheWrite: info.cost?.cache_write, + context: info.context || 128000, + maxOutput: info.output || 4096, + toolCall: info.tool_call ?? false, + reasoning: info.reasoning ?? false, + vision: Array.isArray(info.input) && info.input.includes('image'), + source: 'models.dev', + }; + + // Also index by short name + prices[modelId.toLowerCase()] = prices[fullId]; + } + } + + return prices; + } + + /** + * Build unified index from all sources + */ + _buildIndex() { + this.modelIndex.clear(); + + // Add Databricks fallback first (lowest priority) + for (const [modelId, info] of Object.entries(DATABRICKS_FALLBACK)) { + this.modelIndex.set(modelId.toLowerCase(), { ...info, source: 'databricks-fallback' }); + } + + // Add models.dev (medium priority) + for (const [modelId, info] of Object.entries(this.modelsDevPrices)) { + this.modelIndex.set(modelId, info); + } + + // Add LiteLLM (highest priority) + for (const [modelId, info] of Object.entries(this.litellmPrices)) { + this.modelIndex.set(modelId, info); + } + } + + /** + * Get cost for a model + * @param {string} modelName - Model name/ID + * @returns {Object} Cost info { input, output, context, ... } + */ + getCost(modelName) { + if (!modelName) return { ...DEFAULT_COST, source: 'default' }; + + const normalizedName = modelName.toLowerCase(); + + // Direct lookup + if (this.modelIndex.has(normalizedName)) { + return this.modelIndex.get(normalizedName); + } + + // Try common variations + const variations = [ + normalizedName, + normalizedName.replace('databricks-', ''), + normalizedName.replace('azure/', ''), + normalizedName.replace('bedrock/', ''), + normalizedName.replace('anthropic.', ''), + normalizedName.split('/').pop(), + ]; + + for (const variant of variations) { + if (this.modelIndex.has(variant)) { + return this.modelIndex.get(variant); + } + } + + // Fuzzy match for partial names + for (const [key, value] of this.modelIndex.entries()) { + if (key.includes(normalizedName) || normalizedName.includes(key)) { + return value; + } + } + + logger.debug({ model: modelName }, '[ModelRegistry] Model not found, using default'); + return { ...DEFAULT_COST, source: 'default' }; + } + + /** + * Get model info by name + */ + getModel(modelName) { + return this.getCost(modelName); + } + + /** + * Check if model is free (local) + */ + isFree(modelName) { + const cost = this.getCost(modelName); + return cost.input === 0 && cost.output === 0; + } + + /** + * Check if model supports tool calling + */ + supportsTools(modelName) { + const model = this.getCost(modelName); + return model.toolCall === true; + } + + /** + * Find models matching criteria + */ + findModels(criteria = {}) { + const results = []; + + for (const [modelId, info] of this.modelIndex.entries()) { + if (criteria.maxInputCost && info.input > criteria.maxInputCost) continue; + if (criteria.minContext && info.context < criteria.minContext) continue; + if (criteria.toolCall && !info.toolCall) continue; + if (criteria.reasoning && !info.reasoning) continue; + if (criteria.vision && !info.vision) continue; + + results.push({ modelId, ...info }); + } + + // Sort by input cost ascending + return results.sort((a, b) => a.input - b.input); + } + + /** + * Get stats for metrics endpoint + */ + getStats() { + const sources = { litellm: 0, 'models.dev': 0, 'databricks-fallback': 0, default: 0 }; + + for (const info of this.modelIndex.values()) { + const source = info.source || 'default'; + sources[source] = (sources[source] || 0) + 1; + } + + return { + totalModels: this.modelIndex.size, + bySource: sources, + lastFetch: this.lastFetch, + cacheAge: this.lastFetch ? Date.now() - this.lastFetch : null, + cacheTTL: CACHE_TTL_MS, + }; + } + + /** + * Force refresh from APIs + */ + async refresh() { + await this._fetchAll(); + } + + // Cache management + _loadFromCache() { + try { + if (!fs.existsSync(CACHE_FILE)) return false; + + const cache = JSON.parse(fs.readFileSync(CACHE_FILE, 'utf8')); + this.litellmPrices = cache.litellm || {}; + this.modelsDevPrices = cache.modelsDev || {}; + this.lastFetch = cache.timestamp || 0; + + this._buildIndex(); + + logger.debug({ + age: Math.round((Date.now() - this.lastFetch) / 60000) + 'min', + models: this.modelIndex.size, + }, '[ModelRegistry] Loaded from cache'); + + return true; + } catch (err) { + logger.debug({ err: err.message }, '[ModelRegistry] Cache load failed'); + return false; + } + } + + _saveToCache() { + try { + const dir = path.dirname(CACHE_FILE); + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir, { recursive: true }); + } + + const cache = { + litellm: this.litellmPrices, + modelsDev: this.modelsDevPrices, + timestamp: Date.now(), + }; + + fs.writeFileSync(CACHE_FILE, JSON.stringify(cache, null, 2)); + logger.debug('[ModelRegistry] Cache saved'); + } catch (err) { + logger.warn({ err: err.message }, '[ModelRegistry] Cache save failed'); + } + } +} + +// Singleton with lazy initialization +let instance = null; + +async function getModelRegistry() { + if (!instance) { + instance = new ModelRegistry(); + await instance.initialize(); + } + return instance; +} + +// Sync getter (uses cache only, no network) +function getModelRegistrySync() { + if (!instance) { + instance = new ModelRegistry(); + instance._loadFromCache(); + instance._buildIndex(); + instance.loaded = true; + } + return instance; +} + +module.exports = { + ModelRegistry, + getModelRegistry, + getModelRegistrySync, + DATABRICKS_FALLBACK, + DEFAULT_COST, +}; diff --git a/src/routing/model-tiers.js b/src/routing/model-tiers.js new file mode 100644 index 0000000..5989dce --- /dev/null +++ b/src/routing/model-tiers.js @@ -0,0 +1,363 @@ +/** + * Model Tier Selector + * Maps complexity scores to appropriate models per provider + * Uses config/model-tiers.json for tier preferences + */ + +const fs = require('fs'); +const path = require('path'); +const logger = require('../logger'); +const config = require('../config'); + +// Load tier config +const TIER_CONFIG_PATH = path.join(__dirname, '../../config/model-tiers.json'); + +// Tier definitions with complexity ranges +const TIER_DEFINITIONS = { + SIMPLE: { + description: 'Greetings, simple Q&A, confirmations', + range: [0, 25], + priority: 1, + }, + MEDIUM: { + description: 'Code reading, simple edits, research', + range: [26, 50], + priority: 2, + }, + COMPLEX: { + description: 'Multi-file changes, debugging, architecture', + range: [51, 75], + priority: 3, + }, + REASONING: { + description: 'Complex analysis, security audits, novel problems', + range: [76, 100], + priority: 4, + }, +}; + +class ModelTierSelector { + constructor() { + this.tierConfig = null; + this.localProviders = {}; + this.providerAliases = {}; + this._loadConfig(); + } + + /** + * Load tier configuration from JSON file + */ + _loadConfig() { + try { + if (fs.existsSync(TIER_CONFIG_PATH)) { + const data = JSON.parse(fs.readFileSync(TIER_CONFIG_PATH, 'utf8')); + this.tierConfig = data.tiers || {}; + this.localProviders = data.localProviders || {}; + this.providerAliases = data.providerAliases || {}; + logger.debug({ tiers: Object.keys(this.tierConfig) }, '[ModelTiers] Config loaded'); + } else { + logger.warn('[ModelTiers] Config file not found, using defaults'); + this._loadDefaults(); + } + } catch (err) { + logger.warn({ err: err.message }, '[ModelTiers] Config load failed, using defaults'); + this._loadDefaults(); + } + } + + /** + * Load default tier config + */ + _loadDefaults() { + this.tierConfig = { + SIMPLE: { preferred: { ollama: ['llama3.2'], openai: ['gpt-4o-mini'] } }, + MEDIUM: { preferred: { openai: ['gpt-4o'], anthropic: ['claude-sonnet-4-20250514'] } }, + COMPLEX: { preferred: { openai: ['o1-mini'], anthropic: ['claude-sonnet-4-20250514'] } }, + REASONING: { preferred: { openai: ['o1'], anthropic: ['claude-opus-4-20250514'] } }, + }; + this.localProviders = { + ollama: { free: true, defaultTier: 'SIMPLE' }, + llamacpp: { free: true, defaultTier: 'SIMPLE' }, + lmstudio: { free: true, defaultTier: 'SIMPLE' }, + }; + } + + /** + * Normalize provider name using aliases + */ + _normalizeProvider(provider) { + if (!provider) return 'openai'; + const lower = provider.toLowerCase(); + return this.providerAliases[lower] || lower; + } + + /** + * Get tier from complexity score + * @param {number} complexityScore - Score from 0-100 + * @returns {string} Tier name (SIMPLE, MEDIUM, COMPLEX, REASONING) + */ + getTier(complexityScore) { + const score = Math.max(0, Math.min(100, complexityScore || 0)); + + for (const [tier, def] of Object.entries(TIER_DEFINITIONS)) { + if (score >= def.range[0] && score <= def.range[1]) { + return tier; + } + } + + return score > 75 ? 'REASONING' : 'SIMPLE'; + } + + /** + * Get tier definition + */ + getTierDefinition(tier) { + return TIER_DEFINITIONS[tier] || TIER_DEFINITIONS.MEDIUM; + } + + /** + * Get tier priority (1-4) + */ + getTierPriority(tier) { + return TIER_DEFINITIONS[tier]?.priority || 2; + } + + /** + * Compare two tiers, returns positive if tier1 > tier2 + */ + compareTiers(tier1, tier2) { + return this.getTierPriority(tier1) - this.getTierPriority(tier2); + } + + /** + * Get preferred models for a tier and provider + * @param {string} tier - Tier name + * @param {string} provider - Provider name + * @returns {string[]} Array of model names + */ + getPreferredModels(tier, provider) { + const normalizedProvider = this._normalizeProvider(provider); + return this.tierConfig[tier]?.preferred?.[normalizedProvider] || []; + } + + /** + * Select model for tier from TIER_* env var (mandatory) + * @param {string} tier - Tier name (SIMPLE, MEDIUM, COMPLEX, REASONING) + * @param {string} _unused - Deprecated parameter + * @returns {Object} { model, provider, source, tier } + */ + selectModel(tier, _unused = null) { + const tierConfig = config.modelTiers?.[tier]; + if (!tierConfig) { + throw new Error(`TIER_${tier} not configured. Set TIER_${tier}=provider:model in .env`); + } + + const parsed = this._parseTierConfig(tierConfig); + if (!parsed) { + throw new Error(`Invalid TIER_${tier} format. Expected provider:model, got: ${tierConfig}`); + } + + return { + model: parsed.model, + provider: parsed.provider, + source: 'env_tier', + tier, + }; + } + + /** + * Parse tier config string (format: provider:model) + * Examples: "ollama:llama3.2", "azure-openai:gpt-5.2-chat", "openai:gpt-4o" + */ + _parseTierConfig(configStr) { + if (!configStr || typeof configStr !== 'string') return null; + + const colonIndex = configStr.indexOf(':'); + if (colonIndex === -1) { + // No colon - treat as model name, use default provider + return { + provider: config.modelProvider?.type || 'openai', + model: configStr.trim(), + }; + } + + const provider = configStr.substring(0, colonIndex).trim().toLowerCase(); + const model = configStr.substring(colonIndex + 1).trim(); + + if (!provider || !model) return null; + + return { provider, model }; + } + + /** + * Get the model configured for a provider from .env + */ + _getProviderModel(provider) { + switch (provider) { + case 'azure-openai': + case 'azureopenai': + return config.azureOpenAI?.deployment || null; + case 'openai': + return config.openai?.model || null; + case 'ollama': + return config.ollama?.model || null; + case 'openrouter': + return config.openrouter?.model || null; + case 'llamacpp': + return config.llamacpp?.model || null; + case 'lmstudio': + return config.lmstudio?.model || null; + case 'bedrock': + return config.bedrock?.modelId || null; + case 'zai': + return config.zai?.model || null; + case 'vertex': + return config.vertex?.model || null; + case 'databricks': + return config.modelProvider?.defaultModel || null; + default: + return null; + } + } + + /** + * Get provider for a specific tier (from env or fallback) + */ + getProviderForTier(tier) { + const tierConfig = config.modelTiers?.[tier]; + if (tierConfig) { + const parsed = this._parseTierConfig(tierConfig); + if (parsed) return parsed.provider; + } + return config.modelProvider?.type || 'openai'; + } + + /** + * Get fallback model if provider can't handle requested tier + */ + _getFallbackModel(requestedTier, provider) { + const tierOrder = ['REASONING', 'COMPLEX', 'MEDIUM', 'SIMPLE']; + const startIndex = tierOrder.indexOf(requestedTier); + + // Try lower tiers + for (let i = startIndex + 1; i < tierOrder.length; i++) { + const fallbackTier = tierOrder[i]; + const models = this.getPreferredModels(fallbackTier, provider); + + if (models.length > 0) { + logger.debug({ + from: requestedTier, + to: fallbackTier, + provider, + model: models[0], + }, '[ModelTiers] Downgrading tier'); + + return { model: models[0], tier: fallbackTier }; + } + } + + return null; + } + + /** + * Check if provider can handle a specific tier + */ + canHandleTier(provider, tier) { + const normalizedProvider = this._normalizeProvider(provider); + const models = this.getPreferredModels(tier, normalizedProvider); + return models.length > 0; + } + + /** + * Check if provider is local/free + */ + isLocalProvider(provider) { + const normalizedProvider = this._normalizeProvider(provider); + return this.localProviders[normalizedProvider]?.free === true; + } + + /** + * Get all providers that can handle a tier + */ + getProvidersForTier(tier) { + const tierConfig = this.tierConfig[tier]; + if (!tierConfig?.preferred) return []; + return Object.keys(tierConfig.preferred); + } + + /** + * Get all tiers a provider can handle + */ + getTiersForProvider(provider) { + const normalizedProvider = this._normalizeProvider(provider); + const tiers = []; + + for (const tier of Object.keys(TIER_DEFINITIONS)) { + if (this.canHandleTier(normalizedProvider, tier)) { + tiers.push(tier); + } + } + + return tiers; + } + + /** + * Get tier stats for metrics endpoint + */ + getTierStats() { + const stats = { + tiers: {}, + providers: {}, + }; + + for (const [tier, def] of Object.entries(TIER_DEFINITIONS)) { + const providers = this.getProvidersForTier(tier); + stats.tiers[tier] = { + ...def, + providerCount: providers.length, + providers: providers, + }; + } + + // Count models per provider + const allProviders = new Set(); + for (const tierConfig of Object.values(this.tierConfig)) { + if (tierConfig.preferred) { + Object.keys(tierConfig.preferred).forEach(p => allProviders.add(p)); + } + } + + for (const provider of allProviders) { + stats.providers[provider] = { + tiers: this.getTiersForProvider(provider), + isLocal: this.isLocalProvider(provider), + }; + } + + return stats; + } + + /** + * Reload configuration (for hot reload) + */ + reload() { + this._loadConfig(); + logger.info('[ModelTiers] Configuration reloaded'); + } +} + +// Singleton instance +let instance = null; + +function getModelTierSelector() { + if (!instance) { + instance = new ModelTierSelector(); + } + return instance; +} + +module.exports = { + ModelTierSelector, + getModelTierSelector, + TIER_DEFINITIONS, +}; diff --git a/src/sessions/cleanup.js b/src/sessions/cleanup.js index 50f0b65..c4d22a4 100644 --- a/src/sessions/cleanup.js +++ b/src/sessions/cleanup.js @@ -4,9 +4,9 @@ const { cleanupOldSessions, cleanupOldHistory } = require("./store"); class SessionCleanupManager { constructor(options = {}) { this.enabled = options.enabled !== false; - this.intervalMs = options.intervalMs || 3600000; // 1 hour - this.sessionMaxAgeMs = options.sessionMaxAgeMs || 7 * 24 * 60 * 60 * 1000; // 7 days - this.historyMaxAgeMs = options.historyMaxAgeMs || 30 * 24 * 60 * 60 * 1000; // 30 days + this.intervalMs = options.intervalMs || 300000; // 5 minutes (was 1 hour) + this.sessionMaxAgeMs = options.sessionMaxAgeMs || 24 * 60 * 60 * 1000; // 1 day (was 7 days) + this.historyMaxAgeMs = options.historyMaxAgeMs || 7 * 24 * 60 * 60 * 1000; // 7 days (was 30 days) this.timer = null; } diff --git a/src/sessions/record.js b/src/sessions/record.js index 0a5da1f..ebfa25b 100644 --- a/src/sessions/record.js +++ b/src/sessions/record.js @@ -1,5 +1,8 @@ const { appendSessionTurn } = require("./store"); +// Cap in-memory history to prevent unbounded growth during long tool loops +const MAX_IN_MEMORY_HISTORY = 100; + function ensureSessionShape(session) { if (!session) return null; if (!Array.isArray(session.history)) { @@ -19,7 +22,13 @@ function appendTurnToSession(session, entry) { target.history.push(turn); target.updatedAt = turn.timestamp; - if (target.id) { + // Trim in-memory history if it exceeds the cap + if (target.history.length > MAX_IN_MEMORY_HISTORY) { + target.history = target.history.slice(-MAX_IN_MEMORY_HISTORY); + } + + // Skip DB write for ephemeral sessions (auto-generated, no client session ID) + if (target.id && !target._ephemeral) { appendSessionTurn(target.id, turn, target.metadata ?? {}); } diff --git a/src/sessions/store.js b/src/sessions/store.js index f88238e..0c04f55 100644 --- a/src/sessions/store.js +++ b/src/sessions/store.js @@ -4,11 +4,15 @@ const logger = require("../logger"); const selectSessionStmt = db.prepare( "SELECT id, created_at, updated_at, metadata FROM sessions WHERE id = ?", ); +// Limit history to last 50 entries to prevent unbounded memory growth. +// Older entries remain in DB for auditing but aren't loaded into memory. +const MAX_HISTORY_ROWS = 50; const selectHistoryStmt = db.prepare( `SELECT role, type, status, content, metadata, timestamp FROM session_history WHERE session_id = ? - ORDER BY timestamp ASC, id ASC`, + ORDER BY timestamp DESC, id DESC + LIMIT ${MAX_HISTORY_ROWS}`, ); const insertSessionStmt = db.prepare( "INSERT INTO sessions (id, created_at, updated_at, metadata) VALUES (@id, @created_at, @updated_at, @metadata)", @@ -75,7 +79,8 @@ function getSession(sessionId) { if (!sessionId) return null; const sessionRow = selectSessionStmt.get(sessionId); if (!sessionRow) return null; - const historyRows = selectHistoryStmt.all(sessionId); + // Query returns rows in DESC order (for LIMIT to grab newest), reverse to ASC + const historyRows = selectHistoryStmt.all(sessionId).reverse(); return toSession(sessionRow, historyRows); } diff --git a/src/tools/agent-task.js b/src/tools/agent-task.js index 4e69e22..74700d4 100644 --- a/src/tools/agent-task.js +++ b/src/tools/agent-task.js @@ -2,6 +2,50 @@ const { registerTool } = require("."); const { spawnAgent, autoSelectAgent } = require("../agents"); const logger = require("../logger"); +/** + * Extract text from Anthropic content blocks format + * Handles: [{"type":"text","text":"..."}] -> "..." + */ +function extractTextFromContentBlocks(content) { + if (typeof content !== 'string') { + return content; + } + + const trimmed = content.trim(); + if (!trimmed.startsWith('[')) { + return content; + } + + try { + const parsed = JSON.parse(trimmed); + if (!Array.isArray(parsed)) { + return content; + } + + // Extract text from content blocks + const textParts = parsed + .filter(block => block && typeof block === 'object') + .map(block => { + if (block.type === 'text' && typeof block.text === 'string') { + return block.text; + } + if (typeof block.text === 'string') { + return block.text; + } + return null; + }) + .filter(text => text !== null); + + if (textParts.length > 0) { + return textParts.join('\n\n'); + } + + return content; + } catch { + return content; + } +} + function registerAgentTaskTool() { registerTool( "Task", @@ -49,10 +93,13 @@ function registerAgentTaskTool() { }); if (result.success) { + // Extract text from Anthropic content blocks if present + const cleanContent = extractTextFromContentBlocks(result.result); + return { ok: true, status: 200, - content: result.result, + content: cleanContent, metadata: { agentType: subagentType, agentId: result.stats.agentId, diff --git a/src/tools/index.js b/src/tools/index.js index 11227f0..1983c00 100644 --- a/src/tools/index.js +++ b/src/tools/index.js @@ -1,5 +1,6 @@ const logger = require("../logger"); const { truncateToolOutput } = require("./truncate"); +const { isGPTProvider, formatToolResultForGPT } = require("../clients/gpt-utils"); const registry = new Map(); const registryLowercase = new Map(); @@ -254,7 +255,18 @@ async function executeToolCall(call, context = {}) { const formatted = normalizeHandlerResult(result); // Apply tool output truncation for token efficiency - const truncatedContent = truncateToolOutput(normalisedCall.name, formatted.content); + let truncatedContent = truncateToolOutput(normalisedCall.name, formatted.content); + + // GPT-specific formatting temporarily disabled for testing + // const isGPT = context?.provider && isGPTProvider(context.provider); + // if (isGPT) { + // truncatedContent = formatToolResultForGPT( + // normalisedCall.name, + // truncatedContent, + // normalisedCall.arguments + // ); + // } + const isGPT = false; // Disabled for testing return { id: normalisedCall.id, @@ -267,7 +279,8 @@ async function executeToolCall(call, context = {}) { registered: true, truncated: truncatedContent !== formatted.content, originalLength: formatted.content?.length, - truncatedLength: truncatedContent?.length + truncatedLength: truncatedContent?.length, + gptFormatted: isGPT, }, }; } catch (err) { From c5fc75d73ba85459dbdbf1e0aa6e8529fb82b878 Mon Sep 17 00:00:00 2001 From: vishal veerareddy Date: Thu, 12 Feb 2026 12:16:10 +0530 Subject: [PATCH 03/16] Added docs --- .claude/settings.local.json | 3 +- docs/docs.html | 1 + documentation/README.md | 3 +- documentation/features.md | 30 +-- documentation/routing.md | 423 ++++++++++++++++++++++++++++++++++++ 5 files changed, 445 insertions(+), 15 deletions(-) create mode 100644 documentation/routing.md diff --git a/.claude/settings.local.json b/.claude/settings.local.json index a1dd8cd..c39e6b4 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -81,7 +81,8 @@ "WebFetch(domain:www.databricks.com)", "WebFetch(domain:docs.databricks.com)", "Bash(env:*)", - "Bash(DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test:*)" + "Bash(DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test:*)", + "Bash(DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node:*)" ], "deny": [], "ask": [] diff --git a/docs/docs.html b/docs/docs.html index 71d5d12..22fe761 100644 --- a/docs/docs.html +++ b/docs/docs.html @@ -51,6 +51,7 @@
Features