From a64819c36be2e95941c65fce3c14d798246556dd Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Thu, 7 May 2026 14:17:24 +0000 Subject: [PATCH 1/3] fix: add response content diagnostics and thinking-only retry guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add debug-level logging across the LLM response pipeline to diagnose sessions that produce tokens but no visible Slack output. Three layers now report content type breakdowns (text/thinking/tool call counts and char totals): OpenAiCompatibleChatClient (SSE), LoggingChatClient (middleware), and LlmSessionActor (actor). Comparing the three logs pinpoints whether content is misrouted upstream (llama.cpp) or dropped downstream (Netclaw parsing/ToolCallTextFilter). Unify the empty-response and thinking-only guards into a single check that retries via EvaluateEmptyResponse when the LLM produces reasoning content but no visible text or tool calls — prevents silent fallback replies in Slack. Absorb stale LlmCallFailed and LlmResponseReceived messages in the Ready state to eliminate noisy dead letters after watchdog timeouts. --- .../Sessions/LlmSessionActor.cs | 41 +++++++++++++++---- .../Configuration/LoggingChatClient.cs | 37 +++++++++++++++-- .../SelfHosted/OpenAiCompatibleChatClient.cs | 24 ++++++++++- 3 files changed, 89 insertions(+), 13 deletions(-) diff --git a/src/Netclaw.Actors/Sessions/LlmSessionActor.cs b/src/Netclaw.Actors/Sessions/LlmSessionActor.cs index eef6bdf1..743140db 100644 --- a/src/Netclaw.Actors/Sessions/LlmSessionActor.cs +++ b/src/Netclaw.Actors/Sessions/LlmSessionActor.cs @@ -363,6 +363,8 @@ private void Ready() }); Command(_ => { }); + Command(_ => { }); // stale failure arriving after watchdog timeout + Command(_ => { }); // stale response arriving after transition to Ready Command(_ => { }); Command(_ => { }); CommandDistillationAckNoOp(); @@ -443,8 +445,31 @@ private void Processing() var response = msg.Response; var lastMessage = response.Messages[^1]; - // Check for tool calls - var toolCalls = lastMessage.Contents.OfType().ToList(); + var toolCalls = new List(); + bool hasText = false, hasThinking = false; + int textChars = 0, thinkingChars = 0; + foreach (var c in lastMessage.Contents) + { + switch (c) + { + case TextContent tc: + textChars += tc.Text?.Length ?? 0; + if (!string.IsNullOrWhiteSpace(tc.Text)) hasText = true; + break; + case TextReasoningContent rc: + thinkingChars += rc.Text?.Length ?? 0; + if (!string.IsNullOrWhiteSpace(rc.Text)) hasThinking = true; + break; + case FunctionCallContent fc: + toolCalls.Add(fc); + break; + } + } + + _log.Debug( + "LLM response content breakdown: text={TextChars}ch thinking={ThinkingChars}ch toolCalls={ToolCallCount} finishReason={FinishReason}", + textChars, thinkingChars, toolCalls.Count, response.FinishReason?.ToString() ?? "null"); + if (toolCalls.Count > 0 && _turnState.ForceNoToolsActive) { TurnLog().Warning( @@ -465,21 +490,19 @@ private void Processing() return; } - // Guard: empty response (no text, no thinking, no tool calls) — delegate decision to tracker - var hasContent = lastMessage.Contents.Any(c => - (c is TextContent tc && !string.IsNullOrWhiteSpace(tc.Text)) || - (c is TextReasoningContent rc && !string.IsNullOrWhiteSpace(rc.Text))); - if (!hasContent) + if (!hasText && toolCalls.Count == 0) { + var label = hasThinking ? "thinking-only" : "empty"; switch (_turnState.EvaluateEmptyResponse()) { case EmptyResponseAction.Retry retry: - _log.Warning("LLM produced empty response — retrying with nudge"); + _log.Warning("LLM produced {Label} response ({ThinkingChars} chars) — retrying with nudge", + label, thinkingChars); _state = _state.AddSystemNudge(retry.NudgeText); FireLlmCall(); return; case EmptyResponseAction.Fail fail: - _log.Warning("LLM produced empty response — failing turn"); + _log.Warning("LLM produced {Label} response — failing turn", label); FailCurrentTurn(fail.ErrorMessage, fail.Cause, ErrorCategory.ProviderFailure); return; } diff --git a/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs b/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs index ab7a95f1..c4115856 100644 --- a/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs +++ b/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs @@ -72,6 +72,12 @@ public override async IAsyncEnumerable GetStreamingResponseA var start = _timeProvider.GetTimestamp(); long inputTokens = 0; long outputTokens = 0; + int textDeltaCount = 0; + int textDeltaChars = 0; + int thinkingDeltaCount = 0; + int thinkingDeltaChars = 0; + int toolCallDeltaCount = 0; + ChatFinishReason? lastFinishReason = null; IAsyncEnumerable stream; try @@ -87,16 +93,41 @@ public override async IAsyncEnumerable GetStreamingResponseA await foreach (var update in stream) { - if (update.Contents?.OfType().FirstOrDefault() is { } usage) + foreach (var item in update.Contents) { - inputTokens += usage.Details?.InputTokenCount ?? 0; - outputTokens += usage.Details?.OutputTokenCount ?? 0; + switch (item) + { + case TextContent tc: + textDeltaCount++; + textDeltaChars += tc.Text?.Length ?? 0; + break; + case TextReasoningContent rc: + thinkingDeltaCount++; + thinkingDeltaChars += rc.Text?.Length ?? 0; + break; + case FunctionCallContent: + toolCallDeltaCount++; + break; + case UsageContent usage: + inputTokens += usage.Details?.InputTokenCount ?? 0; + outputTokens += usage.Details?.OutputTokenCount ?? 0; + break; + } } + if (update.FinishReason is not null) + lastFinishReason = update.FinishReason; + yield return update; } var totalElapsed = _timeProvider.GetElapsedTime(start); + + _logger.LogDebug( + "LLM streaming response content breakdown: textDeltas={TextDeltaCount} textChars={TextChars} thinkingDeltas={ThinkingDeltaCount} thinkingChars={ThinkingChars} toolCallDeltas={ToolCallDeltaCount} finishReason={FinishReason}", + textDeltaCount, textDeltaChars, thinkingDeltaCount, thinkingDeltaChars, toolCallDeltaCount, + lastFinishReason?.ToString() ?? "null"); + if (inputTokens > 0 || outputTokens > 0) { var delta = RecordInputTokens(inputTokens); diff --git a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs index af6a701f..bc2ffc28 100644 --- a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs +++ b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs @@ -9,6 +9,8 @@ using System.Text.Json; using System.Text.Json.Nodes; using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; namespace Netclaw.Providers.SelfHosted; @@ -22,11 +24,15 @@ public sealed class OpenAiCompatibleChatClient : IChatClient private readonly HttpClient _httpClient; private readonly OpenAiCompatibleEndpoint _endpoint; private readonly string _modelId; - public OpenAiCompatibleChatClient(HttpClient httpClient, OpenAiCompatibleEndpoint endpoint, string modelId) + private readonly ILogger _logger; + + public OpenAiCompatibleChatClient(HttpClient httpClient, OpenAiCompatibleEndpoint endpoint, string modelId, + ILogger? logger = null) { _httpClient = httpClient; _endpoint = endpoint; _modelId = modelId; + _logger = logger ?? NullLogger.Instance; } public async Task GetResponseAsync( @@ -67,6 +73,9 @@ public async IAsyncEnumerable GetStreamingResponseAsync( var hadStructuredToolCalls = false; ChatResponseUpdate? finalUpdate = null; var filter = new ToolCallTextFilter(); + int textDeltaCount = 0, textDeltaChars = 0; + int thinkingDeltaCount = 0, thinkingDeltaChars = 0; + int toolCallDeltaCount = 0, suppressedDeltaCount = 0; while (!cancellationToken.IsCancellationRequested) { @@ -99,11 +108,18 @@ public async IAsyncEnumerable GetStreamingResponseAsync( { case TextContent tc: accumulatedText.Append(tc.Text); + textDeltaCount++; + textDeltaChars += tc.Text?.Length ?? 0; if (filter.ShouldSuppress(tc.Text)) suppressThisUpdate = true; break; + case TextReasoningContent rc: + thinkingDeltaCount++; + thinkingDeltaChars += rc.Text?.Length ?? 0; + break; case FunctionCallContent: hadStructuredToolCalls = true; + toolCallDeltaCount++; break; } } @@ -115,6 +131,7 @@ public async IAsyncEnumerable GetStreamingResponseAsync( // content-free keepalive so the caller knows the stream is alive. if (suppressThisUpdate) { + suppressedDeltaCount++; yield return KeepaliveUpdate; continue; } @@ -123,6 +140,11 @@ public async IAsyncEnumerable GetStreamingResponseAsync( } } + _logger.LogDebug( + "SSE stream content breakdown: textDeltas={TextDeltas} textChars={TextChars} thinkingDeltas={ThinkingDeltas} thinkingChars={ThinkingChars} toolCallDeltas={ToolCallDeltas} suppressedDeltas={SuppressedDeltas} finishReason={FinishReason}", + textDeltaCount, textDeltaChars, thinkingDeltaCount, thinkingDeltaChars, toolCallDeltaCount, + suppressedDeltaCount, finalUpdate?.FinishReason?.ToString() ?? "null"); + // Fallback: if the model stopped without structured tool calls but the text // contains XML-like tool call blocks, emit a synthetic tool call update. if (!hadStructuredToolCalls From 3a86cf54981000a02162863d747e5790ed9157a9 Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Fri, 8 May 2026 11:01:08 +0000 Subject: [PATCH 2/3] fix: inject ILoggerFactory so SSE-layer breakdown log actually fires OpenAiCompatibleProviderPlugin was constructing OpenAiCompatibleChatClient without a logger, so _logger fell back to NullLogger.Instance and the SSE-layer "stream content breakdown" Debug log was silently swallowed. Wire ILoggerFactory through DI and create a categorized logger. --- .../SelfHosted/OpenAiCompatibleProviderPlugin.cs | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs index 13af2185..1a638fac 100644 --- a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs +++ b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs @@ -4,6 +4,7 @@ // // ----------------------------------------------------------------------- using Microsoft.Extensions.AI; +using Microsoft.Extensions.Logging; using Netclaw.Configuration; using Netclaw.Providers; @@ -14,7 +15,13 @@ namespace Netclaw.Providers.SelfHosted; /// public sealed class OpenAiCompatibleProviderPlugin : ProviderPluginBase { - public OpenAiCompatibleProviderPlugin(OpenAiCompatibleDescriptor descriptor) : base(descriptor) { } + private readonly ILoggerFactory _loggerFactory; + + public OpenAiCompatibleProviderPlugin(OpenAiCompatibleDescriptor descriptor, ILoggerFactory loggerFactory) + : base(descriptor) + { + _loggerFactory = loggerFactory; + } public override IChatClient CreateChatClient(ProviderEntry entry, ModelReference model) { @@ -25,6 +32,7 @@ public override IChatClient CreateChatClient(ProviderEntry entry, ModelReference return new OpenAiCompatibleChatClient( CreateLlmHttpClient(endpoint.BaseUri), endpoint, - model.ModelId); + model.ModelId, + _loggerFactory.CreateLogger()); } } From 1bf8da37d1e389818d25dea523d33c5c82e1cf18 Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Sat, 9 May 2026 14:09:55 +0000 Subject: [PATCH 3/3] fix(logging): drop hardcoded Information floor in RollingFileLogger (#908) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RollingFileLogger.IsEnabled was hardcoded to LogLevel.Information, which silently overrode any Debug-level configuration coming through Logging.LogLevel.Default or SetMinimumLevel. The framework was correctly configured for Debug, but every Debug log was rejected at the file sink — only the console sink saw them. This made the new SSE / middleware / actor content-breakdown diagnostics invisible in production daemon logs, and is consistent with operator reports that "structured log output isn't appearing in daemon logs." Defer entirely to the framework's configured minimum level instead of imposing our own floor. Cherry-picked from the diagnostic-investigation branch (the original 6558a471 also bumped specific call sites to Warning/Info temporarily; that bump is intentionally skipped here — the diagnostic logs stay at Debug and we let configuration decide what gets persisted). --- src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs b/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs index 72177a00..726f088f 100644 --- a/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs +++ b/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs @@ -212,7 +212,7 @@ public RollingFileLogger(string category, RollingFileLoggerProvider provider) public IDisposable? BeginScope(TState state) where TState : notnull => null; - public bool IsEnabled(LogLevel logLevel) => logLevel >= LogLevel.Information; + public bool IsEnabled(LogLevel logLevel) => logLevel != LogLevel.None; public void Log(LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func formatter) {