From a64819c36be2e95941c65fce3c14d798246556dd Mon Sep 17 00:00:00 2001
From: Aaron Stannard <aaron@petabridge.com>
Date: Thu, 7 May 2026 14:17:24 +0000
Subject: [PATCH 1/3] fix: add response content diagnostics and thinking-only
 retry guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add debug-level logging across the LLM response pipeline to diagnose
sessions that produce tokens but no visible Slack output. Three layers
now report content type breakdowns (text/thinking/tool call counts and
char totals): OpenAiCompatibleChatClient (SSE), LoggingChatClient
(middleware), and LlmSessionActor (actor). Comparing the three logs
pinpoints whether content is misrouted upstream (llama.cpp) or dropped
downstream (Netclaw parsing/ToolCallTextFilter).

Unify the empty-response and thinking-only guards into a single check
that retries via EvaluateEmptyResponse when the LLM produces reasoning
content but no visible text or tool calls — prevents silent fallback
replies in Slack.

Absorb stale LlmCallFailed and LlmResponseReceived messages in the
Ready state to eliminate noisy dead letters after watchdog timeouts.
---
 .../Sessions/LlmSessionActor.cs               | 41 +++++++++++++++----
 .../Configuration/LoggingChatClient.cs        | 37 +++++++++++++++--
 .../SelfHosted/OpenAiCompatibleChatClient.cs  | 24 ++++++++++-
 3 files changed, 89 insertions(+), 13 deletions(-)
diff --git a/src/Netclaw.Actors/Sessions/LlmSessionActor.cs b/src/Netclaw.Actors/Sessions/LlmSessionActor.cs
index eef6bdf1..743140db 100644
--- a/src/Netclaw.Actors/Sessions/LlmSessionActor.cs
+++ b/src/Netclaw.Actors/Sessions/LlmSessionActor.cs
@@ -363,6 +363,8 @@ private void Ready()
         });
 
         Command<ProcessingWatchdogExpired>(_ => { });
+        Command<LlmCallFailed>(_ => { }); // stale failure arriving after watchdog timeout
+        Command<LlmResponseReceived>(_ => { }); // stale response arriving after transition to Ready
         Command<CompactionWorkCompleted>(_ => { });
         Command<CompactionWorkFailed>(_ => { });
         CommandDistillationAckNoOp();
@@ -443,8 +445,31 @@ private void Processing()
             var response = msg.Response;
             var lastMessage = response.Messages[^1];
 
-            // Check for tool calls
-            var toolCalls = lastMessage.Contents.OfType<FunctionCallContent>().ToList();
+            var toolCalls = new List<FunctionCallContent>();
+            bool hasText = false, hasThinking = false;
+            int textChars = 0, thinkingChars = 0;
+            foreach (var c in lastMessage.Contents)
+            {
+                switch (c)
+                {
+                    case TextContent tc:
+                        textChars += tc.Text?.Length ?? 0;
+                        if (!string.IsNullOrWhiteSpace(tc.Text)) hasText = true;
+                        break;
+                    case TextReasoningContent rc:
+                        thinkingChars += rc.Text?.Length ?? 0;
+                        if (!string.IsNullOrWhiteSpace(rc.Text)) hasThinking = true;
+                        break;
+                    case FunctionCallContent fc:
+                        toolCalls.Add(fc);
+                        break;
+                }
+            }
+
+            _log.Debug(
+                "LLM response content breakdown: text={TextChars}ch thinking={ThinkingChars}ch toolCalls={ToolCallCount} finishReason={FinishReason}",
+                textChars, thinkingChars, toolCalls.Count, response.FinishReason?.ToString() ?? "null");
+
             if (toolCalls.Count > 0 && _turnState.ForceNoToolsActive)
             {
                 TurnLog().Warning(
@@ -465,21 +490,19 @@ private void Processing()
                 return;
             }
 
-            // Guard: empty response (no text, no thinking, no tool calls) — delegate decision to tracker
-            var hasContent = lastMessage.Contents.Any(c =>
-                (c is TextContent tc && !string.IsNullOrWhiteSpace(tc.Text)) ||
-                (c is TextReasoningContent rc && !string.IsNullOrWhiteSpace(rc.Text)));
-            if (!hasContent)
+            if (!hasText && toolCalls.Count == 0)
             {
+                var label = hasThinking ? "thinking-only" : "empty";
                 switch (_turnState.EvaluateEmptyResponse())
                 {
                     case EmptyResponseAction.Retry retry:
-                        _log.Warning("LLM produced empty response — retrying with nudge");
+                        _log.Warning("LLM produced {Label} response ({ThinkingChars} chars) — retrying with nudge",
+                            label, thinkingChars);
                         _state = _state.AddSystemNudge(retry.NudgeText);
                         FireLlmCall();
                         return;
                     case EmptyResponseAction.Fail fail:
-                        _log.Warning("LLM produced empty response — failing turn");
+                        _log.Warning("LLM produced {Label} response — failing turn", label);
                         FailCurrentTurn(fail.ErrorMessage, fail.Cause, ErrorCategory.ProviderFailure);
                         return;
                 }
diff --git a/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs b/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs
index ab7a95f1..c4115856 100644
--- a/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs
+++ b/src/Netclaw.Daemon/Configuration/LoggingChatClient.cs
@@ -72,6 +72,12 @@ public override async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseA
         var start = _timeProvider.GetTimestamp();
         long inputTokens = 0;
         long outputTokens = 0;
+        int textDeltaCount = 0;
+        int textDeltaChars = 0;
+        int thinkingDeltaCount = 0;
+        int thinkingDeltaChars = 0;
+        int toolCallDeltaCount = 0;
+        ChatFinishReason? lastFinishReason = null;
 
         IAsyncEnumerable<ChatResponseUpdate> stream;
         try
@@ -87,16 +93,41 @@ public override async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseA
 
         await foreach (var update in stream)
         {
-            if (update.Contents?.OfType<UsageContent>().FirstOrDefault() is { } usage)
+            foreach (var item in update.Contents)
             {
-                inputTokens += usage.Details?.InputTokenCount ?? 0;
-                outputTokens += usage.Details?.OutputTokenCount ?? 0;
+                switch (item)
+                {
+                    case TextContent tc:
+                        textDeltaCount++;
+                        textDeltaChars += tc.Text?.Length ?? 0;
+                        break;
+                    case TextReasoningContent rc:
+                        thinkingDeltaCount++;
+                        thinkingDeltaChars += rc.Text?.Length ?? 0;
+                        break;
+                    case FunctionCallContent:
+                        toolCallDeltaCount++;
+                        break;
+                    case UsageContent usage:
+                        inputTokens += usage.Details?.InputTokenCount ?? 0;
+                        outputTokens += usage.Details?.OutputTokenCount ?? 0;
+                        break;
+                }
             }
 
+            if (update.FinishReason is not null)
+                lastFinishReason = update.FinishReason;
+
             yield return update;
         }
 
         var totalElapsed = _timeProvider.GetElapsedTime(start);
+
+        _logger.LogDebug(
+            "LLM streaming response content breakdown: textDeltas={TextDeltaCount} textChars={TextChars} thinkingDeltas={ThinkingDeltaCount} thinkingChars={ThinkingChars} toolCallDeltas={ToolCallDeltaCount} finishReason={FinishReason}",
+            textDeltaCount, textDeltaChars, thinkingDeltaCount, thinkingDeltaChars, toolCallDeltaCount,
+            lastFinishReason?.ToString() ?? "null");
+
         if (inputTokens > 0 || outputTokens > 0)
         {
             var delta = RecordInputTokens(inputTokens);
diff --git a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs
index af6a701f..bc2ffc28 100644
--- a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs
+++ b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleChatClient.cs
@@ -9,6 +9,8 @@
 using System.Text.Json;
 using System.Text.Json.Nodes;
 using Microsoft.Extensions.AI;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
 
 namespace Netclaw.Providers.SelfHosted;
 
@@ -22,11 +24,15 @@ public sealed class OpenAiCompatibleChatClient : IChatClient
     private readonly HttpClient _httpClient;
     private readonly OpenAiCompatibleEndpoint _endpoint;
     private readonly string _modelId;
-    public OpenAiCompatibleChatClient(HttpClient httpClient, OpenAiCompatibleEndpoint endpoint, string modelId)
+    private readonly ILogger _logger;
+
+    public OpenAiCompatibleChatClient(HttpClient httpClient, OpenAiCompatibleEndpoint endpoint, string modelId,
+        ILogger? logger = null)
     {
         _httpClient = httpClient;
         _endpoint = endpoint;
         _modelId = modelId;
+        _logger = logger ?? NullLogger.Instance;
     }
 
     public async Task<ChatResponse> GetResponseAsync(
@@ -67,6 +73,9 @@ public async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseAsync(
         var hadStructuredToolCalls = false;
         ChatResponseUpdate? finalUpdate = null;
         var filter = new ToolCallTextFilter();
+        int textDeltaCount = 0, textDeltaChars = 0;
+        int thinkingDeltaCount = 0, thinkingDeltaChars = 0;
+        int toolCallDeltaCount = 0, suppressedDeltaCount = 0;
 
         while (!cancellationToken.IsCancellationRequested)
         {
@@ -99,11 +108,18 @@ public async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseAsync(
                     {
                         case TextContent tc:
                             accumulatedText.Append(tc.Text);
+                            textDeltaCount++;
+                            textDeltaChars += tc.Text?.Length ?? 0;
                             if (filter.ShouldSuppress(tc.Text))
                                 suppressThisUpdate = true;
                             break;
+                        case TextReasoningContent rc:
+                            thinkingDeltaCount++;
+                            thinkingDeltaChars += rc.Text?.Length ?? 0;
+                            break;
                         case FunctionCallContent:
                             hadStructuredToolCalls = true;
+                            toolCallDeltaCount++;
                             break;
                     }
                 }
@@ -115,6 +131,7 @@ public async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseAsync(
                 // content-free keepalive so the caller knows the stream is alive.
                 if (suppressThisUpdate)
                 {
+                    suppressedDeltaCount++;
                     yield return KeepaliveUpdate;
                     continue;
                 }
@@ -123,6 +140,11 @@ public async IAsyncEnumerable<ChatResponseUpdate> GetStreamingResponseAsync(
             }
         }
 
+        _logger.LogDebug(
+            "SSE stream content breakdown: textDeltas={TextDeltas} textChars={TextChars} thinkingDeltas={ThinkingDeltas} thinkingChars={ThinkingChars} toolCallDeltas={ToolCallDeltas} suppressedDeltas={SuppressedDeltas} finishReason={FinishReason}",
+            textDeltaCount, textDeltaChars, thinkingDeltaCount, thinkingDeltaChars, toolCallDeltaCount,
+            suppressedDeltaCount, finalUpdate?.FinishReason?.ToString() ?? "null");
+
         // Fallback: if the model stopped without structured tool calls but the text
         // contains XML-like tool call blocks, emit a synthetic tool call update.
         if (!hadStructuredToolCalls

From 3a86cf54981000a02162863d747e5790ed9157a9 Mon Sep 17 00:00:00 2001
From: Aaron Stannard <aaron@petabridge.com>
Date: Fri, 8 May 2026 11:01:08 +0000
Subject: [PATCH 2/3] fix: inject ILoggerFactory so SSE-layer breakdown log
 actually fires

OpenAiCompatibleProviderPlugin was constructing OpenAiCompatibleChatClient
without a logger, so _logger fell back to NullLogger.Instance and the
SSE-layer "stream content breakdown" Debug log was silently swallowed.
Wire ILoggerFactory through DI and create a categorized logger.
---
 .../SelfHosted/OpenAiCompatibleProviderPlugin.cs     | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs
index 13af2185..1a638fac 100644
--- a/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs
+++ b/src/Netclaw.Providers/SelfHosted/OpenAiCompatibleProviderPlugin.cs
@@ -4,6 +4,7 @@
 // </copyright>
 // -----------------------------------------------------------------------
 using Microsoft.Extensions.AI;
+using Microsoft.Extensions.Logging;
 using Netclaw.Configuration;
 using Netclaw.Providers;
 
@@ -14,7 +15,13 @@ namespace Netclaw.Providers.SelfHosted;
 /// </summary>
 public sealed class OpenAiCompatibleProviderPlugin : ProviderPluginBase<OpenAiCompatibleDescriptor>
 {
-    public OpenAiCompatibleProviderPlugin(OpenAiCompatibleDescriptor descriptor) : base(descriptor) { }
+    private readonly ILoggerFactory _loggerFactory;
+
+    public OpenAiCompatibleProviderPlugin(OpenAiCompatibleDescriptor descriptor, ILoggerFactory loggerFactory)
+        : base(descriptor)
+    {
+        _loggerFactory = loggerFactory;
+    }
 
     public override IChatClient CreateChatClient(ProviderEntry entry, ModelReference model)
     {
@@ -25,6 +32,7 @@ public override IChatClient CreateChatClient(ProviderEntry entry, ModelReference
         return new OpenAiCompatibleChatClient(
             CreateLlmHttpClient(endpoint.BaseUri),
             endpoint,
-            model.ModelId);
+            model.ModelId,
+            _loggerFactory.CreateLogger<OpenAiCompatibleChatClient>());
     }
 }

From 1bf8da37d1e389818d25dea523d33c5c82e1cf18 Mon Sep 17 00:00:00 2001
From: Aaron Stannard <aaron@petabridge.com>
Date: Sat, 9 May 2026 14:09:55 +0000
Subject: [PATCH 3/3] fix(logging): drop hardcoded Information floor in
 RollingFileLogger (#908)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RollingFileLogger.IsEnabled was hardcoded to LogLevel.Information,
which silently overrode any Debug-level configuration coming through
Logging.LogLevel.Default or SetMinimumLevel. The framework was
correctly configured for Debug, but every Debug log was rejected
at the file sink — only the console sink saw them.

This made the new SSE / middleware / actor content-breakdown
diagnostics invisible in production daemon logs, and is consistent
with operator reports that "structured log output isn't appearing
in daemon logs."

Defer entirely to the framework's configured minimum level instead
of imposing our own floor.

Cherry-picked from the diagnostic-investigation branch (the original
6558a471 also bumped specific call sites to Warning/Info temporarily;
that bump is intentionally skipped here — the diagnostic logs stay
at Debug and we let configuration decide what gets persisted).
---
 src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs b/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs
index 72177a00..726f088f 100644
--- a/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs
+++ b/src/Netclaw.Daemon/Configuration/RollingFileLoggerProvider.cs
@@ -212,7 +212,7 @@ public RollingFileLogger(string category, RollingFileLoggerProvider provider)
 
     public IDisposable? BeginScope<TState>(TState state) where TState : notnull => null;
 
-    public bool IsEnabled(LogLevel logLevel) => logLevel >= LogLevel.Information;
+    public bool IsEnabled(LogLevel logLevel) => logLevel != LogLevel.None;
 
     public void Log<TState>(LogLevel logLevel, EventId eventId, TState state, Exception? exception, Func<TState, Exception?, string> formatter)
     {