From cad1e7acf7694842dba56d2af202fad5da263fe8 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Sun, 12 Apr 2026 15:31:35 +0800
Subject: [PATCH 01/40] [Feature] Add troubleshoot and stats-cache-hitratio
 skills

---
 docs/online_serving/router.md                 |   2 +-
 docs/online_serving/router_faq.md             |  41 +-
 docs/zh/online_serving/router.md              |   2 +-
 docs/zh/online_serving/router_faq.md          |  41 +-
 .../skills/stat-cache-hitrate/SKILL.md        | 119 +++
 .../evals/trigger_eval.json                   |  18 +
 .../references/log_formats.md                 | 139 +++
 .../references/report_templates.md            | 199 +++++
 .../stat-cache-hitrate/scripts/chart.py       | 249 ++++++
 .../stat-cache-hitrate/scripts/log_parser.py  | 358 ++++++++
 .../scripts/stat_cache_hitrate.py             | 669 ++++++++++++++
 .../stat-cache-hitrate/scripts/stats.py       | 278 ++++++
 .../.claude/skills/troubleshoot/SKILL.md      | 148 ++++
 .../troubleshoot/evals/trigger_eval.json      |  18 +
 .../troubleshoot/references/error_catalog.md  | 122 +++
 .../references/fastdeploy_cross_reference.md  | 102 +++
 .../troubleshoot/references/log_patterns.md   | 282 ++++++
 .../references/report_templates.md            | 120 +++
 .../scripts/analyzers/__init__.py             |   1 +
 .../troubleshoot/scripts/analyzers/cache.py   | 458 ++++++++++
 .../troubleshoot/scripts/analyzers/errors.py  | 314 +++++++
 .../troubleshoot/scripts/analyzers/health.py  | 421 +++++++++
 .../troubleshoot/scripts/analyzers/latency.py | 355 ++++++++
 .../troubleshoot/scripts/analyzers/load.py    | 389 ++++++++
 .../troubleshoot/scripts/analyzers/trace.py   | 391 ++++++++
 .../skills/troubleshoot/scripts/chart.py      | 351 ++++++++
 .../skills/troubleshoot/scripts/log_parser.py | 832 ++++++++++++++++++
 .../skills/troubleshoot/scripts/stats.py      | 278 ++++++
 .../troubleshoot/scripts/troubleshoot.py      | 334 +++++++
 fastdeploy/golang_router/.gitignore           |   2 +
 30 files changed, 7021 insertions(+), 12 deletions(-)
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
 create mode 100644 fastdeploy/golang_router/.gitignore

diff --git a/docs/online_serving/router.md b/docs/online_serving/router.md
index 82940e5680c..7abc9c06af3 100644
--- a/docs/online_serving/router.md
+++ b/docs/online_serving/router.md
@@ -194,7 +194,7 @@ scheduler:
   policy: "power_of_two" # Scheduling policy (optional): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score
   prefill-policy: "cache_aware" # Prefill scheduling policy in PD mode
   decode-policy: "request_num" # Decode scheduling policy in PD mode
-  eviction-interval-secs: 60 # Cache eviction interval for CacheAware scheduling
+  eviction-interval-secs: 60 # Counter eviction interval for CacheAware scheduling
   eviction-duration-mins: 30 # Eviction duration for cache-aware radix tree nodes (minutes); default: 30
   balance-abs-threshold: 1 # Absolute threshold for CacheAware balancing
   balance-rel-threshold: 0.2 # Relative threshold for CacheAware balancing
diff --git a/docs/online_serving/router_faq.md b/docs/online_serving/router_faq.md
index 49083539d4c..c0fb8cba4bf 100644
--- a/docs/online_serving/router_faq.md
+++ b/docs/online_serving/router_faq.md
@@ -29,6 +29,24 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `empty baseURL provided` | Health check received an empty base URL | Health check cannot be performed | Registration parameters |
 | `failed to create request: {error}` | Failed to create health check request | The instance may be marked as unhealthy | Network environment |
 | `failed to read response body: {error}` | Failed to read health check response body | The instance may be marked as unhealthy | Backend instance status |
+| `Failed to select mixed worker: {error}` | Failed to select Mixed worker in centralized mode | Current request returns 502 | Health status, scheduling strategy |
+| `Failed to select prefill worker: {error}` | Failed to select Prefill worker in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy |
+| `Failed to read register request body: {error}` | Failed to read registration request body | Registration request returns 400 | Request format |
+| `Failed to unmarshal register request JSON: {error}` | Failed to parse registration request JSON | Registration request returns 400 | Request format |
+| `Failed to create decode request for {url}: {error}` | Failed to create HTTP request to Decode instance | Current request fails | Network environment |
+| `Failed to create prefill request for {url}: {error}` | Failed to create HTTP request to Prefill instance | Current request fails | Network environment |
+| `Decode request failed for {url}: {error}` | Request to Decode instance failed | Current request fails | Backend instance status, network connectivity |
+| `Prefill request failed for {url}: {error}` | Request to Prefill instance failed | Current request fails | Backend instance status, network connectivity |
+| `Failed to read request body: {error}` | Failed to read inference request body | Current request returns 400 | Request format |
+| `Failed to unmarshal request JSON: {error}` | Failed to parse inference request JSON | Current request returns 400 | Request format |
+| `Failed to select worker pair: {error}` | Failed to select worker pair in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy |
+| `Failed to build disaggregate_info: {error}` | Failed to build PD disaggregation communication info | Current request returns 500 | Registration parameters (connector_port, device_ids, etc.) |
+| `Failed to encode modified request: {error}` | Failed to encode modified request body | Current request returns 500 | Request content |
+| `Failed to select worker: {error}` | Failed to select worker in centralized mode | Current request returns 502 | Health status, scheduling strategy |
+| `Failed to connect to backend service: {error}` | Failed to connect to backend inference instance (after 3 retries) | Current request returns 502 | Backend instance status, network connectivity |
+| `Request failed (attempt {n}/{max}): {error}` | Request attempt {n} failed | If retries exhausted, request returns 502 | Backend instance status, network connectivity |
+| `Failed to create backend request for {url}: {error}` | Failed to create HTTP request to backend | Current request fails | Network environment |
+| `Backend request failed for {url}: {error}` | Request to backend instance failed | Current request fails | Backend instance status, network connectivity |
 
 ### Warn-Level Logs
 
@@ -37,8 +55,9 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `Server {url} is not healthy` | The instance at this URL failed health check | Router cannot register the instance, or will remove it from the registered list | Health status |
 | `Instance {url} role is unknown` | Instance role cannot be recognized | The instance will not be added to the scheduling list | Registration parameters |
 | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer service call failed, automatically falling back to character-based tokenization | cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer; normal request processing is not affected | Tokenizer service status |
-| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | Tokenization completely failed (e.g., empty input), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status |
-| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed (new format), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status |
+| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | Failed to fetch remote metrics, falling back to local counter | Scheduling accuracy may decrease; normal request processing is not affected | Backend instance metrics port, network connectivity |
+| `release worker: {url} skipped, counter already cleaned up` | Worker counter was already cleaned up when trying to release | May occur when a worker is removed by health check while requests are still in-flight | Health status, request timing |
+| `release worker: {url} skipped, counter already zero (possible double-release)` | Worker counter is already zero when trying to release | Possible duplicate counter release | Request processing logic |
 
 ### Info-Level Logs
 
@@ -49,7 +68,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `Successfully registered instance from index {index}` | Instance from config file registered successfully | Normal startup log |
 | `No instances found in config file {path}` | No instances found in the registration config file | Check whether register.yaml is empty |
 | `Request completed successfully.` | Request processing completed | Normal operation log |
-| `Request failed, retrying...` | Request failed, retrying | Router will retry up to 3 times |
 | `select worker (prefill): {url}, tokens: {tokens}` | Prefill scheduler selected a worker, showing current token processing count | Normal operation log |
 | `select worker ({type}): {url}, count: {count}` | Decode/Mixed scheduler selected a worker, showing current request concurrency | Normal operation log |
 | `release worker: {url}, count: {count}` | Request ended, worker counter released | Normal operation log |
@@ -58,7 +76,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `removed counters for {count} unhealthy workers: {urls}` | Batch cleanup of counters for unhealthy workers | Normal operation log |
 | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | Periodic stats: total requests, worker loads, cache hit rate | Normal operation log, useful for monitoring and tuning |
 | `Parsing completed; starting worker selection.` | Request parsing completed, starting worker selection | Normal operation log |
-| `Request completed with an error.` | Request processing completed with an error | Check backend instance status |
 | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | Decode selection failed in PD disaggregated mode, releasing Prefill counter | Error handling log |
 | `[prefill] first chunk received, release counter url={url}` | Prefill streaming response received first chunk, counter released | Normal operation log |
 | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill non-streaming response completed, counter released | Normal operation log |
@@ -66,12 +83,17 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `[prefill] release in defer (fallback) url={url}, isStream={bool}` | Fallback resource release when Prefill request exits abnormally | Error handling log |
 | `[prefill] release in CommonCompletions defer (error path) url={url}` | Prefill resource release on error path | Error handling log |
 | `cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized` | cache_aware strategy not initialized, falling back to process_tokens | Check cache_aware configuration |
+| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed, falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected |
 | `cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads={loads}. ts_ms={ts}` | Load imbalanced across instances, falling back to process_tokens strategy | Normal operation log, automatic load balancing switch |
 | `cache-aware prefill: final strategy: cache_aware_scoring, selected={url}, loads={loads}, hitRatios={ratios}. ts_ms={ts}` | cache_aware scoring strategy selected a worker | Normal operation log, showing loads and hit ratios |
 | `[{method}] {path} {proto} {status} {latency} {clientIP}` | HTTP request access log | Normal operation log, records basic info for each request |
 | `before SelectWorker prefill. ts_ms={ts}` | Starting Prefill worker selection in PD disaggregated mode | Normal operation log, for performance tracing |
 | `before SelectWorker decode, after prefill. ts_ms={ts}` | Starting Decode worker selection after Prefill selection | Normal operation log, for performance tracing |
 | `after SelectWorker decode, before return. ts_ms={ts}` | Decode worker selection completed | Normal operation log, for performance tracing |
+| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | Unhealthy worker still has in-flight requests, counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete |
+| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | Unhealthy worker still has in-flight token load, token counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete |
+| `cleanup unhealthy worker token counter: {url}` | Cleaned up token counter for unhealthy worker | Normal operation log |
+| `preserved counters for {count} workers with inflight requests: {urls}` | Batch preserved counters for workers with in-flight requests | Normal operation log |
 
 ### Debug-Level Logs
 
@@ -100,6 +122,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `{"error": "Failed to build disaggregate_info"}` | 500 | Failed to build PD disaggregation communication info | Registration parameters (connector_port, device_ids, etc.) |
 | `{"error": "Invalid request body"}` | 400 | Failed to read request body | Request format |
 | `{"error": "Invalid JSON format"}` | 400 | Failed to parse request body JSON | Request format |
+| `{"error": "Failed to encode modified request: {error}"}` | 500 | Failed to encode modified request body | Request content |
+| `{"code": 500, "msg": "Internal server error"}` | 500 | A panic occurred during request processing and was recovered | Backend instance status, request content |
+
+> **Note**: In PD disaggregated (splitwise) mode, the above error responses include an additional `request_id` field, e.g., `{"error": "...", "request_id": "xxx"}`. Additionally, `Invalid request body` and `Invalid JSON format` responses include specific error details, e.g., `{"error": "Invalid request body: EOF"}`.
 
 ### Registration Request Errors (/register)
 
@@ -111,6 +137,7 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `{"code": 400, "msg": "splitwise mode only supports PREFILL/DECODE instances"}` | 400 | MIXED instances are not allowed in PD disaggregated mode | Deployment mode, instance role |
 | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | Only MIXED instances are allowed in centralized mode | Deployment mode, instance role |
 | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | Instance registration info validation failed | Registration parameters |
+| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router internal manager not initialized | Router startup status |
 | `{"code": 200, "msg": "Register success"}` | 200 | Registration successful | — |
 
 ### Common Registration Parameter Validation Errors
@@ -124,6 +151,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route
 | `port is required` | Missing port field | Add the port field |
 | `invalid port: {port}` | port is not a valid port number | Provide a port number in the range 1-65535 |
 | `invalid protocol: {protocol}` | Invalid transfer protocol | Use a valid protocol value: ipc / rdma |
+| `invalid connector_port: {port}` | connector_port is not a valid port number | Provide a port number in the range 1-65535 |
+| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port is not a valid port number | Provide a port number in the range 1-65535 |
+| `invalid metrics_port: {port}` | metrics_port is not a valid port number | Provide a port number in the range 1-65535 |
+| `rdma_ports[{index}] invalid port: {port}` | Port at index {index} in RDMA ports list is not valid | Provide a port number in the range 1-65535 |
 
 ## Troubleshooting Guide
 
@@ -236,7 +267,7 @@ If `Failed to start server` appears in startup logs, check:
 When using the `cache_aware` scheduling strategy, the Router calls a Tokenizer service to tokenize requests for cache hit ratio computation. When the Tokenizer service is unavailable, the Router has a two-level degradation mechanism:
 
 1. **Fallback to character-based tokenization** (common case): The log will show `tokenizer failed, fallback to char tokens`. The cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer. Cache hit accuracy may decrease, but normal request processing is not affected.
-2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `tokenize failed, fallback to process_tokens`. The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected.
+2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` (Info level). The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected.
 
 To restore full cache_aware functionality:
 
diff --git a/docs/zh/online_serving/router.md b/docs/zh/online_serving/router.md
index 0ace28c2da1..375f036ad2c 100644
--- a/docs/zh/online_serving/router.md
+++ b/docs/zh/online_serving/router.md
@@ -194,7 +194,7 @@ scheduler:
   policy: "power_of_two" # 调度策略(可选): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score; 默认: request_num
   prefill-policy: "cache_aware" # pd分离模式下prefill节点调度策略; 默认: process_tokens
   decode-policy: "request_num" # pd分离模式下decode节点调度策略; 默认: request_num
-  eviction-interval-secs: 60 # cache-aware策略清理过期cache的间隔时间
+  eviction-interval-secs: 60 # cache-aware策略清理过期计数器的间隔时间
   eviction-duration-mins: 30 # cache-aware策略radix tree节点驱逐时间(分钟); 默认: 30
   balance-abs-threshold: 1 # cache-aware策略绝对阈值
   balance-rel-threshold: 0.2 # cache-aware策略相对阈值
diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md
index a42ed015283..9c32726f4dc 100644
--- a/docs/zh/online_serving/router_faq.md
+++ b/docs/zh/online_serving/router_faq.md
@@ -29,6 +29,24 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `empty baseURL provided` | 健康检查时传入了空的基础 URL | 健康检查无法执行 | 注册参数 |
 | `failed to create request: {error}` | 创建健康检查请求失败 | 该实例可能被判定为不健康 | 网络环境 |
 | `failed to read response body: {error}` | 读取健康检查响应体失败 | 该实例可能被判定为不健康 | 后端实例状态 |
+| `Failed to select mixed worker: {error}` | 集中式模式下选择 Mixed Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 |
+| `Failed to select prefill worker: {error}` | PD 分离模式下选择 Prefill Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 |
+| `Failed to read register request body: {error}` | 读取注册请求体失败 | 该注册请求返回 400 | 请求格式 |
+| `Failed to unmarshal register request JSON: {error}` | 解析注册请求 JSON 失败 | 该注册请求返回 400 | 请求格式 |
+| `Failed to create decode request for {url}: {error}` | 创建发往 Decode 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 |
+| `Failed to create prefill request for {url}: {error}` | 创建发往 Prefill 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 |
+| `Decode request failed for {url}: {error}` | 发往 Decode 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 |
+| `Prefill request failed for {url}: {error}` | 发往 Prefill 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 |
+| `Failed to read request body: {error}` | 读取推理请求体失败 | 当前请求返回 400 | 请求格式 |
+| `Failed to unmarshal request JSON: {error}` | 解析推理请求 JSON 失败 | 当前请求返回 400 | 请求格式 |
+| `Failed to select worker pair: {error}` | PD 分离模式下选择 Worker 对失败 | 当前请求返回 502 | 健康状况、调度策略 |
+| `Failed to build disaggregate_info: {error}` | 构建 PD 分离通信信息失败 | 当前请求返回 500 | 注册参数（connector_port、device_ids 等） |
+| `Failed to encode modified request: {error}` | 编码修改后的请求体失败 | 当前请求返回 500 | 请求内容 |
+| `Failed to select worker: {error}` | 集中式模式下选择 Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 |
+| `Failed to connect to backend service: {error}` | 连接后端推理实例失败（已重试 3 次仍失败） | 当前请求返回 502 | 后端实例状态、网络连通性 |
+| `Request failed (attempt {n}/{max}): {error}` | 请求发送第 {n} 次尝试失败 | 若重试耗尽则请求返回 502 | 后端实例状态、网络连通性 |
+| `Failed to create backend request for {url}: {error}` | 创建发往后端的 HTTP 请求失败 | 当前请求失败 | 网络环境 |
+| `Backend request failed for {url}: {error}` | 发往后端实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 |
 
 ### Warn 级别日志
 
@@ -37,8 +55,9 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `Server {url} is not healthy` | 该 URL 对应的实例未通过健康检查 | Router 无法注册该实例，或将该实例从已注册列表中移除 | 健康状况 |
 | `Instance {url} role is unknown` | 实例角色无法识别 | 该实例不会被加入调度列表 | 注册参数 |
 | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer 服务调用失败，已自动回退至字符级分词 | cache_aware 策略仍然生效，使用字符级分词代替 Tokenizer 进行缓存匹配，不影响正常请求处理 | Tokenizer 服务状态 |
-| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | 分词彻底失败（如输入为空），回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略，不影响正常请求处理 | 请求内容、Tokenizer 服务状态 |
-| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败（新格式），回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略，不影响正常请求处理 | 请求内容、Tokenizer 服务状态 |
+| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | 获取远程 metrics 失败，已回退至本地计数器 | 调度精度可能下降，不影响正常请求处理 | 后端实例 metrics 端口、网络连通性 |
+| `release worker: {url} skipped, counter already cleaned up` | 释放 Worker 计数器时发现已被清理 | 可能是 Worker 被健康检查移除后仍有在途请求完成 | 健康状况、请求时序 |
+| `release worker: {url} skipped, counter already zero (possible double-release)` | 释放 Worker 计数器时发现已归零 | 可能存在计数器重复释放 | 请求处理逻辑 |
 
 ### Info 级别日志
 
@@ -49,7 +68,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `Successfully registered instance from index {index}` | 配置文件中的实例注册成功 | 正常启动日志 |
 | `No instances found in config file {path}` | 注册配置文件中未找到实例信息 | 请检查 register.yaml 内容是否为空 |
 | `Request completed successfully.` | 请求处理完成 | 正常运行日志 |
-| `Request failed, retrying...` | 请求失败，正在进行重试 | Router 最多重试 3 次 |
 | `select worker (prefill): {url}, tokens: {tokens}` | Prefill 调度选中 Worker，显示当前 token 处理量 | 正常运行日志 |
 | `select worker ({type}): {url}, count: {count}` | Decode/Mixed 调度选中 Worker，显示当前请求并发数 | 正常运行日志 |
 | `release worker: {url}, count: {count}` | 请求结束，释放 Worker 计数器 | 正常运行日志 |
@@ -58,7 +76,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `removed counters for {count} unhealthy workers: {urls}` | 批量清理不健康 Worker 的计数器 | 正常运行日志 |
 | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | 周期性统计：总请求数、各 Worker 负载、缓存命中率 | 正常运行日志，用于监控调优 |
 | `Parsing completed; starting worker selection.` | 请求解析完成，开始选择 Worker | 正常运行日志 |
-| `Request completed with an error.` | 请求处理完成但发生错误 | 请排查后端实例状态 |
 | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | PD 分离模式下 Decode 选择失败，释放 Prefill 计数器 | 异常处理日志 |
 | `[prefill] first chunk received, release counter url={url}` | Prefill 流式响应收到首个数据块，释放计数器 | 正常运行日志 |
 | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill 非流式响应完成，释放计数器 | 正常运行日志 |
@@ -72,6 +89,11 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `before SelectWorker prefill. ts_ms={ts}` | PD 分离模式下开始选择 Prefill Worker | 正常运行日志，用于性能追踪 |
 | `before SelectWorker decode, after prefill. ts_ms={ts}` | Prefill 选择完成后开始选择 Decode Worker | 正常运行日志，用于性能追踪 |
 | `after SelectWorker decode, before return. ts_ms={ts}` | Decode Worker 选择完成 | 正常运行日志，用于性能追踪 |
+| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败，回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略，不影响正常请求处理 |
+| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | 不健康 Worker 仍有在途请求，计数器暂时保留 | 正常运行日志，待在途请求完成后自动清理 |
+| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | 不健康 Worker 仍有在途 token 负载，token 计数器暂时保留 | 正常运行日志，待在途请求完成后自动清理 |
+| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数器 | 正常运行日志 |
+| `preserved counters for {count} workers with inflight requests: {urls}` | 批量保留仍有在途请求的 Worker 计数器 | 正常运行日志 |
 
 ### Debug 级别日志
 
@@ -100,6 +122,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `{"error": "Failed to build disaggregate_info"}` | 500 | 构建 PD 分离通信信息失败 | 注册参数（connector_port、device_ids 等） |
 | `{"error": "Invalid request body"}` | 400 | 请求体读取失败 | 请求格式 |
 | `{"error": "Invalid JSON format"}` | 400 | 请求体 JSON 解析失败 | 请求格式 |
+| `{"error": "Failed to encode modified request: {error}"}` | 500 | 编码修改后的请求体失败 | 请求内容 |
+| `{"code": 500, "msg": "Internal server error"}` | 500 | 请求处理过程中发生 panic 并被恢复 | 后端实例状态、请求内容 |
+
+> **说明**：在 PD 分离（splitwise）模式下，以上错误响应会额外包含 `request_id` 字段，如 `{"error": "...", "request_id": "xxx"}`。此外，`Invalid request body` 和 `Invalid JSON format` 的实际输出会包含具体的错误详情，如 `{"error": "Invalid request body: EOF"}`。
 
 ### 注册请求错误（/register）
 
@@ -112,6 +138,7 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | 集中式模式下只允许注册 MIXED 实例 | 部署模式、实例角色 |
 | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | 实例注册信息校验失败 | 注册参数 |
 | `{"code": 200, "msg": "Register success"}` | 200 | 注册成功 | — |
+| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router 内部管理器未初始化 | Router 启动状态 |
 
 ### 常见注册参数校验错误
 
@@ -124,6 +151,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
 | `port is required` | 缺少 port 字段 | 添加 port 字段 |
 | `invalid port: {port}` | port 不是合法的端口号 | 填写 1-65535 范围内的端口号 |
 | `invalid protocol: {protocol}` | 传输协议不合法 | 使用合法的协议值：ipc / rdma |
+| `invalid connector_port: {port}` | connector_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 |
+| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 |
+| `invalid metrics_port: {port}` | metrics_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 |
+| `rdma_ports[{index}] invalid port: {port}` | RDMA 端口列表中第 {index} 个端口号不合法 | 填写 1-65535 范围内的端口号 |
 
 ## 常见问题排查方式
 
@@ -236,7 +267,7 @@ PD 分离模式下建议完整配置以下参数，以确保 KV Cache 传输正
 使用 `cache_aware` 调度策略时，Router 会调用 Tokenizer 服务对请求进行分词以计算缓存命中率。当 Tokenizer 服务不可用时，Router 内置了两级退化机制：
 
 1. **回退至字符级分词**（常见情况）：日志出现 `tokenizer failed, fallback to char tokens`。此时 cache_aware 策略仍然生效，只是使用字符级分词代替 Tokenizer 进行缓存匹配，缓存命中精度会有所下降，但不影响正常请求处理。
-2. **回退至 process_tokens 策略**（极端情况）：当分词彻底失败（如请求内容为空）时，日志出现 `tokenize failed, fallback to process_tokens`。此时 cache_aware 策略暂时不生效，改为按 token 处理量进行调度，同样不影响正常请求处理。
+2. **回退至 process_tokens 策略**（极端情况）：当分词彻底失败（如请求内容为空）时，日志出现 `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}`（Info 级别）。此时 cache_aware 策略暂时不生效，改为按 token 处理量进行调度，同样不影响正常请求处理。
 
 如需恢复 cache_aware 策略的完整功能：
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
new file mode 100644
index 00000000000..6534fb332f2
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -0,0 +1,119 @@
+---
+name: stat-cache-hitrate
+description: >
+  统计 FastDeploy Go Router 日志中的三层 cache 命中率指标，生成可视化报告。
+  三层指标：Prefix Hit Ratio（KV Cache 内容复用度）、Session Hit Rate（请求级路由粘性）、
+  Per-Worker Cache Stats（各 prefill worker 的缓存利用排名）。支持全量统计、tail 快速查看、
+  持续监控模式。
+
+  当用户提到以下内容时触发此 skill：统计/查看 cache 命中率、查看 cache-aware 调度效果、
+  查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。
+  关键词：cache 命中率、hitRatio、cache-aware、prefix hit、session hit rate、
+  缓存预热、/stat-cache-hitrate。
+
+IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析规则。
+---
+
+# Cache Hit Rate Statistics
+
+统计 FastDeploy Go Router 的三层 cache 命中率，生成可视化报告。
+
+## 执行前交互
+
+运行脚本前，Claude 必须先向用户确认以下参数：
+
+### 1. 日志文件路径
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项，同时允许用户直接输入自定义路径（支持绝对路径和相对路径）：
+- 选项 1: `logs/router.log`（默认）
+- 选项 2: `fd-router.log`（golang_router 根目录）
+- 选项 3: 用户通过 Other 输入自定义路径
+
+**重要规则**：
+- 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
+- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件
+- 支持绝对路径（如 `/home/user/logs/xxx.log`）和相对路径（如 `logs/fd-router (2).log`）
+
+如果用户直接确认或未指定路径，使用默认值 `logs/router.log`。
+
+### 2. 分析模式
+向用户询问分析模式：
+> "请选择分析模式：
+> 1. **全量统计**（默认）— 扫描完整日志
+> 2. **快速查看尾部** — 只看最近的数据（可指定行数如 2000 或时间如 30m）
+> 3. **持续监控** — 全量分析后提示监控命令
+> 4. **指定时间段** — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）"
+
+如果用户未选择，默认使用全量统计。
+
+`--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。
+时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。
+缺失部分自动从日志首末行推断。
+
+### 3. 输出目录
+分析结果默认保存到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
+用户可通过 `--output` 指定自定义目录。
+
+## 使用方式
+
+运行统计脚本（相对于 `fastdeploy/golang_router/` 目录）：
+
+```bash
+# 全量统计
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --output skill_output/stat-cache-hitrate/
+
+# 快速查看尾部数据
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail       # 默认最后 2000 行
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000   # 指定行数
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m    # 指定时间
+
+# 持续监控
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --watch
+
+# 指定时间段（--start 和 --end 可单独或同时使用）
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00"
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "2026/03/31 16:00:00"
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00"
+```
+
+默认日志路径：`logs/router.log` 或 `fd-router.log`（相对于 `fastdeploy/golang_router/`）。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate/<timestamp>/`。
+
+脚本会自动根据文件大小选择解析策略：小文件（<5000 行）在内存中处理，大文件用 grep + 管道流式处理。
+
+## 输出说明
+
+### 三层指标
+
+| 层级 | 指标 | 含义 |
+|------|------|------|
+| 第一层 | Prefix Hit Ratio | 被选中 worker 的 KV cache 命中率，反映内容级复用度 |
+| 第二层 | Session Hit Rate | 带 session_id 的请求被路由到同一 worker 的比例 |
+| 第三层 | Per-Worker Stats | 每个 prefill worker 被选中的次数和平均命中率排名 |
+
+### 输出文件位置
+
+详细报告和图表输出到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/` 目录，每次运行自动创建带时间戳的子目录。
+
+- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
+- `details/per_window_data.md` — 每5s窗口的完整明细数据（Prefix HR / Session HR / Scoring / Fallback / Running）
+
+### 交叉诊断矩阵
+
+| Session HR | Prefix HR | 诊断 |
+|------------|-----------|------|
+| 高 | 高 | cache-aware 策略运行良好 |
+| 高 | 低 | session 粘性好但 prompt 内容变化大，KV cache 实际复用低 |
+| 低 | 高 | 换 worker 了但新 worker 也有类似前缀缓存 |
+| 低 | 低 | 负载均衡强制分散或缓存未预热 |
+
+## 重要规则
+
+1. **`[stats]` 计数器 per-interval**：每 5s `atomic.Swap(0)` 重置，必须 sum 所有行计算累计值
+2. **Session HR 只统计带 session_id 的请求**
+3. **Prefix HR 取 selected worker 的值**：不在 hitRatios map 中则为 0
+4. **此 skill 只关注 cache 命中率**：延迟/错误/健康等排查由 troubleshoot skill 负责
+5. **与 troubleshoot-cache 互补**：本 skill 做数值统计，troubleshoot-cache 做调度策略诊断
+
+## 参考文件
+
+- `references/log_formats.md` — 日志格式和解析规则
+- `references/report_templates.md` — 终端报告和详细导出的模板
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json
new file mode 100644
index 00000000000..23c7f6d86aa
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json
@@ -0,0 +1,18 @@
+[
+  {"query": "帮我统计一下 router 的 cache 命中率，日志在 logs/fd-router.log", "should_trigger": true},
+  {"query": "我想看看 cache-aware 调度的效果怎么样，hitRatio 数据是多少", "should_trigger": true},
+  {"query": "prefix hit ratio 和 session hit rate 分别是多少？分析一下 logs/router.log", "should_trigger": true},
+  {"query": "看一下最近30分钟的缓存预热情况，用 tail 模式快速扫一下", "should_trigger": true},
+  {"query": "我刚部署了新的 cache-aware 策略，帮我跑一下 /stat-cache-hitrate 看看效果", "should_trigger": true},
+  {"query": "每个 prefill worker 的缓存利用率排名是怎样的？哪个 worker 命中率最高", "should_trigger": true},
+  {"query": "stat cache hitrate on our go router log, need to check the KV cache reuse rate", "should_trigger": true},
+  {"query": "持续监控 cache 命中率变化趋势，我想看实时数据", "should_trigger": true},
+  {"query": "router 最近老是返回 502，帮我排查一下什么问题", "should_trigger": false},
+  {"query": "分析一下 router 的请求延迟，p99 是不是太高了", "should_trigger": false},
+  {"query": "帮我 trace 一下这个请求 ID: abc-123-def，看看整个链路", "should_trigger": false},
+  {"query": "Worker 健康状态怎么样？有没有频繁下线的", "should_trigger": false},
+  {"query": "帮我写一个 Go 语言的 HTTP 路由框架", "should_trigger": false},
+  {"query": "分析一下 nginx 的 access log，统计各个 URL 的访问量", "should_trigger": false},
+  {"query": "router 负载不均衡，某些 worker 的 running 计数异常高", "should_trigger": false},
+  {"query": "帮我看看 FastDeploy 的部署文档，我想部署一个新模型", "should_trigger": false}
+]
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md
new file mode 100644
index 00000000000..bc29a4cbb25
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md
@@ -0,0 +1,139 @@
+# 日志格式参考
+
+本文件描述 FastDeploy Go Router 的日志格式和解析规则。统计 cache 命中率前必须阅读。
+
+---
+
+## 通用日志行格式
+
+```
+[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:<line>: <optional_context_prefixes> <message>
+```
+
+- **Level**：`[INFO]`、`[ERROR]`、`[WARN]`、`[DEBUG]`
+- **Timestamp**：`YYYY/MM/DD HH:MM:SS`
+- **可选 context 前缀**：`[trace_id:...]`、`[req_id:...]`、`[session_id:...]`、`[request_id:...]` 可能出现在 `logger.go:XX:` 和实际消息之间，顺序固定（trace_id → req_id → session_id → request_id），但不一定全部出现
+
+---
+
+## 类别 A：Cache-Aware 策略行
+
+### A1. cache_aware_scoring（正常走 cache-aware 路径）
+
+```
+[INFO] 2026/03/30 20:16:57 logger.go:79: [session_id:slimshetty/swebench-verified:sweb.eval.x86_64.psf__requests-1766] [request_id:565a594c-...] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20 ...], hitRatios=map[http://10.52.95.17:9263:100]. ts_ms=2026-03-30 20:16:57.021
+```
+
+**提取字段**：
+- `selected=<url>` — 被选中的 worker URL，格式 `http://IP:PORT`
+- `hitRatios=map[...]` — Go map 格式，详见下方解析规则
+- `loads=map[...]` — 各 worker 的负载
+
+### A2. process_tokens fallback（未走 cache-aware 路径）
+
+```
+cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...]
+cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: <error>
+cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized
+```
+
+---
+
+## 类别 B：Stats 行
+
+```
+[INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14, workers: [http://10.52.96.143:9867: running=0, http://10.52.95.26:9867: running=1, ...], cache_hit_rate=0.00% (hits=0/total=7)
+```
+
+**提取字段**：
+- `total_running=<N>` — 所有 worker 的运行请求总数
+- `workers: [...]` — 各 worker 的 `running=N`
+- `cache_hit_rate=<X.XX>%` — 该窗口的命中率百分比
+- `(hits=<N>/total=<M>)` — 该 5s 窗口的命中次数和总次数
+
+**关键**：`hits` 和 `total` 是 **per-interval** 的，代码使用 `atomic.Swap(0)` 每 5s 重置为 0。
+
+---
+
+## 类别 C：推理请求行
+
+```
+[INFO] 2026/03/30 18:25:49 logger.go:79: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.52.95.139
+```
+
+格式：`[METHOD] /path HTTP/1.1 <status_code> <duration> <client_ip>`
+
+延迟单位可能是 `s`、`ms`、`µs`/`us`。
+
+**注意**：仅 `POST /v1/chat/completions` 和 `POST /v1/completions` 为推理请求。其余路径（`/register`、`/registered_number`、`/registered`、`/health_generate`、`/metrics`）为管理/监控请求，统计推理吞吐量时应排除。
+
+---
+
+## Go Map 解析规则
+
+Go 的 `fmt.Sprintf("%v", map)` 输出格式：`map[key1:val1 key2:val2 ...]`
+
+### hitRatios 的特殊挑战
+
+Worker URL 包含 `:`（如 `http://10.52.95.17:9263`），而 Go map 的 key-value 分隔符也是 `:`。
+因此 `hitRatios=map[http://10.52.95.17:9263:100]` 中：
+- URL = `http://10.52.95.17:9263`
+- Ratio = `100`
+
+### 推荐解析方法
+
+**方法 1：正则匹配**（推荐）
+
+提取 `hitRatios=map[` 和 `]` 之间的内容，然后用正则匹配每个 entry：
+
+```
+正则：(http://[^\s:]+:\d+):(\d+)
+```
+
+示例：
+```
+输入：http://10.52.95.17:9263:100 http://10.52.96.143:9867:50
+匹配1：group1=http://10.52.95.17:9263, group2=100
+匹配2：group1=http://10.52.96.143:9867, group2=50
+```
+
+**方法 2：从右分割**
+
+对 map 内容按空格分割每个 token，然后对每个 token 找最后一个 `:` 分割：
+```
+token = "http://10.52.95.17:9263:100"
+lastColon = 最后一个 ":" 的位置
+url = token[:lastColon]  → "http://10.52.95.17:9263"
+ratio = token[lastColon+1:]  → "100"
+```
+
+### 空 map
+
+`hitRatios=map[]` 表示冷启动，没有任何 worker 有匹配的前缀缓存。
+
+### loads map 解析
+
+同样的规则适用于 `loads=map[...]`，value 是负载数：
+```
+loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20]
+```
+
+### workers 列表解析（stats 行）
+
+`workers: [http://10.52.96.143:9867: running=0, ...]` 格式不同：
+- 用 `,` 分割每个 entry
+- 每个 entry 格式：`http://IP:PORT: running=N`
+- 注意 URL 后面跟的是 `: running=`（带空格），不是 Go map 的 `:val`
+
+---
+
+## 时间戳解析
+
+日志时间戳格式：`YYYY/MM/DD HH:MM:SS`
+
+提取正则：`(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})`
+
+用于：
+- 确定日志时间跨度
+- 按时间分窗口（5s、1min 等）
+- 按 quartile 分段统计趋势
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
new file mode 100644
index 00000000000..dcef9c47498
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
@@ -0,0 +1,199 @@
+# 报告输出模板
+
+本文件包含 cache 命中率分析报告的终端输出模板和详细数据导出模板。
+
+---
+
+## 终端概览报告模板
+
+```
+## Cache Hit Rate Analysis Report
+**File**: <path> | **Lines**: N | **Span**: <start> ~ <end> (<duration>)
+
+### 1. Prefix Hit Ratio (KV Cache 内容复用度)
+  累计平均: XX.X% (被选中 worker)
+  分布:
+    0-20%  ██░░░░░░░░░░░░░░░░░░  X%   (N=xxx)
+   20-40%  ███░░░░░░░░░░░░░░░░░  X%   (N=xxx)
+   40-60%  █████░░░░░░░░░░░░░░░  X%   (N=xxx)
+   60-80%  ████████████░░░░░░░░  X%   (N=xxx)
+  80-100%  ████████████████████  X%   (N=xxx)
+  冷启动率: X.X%
+  趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X% ↑/↓/→
+
+  Prefix Hit Ratio (5s 窗口):
+  100%|                              ·····················
+   80%|                     ····· ···
+   60%|               ·····
+   40%|          ·····
+   20%|    ······
+    0%|····
+      +---+---+---+---+---+---+---+---+---+---→ time
+       18:25 18:26 18:27 18:28 18:29 18:30
+
+### 2. Session Hit Rate (请求级路由粘性)
+  累计: XX.X% (hits=N / total=N)
+  覆盖率: X.X% 的推理请求带 session_id
+  趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X%
+
+  Session Hit Rate (5s 窗口):
+  100%|                                    ····················
+   80%|                          ··········
+   60%|               ···········
+   40%|
+   20%|
+    0%|·······
+      +---+---+---+---+---+---+---+---+---+---→ time
+
+### 3. Per-Worker Cache Stats
+  ┌───────────────────────────┬──────────┬──────────┬─────────────────┐
+  │ Prefill Worker            │ Selected │ Select % │ Avg Hit(Select) │
+  ├───────────────────────────┼──────────┼──────────┼─────────────────┤
+  │ http://10.52.95.17:9263   │   1,234  │  15.2%   │      82%        │
+  │ http://10.52.96.143:9867  │     890  │  11.0%   │      74%        │
+  │ ...                       │    ...   │   ...    │      ...        │
+  └───────────────────────────┴──────────┴──────────┴─────────────────┘
+
+### 4. Scheduling Strategy
+  cache_aware_scoring: N (X%) | fallback: N (X%)
+    fallback reasons: load_imbalanced=N, tokenize_failed=N, not_initialized=N
+  非最优命中选择: X% (负载均衡优先于命中率的比例)
+
+### 5. Diagnosis
+  ✅/⚠/❌ <综合诊断>
+
+📄 详细数据见: skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
+```
+
+---
+
+## 格式规则
+
+### Unicode 柱状图
+
+- 总宽度 20 个字符
+- `█` 表示已填充部分，`░` 表示空白部分
+- 后跟百分比和绝对数量
+
+```
+计算方法：
+filled = round(percentage / 100 * 20)
+bar = "█" * filled + "░" * (20 - filled)
+output = f"{bar}  {percentage}%   (N={count})"
+```
+
+示例：
+```
+████████████░░░░░░░░  60%   (N=1200)
+██████████████████░░  90%   (N=1800)
+██░░░░░░░░░░░░░░░░░░  10%   (N=200)
+```
+
+### ASCII 折线图
+
+- Y 轴：0-100% 范围，6 行（0%, 20%, 40%, 60%, 80%, 100%）
+- X 轴：时间，标注关键时间点
+- 数据点用 `·` 绘制
+- 坐标轴用 `|` `+` `─` `→`
+
+```
+时间粒度自动调整：
+- 日志跨度 <30min → 5s 原始粒度
+- 日志跨度 <3h → 1min 粒度
+- 日志跨度 >3h → 5min 粒度
+```
+
+图表宽度约 60 列。数据点太多时自动聚合到更粗的粒度。
+
+### 表格
+
+使用 Unicode box-drawing 字符：
+
+```
+┌ ─ ┬ ─ ┐    顶部
+│   │   │    数据行
+├ ─ ┼ ─ ┤    分隔行
+│   │   │    数据行
+└ ─ ┴ ─ ┘    底部
+```
+
+### 趋势箭头
+
+- `↑` — 上升趋势（Q4 > Q1 + 10%）
+- `↓` — 下降趋势（Q4 < Q1 - 10%）
+- `→` — 稳定（变化 < 10%）
+
+---
+
+## 详细数据导出模板
+
+主报告：`skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<YYYYMMDD_HHMMSS>.md`
+每窗口明细：`skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md`
+
+### 主报告
+
+```markdown
+# Cache Hit Rate Detailed Report
+
+**Generated**: <timestamp>
+**Source**: <log_file_path>
+
+## 1. Per-Worker 完整统计
+
+| Worker | Selected | Select % | Avg Hit (Selected) | Avg Hit (All) | Max Hit |
+|--------|----------|----------|--------------------|----- ---------|---------|
+| http://10.52.95.17:9263 | 1,234 | 15.2% | 82% | 68% | 100% |
+| ... | ... | ... | ... | ... | ... |
+
+## 2. Fallback 明细
+
+### 3.1 load imbalanced (N 次)
+| Time | Loads |
+|------|-------|
+| 20:15:03 | map[...] |
+
+### 3.2 tokenize failed (N 次)
+| Time | Error |
+|------|-------|
+| ... | ... |
+
+## 4. 非最优命中选择明细
+
+| Time | Selected | Selected HR | Best Worker | Best HR | Load Diff |
+|------|----------|-------------|-------------|---------|-----------|
+| 20:15:10 | w1:9263 | 60% | w2:9867 | 85% | w1=5, w2=18 |
+| ... | ... | ... | ... | ... | ... |
+```
+
+---
+
+## --tail 快速查看模板
+
+`--tail` 模式下只输出核心指标：
+
+```
+## Cache Hit Rate (Recent)
+**File**: <path> | **tail <N> lines** | **Span**: <start> ~ <end>
+
+  Prefix Hit Ratio:  XX.X% (avg) | Cold start: X.X%
+  Session Hit Rate:  XX.X% (hits=N/total=N) | Coverage: X.X%
+  Strategy: scoring N (X%) | fallback N (X%)
+
+  Recent trend (1min buckets):
+  100%|          ·····
+   80%|     ·····
+   60%|·····
+      +---+---+---+---+---→
+       -5m  -4m  -3m  -2m  -1m
+
+💡 持续跟踪: /loop 30s /analyze-cache-hitrate --tail
+```
+
+## --watch 持续监控模板
+
+`--watch` 模式先输出完整报告（同终端概览报告模板），末尾额外提示：
+
+```
+💡 全量分析完成。持续跟踪后续变化:
+   /loop 30s /analyze-cache-hitrate --tail
+```
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py
new file mode 100644
index 00000000000..cc5534a757d
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py
@@ -0,0 +1,249 @@
+#!/usr/bin/env python3
+"""
+Chart — 终端可视化渲染工具
+
+提供 sparkline 折线图、Unicode 柱状图、Markdown 表格的渲染函数。
+所有函数返回字符串（不直接打印），方便组装到报告中。
+
+Python 3 stdlib only，零依赖。
+"""
+
+
+# ════════════════════════════════════════════════════════════════
+# Sparkline 折线图
+# ════════════════════════════════════════════════════════════════
+
+BLOCK_CHARS = " ▁▂▃▄▅▆▇█"
+
+
+def render_sparkline(
+    records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60
+):
+    """渲染 8 级 Unicode sparkline 折线图。
+
+    Args:
+        records: dict 列表，每个 dict 包含 bucket_field 和 value_field
+        value_field: 数值字段名
+        bucket_field: 时间桶字段名
+        title: 图表标题
+        y_label: Y 轴标签（如 '%'）
+        y_range: Y 轴范围 (min, max) 元组，None 则自动
+        width: 图表宽度（字符数）
+
+    Returns:
+        str: 渲染后的图表文本
+    """
+    if not records:
+        return "  (no data)"
+
+    all_values = []
+    for r in records:
+        v = r.get(value_field)
+        if v is not None:
+            all_values.append(float(v))
+
+    if not all_values:
+        return "  (no numeric data)"
+
+    # Y 轴范围
+    if y_range:
+        y_min, y_max = y_range
+    else:
+        y_min = min(all_values)
+        y_max = max(all_values)
+        if y_max == y_min:
+            y_min = 0 if y_max > 0 else y_max - 1
+            y_max = max(y_max, 1)
+
+    y_span = y_max - y_min if y_max != y_min else 1
+
+    # 降采样
+    n = len(records)
+    if n > width:
+        step = n / width
+        sampled = []
+        for i in range(width):
+            start_idx = int(i * step)
+            end_idx = int((i + 1) * step)
+            chunk = records[start_idx:end_idx]
+            vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None]
+            avg_record = {
+                bucket_field: chunk[0].get(bucket_field, ""),
+                value_field: sum(vals) / len(vals) if vals else 0,
+            }
+            sampled.append(avg_record)
+        records = sampled
+
+    lines = []
+
+    # 标题行
+    def fmt_val(v):
+        if abs(v) >= 1000:
+            return f"{v:.0f}"
+        elif abs(v) >= 10:
+            return f"{v:.1f}"
+        return f"{v:.2f}"
+
+    header_parts = []
+    if title:
+        header_parts.append(title)
+    header_parts.append(f"min={fmt_val(min(all_values))}")
+    header_parts.append(f"max={fmt_val(max(all_values))}")
+    if y_label:
+        header_parts.append(f"({y_label})")
+    lines.append("  " + "  ".join(header_parts))
+
+    # Sparkline 字符
+    spark_chars = []
+    for r in records:
+        v = r.get(value_field)
+        if v is None:
+            spark_chars.append(" ")
+            continue
+        v = float(v)
+        normalized = (v - y_min) / y_span
+        level = max(0, min(8, round(normalized * 8)))
+        spark_chars.append(BLOCK_CHARS[level])
+    lines.append("  " + "".join(spark_chars))
+
+    # X 轴标签
+    data_width = len(records)
+    if data_width > 0:
+
+        def short_bucket(r):
+            b = str(r.get(bucket_field, ""))
+            if " " in b:
+                b = b.split(" ")[-1]
+            return b[:5] if len(b) >= 5 else b
+
+        lbl_width = 6
+        max_labels = max(1, data_width // lbl_width)
+        n_records = len(records)
+
+        if n_records <= 2:
+            indices = list(range(n_records))
+        elif n_records <= max_labels:
+            indices = [0, n_records - 1]
+        else:
+            n_labels = min(5, max(2, max_labels))
+            indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)]
+
+        label_line = [" "] * (data_width + lbl_width + 2)
+        last_end = -1
+        for idx in indices:
+            lbl = short_bucket(records[idx])
+            pos = idx
+            if pos < last_end:
+                continue
+            for ci, c in enumerate(lbl):
+                p = pos + ci
+                if p < len(label_line):
+                    label_line[p] = c
+            last_end = pos + len(lbl) + 1
+        lines.append("  " + "".join(label_line).rstrip())
+
+    return "\n".join(lines)
+
+
+# ════════════════════════════════════════════════════════════════
+# Unicode 柱状图
+# ════════════════════════════════════════════════════════════════
+
+
+def render_bar(data, bar_width=20, show_count=False):
+    """渲染 Unicode 柱状图。
+
+    Args:
+        data: dict 列表，每个 dict 包含 label, value（百分比 0-100）, 可选 count
+        bar_width: 柱状图宽度（字符数）
+        show_count: 是否显示绝对数量
+
+    Returns:
+        str: 渲染后的图表文本
+    """
+    if not data:
+        return "  (no data)"
+
+    max_label_len = max(len(str(d.get("label", ""))) for d in data)
+    max_label_len = max(max_label_len, 4)
+
+    lines = []
+    for d in data:
+        label = str(d.get("label", ""))
+        value = float(d.get("value", 0))
+        count = d.get("count")
+
+        filled = round(value / 100 * bar_width) if value > 0 else 0
+        filled = max(1, filled) if value > 0 else 0
+        filled = min(bar_width, filled)
+        empty = bar_width - filled
+        bar = "█" * filled + "░" * empty
+
+        line = f"  {label:<{max_label_len}}  {bar} {value:>5.1f}%"
+        if show_count and count is not None:
+            line += f"  (N={count})"
+        lines.append(line)
+
+    return "\n".join(lines)
+
+
+# ════════════════════════════════════════════════════════════════
+# Markdown 表格
+# ════════════════════════════════════════════════════════════════
+
+
+def render_table(data, columns=None, right_align=None):
+    """渲染 Markdown 表格。
+
+    Args:
+        data: dict 列表
+        columns: 列名列表，None 则用第一条记录的所有 key
+        right_align: 右对齐的列名集合
+
+    Returns:
+        str: 渲染后的表格文本
+    """
+    if not data:
+        return "  (no data)"
+
+    if columns is None:
+        columns = list(data[0].keys())
+    if right_align is None:
+        right_align = set()
+
+    # 计算列宽
+    col_widths = {}
+    for col in columns:
+        col_widths[col] = len(col)
+        for row in data:
+            val = str(row.get(col, ""))
+            col_widths[col] = max(col_widths[col], len(val))
+
+    # 表头
+    header_parts = []
+    sep_parts = []
+    for col in columns:
+        w = col_widths[col]
+        if col in right_align:
+            header_parts.append(f" {col:>{w}} ")
+        else:
+            header_parts.append(f" {col:<{w}} ")
+        sep_parts.append("-" * (w + 2))
+
+    lines = []
+    lines.append("|" + "|".join(header_parts) + "|")
+    lines.append("|" + "|".join(sep_parts) + "|")
+
+    # 数据行
+    for row in data:
+        row_parts = []
+        for col in columns:
+            val = str(row.get(col, ""))
+            w = col_widths[col]
+            if col in right_align:
+                row_parts.append(f" {val:>{w}} ")
+            else:
+                row_parts.append(f" {val:<{w}} ")
+        lines.append("|" + "|".join(row_parts) + "|")
+
+    return "\n".join(lines)
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
new file mode 100644
index 00000000000..0b7377b4865
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
@@ -0,0 +1,358 @@
+#!/usr/bin/env python3
+"""
+Router Log Parser — FastDeploy Go Router 日志解析原语
+
+支持两种调用方式：
+1. 作为模块导入：from log_parser import parse_cache_strategy_line, parse_stats_line
+2. 作为 CLI 工具：grep 'pattern' logfile | python3 log_parser.py parse-cache-strategy
+
+Python 3 stdlib only，零依赖。
+"""
+
+import argparse
+import json
+import re
+import sys
+from datetime import datetime
+
+# ════════════════════════════════════════════════════════════════
+# 通用解析原语
+# ════════════════════════════════════════════════════════════════
+
+
+def parse_go_map(s):
+    """解析 Go fmt.Sprintf('%v', map) 输出：map[key1:val1 key2:val2 ...]
+
+    处理 URL 中冒号与 Go map key-value 分隔符的冲突（从最后一个冒号分割）。
+    空 map 'map[]' 返回空 dict。
+    """
+    inner_match = re.search(r"map\[(.*?)\]", s)
+    if not inner_match:
+        return {}
+    inner = inner_match.group(1).strip()
+    if not inner:
+        return {}
+    result = {}
+    for token in inner.split():
+        idx = token.rfind(":")
+        if idx > 0:
+            key = token[:idx]
+            val_str = token[idx + 1 :]
+            try:
+                result[key] = int(val_str) if "." not in val_str else float(val_str)
+            except ValueError:
+                result[key] = val_str
+    return result
+
+
+# 时间戳：YYYY/MM/DD HH:MM:SS
+TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})")
+
+# ts_ms：2025-01-15 18:25:33.123
+TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
+
+
+def extract_ts(line):
+    """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。"""
+    m = TS_RE.search(line)
+    return m.group(1) if m else None
+
+
+def parse_ts(ts_str):
+    """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。"""
+    return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")
+
+
+# ════════════════════════════════════════════════════════════════
+# 时间范围过滤
+# ════════════════════════════════════════════════════════════════
+
+import os
+import subprocess
+import tempfile
+
+_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$")
+_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$")
+_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$")
+_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$")
+
+
+def _get_log_boundary_ts(log_file, which="first"):
+    """从日志文件首行或末行提取时间戳。"""
+    cmd = "head" if which == "first" else "tail"
+    try:
+        r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5)
+        return extract_ts(r.stdout) if r.returncode == 0 else None
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return None
+
+
+def complete_time_arg(time_str, log_file, is_end=False):
+    """解析灵活时间输入，补全缺失部分。
+
+    支持格式：
+        'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD',
+        'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM'
+
+    补全规则：
+        - 缺年份：从日志首行取
+        - 缺日期：从日志末行取
+        - 缺时间：start→00:00:00, end→23:59:59
+
+    Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串
+    """
+    if time_str is None:
+        return None
+    time_str = time_str.strip()
+
+    # Case 1: 完整日期时间
+    m = _FULL_DT_RE.match(time_str)
+    if m:
+        y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
+        h, mi = m.group(4).zfill(2), m.group(5)
+        s = (m.group(6) or "00").zfill(2)
+        return f"{y}/{mo}/{d} {h}:{mi}:{s}"
+
+    # Case 2: 仅日期 YYYY/MM/DD
+    m = _DATE_ONLY_RE.match(time_str)
+    if m:
+        y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
+        t = "23:59:59" if is_end else "00:00:00"
+        return f"{y}/{mo}/{d} {t}"
+
+    # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS]
+    m = _SHORT_DATE_RE.match(time_str)
+    if m:
+        mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
+        ts = _get_log_boundary_ts(log_file, "first")
+        year = ts[:4] if ts else "2026"
+        if m.group(3):  # 有时间部分
+            h, mi = m.group(3).zfill(2), m.group(4)
+            s = (m.group(5) or "00").zfill(2)
+            return f"{year}/{mo}/{d} {h}:{mi}:{s}"
+        t = "23:59:59" if is_end else "00:00:00"
+        return f"{year}/{mo}/{d} {t}"
+
+    # Case 4: 仅时间 HH:MM[:SS]
+    m = _TIME_ONLY_RE.match(time_str)
+    if m:
+        h, mi = m.group(1).zfill(2), m.group(2)
+        s = (m.group(3) or "00").zfill(2)
+        ts = _get_log_boundary_ts(log_file, "last")
+        date_part = ts[:10] if ts else "2026/01/01"
+        return f"{date_part} {h}:{mi}:{s}"
+
+    # Fallback: 原样返回
+    return time_str
+
+
+def filter_file_by_time_range(log_file, start_str=None, end_str=None):
+    """用 awk 按时间范围预过滤日志文件。
+
+    时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比，直接用 awk 字符串比较。
+    无时间戳的行（如 panic 堆栈续行）保留。
+
+    Args:
+        log_file: 原日志文件路径
+        start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'（含），或 None
+        end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'（含），或 None
+
+    Returns:
+        tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除
+    """
+    if not start_str and not end_str:
+        return (log_file, False)
+
+    tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_")
+    tmp.close()
+
+    awk_script = r"""{
+        ts = ""
+        if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) {
+            ts = substr($0, RSTART, RLENGTH)
+        }
+        if (ts == "") { print; next }
+        if ((start == "" || ts >= start) && (end == "" || ts <= end)) print
+    }"""
+
+    cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file]
+
+    try:
+        with open(tmp.name, "w") as outf:
+            result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120)
+        if result.returncode != 0:
+            os.unlink(tmp.name)
+            return (log_file, False)
+    except (subprocess.TimeoutExpired, OSError):
+        if os.path.exists(tmp.name):
+            os.unlink(tmp.name)
+        return (log_file, False)
+
+    return (tmp.name, True)
+
+
+# Context tag：[session_id:...], [request_id:...], [trace_id:...], [req_id:...]
+TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]")
+
+
+def extract_tags(line):
+    """从日志行提取 context tag。"""
+    return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)}
+
+
+# ════════════════════════════════════════════════════════════════
+# Cache-Aware 策略行解析（类别 A）
+# ════════════════════════════════════════════════════════════════
+
+STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
+SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+
+
+def parse_cache_strategy_line(line):
+    """解析 cache-aware prefill 策略行。
+
+    输入示例：
+        [INFO] 2026/03/30 20:16:57 logger.go:79: ... cache-aware prefill: final strategy:
+        cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[...], hitRatios=map[...]
+
+    返回 dict 或 None（如果不是策略行）。
+    """
+    sm = STRATEGY_RE.search(line)
+    if not sm:
+        return None
+
+    ts = extract_ts(line)
+    strategy = sm.group(1)
+    record = {"ts": ts or "", "strategy": strategy}
+
+    # selected worker URL
+    sel_m = SELECTED_RE.search(line)
+    if sel_m:
+        record["selected"] = sel_m.group(1)
+
+    # reason（仅 process_tokens fallback）
+    reason_m = REASON_RE.search(line)
+    if reason_m and strategy == "process_tokens":
+        record["reason"] = reason_m.group(1).strip()
+
+    # hitRatios map
+    hr_match = re.search(r"hitRatios=(map\[.*?\])", line)
+    if hr_match:
+        hit_ratios = parse_go_map(hr_match.group(1))
+        record["hitRatios"] = hit_ratios
+        if "selected" in record:
+            record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0)
+    else:
+        record["hitRatios"] = {}
+        if "selected" in record:
+            record["selected_hitRatio"] = 0
+
+    # loads map
+    loads_match = re.search(r"loads=(map\[.*?\])", line)
+    if loads_match:
+        record["loads"] = parse_go_map(loads_match.group(1))
+
+    # ts_ms（精确到毫秒的调度时间戳）
+    ts_ms_m = TS_MS_RE.search(line)
+    if ts_ms_m:
+        record["ts_ms"] = ts_ms_m.group(1)
+
+    # context tags
+    tags = extract_tags(line)
+    if tags:
+        record["tags"] = tags
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# Stats 行解析（类别 B）
+# ════════════════════════════════════════════════════════════════
+
+TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
+WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
+
+
+def parse_stats_line(line):
+    """解析 [stats] 统计行。
+
+    输入示例：
+        [INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14,
+        workers: [...], cache_hit_rate=0.00% (hits=0/total=7)
+
+    注意：hits 和 total 是 per-interval 的（每 5s 重置），累计值必须 sum 所有行。
+
+    返回 dict 或 None（如果不是 stats 行）。
+    """
+    if "[stats]" not in line:
+        return None
+
+    ts = extract_ts(line)
+    record = {"ts": ts or ""}
+
+    # total_running
+    tr_m = TOTAL_RUNNING_RE.search(line)
+    if tr_m:
+        record["total_running"] = int(tr_m.group(1))
+
+    # per-worker running
+    workers = {}
+    for wm in WORKER_RUNNING_RE.finditer(line):
+        workers[wm.group(1)] = int(wm.group(2))
+    record["workers"] = workers
+
+    # cache_hit_rate + hits/total
+    chr_m = CACHE_HR_RE.search(line)
+    if chr_m:
+        record["cache_hit_rate"] = float(chr_m.group(1))
+        record["hits"] = int(chr_m.group(2))
+        record["total"] = int(chr_m.group(3))
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# CLI 入口
+# ════════════════════════════════════════════════════════════════
+
+
+def _cli_parse_stream(parse_fn):
+    """通用 CLI 流式解析：从 stdin 读入日志行，输出 JSON Lines 到 stdout。"""
+    parsed = 0
+    skipped = 0
+    for line in sys.stdin:
+        line = line.rstrip("\n")
+        record = parse_fn(line)
+        if record:
+            print(json.dumps(record, ensure_ascii=False))
+            parsed += 1
+        else:
+            skipped += 1
+    print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="FastDeploy Go Router Log Parser",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    sub = parser.add_subparsers(dest="command")
+
+    sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 → JSON Lines")
+    sub.add_parser("parse-stats", help="解析 [stats] 统计行 → JSON Lines")
+
+    args = parser.parse_args()
+
+    if args.command == "parse-cache-strategy":
+        _cli_parse_stream(parse_cache_strategy_line)
+    elif args.command == "parse-stats":
+        _cli_parse_stream(parse_stats_line)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
new file mode 100644
index 00000000000..c193e99d47c
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -0,0 +1,669 @@
+#!/usr/bin/env python3
+"""
+stat_cache_hitrate — FastDeploy Go Router Cache 命中率统计工具
+
+统计三层 cache 命中率指标：
+  1. Prefix Hit Ratio  — KV Cache 内容复用度
+  2. Session Hit Rate   — 请求级路由粘性
+  3. Per-Worker Stats   — 各 worker 缓存利用排名
+
+用法：
+  python3 stat_cache_hitrate.py <log_file> [--tail N|Nm] [--watch] [--output DIR]
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from collections import defaultdict
+from datetime import datetime
+
+# 同目录模块导入
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from chart import render_bar, render_sparkline, render_table
+from log_parser import (
+    complete_time_arg,
+    extract_ts,
+    filter_file_by_time_range,
+    parse_cache_strategy_line,
+    parse_stats_line,
+    parse_ts,
+)
+from stats import compute_statistics, count_by, time_bucket
+
+# ════════════════════════════════════════════════════════════════
+# Phase 1: 日志读取
+# ════════════════════════════════════════════════════════════════
+
+
+def count_lines(filepath):
+    """快速统计文件行数。"""
+    result = subprocess.run(["wc", "-l", filepath], capture_output=True, text=True)
+    if result.returncode == 0:
+        return int(result.stdout.strip().split()[0])
+    return 0
+
+
+def read_lines(filepath, tail=None):
+    """读取日志文件，支持 tail 模式。"""
+    if tail:
+        if isinstance(tail, str) and tail.endswith("m"):
+            # 按时间 tail：读取全部行，过滤最近 N 分钟
+            minutes = int(tail[:-1])
+            all_lines = _read_file_lines(filepath)
+            return _filter_by_time(all_lines, minutes)
+        else:
+            # 按行数 tail
+            n = int(tail)
+            result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True)
+            return result.stdout.splitlines() if result.returncode == 0 else []
+    return _read_file_lines(filepath)
+
+
+def _read_file_lines(filepath):
+    with open(filepath, "r", errors="replace") as f:
+        return f.readlines()
+
+
+def _filter_by_time(lines, minutes):
+    """过滤最近 N 分钟的日志行。"""
+    # 找最后一行的时间戳作为基准
+    last_ts = None
+    for line in reversed(lines):
+        ts = extract_ts(line)
+        if ts:
+            last_ts = parse_ts(ts)
+            break
+    if not last_ts:
+        return lines
+
+    from datetime import timedelta
+
+    cutoff = last_ts - timedelta(minutes=minutes)
+    result = []
+    for line in lines:
+        ts = extract_ts(line)
+        if ts:
+            try:
+                if parse_ts(ts) >= cutoff:
+                    result.append(line)
+            except ValueError:
+                result.append(line)
+        else:
+            result.append(line)
+    return result
+
+
+# ════════════════════════════════════════════════════════════════
+# Phase 2: 日志提取与解析
+# ════════════════════════════════════════════════════════════════
+
+STRATEGY_PATTERN = "cache-aware prefill: final strategy:"
+STATS_PATTERN = "[stats]"
+INFERENCE_PATTERNS = ["] [POST] /v1/chat/completions ", "] [POST] /v1/completions "]
+
+
+def _shell_quote(s):
+    """Shell 引号转义，安全处理含空格、括号、单引号的路径。"""
+    return "'" + s.replace("'", "'\\''") + "'"
+
+
+def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None):
+    """大文件模式：grep 过滤 + log_parser.py CLI 管道解析。"""
+    parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log_parser.py")
+
+    if tail and not (isinstance(tail, str) and tail.endswith("m")):
+        grep_cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -F {_shell_quote(grep_pattern)} | python3 {_shell_quote(parser_path)} {parse_cmd}"
+    else:
+        grep_cmd = f"grep -F {_shell_quote(grep_pattern)} {_shell_quote(filepath)} | python3 {_shell_quote(parser_path)} {parse_cmd}"
+
+    result = subprocess.run(grep_cmd, shell=True, capture_output=True, text=True)
+    records = []
+    for line in result.stdout.strip().splitlines():
+        if line:
+            try:
+                records.append(json.loads(line))
+            except json.JSONDecodeError:
+                pass
+    return records
+
+
+def grep_count(filepath, grep_pattern, tail=None):
+    """大文件模式：grep 计数。"""
+    if tail and not (isinstance(tail, str) and tail.endswith("m")):
+        cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -cE {_shell_quote(grep_pattern)}"
+    else:
+        cmd = f"grep -cE {_shell_quote(grep_pattern)} {_shell_quote(filepath)}"
+
+    result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
+    try:
+        return int(result.stdout.strip())
+    except ValueError:
+        return 0
+
+
+def extract_data(filepath, tail=None):
+    """提取并解析日志数据，根据文件大小自动选择策略。"""
+    total = count_lines(filepath)
+
+    if total < 5000:
+        # 小文件：内存中处理
+        lines = read_lines(filepath, tail)
+        strategy_recs = [r for l in lines if (r := parse_cache_strategy_line(l)) is not None]
+        stats_recs = [r for l in lines if (r := parse_stats_line(l)) is not None]
+        inference_count = sum(1 for l in lines if any(p in l for p in INFERENCE_PATTERNS))
+        return strategy_recs, stats_recs, inference_count, len(lines)
+    else:
+        # 大文件：grep + subprocess
+        strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail)
+        stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail)
+        inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail)
+        line_count = int(tail) if tail and not (isinstance(tail, str) and tail.endswith("m")) else total
+        return strategy_recs, stats_recs, inference_count, line_count
+
+
+# ════════════════════════════════════════════════════════════════
+# Phase 3: 三层指标计算
+# ════════════════════════════════════════════════════════════════
+
+
+def compute_prefix_hitrate(strategies):
+    """计算第一层：Prefix Hit Ratio。"""
+    scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"]
+    if not scoring_recs:
+        return {"mean": 0, "stats": None, "distribution": [], "cold_start_rate": 0, "trend": [], "count": 0}
+
+    hit_ratios = [r.get("selected_hitRatio", 0) for r in scoring_recs]
+    cold_starts = sum(1 for r in scoring_recs if not r.get("hitRatios"))
+
+    stats = compute_statistics(hit_ratios, distribution_spec="0-20,20-40,40-60,60-80,80-100")
+    trend = time_bucket(scoring_recs, "auto", [("selected_hitRatio", "mean")])
+
+    return {
+        "mean": stats["mean"],
+        "stats": stats,
+        "distribution": stats.get("distribution", []),
+        "cold_start_rate": round(cold_starts / len(scoring_recs) * 100, 1) if scoring_recs else 0,
+        "trend": trend,
+        "count": len(scoring_recs),
+    }
+
+
+def compute_session_hitrate(stats_recs, inference_count):
+    """计算第二层：Session Hit Rate。"""
+    total_hits = sum(r.get("hits", 0) for r in stats_recs)
+    total_total = sum(r.get("total", 0) for r in stats_recs)
+
+    session_hr = round(total_hits / total_total * 100, 1) if total_total else 0
+    coverage = round(total_total / inference_count * 100, 1) if inference_count else 0
+
+    # 趋势：每个窗口的 hits/total
+    trend = time_bucket(stats_recs, "auto", [("hits", "sum"), ("total", "sum")])
+    for t in trend:
+        h = t.get("hits_sum", 0)
+        tot = t.get("total_sum", 0)
+        t["value"] = round(h / tot * 100, 1) if tot else 0
+
+    return {
+        "rate": session_hr,
+        "hits": total_hits,
+        "total": total_total,
+        "coverage": coverage,
+        "inference_count": inference_count,
+        "trend": trend,
+    }
+
+
+def compute_per_worker_stats(strategies):
+    """计算第三层：Per-Worker Cache Stats。"""
+    scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"]
+    if not scoring_recs:
+        return []
+
+    worker_data = defaultdict(lambda: {"selected_count": 0, "hit_ratios": []})
+    total_scoring = len(scoring_recs)
+
+    for r in scoring_recs:
+        selected = r.get("selected", "")
+        if selected:
+            worker_data[selected]["selected_count"] += 1
+            worker_data[selected]["hit_ratios"].append(r.get("selected_hitRatio", 0))
+
+    result = []
+    for worker, data in worker_data.items():
+        avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0
+        result.append(
+            {
+                "Worker": worker.replace("http://", ""),
+                "Selected": data["selected_count"],
+                "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%",
+                "AvgHitRatio": f"{avg_hr}%",
+            }
+        )
+
+    result.sort(key=lambda x: x["Selected"], reverse=True)
+    return result
+
+
+def compute_scheduling_stats(strategies):
+    """计算调度策略概况。"""
+    if not strategies:
+        return {"scoring_count": 0, "fallback_count": 0, "scoring_pct": 0, "fallback_reasons": [], "suboptimal_pct": 0}
+
+    scoring = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"]
+    fallback = [r for r in strategies if r.get("strategy") == "process_tokens"]
+
+    # Fallback 原因分类
+    fallback_reasons = count_by(fallback, "reason") if fallback else []
+
+    # 非最优命中选择比例
+    suboptimal = 0
+    for r in scoring:
+        hit_ratios = r.get("hitRatios", {})
+        if not hit_ratios:
+            continue
+        selected_hr = r.get("selected_hitRatio", 0)
+        max_hr = max(hit_ratios.values()) if hit_ratios else 0
+        if selected_hr < max_hr:
+            suboptimal += 1
+
+    total = len(strategies)
+    return {
+        "scoring_count": len(scoring),
+        "fallback_count": len(fallback),
+        "scoring_pct": round(len(scoring) / total * 100, 1) if total else 0,
+        "fallback_reasons": fallback_reasons,
+        "suboptimal_count": suboptimal,
+        "suboptimal_pct": round(suboptimal / len(scoring) * 100, 1) if scoring else 0,
+    }
+
+
+def cross_diagnose(prefix_hr, session_hr):
+    """交叉诊断矩阵。"""
+    p_high = prefix_hr["mean"] >= 60
+    s_high = session_hr["rate"] >= 60
+
+    if s_high and p_high:
+        return {
+            "icon": "\u2705",
+            "summary": "cache-aware 策略运行良好",
+            "detail": "Session 粘性好，KV cache 实际复用度高",
+        }
+    elif s_high and not p_high:
+        return {
+            "icon": "\u26a0\ufe0f",
+            "summary": "Session 粘性好但 Prefix HR 低",
+            "detail": "prompt 内容变化大，同 worker 的 KV cache 实际复用低",
+        }
+    elif not s_high and p_high:
+        return {
+            "icon": "\u26a0\ufe0f",
+            "summary": "换 worker 频繁但 Prefix HR 尚可",
+            "detail": "负载均衡分散了请求，但新 worker 也有类似前缀缓存",
+        }
+    else:
+        return {
+            "icon": "\u274c",
+            "summary": "命中率全面偏低",
+            "detail": "负载均衡强制分散或缓存未预热，建议检查 worker 数量和 session 分配策略",
+        }
+
+
+# ════════════════════════════════════════════════════════════════
+# Phase 4: 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def _quartile_trend(trend, value_field):
+    """将趋势数据分为 4 个 quartile，计算每段均值。"""
+    if not trend:
+        return ""
+    n = len(trend)
+    if n < 4:
+        values = [t.get(value_field, 0) for t in trend]
+        avg = round(sum(values) / len(values), 1) if values else 0
+        return f"{avg}%"
+
+    q_size = n // 4
+    quartiles = []
+    for i in range(4):
+        start = i * q_size
+        end = start + q_size if i < 3 else n
+        vals = [t.get(value_field, 0) for t in trend[start:end]]
+        quartiles.append(round(sum(vals) / len(vals), 1) if vals else 0)
+
+    arrow = (
+        "\u2191" if quartiles[3] > quartiles[0] + 10 else "\u2193" if quartiles[3] < quartiles[0] - 10 else "\u2192"
+    )
+    return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}"
+
+
+def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None):
+    """格式化完整终端报告。"""
+    parts = []
+
+    # 标题
+    span_str = time_span or ""
+    parts.append("## Cache Hit Rate Report")
+    parts.append(f"**File**: {filepath} | **Lines**: {line_count:,}")
+    if span_str:
+        parts.append(f"**Span**: {span_str}")
+    parts.append("")
+
+    # 1. Prefix Hit Ratio
+    parts.append("### 1. Prefix Hit Ratio (KV Cache 内容复用度)")
+    if prefix_hr["stats"]:
+        _ = prefix_hr["stats"]
+        parts.append(f'  累计平均: {prefix_hr["mean"]}% (被选中 worker, N={prefix_hr["count"]})')
+        parts.append("  分布:")
+
+        dist_data = [
+            {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]
+        ]
+        parts.append(render_bar(dist_data, show_count=True))
+
+        parts.append(f'  冷启动率: {prefix_hr["cold_start_rate"]}%')
+
+        trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean")
+        if trend_str:
+            parts.append(f"  趋势: {trend_str}")
+
+        # Sparkline
+        if prefix_hr["trend"]:
+            sparkline_data = [
+                {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]
+            ]
+            parts.append("")
+            parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100)))
+    else:
+        parts.append("  (无 cache_aware_scoring 数据)")
+    parts.append("")
+
+    # 2. Session Hit Rate
+    parts.append("### 2. Session Hit Rate (请求级路由粘性)")
+    parts.append(f'  累计: {session_hr["rate"]}% (hits={session_hr["hits"]} / total={session_hr["total"]})')
+    parts.append(f'  覆盖率: {session_hr["coverage"]}% 的推理请求带 session_id')
+
+    trend_str = _quartile_trend(session_hr["trend"], "value")
+    if trend_str:
+        parts.append(f"  趋势: {trend_str}")
+
+    if session_hr["trend"]:
+        parts.append("")
+        parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100)))
+    parts.append("")
+
+    # 3. Per-Worker
+    parts.append("### 3. Per-Worker Cache Stats")
+    if per_worker:
+        parts.append(
+            render_table(
+                per_worker,
+                columns=["Worker", "Selected", "Select%", "AvgHitRatio"],
+                right_align={"Selected", "Select%", "AvgHitRatio"},
+            )
+        )
+    else:
+        parts.append("  (无数据)")
+    parts.append("")
+
+    # 4. Scheduling Strategy
+    parts.append("### 4. Scheduling Strategy")
+    parts.append(
+        f'  cache_aware_scoring: {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)'
+        f' | fallback: {scheduling["fallback_count"]}'
+    )
+    if scheduling["fallback_reasons"]:
+        reasons = ", ".join(f'{r["value"]}={r["count"]}' for r in scheduling["fallback_reasons"])
+        parts.append(f"    fallback reasons: {reasons}")
+    parts.append(
+        f'  非最优命中选择: {scheduling["suboptimal_pct"]}%'
+        f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)'
+    )
+    parts.append("")
+
+    # 5. Diagnosis
+    parts.append("### 5. Diagnosis")
+    parts.append(f'  {diagnosis["icon"]} {diagnosis["summary"]}')
+    parts.append(f'  {diagnosis["detail"]}')
+
+    return "\n".join(parts)
+
+
+def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling):
+    """格式化 --tail 精简报告。"""
+    parts = []
+    parts.append("## Cache Hit Rate (Recent)")
+    parts.append(f"**File**: {filepath} | **tail {line_count} lines**")
+    parts.append("")
+    parts.append(f'  Prefix Hit Ratio:  {prefix_hr["mean"]}% (avg) | Cold start: {prefix_hr["cold_start_rate"]}%')
+    parts.append(
+        f'  Session Hit Rate:  {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})'
+        f' | Coverage: {session_hr["coverage"]}%'
+    )
+    parts.append(
+        f'  Strategy: scoring {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)'
+        f' | fallback {scheduling["fallback_count"]}'
+    )
+
+    # Sparkline
+    if prefix_hr["trend"]:
+        parts.append("")
+        sparkline_data = [
+            {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]
+        ]
+        parts.append(render_sparkline(sparkline_data, title="Recent Prefix HR", y_label="%", y_range=(0, 100)))
+
+    return "\n".join(parts)
+
+
+def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir):
+    """导出详细数据 Markdown 文件。
+
+    主报告包含 Per-Worker 统计和 Fallback 明细。
+    每窗口明细数据拆分到 details/per_window_data.md。
+    """
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md")
+
+    parts = []
+    parts.append("# Cache Hit Rate Detailed Report")
+    parts.append(f'**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
+    parts.append(f"**Source**: {filepath}")
+    parts.append("")
+
+    # Per-Worker 完整统计
+    parts.append("## 1. Per-Worker 完整统计")
+    parts.append("")
+    if per_worker:
+        parts.append(
+            render_table(
+                per_worker,
+                columns=["Worker", "Selected", "Select%", "AvgHitRatio"],
+                right_align={"Selected", "Select%", "AvgHitRatio"},
+            )
+        )
+    parts.append("")
+
+    # Fallback 明细
+    if scheduling["fallback_reasons"]:
+        parts.append("## 2. Fallback 明细")
+        for reason in scheduling["fallback_reasons"]:
+            parts.append(f'- **{reason["value"]}**: {reason["count"]} 次 ({reason["pct"]}%)')
+        parts.append("")
+
+    # 每窗口明细 → 拆分到 details/
+    time_data = defaultdict(lambda: {"prefix_hr": "-", "session_hr": "-", "scoring": 0, "fallback": 0, "running": "-"})
+    for r in strategies:
+        ts = r.get("ts", "")
+        if r.get("strategy") == "cache_aware_scoring":
+            time_data[ts]["scoring"] += 1
+        else:
+            time_data[ts]["fallback"] += 1
+
+    for r in stats_recs:
+        ts = r.get("ts", "")
+        h = r.get("hits", 0)
+        t = r.get("total", 0)
+        time_data[ts]["session_hr"] = f"{round(h / t * 100, 1)}% ({h}/{t})" if t else "0%"
+        time_data[ts]["running"] = str(r.get("total_running", "-"))
+
+    if time_data:
+        # 主报告中添加引用
+        parts.append(
+            f"> 每窗口明细数据 ({len(time_data)} 条): [details/per_window_data.md](details/per_window_data.md)"
+        )
+        parts.append("")
+
+        # 写入 details 子目录
+        details_dir = os.path.join(output_dir, "details")
+        os.makedirs(details_dir, exist_ok=True)
+        detail_parts = ["# 每窗口明细数据", ""]
+        detail_parts.append("| Time | Prefix HR | Session HR | Scoring | Fallback | Total Running |")
+        detail_parts.append("|------|-----------|------------|---------|----------|---------------|")
+        for ts in sorted(time_data.keys()):
+            d = time_data[ts]
+            short_ts = ts.split(" ")[-1] if " " in ts else ts
+            detail_parts.append(
+                f'| {short_ts} | {d["prefix_hr"]} | {d["session_hr"]} '
+                f'| {d["scoring"]} | {d["fallback"]} | {d["running"]} |'
+            )
+        detail_parts.append("")
+
+        detail_path = os.path.join(details_dir, "per_window_data.md")
+        with open(detail_path, "w") as f:
+            f.write("\n".join(detail_parts))
+
+    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+    with open(output_path, "w") as f:
+        f.write("\n".join(parts))
+
+    return output_path
+
+
+# ════════════════════════════════════════════════════════════════
+# 时间跨度计算
+# ════════════════════════════════════════════════════════════════
+
+
+def compute_time_span(strategies, stats_recs):
+    """从数据中计算时间跨度字符串。"""
+    all_ts = []
+    for r in strategies + stats_recs:
+        ts = r.get("ts", "")
+        if ts:
+            try:
+                all_ts.append(parse_ts(ts))
+            except ValueError:
+                pass
+    if len(all_ts) < 2:
+        return None
+    t_min = min(all_ts)
+    t_max = max(all_ts)
+    duration = t_max - t_min
+    hours = int(duration.total_seconds() // 3600)
+    minutes = int((duration.total_seconds() % 3600) // 60)
+    start = t_min.strftime("%H:%M:%S")
+    end = t_max.strftime("%H:%M:%S")
+    if hours > 0:
+        return f"{start} ~ {end} ({hours}h{minutes}m)"
+    return f"{start} ~ {end} ({minutes}m)"
+
+
+# ════════════════════════════════════════════════════════════════
+# CLI 入口
+# ════════════════════════════════════════════════════════════════
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="FastDeploy Go Router Cache 命中率统计",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument("log_file", help="日志文件路径")
+    parser.add_argument("--tail", nargs="?", const="2000", help="只分析尾部数据（行数如 2000，或时间如 30m）")
+    parser.add_argument("--watch", action="store_true", help="全量分析后提示持续监控命令")
+    parser.add_argument(
+        "--output", default=None, help="详细报告输出目录（默认：skill_output/stat-cache-hitrate/<timestamp>/）"
+    )
+    parser.add_argument(
+        "--start", default=None, help='起始时间（如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00"）'
+    )
+    parser.add_argument("--end", default=None, help='结束时间（如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00"）')
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # 验证文件存在
+    if not os.path.isfile(args.log_file):
+        print(f"Error: 文件不存在: {args.log_file}", file=sys.stderr)
+        sys.exit(1)
+
+    # --tail 与 --start/--end 不能混用（两者是不同的范围选择方式）
+    if args.tail and (args.start or args.end):
+        print("Error: --tail 与 --start/--end 不能同时使用，请选择其一", file=sys.stderr)
+        sys.exit(1)
+
+    # 时间范围预过滤（--start 和 --end 可单独或同时指定）
+    import atexit
+
+    log_file = args.log_file
+    if args.start or args.end:
+        start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None
+        end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None
+        filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts)
+        if is_temp:
+            atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None)
+        log_file = filtered_path
+        print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr)
+
+    # Phase 2: 提取 + 解析
+    strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, args.tail)
+
+    if not strategy_recs and not stats_recs:
+        print(
+            "Warning: 未找到 cache-aware 策略行或 [stats] 行。" "请确认日志文件包含 Go Router 日志。", file=sys.stderr
+        )
+        sys.exit(0)
+
+    # Phase 3: 计算三层指标
+    prefix_hr = compute_prefix_hitrate(strategy_recs)
+    session_hr = compute_session_hitrate(stats_recs, inference_count)
+    per_worker = compute_per_worker_stats(strategy_recs)
+    scheduling = compute_scheduling_stats(strategy_recs)
+    diagnosis = cross_diagnose(prefix_hr, session_hr)
+
+    # Phase 4: 输出
+    if args.tail:
+        print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling))
+    else:
+        time_span = compute_time_span(strategy_recs, stats_recs)
+        print(
+            format_full_report(
+                args.log_file, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span
+            )
+        )
+
+        # 导出详细报告
+        if args.output:
+            output_dir = args.output
+        else:
+            script_dir = os.path.dirname(os.path.abspath(__file__))
+            golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
+            run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp)
+        report_path = save_detailed_report(
+            args.log_file, strategy_recs, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir
+        )
+        print(f"\n\U0001f4c4 详细数据见: {report_path}")
+
+    if args.watch:
+        print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py
new file mode 100644
index 00000000000..a197ee7aff0
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+"""
+Stats — 通用统计计算工具
+
+提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。
+不含任何业务逻辑或日志格式依赖。
+
+Python 3 stdlib only，零依赖。
+"""
+
+import math
+from collections import defaultdict
+from datetime import datetime, timedelta
+
+# ════════════════════════════════════════════════════════════════
+# 百分位数与基础统计
+# ════════════════════════════════════════════════════════════════
+
+
+def percentile(sorted_vals, p):
+    """从已排序列表计算第 p 百分位数（线性插值）。"""
+    if not sorted_vals:
+        return 0.0
+    n = len(sorted_vals)
+    k = (p / 100.0) * (n - 1)
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return sorted_vals[int(k)]
+    return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f)
+
+
+def compute_statistics(values, percentiles_list=None, distribution_spec=None):
+    """计算一组数值的统计量。
+
+    Args:
+        values: 数值列表
+        percentiles_list: 要计算的百分位数列表，默认 [50, 90, 95, 99]
+        distribution_spec: 分布区间规格字符串，如 '0-20,20-40,40-60,60-80,80-100'
+
+    Returns:
+        dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution}
+    """
+    if percentiles_list is None:
+        percentiles_list = [50, 90, 95, 99]
+
+    if not values:
+        result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0}
+        for p in percentiles_list:
+            result[f"p{p}"] = 0
+        if distribution_spec is not None:
+            result["distribution"] = []
+        return result
+
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    total = sum(sorted_vals)
+    mean = total / n
+    variance = sum((x - mean) ** 2 for x in sorted_vals) / n
+    stddev = math.sqrt(variance)
+
+    result = {
+        "count": n,
+        "min": round(sorted_vals[0], 3),
+        "max": round(sorted_vals[-1], 3),
+        "mean": round(mean, 3),
+        "sum": round(total, 3),
+        "stddev": round(stddev, 3),
+    }
+
+    for p in percentiles_list:
+        result[f"p{p}"] = round(percentile(sorted_vals, p), 3)
+
+    if distribution_spec is not None:
+        result["distribution"] = compute_distribution(sorted_vals, distribution_spec)
+
+    return result
+
+
+def compute_distribution(sorted_vals, spec_str):
+    """根据区间规格计算分布直方图。
+
+    spec_str 示例：'0-20,20-40,40-60,60-80,80-100'
+    每个区间是左闭右开 [lo, hi)。
+    """
+    buckets = _parse_distribution_spec(spec_str)
+    n = len(sorted_vals)
+    result = []
+    for b in buckets:
+        if b[0] == "lt":
+            count = sum(1 for v in sorted_vals if v < b[1])
+            label = b[2]
+        elif b[0] == "gt":
+            count = sum(1 for v in sorted_vals if v > b[1])
+            label = b[2]
+        elif b[0] == "range":
+            count = sum(1 for v in sorted_vals if b[1] <= v < b[2])
+            label = b[3]
+        else:
+            continue
+        result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0})
+    return result
+
+
+def _parse_distribution_spec(spec_str):
+    """解析分布区间规格：'<100,100-500,>1000' → bucket 定义列表。"""
+    buckets = []
+    for part in spec_str.split(","):
+        part = part.strip()
+        if part.startswith("<"):
+            buckets.append(("lt", float(part[1:]), part))
+        elif part.startswith(">"):
+            buckets.append(("gt", float(part[1:]), part))
+        elif "-" in part:
+            lo, hi = part.split("-", 1)
+            buckets.append(("range", float(lo), float(hi), part))
+    return buckets
+
+
+# ════════════════════════════════════════════════════════════════
+# 时间窗口聚合
+# ════════════════════════════════════════════════════════════════
+
+
+def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"):
+    """按时间窗口聚合记录。
+
+    Args:
+        records: dict 列表，每个 dict 必须有 ts_field 字段
+        window: 窗口大小 '5s'/'1m'/'5m'/'auto'
+        agg_specs: 聚合规格列表 [(field, func), ...]，如 [('selected_hitRatio', 'mean')]
+                   func 支持：count, sum, mean, min, max, pNN
+        ts_field: 时间戳字段名
+
+    Returns:
+        list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...}
+    """
+    if agg_specs is None:
+        agg_specs = [("_", "count")]
+
+    if not records:
+        return []
+
+    window_td = _parse_window(window, records, ts_field)
+
+    # 按窗口分组
+    buckets = defaultdict(list)
+    for r in records:
+        ts_str = r.get(ts_field, "")
+        if not ts_str:
+            continue
+        try:
+            dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")
+        except ValueError:
+            continue
+        bucket_dt = _align_to_bucket(dt, window_td)
+        bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S")
+        buckets[bucket_key].append(r)
+
+    # 按时间排序并聚合
+    result = []
+    for bucket_key in sorted(buckets.keys()):
+        bucket_records = buckets[bucket_key]
+        entry = {"bucket": bucket_key, "count": len(bucket_records)}
+
+        for field, func in agg_specs:
+            if field == "_":
+                if func == "count":
+                    entry["count"] = len(bucket_records)
+                continue
+
+            values = []
+            for r in bucket_records:
+                v = r.get(field)
+                if v is not None:
+                    try:
+                        values.append(float(v))
+                    except (ValueError, TypeError):
+                        pass
+
+            out_key = f"{field}_{func}"
+            entry[out_key] = _aggregate_values(values, func)
+
+        result.append(entry)
+
+    return result
+
+
+def _parse_window(window_str, records, ts_field):
+    """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。"""
+    if window_str == "auto":
+        timestamps = []
+        for r in records:
+            ts_str = r.get(ts_field, "")
+            if ts_str:
+                try:
+                    timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S"))
+                except ValueError:
+                    pass
+        if len(timestamps) < 2:
+            return timedelta(minutes=1)
+        span = max(timestamps) - min(timestamps)
+        if span < timedelta(minutes=30):
+            return timedelta(seconds=5)
+        elif span < timedelta(hours=3):
+            return timedelta(minutes=1)
+        else:
+            return timedelta(minutes=5)
+    elif window_str.endswith("s"):
+        return timedelta(seconds=int(window_str[:-1]))
+    elif window_str.endswith("m"):
+        return timedelta(minutes=int(window_str[:-1]))
+    elif window_str.endswith("h"):
+        return timedelta(hours=int(window_str[:-1]))
+    return timedelta(minutes=1)
+
+
+def _align_to_bucket(dt, window_td):
+    """将 datetime 对齐到窗口边界。"""
+    secs = max(1, int(window_td.total_seconds()))
+    epoch = datetime(dt.year, dt.month, dt.day)
+    offset = int((dt - epoch).total_seconds())
+    aligned = (offset // secs) * secs
+    return epoch + timedelta(seconds=aligned)
+
+
+def _aggregate_values(values, func):
+    """用指定函数聚合一组数值。"""
+    if not values:
+        return 0
+    if func == "count":
+        return len(values)
+    elif func == "sum":
+        return round(sum(values), 3)
+    elif func == "mean":
+        return round(sum(values) / len(values), 3)
+    elif func == "min":
+        return round(min(values), 3)
+    elif func == "max":
+        return round(max(values), 3)
+    elif func.startswith("p"):
+        p = int(func[1:])
+        return round(percentile(sorted(values), p), 3)
+    return 0
+
+
+# ════════════════════════════════════════════════════════════════
+# 分组计数
+# ════════════════════════════════════════════════════════════════
+
+
+def count_by(records, field, top_n=None):
+    """按指定字段分组计数。
+
+    Args:
+        records: dict 列表
+        field: 分组字段名
+        top_n: 只返回前 N 个（按计数降序）
+
+    Returns:
+        list[dict]: [{value, count, pct}]，按计数降序排列
+    """
+    counts = defaultdict(int)
+    total = 0
+    for r in records:
+        val = r.get(field)
+        if val is not None:
+            counts[str(val)] += 1
+            total += 1
+
+    result = []
+    for val, count in sorted(counts.items(), key=lambda x: -x[1]):
+        result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0})
+
+    if top_n:
+        result = result[:top_n]
+
+    return result
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
new file mode 100644
index 00000000000..ab0c3ce7219
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -0,0 +1,148 @@
+---
+name: troubleshoot
+description: >
+  FastDeploy Go Router 综合问题排查 skill。覆盖错误分类、延迟分析、请求追踪、Worker 健康时间线、
+  Cache 调度诊断、负载与计数器分析六个维度。输出按三层问题来源分类：Router 自身、FastDeploy 后端、客户端。
+
+  当用户要求以下操作时触发此 skill：排查 router 问题、分析 router 日志、router 排查、
+  查看 router 状态、综合排查、全量扫描、troubleshoot router、/troubleshoot、
+  分析错误日志、502/503 排查、延迟分析、Worker 健康、负载分析、cache 调度诊断、
+  请求追踪、trace 请求。
+  关键词：troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、
+  health、load、cache、trace、/troubleshoot。
+
+IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。
+错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。
+---
+
+# Router Troubleshooting
+
+综合排查 FastDeploy Go Router 问题，输出完整诊断报告。
+
+## 执行前交互
+
+运行脚本前，Claude 必须按以下顺序向用户确认参数：
+
+### 1. 日志文件路径
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项，同时允许用户直接输入自定义路径（支持绝对路径和相对路径）：
+- 选项 1: `logs/router.log`（默认）
+- 选项 2: `fd-router.log`（golang_router 根目录）
+- 选项 3: 用户通过 Other 输入自定义路径
+
+**重要规则**：
+- 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
+- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件
+- 支持绝对路径（如 `/home/user/logs/xxx.log`）和相对路径（如 `logs/fd-router (2).log`）
+
+如果用户直接确认或未指定路径，使用脚本的自动发现逻辑。
+
+### 2. 分析范围
+向用户询问分析范围：
+> "请选择分析范围：
+> 1. **全量分析**（默认）— 分析整个日志文件
+> 2. **尾部分析** — 只分析最近数据（可指定行数或时间如 `--tail 5000` 或 `--tail 30m`）
+> 3. **指定时间段** — 分析特定时间范围内的日志"
+
+如果用户未选择，默认使用全量分析。
+
+#### 指定时间段的处理
+
+脚本原生支持 `--start` 和 `--end` 参数，无需手动预过滤。两者可单独或同时指定。
+
+时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。
+缺失部分自动从日志首末行推断（缺年份取首行，缺日期取末行）。
+`--start/--end` 与 `--tail` 互斥。
+
+### 3. 分析模式
+向用户询问分析模式：
+> "请选择分析模式：
+> 1. **完整分析**（默认）— 运行所有维度（errors + latency + health + cache + load）
+> 2. **单维度/多维度分析** — 选择特定维度（errors / latency / health / cache / load），可选多个
+> 3. **请求追踪** — 追踪特定请求 ID（需提供 ID）"
+
+如果用户未选择，默认使用完整分析。
+
+### 4. 输出目录
+诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
+用户可通过 `--output` 指定自定义目录。
+
+## 用法
+
+脚本路径（相对于 `fastdeploy/golang_router/`）：`.claude/skills/troubleshoot/scripts/`
+
+```bash
+SCRIPTS=.claude/skills/troubleshoot/scripts
+
+# 全量扫描（errors + latency + health + cache + load）
+python3 $SCRIPTS/troubleshoot.py <log_file>
+
+# 单维度分析
+python3 $SCRIPTS/troubleshoot.py <log_file> --errors
+python3 $SCRIPTS/troubleshoot.py <log_file> --latency
+python3 $SCRIPTS/troubleshoot.py <log_file> --health
+python3 $SCRIPTS/troubleshoot.py <log_file> --cache
+python3 $SCRIPTS/troubleshoot.py <log_file> --load
+
+# 请求追踪（需指定 ID，支持逗号分隔多 ID）
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace <ID>
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace "id1,id2"
+
+# 尾部分析
+python3 $SCRIPTS/troubleshoot.py <log_file> --tail 5000
+python3 $SCRIPTS/troubleshoot.py <log_file> --tail 30m
+
+# 指定时间段（--start 和 --end 可单独或同时使用）
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00:00" --end "17:00:00"
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "2026/03/31 16:00:00"
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "03/31" --end "03/31 18:00"
+
+# 组合模式
+python3 $SCRIPTS/troubleshoot.py <log_file> --errors --latency
+python3 $SCRIPTS/troubleshoot.py <log_file> --errors --tail 5000
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00" --end "17:00" --errors --latency
+```
+
+默认日志路径：`logs/router.log` → `fd-router.log`
+
+## 输出
+
+- **终端**：简洁三层汇总（Router / FD 后端 / 客户端），含状态码分布、错误 Top N、趋势图
+- **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
+  - 逐分钟事件详情拆分到 `details/health_events.md`
+  - 请求追踪事件链拆分到 `details/trace_<ID>.md`
+- **状态行**：`STATUS: HEALTHY / DEGRADED / CRITICAL`
+
+## 三层诊断框架
+
+| 层 | 典型问题 | 日志特征 |
+|----|---------|---------|
+| Router | Panic、500、Counter 异常、调度瓶颈、Cache 策略不优 | `Panic recovered`、`Failed to encode`、`double-release` |
+| FD 后端 | 502、Worker 下线、高推理延迟、请求卡住 | `Failed to connect`、`Removed unhealthy`、p99 高 |
+| 客户端 | 断连、请求格式错误 | `context canceled`、400 |
+
+## 脚本架构
+
+```
+scripts/
+  log_parser.py    — 日志解析原语（HTTP/Cache/Stats/错误归一化/事件匹配）
+  stats.py         — 通用统计计算（百分位数/时间窗口/分组）
+  chart.py         — 终端可视化（sparkline/柱状图/表格/时间线）
+  troubleshoot.py  — 主编排器
+  analyzers/
+    errors.py      — 错误分类分析
+    latency.py     — 延迟分析
+    health.py      — Worker 健康时间线
+    cache.py       — Cache 调度诊断
+    load.py        — 负载与计数器分析
+    trace.py       — 请求追踪
+```
+
+## 重要规则
+
+1. 大文件 (>5000 行) 用 grep 分类提取，不一次性读取
+2. 每个问题标注来源层（Router / FD 后端 / 客户端）
+3. Cache 命中率数值分析用 `/stat-cache-hitrate`，本 skill 做策略诊断
+4. 分析前读取 `references/log_patterns.md`
+5. 错误查询参考 `references/error_catalog.md`
+6. 后端问题排查参考 `references/fastdeploy_cross_reference.md`
+7. 输出格式参考 `references/report_templates.md`
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json
new file mode 100644
index 00000000000..4b961e85b36
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json
@@ -0,0 +1,18 @@
+[
+  {"query": "router 最近频繁 502 和 503，帮我全面排查一下问题", "should_trigger": true},
+  {"query": "帮我 troubleshoot 一下 Go Router，感觉有些请求延迟特别高", "should_trigger": true},
+  {"query": "分析 logs/fd-router.log 里面的错误日志，看看哪些错误最多", "should_trigger": true},
+  {"query": "有几个 Worker 好像不太健康，帮我看看 Worker 健康时间线", "should_trigger": true},
+  {"query": "cache 调度策略最近好像有问题，fallback 比例太高了，诊断一下", "should_trigger": true},
+  {"query": "帮我追踪请求 trace-id-12345，看看这个请求在 router 里经历了什么", "should_trigger": true},
+  {"query": "/troubleshoot 全量扫描 router 日志，给我一份完整的诊断报告", "should_trigger": true},
+  {"query": "router 负载分析一下，有没有 counter 异常或者 double-release 的情况", "should_trigger": true},
+  {"query": "统计一下 cache 命中率是多少，prefix hit ratio 和 session hit rate 各是多少", "should_trigger": false},
+  {"query": "帮我看看 hitRatio 数据，想了解 KV cache 的复用度", "should_trigger": false},
+  {"query": "帮我写一个 Go 的 reverse proxy，要支持负载均衡", "should_trigger": false},
+  {"query": "分析 Kubernetes pod 的日志，看看为什么 OOMKilled", "should_trigger": false},
+  {"query": "FastDeploy 模型部署失败了，帮我看看怎么回事", "should_trigger": false},
+  {"query": "帮我优化一下 Python 代码的性能，跑得太慢了", "should_trigger": false},
+  {"query": "nginx 返回 504 Gateway Timeout，帮我排查原因", "should_trigger": false},
+  {"query": "帮我监控 cache 命中率的实时变化趋势", "should_trigger": false}
+]
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
new file mode 100644
index 00000000000..ba48297d9c9
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -0,0 +1,122 @@
+# Router 错误目录
+
+按 HTTP 状态码和日志级别分类的 Router 错误快速索引。每条含严重程度、根因、影响、排查命令、问题来源层。
+
+---
+
+## 按 HTTP 状态码索引
+
+注意：HTTP 响应体中的错误消息与 logger 输出的 ERROR 消息**可能不同**。
+例如：HTTP 502 响应 `Failed to select worker: {err}` 对应的日志 ERROR 是 `Failed to select mixed worker: {err}`。
+分析时需将两者关联而非简单去重。
+
+### 400 Bad Request
+
+| 错误消息 | 根因 | 来源层 | 排查 |
+|---------|------|-------|------|
+| `Invalid request body: {err}` | 请求体读取失败 | 客户端 | 检查客户端请求格式 |
+| `Invalid JSON format: {err}` | JSON 解析失败 | 客户端 | 检查 JSON 格式 |
+| `DefaultManager is nil` | Manager 未初始化 | Router | 检查 Router 启动日志 |
+
+### 500 Internal Server Error
+
+| 错误消息 | 根因 | 来源层 | 排查 |
+|---------|------|-------|------|
+| `Failed to build disaggregate_info: {err}` | PD 模式配置错误 | Router | 检查 register.yaml 参数 |
+| `Failed to encode modified request: {err}` | 请求编码失败 | Router | 检查请求参数特殊字符 |
+| `Internal server error` (Panic) | Router 代码 bug | Router | 检查 Panic recovered 日志 |
+
+### 502 Bad Gateway
+
+| 错误消息 | 根因 | 来源层 | 排查 |
+|---------|------|-------|------|
+| `Failed to select worker: {err}` | 无可用 Mixed Worker | FD 后端 | `curl /health` 检查后端 |
+| `Failed to select worker pair: {err}` | 无可用 PD Worker | FD 后端 | 检查 prefill/decode 注册状态 |
+| `Failed to connect to backend service: {err}` | 后端不可达 | FD 后端 | `curl {worker_url}/health` |
+
+### 503 Service Unavailable
+
+| 错误消息 | 根因 | 来源层 | 排查 |
+|---------|------|-------|------|
+| `No available prefill/decode workers` | 全部 Worker 不健康 | FD 后端 | 检查部署状态 |
+
+---
+
+## 按日志级别索引
+
+### ERROR 级别
+
+| 消息模板 | 严重程度 | 来源层 | 影响 |
+|---------|---------|-------|------|
+| `Failed to select mixed worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
+| `Failed to select prefill worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
+| `Failed to read register request body: {err}` | MEDIUM | Router | 注册失败 |
+| `Failed to unmarshal register request JSON: {err}` | MEDIUM | Router | 注册失败 |
+| `Failed to create decode request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 |
+| `Failed to create prefill request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 |
+| `Decode request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 |
+| `Prefill request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 |
+| `Failed to read request body: {err}` | LOW | 客户端 | 单请求失败 |
+| `Failed to unmarshal request JSON: {err}` | LOW | 客户端 | 单请求失败 |
+| `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
+| `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
+| `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
+| `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
+| `Failed to create backend request for {url}: {err}` | HIGH | FD 后端 | 请求失败 |
+| `Backend request failed for {url}: {err}` | HIGH | FD 后端 | 请求失败 |
+| `scanner error: {err}` | MEDIUM | FD 后端/客户端 | 流式响应中断（gateway redirect 函数） |
+| `[prefill] scanner error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 流式错误 |
+| `copy error: {err}` | MEDIUM | FD 后端/客户端 | 非流式响应中断 |
+| `[prefill] copy error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 非流式错误 |
+| `Removed unhealthy prefill/decode/mixed instance: {url}` | HIGH | FD 后端 | Worker 被移除（注意：这是 ERROR 级别） |
+
+### WARN 级别
+
+| 消息模板 | 严重程度 | 来源层 | 影响 |
+|---------|---------|-------|------|
+| `GetRemoteMetrics failed for {url}, falling back to local counter` | LOW | FD 后端 | 调度精度降低 |
+| `release worker: {url} skipped, counter already cleaned up` | LOW | Router | 计数器异常 |
+| `release worker: {url} skipped, counter already zero (possible double-release)` | MEDIUM | Router | 计数器逻辑 bug |
+| `cache-aware prefill: tokenizer failed, fallback to char tokens: {err}` | LOW | Router | cache-aware 精度降低 |
+| `Instance {url} role is unknown` | LOW | Router | 注册角色不识别 |
+
+### INFO 级别（异常相关）
+
+| 消息模板 | 含义 | 关注场景 |
+|---------|------|---------|
+| `unhealthy worker counter preserved (inflight requests): {url}, count: {N}` | 不健康 Worker 仍有 inflight 请求 | 频繁出现说明 Worker 不稳定 |
+| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {N}` | 不健康 Worker 仍有 token 计数 | 同上 |
+| `cleanup unhealthy worker counter: {url}` | 清理不健康 Worker 的请求计数 | 正常清理 |
+| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数 | 正常清理 |
+| `preserved counters for {N} workers with inflight requests: [...]` | 保留了 N 个 Worker 的计数器 | N 大说明多 Worker 不稳定 |
+| `removed counters for {N} unhealthy workers: [...]` | 移除了 N 个 Worker 的计数器 | 正常清理 |
+| `Server {url} is healthy` | 健康检查恢复 | Worker 恢复（来自 HealthGenerate 端点） |
+
+注意：以下事件是 **ERROR 级别**，不是 INFO：
+- `Removed unhealthy prefill/decode/mixed instance: {url}` — Worker 被移除
+
+注意：以下内容是 **HTTP 响应体**，不是 logger 输出（不会出现在日志行中）：
+- `Register success` — 注册成功的 HTTP 200 响应体
+- Worker 注册检测应通过 H1 行的 `POST /register 200` 判断
+
+---
+
+## 注册参数校验错误
+
+| 错误消息 | 根因 | 排查 |
+|---------|------|------|
+| `invalid connector_port: {value}` | connector_port 非数字或范围错误 | 检查 register.yaml |
+| `invalid engine_worker_queue_port: {value}` | engine_worker_queue_port 非数字或范围错误 | 检查 register.yaml |
+| `invalid metrics_port: {value}` | metrics_port 非数字或范围错误 | 检查 register.yaml |
+| `rdma_ports[{i}] invalid port: {value}` | RDMA 端口配置错误 | 检查 register.yaml |
+
+---
+
+## scanner error / copy error 区分
+
+| error 内容 | 来源层 | 含义 |
+|-----------|-------|------|
+| `context canceled` | 客户端 | 客户端主动断连（超时或取消） |
+| 其他 | FD 后端 | 后端流式响应异常 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md
new file mode 100644
index 00000000000..f35cbcb303a
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md
@@ -0,0 +1,102 @@
+# FastDeploy 后端交叉引用
+
+从 Router 日志推断 FastDeploy 后端问题时的排查指引。
+
+---
+
+## 症状 → 后端排查
+
+### 1. 后端不可达 (502)
+
+**Router 日志特征**：
+```
+[ERROR] Failed to connect to backend service: dial tcp {ip}:{port}: connect: connection refused
+```
+
+**排查步骤**：
+1. `curl http://{worker_url}/health` — 确认后端是否存活
+2. `curl http://{worker_url}/v1/models` — 确认模型是否加载完成
+3. 检查后端日志 `logs/workerlog.0`
+4. `netstat -tlnp | grep {port}` — 确认端口监听
+5. 检查网络连通性（防火墙、安全组）
+
+### 2. 后端 OOM / 频繁重启
+
+**Router 日志特征**：
+- Worker 频繁 REMOVED → RE-REGISTERED（短周期内多次）
+- 健康检查间歇性失败
+
+**排查步骤**：
+1. `dmesg | grep -i oom` — 检查 OOM killer
+2. `nvidia-smi` — 检查 GPU 内存
+3. 后端日志搜索 `CUDA out of memory`
+4. 检查 `max_num_seqs`、`max_model_len` 配置
+
+### 3. 高推理延迟
+
+**Router 日志特征**：
+- 请求 p99 高（>10s）但调度耗时仅 ms 级
+- 确认延迟不在 Router 层（调度耗时 << 总延迟）
+
+**排查步骤**：
+1. 检查后端 Prometheus metrics：`http://{worker_url}:{metrics_port}/metrics`
+   - `fastdeploy_llm_running_queue_size` — 推理队列
+   - `fastdeploy_llm_waiting_queue_size` — 等待队列
+   - `fastdeploy_llm_generation_tokens_per_second` — 吞吐量
+2. 确认 GPU 利用率：`nvidia-smi --query-gpu=utilization.gpu --format=csv`
+3. 检查是否有长 prompt 请求拖慢整体
+
+### 4. 流式响应异常
+
+**Router 日志特征**：
+```
+[ERROR] scanner error: {err}  (非 context canceled)
+[ERROR] copy error: {err}  (非 context canceled)
+```
+
+**排查步骤**：
+1. 后端日志搜索对应 request_id
+2. 检查后端是否产生格式错误的 SSE
+3. 检查网络是否有中间代理超时切断
+
+### 5. 请求超时/卡住
+
+**Router 日志特征**：
+- 有 select worker 但长时间无 release/completed
+- [stats] 中 running 持续不降
+
+**根因**：Router 的 `http.Client{}` 没有设置超时，后端不响应则阻塞到客户端断连或 TCP 超时。
+
+**排查步骤**：
+1. 检查后端是否还在处理请求
+2. 检查后端是否出现死锁
+3. `ss -tnp | grep {port}` — 检查 TCP 连接状态
+
+---
+
+## 通用 FastDeploy 排查工具
+
+### collect-env
+
+收集环境信息：
+```bash
+python -m fastdeploy.utils.collect_env
+```
+
+### 后端日志位置
+
+- 默认：`logs/workerlog.0`
+- 多 Worker：`logs/workerlog.{N}`
+
+### Prometheus Metrics
+
+后端 metrics 端口（从注册信息获取 `metrics_port`）：
+```
+http://{worker_ip}:{metrics_port}/metrics
+```
+
+关键指标：
+- `fastdeploy_llm_running_queue_size` — 当前推理中的请求数
+- `fastdeploy_llm_waiting_queue_size` — 等待队列长度
+- `fastdeploy_llm_generation_tokens_per_second` — 生成吞吐
+- `fastdeploy_llm_request_total` — 总请求数
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
new file mode 100644
index 00000000000..cf33b41f723
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -0,0 +1,282 @@
+# 日志格式与提取规则
+
+本文档定义 Router 日志的所有类别、Grep 匹配模式、精确正则，供各子 skill 参考。
+
+---
+
+## 日志基本格式
+
+```
+[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:<line>: [context_tags] message
+```
+
+### Context Tags（可选，顺序固定）
+
+- `[trace_id:<value>]`
+- `[req_id:<value>]`
+- `[session_id:<value>]`
+- `[request_id:<value>]`
+
+所有 tag 可能同时出现，也可能只有部分或没有。顺序固定为：`trace_id → req_id → session_id → request_id`。
+
+### ID 匹配正则
+
+搜索某个 ID 时，同时匹配四种 tag：
+```
+session_id:<ID>|trace_id:<ID>|request_id:<ID>|req_id:<ID>
+```
+
+---
+
+## 日志分类提取
+
+| 类别 | Grep 模式 | 用途 | 典型内容 |
+|------|----------|------|---------|
+| E1 — ERROR | `\[ERROR\]` | 错误分类 | 各类 Failed to ... 错误 |
+| E2 — WARN | `\[WARN\]` | 警告分类 | counter 异常、tokenizer 退化 |
+| H1 — HTTP 请求 | `\] \[(POST\|GET)\] /` | 延迟/状态码/吞吐量 | HTTP middleware 日志行 |
+| H2 — 健康事件 | `Removed unhealthy\|is not healthy\|is healthy` | Worker 健康时间线 | 上下线事件 |
+| H2b — 注册事件 | `\] \[POST\] /register.*200` | Worker 注册 | 从 H1 HTTP 行中匹配 POST /register 返回 200 |
+| H3 — 调度事件 | `select worker\|release worker\|Failed to select\|SelectWorkerPair` | 调度/计数器分析 | Worker 选择和释放 |
+| H4 — 后端问题 | `Failed to connect\|request failed\|scanner error\|copy error\|Panic recovered` | 后端问题 | 连接/流式/Panic（注意：`scanner error`/`copy error` 与 H9 有重叠，带 `[prefill]` 前缀的行同时属于 H9） |
+| H5 — Counter | `counter preserved\|cleanup unhealthy\|removed counters\|counter already\|double-release\|preserved counters` | 计数器异常 | 计数器生命周期 |
+| H6 — Cache-aware | `cache-aware prefill: final strategy:` | Cache 调度诊断 | 策略选择 + hitRatios |
+| H7 — Stats | `\[stats\]` | 负载/命中率 | 周期性统计行 |
+| H8 — ts_ms | `ts_ms=` | 调度耗时 | 调度开始结束时间戳 |
+| H9 — Prefill 生命周期 | `\[prefill\]` | PD 模式 prefill 追踪 | 首包/释放/错误 |
+| H10 — 请求标记 | `Parsing completed\|Request completed successfully` | 请求生命周期 | 调度开始/请求结束标记 |
+| H11 — Token 释放 | `release prefill tokens` | Token 计数器生命周期 | Token 释放事件 |
+
+---
+
+## H1 — HTTP 请求行格式
+
+```
+[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1
+```
+
+字段：`[METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP`
+
+### 延迟单位归一化
+
+Go `time.Duration.String()` 输出格式不固定，需归一化为毫秒：
+
+| 原始格式 | 含义 | 转换为 ms |
+|---------|------|----------|
+| `1.5s` | 秒 | × 1000 |
+| `150ms` | 毫秒 | 直接使用 |
+| `150.5ms` | 毫秒 | 直接使用 |
+| `500µs` | 微秒 | ÷ 1000 |
+| `500us` | 微秒（ASCII） | ÷ 1000 |
+| `500ns` | 纳秒 | ÷ 1000000 |
+| `1m30s` | 分+秒 | 分×60000 + 秒×1000 |
+| `1h2m3s` | 时+分+秒 | 时×3600000 + 分×60000 + 秒×1000 |
+
+正则提取延迟值：`(\d+(?:\.\d+)?(?:h|m(?!s)|s|ms|µs|us|ns))+`
+
+### 仅推理请求
+
+延迟分析只统计推理请求路径：
+- `/v1/chat/completions`
+- `/v1/completions`
+
+排除健康检查 `/health`、注册 `/register` 等管理路径。
+
+---
+
+## H6 — Cache-aware 策略行格式
+
+```
+[INFO] 2025/01/15 18:25:33 logger.go:87: [trace_id:xxx] [session_id:xxx] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.0.0.1:9965, loads=map[http://10.0.0.1:9965:2 http://10.0.0.2:9965:5], hitRatios=map[http://10.0.0.1:9965:0.85 http://10.0.0.2:9965:0.42]. ts_ms=2025-01-15 18:25:33.123
+```
+
+```
+[INFO] ... cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...]. ts_ms=2025-01-15 18:25:33.123
+```
+
+注意：日志中**没有** `scores=map[...]` 字段。scores 仅在 DEBUG 级别的 `chooseByScore` 中逐条打印。
+如需分析非最优选择，需从 hitRatios + loads 使用公式重新计算：
+`score = (100-hitRatio)/100 * hitRatioWeight + loadRatio * loadBalanceWeight`
+
+### Go map 解析
+
+`hitRatios=map[key1:val1 key2:val2]`
+
+- 空 map：`hitRatios=map[]` — 表示冷启动
+- 正则提取 map 内容：`map\[(.*?)\]`
+- 每对 key:value 用空格分隔：`(\S+):(\S+)`
+- key 是 worker URL，value 是 float64
+
+### selected worker 的 hitRatio
+
+从 hitRatios map 中查找 selected URL 的值：
+- 在 map 中找到 → 使用该值
+- 不在 map 中 → hitRatio = 0
+- map 为空 → 冷启动，hitRatio = 0
+
+### ts_ms 格式
+
+`ts_ms=2025-01-15 18:25:33.123`
+
+格式：`2006-01-02 15:04:05.000`（Go reference time）
+
+用于计算调度耗时（两个 ts_ms 之间的差值）。
+
+---
+
+## H7 — Stats 行格式
+
+```
+[INFO] 2025/01/15 18:25:33 logger.go:87: [stats] total_running=5, workers: [http://10.0.0.1:9965: running=2, http://10.0.0.2:9965: running=3], cache_hit_rate=85.71% (hits=6/total=7)
+```
+
+注意：由于 Go `log.Lshortfile` 打印的是 `Printf` 调用处，stats 行的源文件始终为 `logger.go:NN:`（行号随编译变化），而非 `handler.go`。
+
+注意：stats 行**不包含**任何 context tag（trace_id 等），因为由后台 goroutine 周期输出。
+
+### 关键：per-interval 计数器
+
+`hits` 和 `total` 是 **per-interval** 的值（每 5s 通过 `atomic.Swap(0)` 重置为 0）。
+
+计算累计值必须 **sum 所有行**：
+- 累计 Session Hit Rate = `sum(hits) / sum(total) * 100`
+
+### Worker 负载提取
+
+`workers: [url1: running=N, url2: running=N]`
+
+- 注意格式：`workers:` 带冒号+空格，每个 worker 格式为 `url: running=N`，逗号+空格分隔
+- **不包含 token 数据**（reportStats 只读取 running 计数）
+
+正则：`(http://[^:]+:\d+): running=(\d+)`
+
+### cache_hit_rate 提取
+
+`cache_hit_rate=85.71% (hits=6/total=7)`
+
+正则：`cache_hit_rate=([\d.]+)% \(hits=(\d+)/total=(\d+)\)`
+
+---
+
+## 模板归一化
+
+ERROR/WARN 消息分组时，需将变量替换为占位符：
+
+| 变量类型 | 正则 | 替换为 |
+|---------|------|-------|
+| URL | `https?://[\w.:]+` | `{url}` |
+| UUID | `[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}` | `{uuid}` |
+| 数字 | `\d+` (仅在特定位置) | `{N}` |
+| IP:Port | `\d+\.\d+\.\d+\.\d+:\d+` | `{ip:port}` |
+
+---
+
+## Fallback 策略行识别
+
+| final strategy | reason 关键词 | 含义 |
+|---------------|--------------|------|
+| `cache_aware_scoring` | (无 reason) | 正常 cache-aware 调度 |
+| `process_tokens` | `tokenize failed` | 退化 B：字符级 tokenize 也失败 |
+| `process_tokens` | `load imbalanced` | 退化 C：负载不均衡 |
+| `process_tokens` | (其他) | 退化 D：策略未初始化等 |
+
+退化 A（Tokenizer 服务→字符级）在 WARN 行识别：
+```
+[WARN] ... cache-aware prefill: tokenizer failed, fallback to char tokens: {err}
+```
+注意完整前缀 `cache-aware prefill: tokenizer failed`。
+退化 A 后仍可走 cache_aware_scoring（精度降低），与 B/C/D 不互斥。
+
+---
+
+## H4 — 后端问题匹配说明
+
+H4 的 `request failed` 模式会匹配多个消息模板：
+- `Request failed (attempt {n}/{max}): {err}` — 重试日志
+- `Decode request failed for {url}: {err}` — PD 模式 decode 失败
+- `Prefill request failed for {url}: {err}` — PD 模式 prefill 失败
+- `Backend request failed for {url}: {err}` — 后端请求失败
+
+分析时需通过模板归一化去重。
+
+---
+
+## H9 — Prefill 生命周期事件
+
+PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]` 前缀日志：
+
+| 消息模板 | 含义 |
+|---------|------|
+| `[prefill] first chunk received, release counter url=%s` | Prefill 首包到达，释放计数器 |
+| `[prefill] non-stream prefill response done, release counter url=%s` | 非流式 prefill 完成 |
+| `[prefill] release in defer (fallback) url=%s, isStream=%v` | defer 兜底释放 |
+| `[prefill] release in CommonCompletions defer (error path) url=%s` | 错误路径释放 |
+| `[prefill] backendResp is nil or backendResp.Body is nil, url=%s` | 后端响应异常 |
+| `[prefill] scanner error: %v, message=%s` | 流式读取错误（ERROR 级别） |
+| `[prefill] copy error: %v, message=%s` | 非流式复制错误（ERROR 级别） |
+
+---
+
+## H10 — 请求生命周期标记
+
+| 消息 | 含义 | 级别 |
+|------|------|------|
+| `Parsing completed; starting worker selection.` | 请求解析完成，开始调度 | INFO |
+| `Request completed successfully.` | 请求成功完成 | INFO |
+
+---
+
+## H11 — Token 释放
+
+`release prefill tokens: %s, tokens: %d` — 释放 prefill token 计数。
+数据源：`handler.go:333`。用于 troubleshoot-load 的 token 计数器分析。
+
+---
+
+## 使用脚本工具
+
+各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
+
+### 快速参考
+
+| 任务 | 脚本 |
+|------|------|
+| 解析 H1 HTTP 行 | `log_parser.py parse-http [--inference-only]` |
+| 解析 H6 cache 策略行 | `log_parser.py parse-cache-strategy` |
+| 解析 H7 stats 行 | `log_parser.py parse-stats` |
+| 检测非支持请求 | `log_parser.py unsupported-requests [--summary-only]` |
+| ASCII 折线图 | `chart.py` |
+| Unicode 柱状图 | `chart.py` |
+| Markdown 表格 | `chart.py` |
+| Worker 时间线 | `chart.py` |
+
+所有工具从 stdin 读取，输出到 stdout。中间数据使用 JSON Lines 格式。
+
+---
+
+## 已知路由列表
+
+Router 支持的全部路由（来自 `internal/router/router.go`）：
+
+| Method | Path | 类型 |
+|--------|------|------|
+| POST | `/v1/chat/completions` | 推理 |
+| POST | `/v1/completions` | 推理 |
+| POST | `/register` | 实例注册 |
+| GET | `/registered_number` | 注册数量查询 |
+| GET | `/registered` | 注册列表查询 |
+| GET | `/health_generate` | 健康检查 |
+| GET | `/metrics` | Prometheus 指标 |
+
+### 非支持请求排查
+
+客户端可能发送不属于已知路由的请求（如 `/v1/models`），会收到 404 但仍记录在 H1 HTTP 日志中。
+
+使用 `log_parser.py unsupported-requests` 子命令检测：
+```bash
+# 完整输出（详细列表 + 汇总）
+grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests
+
+# 仅汇总
+grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests --summary-only
+```
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
new file mode 100644
index 00000000000..ba9e40e9869
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -0,0 +1,120 @@
+# 报告输出规范
+
+所有 troubleshoot 分析维度共享的可视化和格式规范。
+
+---
+
+## 通用可视化组件
+
+### Unicode 柱状图
+- 填充块：`█`（U+2588），空块：`░`（U+2591）
+- 总宽度：20 字符，右侧标注百分比和计数
+- 块数 = round(percentage / 100 * 20)，最小 1 块（>0% 时）
+
+### Sparkline 折线图
+- 字符集：`▁▂▃▄▅▆▇█`（8 级高度）
+- 图表宽度：60 字符，自动降采样
+- X 轴标注时间（首/尾 + 中间 2-3 个刻度）
+- Y 轴自适应：百分比类 0-100%，计数类 0-max
+
+### Markdown 表格
+- 标准 Markdown 表格格式
+- 数值列右对齐
+
+### Worker 可用性时间线
+- `█` = 在线，`░` = 下线
+- 右侧标注在线率百分比
+
+---
+
+## 严重程度标记
+
+| 标记 | 含义 | 使用场景 |
+|------|------|---------|
+| CRITICAL | 服务不可用 | Panic、全部 Worker 不健康、错误率 >20% |
+| HIGH | 部分请求失败 | 502/503、Worker 频繁下线 |
+| MEDIUM | 性能下降 | 高延迟、cache 命中率低 |
+| LOW | 需关注 | 计数器异常、tokenizer 退化 |
+| INFO | 正常 | 统计信息 |
+
+---
+
+## 报告格式
+
+### 简洁版（终端输出）
+
+- 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 按三层分类（Router / FD 后端 / 客户端）
+- 每个问题一行摘要 + 关键指标
+- 末尾提示详细版文件路径
+
+### 详细版（文件导出）
+
+- 路径：`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
+- 主报告包含各维度总结 + 可视化图表（sparkline/柱状图/时间线等）
+- 详情拆分到 `details/` 子目录：
+  - `details/health_events.md` — Worker 逐分钟健康事件
+  - `details/trace_<ID>.md` — 请求追踪事件链
+
+---
+
+## 状态判定规则
+
+- **CRITICAL**：存在 Panic、全部 Worker 不健康、或错误率 >20%
+- **DEGRADED**：存在 502/503、Worker 不稳定、或错误率 >5%
+- **HEALTHY**：无严重问题
+
+---
+
+## 各维度报告结构
+
+### Errors（错误分析）
+
+```
+HTTP 状态码分布（柱状图）
+错误率趋势（折线图）
+ERROR/WARN Top N（柱状图 + 表格，标注来源层）
+Panic 列表
+```
+
+### Latency（延迟分析）— 待实现
+
+```
+延迟百分位数 (p50/p90/p95/p99)
+延迟分布（柱状图）
+吞吐量趋势（折线图）
+慢请求 Top 10
+```
+
+### Health（Worker 健康）— 待实现
+
+```
+Worker 可用性时间线
+健康事件汇总表
+可用性统计
+```
+
+### Cache（调度诊断）— 待实现
+
+```
+调度策略分布
+Session 粘性分析
+非最优选择分析
+Fallback 原因分类
+```
+
+### Load（负载分析）— 待实现
+
+```
+Worker 负载分布
+计数器异常检测
+Token 计数器统计
+```
+
+### Trace（请求追踪）— 待实现
+
+```
+单请求事件链
+生命周期完整性检查
+Session 多请求汇总
+```
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py
new file mode 100644
index 00000000000..e7bb50660a8
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py
@@ -0,0 +1 @@
+# Analyzers package
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
new file mode 100644
index 00000000000..3a18b668a41
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+"""
+Cache Analyzer — Cache 调度诊断
+
+分析 cache-aware 调度策略：session 粘性、非最优选择评分、驱逐影响、
+fallback 原因、冷启动识别、交叉诊断。
+注意：cache 命中率数值分析由 stat-cache-hitrate skill 负责，本模块做策略诊断。
+"""
+
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chart import render_bar, render_table
+from log_parser import parse_cache_strategy_line, parse_ts
+from stats import compute_statistics, count_by
+
+# ════════════════════════════════════════════════════════════════
+# Fallback 分类
+# ════════════════════════════════════════════════════════════════
+
+TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens")
+
+
+def classify_fallback(record, tokenizer_degraded_ts=None):
+    """对 process_tokens 策略行分类 fallback 原因。
+
+    Returns: 'A-Tokenizer退化' | 'B-char tokenize失败' | 'C-负载不均衡' | 'D-其他'
+    """
+    reason = record.get("reason", "")
+    if "load imbalanced" in reason:
+        return "C-负载不均衡"
+    if "tokenize failed" in reason:
+        return "B-char tokenize失败"
+    return "D-其他"
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+
+def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weight=1.0, load_balance_weight=1.0):
+    """分析 cache-aware 调度策略。
+
+    Args:
+        log_file: 日志文件路径
+        tail: 尾部行数限制
+        eviction_duration_mins: 驱逐时间（分钟，默认 30）
+        hit_ratio_weight: hitRatio 权重（默认 1.0）
+        load_balance_weight: loadBalance 权重（默认 1.0）
+
+    Returns:
+        dict: {strategy_dist, fallback_reasons, session_stickiness, suboptimal_selections,
+               eviction_impact, cold_starts, hitratio_stats, diagnoses, summary}
+    """
+    h6_lines = _grep_lines(log_file, r"cache-aware prefill: final strategy:", tail)
+    tokenizer_warn_lines = _grep_lines(log_file, r"tokenizer failed, fallback to char tokens", tail)
+
+    # 解析策略行
+    strategy_records = [r for line in h6_lines for r in [parse_cache_strategy_line(line)] if r]
+
+    if not strategy_records:
+        return {
+            "strategy_dist": [],
+            "fallback_reasons": [],
+            "session_stickiness": {},
+            "suboptimal_selections": [],
+            "eviction_impact": [],
+            "cold_starts": 0,
+            "hitratio_stats": {},
+            "diagnoses": [],
+            "summary": "未检测到 cache-aware 策略日志",
+        }
+
+    # Tokenizer 退化次数
+    tokenizer_degraded_count = len(tokenizer_warn_lines)
+
+    # 策略分布
+    strategy_dist = count_by(strategy_records, "strategy")
+
+    # Fallback 原因
+    fallback_records = [r for r in strategy_records if r.get("strategy") == "process_tokens"]
+    fallback_reasons = []
+    if fallback_records:
+        for r in fallback_records:
+            r["fallback_type"] = classify_fallback(r)
+        fallback_reasons = count_by(fallback_records, "fallback_type")
+
+    # hitRatio 统计
+    hr_vals = [r.get("selected_hitRatio", 0) for r in strategy_records if "selected_hitRatio" in r]
+    hitratio_stats = compute_statistics(hr_vals) if hr_vals else {}
+
+    # Session 粘性分析
+    session_stickiness = _analyze_session_stickiness(strategy_records)
+
+    # 非最优选择分析
+    suboptimal = _analyze_suboptimal(strategy_records, hit_ratio_weight, load_balance_weight)
+
+    # 驱逐影响
+    eviction_impact = _analyze_eviction(strategy_records, eviction_duration_mins)
+
+    # 冷启动
+    cold_starts = sum(1 for r in strategy_records if r.get("hitRatios") == {})
+
+    total = len(strategy_records)
+    cache_aware_count = sum(1 for r in strategy_records if r["strategy"] == "cache_aware_scoring")
+    fallback_count = len(fallback_records)
+
+    diagnoses = _diagnose(
+        strategy_dist,
+        fallback_reasons,
+        session_stickiness,
+        suboptimal,
+        eviction_impact,
+        cold_starts,
+        total,
+        tokenizer_degraded_count,
+        hitratio_stats,
+    )
+
+    return {
+        "strategy_dist": strategy_dist,
+        "fallback_reasons": fallback_reasons,
+        "session_stickiness": session_stickiness,
+        "suboptimal_selections": suboptimal,
+        "eviction_impact": eviction_impact,
+        "cold_starts": cold_starts,
+        "hitratio_stats": hitratio_stats,
+        "tokenizer_degraded_count": tokenizer_degraded_count,
+        "diagnoses": diagnoses,
+        "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, "
+        f"冷启动 {cold_starts}",
+    }
+
+
+def _analyze_session_stickiness(records):
+    """Session 粘性分析。"""
+    sessions = defaultdict(list)
+    for r in records:
+        sid = (r.get("tags") or {}).get("session_id")
+        if sid and "selected" in r:
+            sessions[sid].append(r["selected"])
+
+    result = {}
+    for sid, workers in sessions.items():
+        if len(workers) < 2:
+            continue
+        same_count = sum(1 for i in range(1, len(workers)) if workers[i] == workers[i - 1])
+        stickiness = round(same_count / (len(workers) - 1) * 100, 1)
+        switches = [(i, workers[i - 1], workers[i]) for i in range(1, len(workers)) if workers[i] != workers[i - 1]]
+        result[sid] = {
+            "total_requests": len(workers),
+            "stickiness_pct": stickiness,
+            "switches": len(switches),
+        }
+
+    return result
+
+
+def _analyze_suboptimal(records, hr_weight, lb_weight):
+    """非最优选择分析：selected 的 hitRatio 不是最高时，重新计算 score 对比。"""
+    suboptimal = []
+    for r in records:
+        if r.get("strategy") != "cache_aware_scoring":
+            continue
+        hit_ratios = r.get("hitRatios", {})
+        loads = r.get("loads", {})
+        selected = r.get("selected")
+        if not hit_ratios or not selected or selected not in hit_ratios:
+            continue
+
+        max_hr = max(hit_ratios.values()) if hit_ratios else 0
+        sel_hr = hit_ratios.get(selected, 0)
+
+        if sel_hr >= max_hr:
+            continue
+
+        # 计算 scores: score = (100-hitRatio)/100 * hrWeight + loadRatio * lbWeight
+        # Go 源码使用 maxLoad 做归一化: loadRatio = load / maxLoad
+        max_load = max(loads.values()) if loads else 1
+        max_load = max(max_load, 1)
+        scores = {}
+        for w_url in hit_ratios:
+            hr = hit_ratios.get(w_url, 0)
+            load = loads.get(w_url, 0)
+            load_ratio = load / max_load
+            score = (100 - hr) / 100 * hr_weight + load_ratio * lb_weight
+            scores[w_url] = round(score, 4)
+
+        best_by_hr = min(hit_ratios, key=lambda w: -hit_ratios[w])
+        sel_score = scores.get(selected, 0)
+        best_hr_score = scores.get(best_by_hr, 0)
+
+        # 分类原因
+        load_diff = abs(loads.get(selected, 0) - loads.get(best_by_hr, 0))
+        if load_diff > 5:
+            reason = "负载主导"
+        elif max_hr < 10:
+            reason = "区分度不够"
+        elif abs(sel_score - best_hr_score) < 0.05:
+            reason = "正常竞争"
+        else:
+            reason = "综合权衡"
+
+        suboptimal.append(
+            {
+                "ts": r.get("ts", ""),
+                "selected": selected.replace("http://", ""),
+                "selected_hr": sel_hr,
+                "best_hr_worker": best_by_hr.replace("http://", ""),
+                "best_hr": max_hr,
+                "reason": reason,
+            }
+        )
+
+    return suboptimal
+
+
+def _analyze_eviction(records, eviction_mins):
+    """驱逐影响分析：同 session 连续请求间隔 > eviction_duration。"""
+    sessions = defaultdict(list)
+    for r in records:
+        sid = (r.get("tags") or {}).get("session_id")
+        ts = r.get("ts")
+        if sid and ts:
+            sessions[sid].append(r)
+
+    impacts = []
+    for sid, reqs in sessions.items():
+        reqs.sort(key=lambda x: x.get("ts", ""))
+        for i in range(1, len(reqs)):
+            try:
+                prev_dt = parse_ts(reqs[i - 1]["ts"])
+                curr_dt = parse_ts(reqs[i]["ts"])
+                interval_mins = (curr_dt - prev_dt).total_seconds() / 60
+                if interval_mins > eviction_mins:
+                    curr_hr = reqs[i].get("selected_hitRatio", -1)
+                    impacts.append(
+                        {
+                            "session_id": sid,
+                            "interval_mins": round(interval_mins, 1),
+                            "hitRatio_after": curr_hr,
+                            "evicted": curr_hr == 0,
+                        }
+                    )
+            except (ValueError, KeyError):
+                pass
+
+    return impacts
+
+
+def _diagnose(
+    strategy_dist,
+    fallback_reasons,
+    session_stickiness,
+    suboptimal,
+    eviction_impact,
+    cold_starts,
+    total,
+    tokenizer_degraded_count,
+    hitratio_stats,
+):
+    """生成 cache 调度诊断。"""
+    diagnoses = []
+
+    # Tokenizer 退化
+    if tokenizer_degraded_count > 0:
+        pct = round(tokenizer_degraded_count / max(total, 1) * 100, 1)
+        sev = "HIGH" if pct > 10 else "MEDIUM"
+        diagnoses.append(
+            {
+                "severity": sev,
+                "message": f"Tokenizer 退化 {tokenizer_degraded_count} 次 ({pct}%)，精度降低",
+                "source_layer": "Router",
+            }
+        )
+
+    # Fallback 比例
+    for s in strategy_dist:
+        if s["value"] == "process_tokens" and s["pct"] > 20:
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'Fallback 到 process_tokens {s["pct"]}%，cache-aware 策略未生效',
+                    "source_layer": "Router",
+                }
+            )
+
+    # 非最优选择
+    if suboptimal and total > 0:
+        pct = round(len(suboptimal) / total * 100, 1)
+        if pct > 20:
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f"非最优选择 {pct}%（{len(suboptimal)}/{total}）",
+                    "source_layer": "Router",
+                }
+            )
+
+    # 冷启动
+    if cold_starts > 0 and total > 0:
+        pct = round(cold_starts / total * 100, 1)
+        if pct > 10:
+            diagnoses.append(
+                {"severity": "LOW", "message": f"冷启动 {pct}%（hitRatios=map[]）", "source_layer": "Router"}
+            )
+
+    # 驱逐影响
+    evicted = [e for e in eviction_impact if e["evicted"]]
+    if evicted:
+        diagnoses.append(
+            {
+                "severity": "MEDIUM",
+                "message": f"{len(evicted)} 次驱逐后 hitRatio=0，考虑增大 eviction-duration-mins",
+                "source_layer": "Router",
+            }
+        )
+
+    # hitRatio 整体偏低
+    if hitratio_stats.get("mean", 100) < 20:
+        diagnoses.append(
+            {
+                "severity": "LOW",
+                "message": f'平均 hitRatio {hitratio_stats["mean"]}%，缓存效果较差',
+                "source_layer": "Router",
+            }
+        )
+
+    return diagnoses
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_cache_report(result):
+    """将分析结果格式化为终端报告。"""
+    sections = ["## Cache 调度诊断", ""]
+    sections.append(f'  {result["summary"]}')
+    sections.append("")
+
+    if result["diagnoses"]:
+        sections.append("### 诊断")
+        sections.append("")
+        for d in result["diagnoses"]:
+            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("")
+
+    # 策略分布
+    if result["strategy_dist"]:
+        sections.append("### 策略分布")
+        sections.append("")
+        bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]]
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # hitRatio 统计
+    hs = result.get("hitratio_stats", {})
+    if hs:
+        sections.append("### hitRatio 统计")
+        sections.append("")
+        sections.append(
+            f'  mean={hs.get("mean",0)}%  p50={hs.get("p50",0)}%  p90={hs.get("p90",0)}%  '
+            f'p99={hs.get("p99",0)}%  max={hs.get("max",0)}%'
+        )
+        sections.append("")
+
+    # Fallback 原因
+    if result["fallback_reasons"]:
+        sections.append("### Fallback 原因分布")
+        sections.append("")
+        bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]]
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # Tokenizer 退化
+    if result.get("tokenizer_degraded_count", 0) > 0:
+        sections.append(f'  Tokenizer 退化: {result["tokenizer_degraded_count"]} 次')
+        sections.append("")
+
+    # Session 粘性
+    stickiness = result.get("session_stickiness", {})
+    if stickiness:
+        sections.append("### Session 粘性")
+        sections.append("")
+        table_data = [
+            {
+                "Session": sid[:16],
+                "请求数": str(s["total_requests"]),
+                "粘性率": f'{s["stickiness_pct"]}%',
+                "切换次数": str(s["switches"]),
+            }
+            for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"])
+        ]
+        sections.append(
+            render_table(
+                table_data[:10],
+                columns=["Session", "请求数", "粘性率", "切换次数"],
+                right_align={"请求数", "粘性率", "切换次数"},
+            )
+        )
+        sections.append("")
+
+    # 非最优选择
+    if result.get("suboptimal_selections"):
+        subs = result["suboptimal_selections"]
+        sections.append(f"### 非最优选择 ({len(subs)} 次)")
+        sections.append("")
+        reason_counts = defaultdict(int)
+        for s in subs:
+            reason_counts[s["reason"]] += 1
+        for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
+            sections.append(f"  {reason}: {count} 次")
+        sections.append("")
+
+    # 驱逐影响
+    if result.get("eviction_impact"):
+        evictions = result["eviction_impact"]
+        evicted = [e for e in evictions if e["evicted"]]
+        sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
+        sections.append("")
+
+    # 冷启动
+    if result.get("cold_starts", 0) > 0:
+        sections.append(f'  冷启动: {result["cold_starts"]} 次（hitRatios=map[]）')
+        sections.append("")
+
+    return "\n".join(sections)
+
+
+# ════════════════════════════════════════════════════════════════
+# Grep 工具
+# ════════════════════════════════════════════════════════════════
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    try:
+        if tail:
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
new file mode 100644
index 00000000000..0817e280aa5
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -0,0 +1,314 @@
+#!/usr/bin/env python3
+"""
+Errors Analyzer — 错误分类分析
+
+分析 Router 日志中的 ERROR/WARN 日志、HTTP 状态码分布、Panic 事件。
+按问题来源层（Router / FastDeploy 后端 / 客户端）标注每类错误。
+"""
+
+import os
+import subprocess
+import sys
+
+# 让 analyzers 能 import 同级 scripts 下的模块
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chart import render_bar, render_sparkline, render_table
+from log_parser import extract_ts, parse_error_line, parse_http_line
+from stats import count_by, time_bucket
+
+# ════════════════════════════════════════════════════════════════
+# 错误来源层映射（从 error_catalog.md 提取的核心规则）
+# ════════════════════════════════════════════════════════════════
+
+# 模板 → 来源层 映射（归一化后的模板匹配）
+SOURCE_LAYER_RULES = [
+    # Router 自身
+    ("Failed to build disaggregate_info", "Router"),
+    ("Failed to encode modified request", "Router"),
+    ("Panic recovered", "Router"),
+    ("DefaultManager is nil", "Router"),
+    ("double-release", "Router"),
+    ("counter already cleaned up", "Router"),
+    ("counter already zero", "Router"),
+    ("tokenizer failed", "Router"),
+    ("Instance {url} role is unknown", "Router"),
+    # 客户端
+    ("Invalid request body", "客户端"),
+    ("Invalid JSON format", "客户端"),
+    ("Failed to read request body", "客户端"),
+    ("Failed to unmarshal request JSON", "客户端"),
+    # FD 后端（默认多数 ERROR 来自后端）
+    ("Failed to select", "FD 后端"),
+    ("Failed to connect to backend", "FD 后端"),
+    ("No available", "FD 后端"),
+    ("request failed", "FD 后端"),
+    ("Removed unhealthy", "FD 后端"),
+    ("Backend request failed", "FD 后端"),
+    ("Decode request failed", "FD 后端"),
+    ("Prefill request failed", "FD 后端"),
+    ("Failed to create decode request", "FD 后端"),
+    ("Failed to create prefill request", "FD 后端"),
+    ("Failed to create backend request", "FD 后端"),
+    ("GetRemoteMetrics failed", "FD 后端"),
+]
+
+# scanner error / copy error 特殊处理：context canceled → 客户端，其他 → FD 后端
+SCANNER_COPY_PATTERNS = ("scanner error", "copy error")
+
+
+def classify_source_layer(template, original=""):
+    """根据错误模板判断来源层。"""
+    # scanner error / copy error 特殊判断
+    for pat in SCANNER_COPY_PATTERNS:
+        if pat in template or pat in original:
+            if "context canceled" in original:
+                return "客户端"
+            return "FD 后端"
+
+    for pattern, layer in SOURCE_LAYER_RULES:
+        if pattern in template:
+            return layer
+
+    return "未知"
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+
+def analyze_errors(log_file, tail=None, top_n=20):
+    """分析日志中的错误。
+
+    Args:
+        log_file: 日志文件路径
+        tail: 尾部行数限制（None 则全量）
+        top_n: 错误 Top N
+
+    Returns:
+        dict: {
+            error_top_n: [{template, count, pct, source_layer, level, urls}],
+            status_code_dist: [{value, count, pct}],
+            panic_list: [{ts, context}],
+            error_rate: float,
+            error_trend: [{bucket, count}],
+            total_errors: int,
+            total_warns: int,
+            total_requests: int,
+            summary: str,
+        }
+    """
+    # Phase 1: Grep 提取各类日志
+    error_lines = _grep_lines(log_file, r"\[ERROR\]", tail)
+    warn_lines = _grep_lines(log_file, r"\[WARN\]", tail)
+    http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail)
+    panic_lines = _grep_lines(log_file, "Panic recovered", tail)
+
+    # Phase 2: 解析
+    # 2.1 ERROR + WARN 归一化
+    error_records = [parse_error_line(line) for line in error_lines]
+    warn_records = [parse_error_line(line) for line in warn_lines]
+    all_error_records = error_records + warn_records
+
+    # 2.2 HTTP 请求解析
+    http_records = []
+    for line in http_lines:
+        r = parse_http_line(line)
+        if r:
+            http_records.append(r)
+
+    # 2.3 Panic 提取
+    panic_list = []
+    for line in panic_lines:
+        ts = extract_ts(line)
+        panic_list.append({"ts": ts or "", "context": line.strip()})
+
+    # Phase 3: 分析
+    # 3.1 按模板分组 Top N
+    error_top = _compute_error_top_n(all_error_records, top_n)
+
+    # 3.2 HTTP 状态码分布
+    status_dist = count_by(http_records, "status")
+
+    # 3.3 错误率
+    total_requests = len(http_records)
+    non_200 = sum(1 for r in http_records if r["status"] != 200)
+    error_rate = round(non_200 / total_requests * 100, 2) if total_requests else 0
+
+    # 3.4 错误趋势（按时间窗口统计非 200 请求数）
+    non_200_records = [r for r in http_records if r["status"] != 200]
+    error_trend = time_bucket(non_200_records, window="auto")
+
+    return {
+        "error_top_n": error_top,
+        "status_code_dist": status_dist,
+        "panic_list": panic_list,
+        "error_rate": error_rate,
+        "error_trend": error_trend,
+        "total_errors": len(error_records),
+        "total_warns": len(warn_records),
+        "total_requests": total_requests,
+    }
+
+
+def _compute_error_top_n(records, top_n):
+    """按模板分组并标注来源层。"""
+    # 分组
+    groups = {}
+    for r in records:
+        tpl = r["template"]
+        if tpl not in groups:
+            groups[tpl] = {
+                "template": tpl,
+                "count": 0,
+                "level": r["level"],
+                "originals": [],
+            }
+        groups[tpl]["count"] += 1
+        # 保留最多 5 个原始消息用于详细报告中提取 URL
+        if len(groups[tpl]["originals"]) < 5:
+            groups[tpl]["originals"].append(r["original"])
+
+    total = len(records)
+    result = []
+    for g in sorted(groups.values(), key=lambda x: -x["count"]):
+        source_layer = classify_source_layer(g["template"], g["originals"][0] if g["originals"] else "")
+        result.append(
+            {
+                "template": g["template"],
+                "count": g["count"],
+                "pct": round(g["count"] / total * 100, 1) if total else 0,
+                "source_layer": source_layer,
+                "level": g["level"],
+                "sample_originals": g["originals"],
+            }
+        )
+        if len(result) >= top_n:
+            break
+
+    return result
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    """用 grep 从日志文件提取匹配行。"""
+    try:
+        if tail:
+            # 先 tail 再 grep
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    """简单 shell 引号转义。"""
+    return "'" + s.replace("'", "'\\''") + "'"
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_errors_report(result):
+    """将分析结果格式化为终端报告。
+
+    Args:
+        result: analyze_errors 返回的 dict
+
+    Returns:
+        str: 格式化后的报告文本
+    """
+    sections = []
+
+    # 标题
+    sections.append("## 错误分析")
+    sections.append("")
+
+    # 概览
+    sections.append(
+        f'  ERROR: {result["total_errors"]}  |  '
+        f'WARN: {result["total_warns"]}  |  '
+        f'请求总数: {result["total_requests"]}  |  '
+        f'错误率: {result["error_rate"]}%'
+    )
+    sections.append("")
+
+    # Panic
+    if result["panic_list"]:
+        sections.append(f'  ⚠ Panic 事件: {len(result["panic_list"])} 次')
+        for p in result["panic_list"][:5]:
+            sections.append(f'    [{p["ts"]}] {p["context"][:100]}')
+        sections.append("")
+
+    # 错误 Top N
+    if result["error_top_n"]:
+        sections.append("### ERROR/WARN Top 分类")
+        sections.append("")
+        bar_data = []
+        for e in result["error_top_n"][:10]:
+            label = e["template"][:50]
+            bar_data.append(
+                {
+                    "label": label,
+                    "value": e["pct"],
+                    "count": e["count"],
+                }
+            )
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+        # 来源层表格
+        table_data = []
+        for e in result["error_top_n"][:10]:
+            table_data.append(
+                {
+                    "模板": e["template"][:60],
+                    "数量": e["count"],
+                    "占比": f'{e["pct"]}%',
+                    "级别": e["level"],
+                    "来源层": e["source_layer"],
+                }
+            )
+        sections.append(
+            render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
+        )
+        sections.append("")
+
+    # 状态码分布
+    if result["status_code_dist"]:
+        sections.append("### HTTP 状态码分布")
+        sections.append("")
+        bar_data = []
+        for s in result["status_code_dist"]:
+            bar_data.append(
+                {
+                    "label": str(s["value"]),
+                    "value": s["pct"],
+                    "count": s["count"],
+                }
+            )
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # 错误趋势
+    if result["error_trend"] and len(result["error_trend"]) > 1:
+        sections.append("### 非 200 请求趋势")
+        sections.append("")
+        sections.append(
+            render_sparkline(
+                result["error_trend"],
+                value_field="count",
+                title="Error Count",
+                y_label="req",
+            )
+        )
+        sections.append("")
+
+    return "\n".join(sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
new file mode 100644
index 00000000000..d2d7ca77acb
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -0,0 +1,421 @@
+#!/usr/bin/env python3
+"""
+Health Analyzer — Worker 健康时间线分析
+
+追踪 Worker 上下线事件、恢复检测、可用性统计。
+按 Worker URL 聚合事件，构建状态时间线。
+"""
+
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chart import render_table, render_timeline
+from log_parser import extract_ts, parse_http_line, parse_ts
+
+# ════════════════════════════════════════════════════════════════
+# 健康事件解析
+# ════════════════════════════════════════════════════════════════
+
+NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy")
+REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)")
+IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy")
+COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
+CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)")
+
+
+def parse_health_event(line):
+    """解析 H2 健康事件行。返回 {ts, worker, event_type} 或 None。"""
+    ts = extract_ts(line)
+    m = REMOVED_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "event_type": "REMOVED"}
+    m = NOT_HEALTHY_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "event_type": "NOT_HEALTHY"}
+    m = IS_HEALTHY_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "event_type": "HEALTHY"}
+    return None
+
+
+def parse_counter_preserved(line):
+    """解析 H5 counter preserved / cleanup 事件。"""
+    ts = extract_ts(line)
+    m = COUNTER_PRESERVED_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "event_type": "COUNTER_PRESERVED"}
+    m = CLEANUP_UNHEALTHY_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "event_type": "CLEANUP_UNHEALTHY"}
+    return None
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+
+def analyze_health(log_file, tail=None):
+    """分析 Worker 健康状态。
+
+    Returns:
+        dict: {workers, diagnoses, time_range, summary}
+    """
+    h2_lines = _grep_lines(log_file, r"Removed unhealthy|is not healthy|is healthy", tail)
+    h5_lines = _grep_lines(log_file, r"counter preserved|cleanup unhealthy", tail)
+    register_lines = _grep_lines(log_file, r"\[POST\] /register", tail)
+
+    health_events = [e for line in h2_lines for e in [parse_health_event(line)] if e]
+    counter_events = [e for line in h5_lines for e in [parse_counter_preserved(line)] if e]
+
+    register_events = []
+    for line in register_lines:
+        r = parse_http_line(line)
+        if r and r["method"] == "POST" and r["path"] == "/register" and r["status"] == 200:
+            register_events.append({"ts": r["ts"], "client_ip": r["client_ip"]})
+
+    if not health_events and not register_events:
+        return {
+            "workers": {},
+            "diagnoses": [],
+            "time_range": {"start": "", "end": ""},
+            "summary": "未检测到 Worker 健康事件",
+        }
+
+    workers = _build_worker_timelines(health_events, counter_events, register_events)
+
+    all_ts = sorted([e["ts"] for e in health_events + register_events if e.get("ts")])
+    time_range = {"start": all_ts[0] if all_ts else "", "end": all_ts[-1] if all_ts else ""}
+
+    diagnoses = _diagnose(workers)
+    down_workers = sum(1 for w in workers.values() if w["down_count"] > 0)
+
+    return {
+        "workers": workers,
+        "diagnoses": diagnoses,
+        "time_range": time_range,
+        "summary": f"{len(workers)} Worker(s), {down_workers} 有下线事件",
+    }
+
+
+def _build_worker_timelines(health_events, counter_events, register_events):
+    """构建每个 Worker 的状态时间线。"""
+    worker_urls = {evt["worker"] for evt in health_events}
+
+    # IP → worker URL 映射
+    ip_to_urls = defaultdict(set)
+    for url in worker_urls:
+        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        if ip_m:
+            ip_to_urls[ip_m.group(1)].add(url)
+
+    worker_events = defaultdict(list)
+    for evt in health_events:
+        worker_events[evt["worker"]].append(evt)
+
+    counter_counts = defaultdict(int)
+    for evt in counter_events:
+        if evt["event_type"] == "COUNTER_PRESERVED":
+            counter_counts[evt["worker"]] += 1
+
+    register_by_ip = defaultdict(list)
+    for evt in register_events:
+        register_by_ip[evt["client_ip"]].append(evt)
+
+    workers = {}
+    for url in sorted(worker_urls):
+        events = sorted(worker_events[url], key=lambda e: e["ts"] or "")
+        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        worker_ip = ip_m.group(1) if ip_m else ""
+
+        # 恢复检测：REMOVED 后有 register
+        recovered = False
+        recovery_events = []
+        for evt in events:
+            if evt["event_type"] == "REMOVED" and worker_ip:
+                for reg in register_by_ip.get(worker_ip, []):
+                    if reg["ts"] and evt["ts"] and reg["ts"] > evt["ts"]:
+                        recovered = True
+                        recovery_events.append({"ts": reg["ts"], "type": "RE-REGISTERED"})
+                        break
+
+        all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events]
+        all_events.extend(recovery_events)
+        all_events.sort(key=lambda e: e["ts"] or "")
+
+        down_periods = _compute_down_periods(all_events)
+        down_count = len(down_periods)
+        avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0
+
+        workers[url] = {
+            "events": all_events,
+            "uptime_pct": _compute_uptime_pct(all_events),
+            "down_count": down_count,
+            "avg_down_duration_s": round(avg_down_s, 1),
+            "recovered": recovered,
+            "inflight_preserved": counter_counts.get(url, 0),
+            "down_periods": down_periods,
+        }
+
+    return workers
+
+
+def _compute_down_periods(events):
+    """从事件列表计算下线时段。"""
+    down_periods = []
+    down_start = None
+    for evt in events:
+        if evt["type"] in ("NOT_HEALTHY", "REMOVED"):
+            if down_start is None and evt["ts"]:
+                down_start = evt["ts"]
+        elif evt["type"] in ("HEALTHY", "RE-REGISTERED"):
+            if down_start is not None and evt["ts"]:
+                try:
+                    duration_s = (parse_ts(evt["ts"]) - parse_ts(down_start)).total_seconds()
+                    down_periods.append({"start": down_start, "end": evt["ts"], "duration_s": max(0, duration_s)})
+                except ValueError:
+                    pass
+                down_start = None
+    if down_start is not None:
+        down_periods.append({"start": down_start, "end": None, "duration_s": 0})
+    return down_periods
+
+
+def _compute_uptime_pct(events):
+    """计算 Worker 可用性百分比。"""
+    if not events:
+        return 100.0
+    ts_list = [e["ts"] for e in events if e["ts"]]
+    if len(ts_list) < 2:
+        return 0.0 if events[0]["type"] in ("NOT_HEALTHY", "REMOVED") else 100.0
+    try:
+        first_dt, last_dt = parse_ts(ts_list[0]), parse_ts(ts_list[-1])
+        total_s = (last_dt - first_dt).total_seconds()
+        if total_s <= 0:
+            return 100.0
+    except ValueError:
+        return 100.0
+
+    down_s, down_start = 0.0, None
+    for evt in events:
+        if evt["type"] in ("NOT_HEALTHY", "REMOVED") and down_start is None and evt["ts"]:
+            try:
+                down_start = parse_ts(evt["ts"])
+            except ValueError:
+                pass
+        elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and down_start is not None and evt["ts"]:
+            try:
+                down_s += (parse_ts(evt["ts"]) - down_start).total_seconds()
+            except ValueError:
+                pass
+            down_start = None
+    if down_start is not None:
+        down_s += (last_dt - down_start).total_seconds()
+
+    return round(max(0, total_s - down_s) / total_s * 100, 1)
+
+
+def _diagnose(workers):
+    """根据 Worker 健康数据生成诊断。"""
+    diagnoses = []
+    if not workers:
+        return diagnoses
+
+    all_down = all(w["events"] and w["events"][-1]["type"] in ("NOT_HEALTHY", "REMOVED") for w in workers.values())
+    if all_down:
+        diagnoses.append(
+            {
+                "severity": "CRITICAL",
+                "message": f"所有 Worker ({len(workers)}) 当前均不可用",
+                "source_layer": "FD 后端",
+            }
+        )
+
+    for url, w in workers.items():
+        s = url.replace("http://", "")
+        if w["down_count"] > 3:
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'{s} 下线 {w["down_count"]} 次，Worker 不稳定',
+                    "source_layer": "FD 后端",
+                }
+            )
+        for p in w.get("down_periods", []):
+            if p["duration_s"] > 300:
+                diagnoses.append(
+                    {
+                        "severity": "HIGH",
+                        "message": f'{s} 下线 {p["duration_s"]/60:.1f}min（{p["start"]} ~ {p["end"] or "未恢复"}）',
+                        "source_layer": "FD 后端",
+                    }
+                )
+        if len(w["events"]) >= 3:
+            ts_list = [e["ts"] for e in w["events"] if e["ts"]]
+            if len(ts_list) >= 2:
+                try:
+                    hours = (parse_ts(ts_list[-1]) - parse_ts(ts_list[0])).total_seconds() / 3600
+                    if hours > 0 and len(w["events"]) / hours > 3:
+                        diagnoses.append(
+                            {
+                                "severity": "MEDIUM",
+                                "message": f'{s} 状态变更频繁 ({len(w["events"])/hours:.1f} 次/小时)',
+                                "source_layer": "FD 后端",
+                            }
+                        )
+                except ValueError:
+                    pass
+        if w["inflight_preserved"] > 3:
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'{s} counter preserved {w["inflight_preserved"]} 次（下线时仍有 inflight 请求）',
+                    "source_layer": "FD 后端",
+                }
+            )
+
+    return diagnoses
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_health_report(result):
+    """将分析结果格式化为终端报告。
+
+    Returns:
+        tuple: (summary_text, detail_text)
+            summary_text: 总结部分（诊断 + 可用性表格 + 时间线）
+            detail_text: 事件详情（逐条事件记录，可能很长）
+    """
+    sections = ["## Worker 健康分析", ""]
+    if not result["workers"]:
+        sections.append("  未检测到 Worker 健康事件（所有 Worker 状态正常或无健康日志）")
+        return "\n".join(sections), ""
+
+    sections.append(f'  {result["summary"]}')
+    if result["time_range"]["start"]:
+        sections.append(f'  时间范围: {result["time_range"]["start"]} ~ {result["time_range"]["end"]}')
+    sections.append("")
+
+    if result["diagnoses"]:
+        sections.append("### 诊断")
+        sections.append("")
+        for d in result["diagnoses"]:
+            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("")
+
+    # Worker 可用性表格
+    sections.append("### Worker 可用性")
+    sections.append("")
+    table_data = []
+    for url, w in sorted(result["workers"].items()):
+        avg_down = ""
+        if w["avg_down_duration_s"] > 0:
+            avg_down = (
+                f'{w["avg_down_duration_s"]/60:.1f}min'
+                if w["avg_down_duration_s"] >= 60
+                else f'{w["avg_down_duration_s"]:.0f}s'
+            )
+        table_data.append(
+            {
+                "Worker": url.replace("http://", ""),
+                "在线率": f'{w["uptime_pct"]}%',
+                "下线次数": str(w["down_count"]),
+                "平均下线时长": avg_down or "-",
+                "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"),
+                "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-",
+            }
+        )
+    sections.append(
+        render_table(
+            table_data,
+            columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"],
+            right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"},
+        )
+    )
+    sections.append("")
+
+    # 时间线
+    if result["time_range"]["start"] and result["time_range"]["end"]:
+        sections.append("### Worker 时间线")
+        sections.append("")
+        timeline_data = _build_timeline_data(result)
+        if timeline_data:
+            sections.append(render_timeline(timeline_data, width=40))
+            sections.append("")
+
+    # 事件详情 → 拆分到 detail_text
+    detail_parts = ["# Worker 健康事件详情", ""]
+    has_events = False
+    for url, w in sorted(result["workers"].items()):
+        if w["events"]:
+            has_events = True
+            detail_parts.append(f'## {url.replace("http://", "")}')
+            detail_parts.append("")
+            for evt in w["events"]:
+                detail_parts.append(f'  [{evt["ts"]}] {evt["type"]}')
+            detail_parts.append("")
+
+    detail_text = "\n".join(detail_parts) if has_events else ""
+
+    # 主报告中添加引用
+    if has_events:
+        sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)")
+        sections.append("")
+
+    return "\n".join(sections), detail_text
+
+
+def _build_timeline_data(result):
+    """构建 render_timeline 需要的数据格式。"""
+    tr = result["time_range"]
+    if not tr["start"] or not tr["end"]:
+        return None
+    workers_data = {}
+    for url, w in result["workers"].items():
+        periods = []
+        status, start = "up", tr["start"]
+        for evt in w["events"]:
+            if not evt["ts"]:
+                continue
+            if evt["type"] in ("NOT_HEALTHY", "REMOVED") and status == "up":
+                periods.append({"from": start, "to": evt["ts"], "status": "up"})
+                status, start = "down", evt["ts"]
+            elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and status == "down":
+                periods.append({"from": start, "to": evt["ts"], "status": "down"})
+                status, start = "up", evt["ts"]
+        periods.append({"from": start, "to": tr["end"], "status": status})
+        workers_data[url] = periods
+    return {"start": tr["start"], "end": tr["end"], "workers": workers_data}
+
+
+# ════════════════════════════════════════════════════════════════
+# Grep 工具
+# ════════════════════════════════════════════════════════════════
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    """用 grep 从日志文件提取匹配行。"""
+    try:
+        if tail:
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
new file mode 100644
index 00000000000..eec862910e8
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
@@ -0,0 +1,355 @@
+#!/usr/bin/env python3
+"""
+Latency Analyzer — 延迟分析
+
+分析 Router 日志中的请求延迟百分位数、延迟分布、吞吐量趋势、调度耗时、慢请求。
+仅统计推理请求路径（/v1/chat/completions, /v1/completions）。
+"""
+
+import os
+import subprocess
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chart import render_bar, render_sparkline, render_table
+from log_parser import TS_MS_RE, extract_tags, parse_http_line
+from stats import compute_statistics, time_bucket
+
+# ════════════════════════════════════════════════════════════════
+# 调度耗时解析
+# ════════════════════════════════════════════════════════════════
+
+
+def _parse_scheduling_ms(ts_ms_lines):
+    """从 ts_ms 行计算调度耗时（同一请求两个 ts_ms 之间的差值）。
+
+    同一 request_id 的两条 ts_ms 行之间的时间差即为调度耗时。
+    返回 ms 列表。
+    """
+    from datetime import datetime
+
+    # 按 request_id 分组
+    by_reqid = defaultdict(list)
+    for line in ts_ms_lines:
+        m = TS_MS_RE.search(line)
+        if not m:
+            continue
+        ts_ms_str = m.group(1)
+        tags = extract_tags(line)
+        rid = tags.get("request_id", "")
+        if rid:
+            try:
+                dt = datetime.strptime(ts_ms_str, "%Y-%m-%d %H:%M:%S.%f")
+                by_reqid[rid].append(dt)
+            except ValueError:
+                pass
+
+    # 计算每个 request_id 的 max - min 差值
+    durations = []
+    for rid, timestamps in by_reqid.items():
+        if len(timestamps) >= 2:
+            timestamps.sort()
+            delta_ms = (timestamps[-1] - timestamps[0]).total_seconds() * 1000
+            durations.append(round(delta_ms, 3))
+
+    return durations
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+LATENCY_DIST_SPEC = "<100,100-500,500-1000,1000-5000,5000-10000,>10000"
+
+
+def analyze_latency(log_file, tail=None):
+    """分析日志中的请求延迟。
+
+    Args:
+        log_file: 日志文件路径
+        tail: 尾部行数限制
+
+    Returns:
+        dict: {
+            stats: {count, p50, p90, p95, p99, max, mean, stddev, distribution},
+            latency_trend: [{bucket, latency_ms_p50}],
+            throughput_trend: [{bucket, count}],
+            slow_top10: [{ts, path, status, latency_ms, client_ip}],
+            scheduling_stats: {p50, p90, p99} | None,
+            diagnoses: [{message, severity}],
+        }
+    """
+    # Phase 1: Grep 提取
+    http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail)
+    ts_ms_lines = _grep_lines(log_file, "ts_ms=", tail)
+
+    # Phase 2: 解析 HTTP 行（仅推理路径）
+    http_records = []
+    for line in http_lines:
+        r = parse_http_line(line, inference_only=True)
+        if r:
+            http_records.append(r)
+
+    # Phase 3: 分析
+
+    # 3.1 延迟统计
+    latency_values = [r["latency_ms"] for r in http_records]
+    stats = compute_statistics(
+        latency_values,
+        percentiles_list=[50, 90, 95, 99],
+        distribution_spec=LATENCY_DIST_SPEC,
+    )
+
+    # 3.2 延迟趋势 (p50)
+    latency_trend = time_bucket(
+        http_records,
+        window="auto",
+        agg_specs=[("latency_ms", "p50")],
+    )
+
+    # 3.3 吞吐量趋势
+    throughput_trend = time_bucket(http_records, window="auto")
+
+    # 3.4 慢请求 Top 10
+    sorted_by_latency = sorted(http_records, key=lambda r: -r["latency_ms"])
+    slow_top10 = []
+    for r in sorted_by_latency[:10]:
+        slow_top10.append(
+            {
+                "ts": r["ts"],
+                "path": r["path"],
+                "status": r["status"],
+                "latency_ms": r["latency_ms"],
+                "client_ip": r["client_ip"],
+            }
+        )
+
+    # 3.5 调度耗时
+    scheduling_stats = None
+    if ts_ms_lines:
+        sched_durations = _parse_scheduling_ms(ts_ms_lines)
+        if sched_durations:
+            sched_raw = compute_statistics(sched_durations, percentiles_list=[50, 90, 99])
+            scheduling_stats = {
+                "p50": sched_raw["p50"],
+                "p90": sched_raw["p90"],
+                "p99": sched_raw["p99"],
+                "count": sched_raw["count"],
+            }
+
+    # 3.6 诊断规则
+    diagnoses = _run_diagnostics(stats, scheduling_stats)
+
+    return {
+        "stats": stats,
+        "latency_trend": latency_trend,
+        "throughput_trend": throughput_trend,
+        "slow_top10": slow_top10,
+        "scheduling_stats": scheduling_stats,
+        "diagnoses": diagnoses,
+    }
+
+
+def _run_diagnostics(stats, scheduling_stats):
+    """应用诊断规则。"""
+    diagnoses = []
+
+    if stats["count"] == 0:
+        diagnoses.append({"message": "未找到推理请求", "severity": "INFO"})
+        return diagnoses
+
+    p99 = stats.get("p99", 0)
+    p50 = stats.get("p50", 0)
+
+    # p99 > 10s
+    if p99 > 10000:
+        if scheduling_stats and scheduling_stats["p99"] < 100:
+            diagnoses.append(
+                {
+                    "message": f'p99={p99:.0f}ms 但调度仅 {scheduling_stats["p99"]:.0f}ms → 延迟在后端推理层',
+                    "severity": "HIGH",
+                }
+            )
+        elif scheduling_stats and scheduling_stats["p99"] >= 100:
+            diagnoses.append(
+                {
+                    "message": f'p99={p99:.0f}ms 且调度 p99={scheduling_stats["p99"]:.0f}ms → 调度层瓶颈',
+                    "severity": "CRITICAL",
+                }
+            )
+        else:
+            diagnoses.append(
+                {
+                    "message": f"p99={p99:.0f}ms (>10s)，后端推理延迟高",
+                    "severity": "HIGH",
+                }
+            )
+
+    # 尾延迟
+    if p50 > 0 and p99 / p50 > 10:
+        diagnoses.append(
+            {
+                "message": f"p99/p50={p99/p50:.1f}x → 尾延迟严重",
+                "severity": "MEDIUM",
+            }
+        )
+
+    if not diagnoses:
+        diagnoses.append(
+            {
+                "message": f"延迟正常 (p50={p50:.0f}ms, p99={p99:.0f}ms)",
+                "severity": "INFO",
+            }
+        )
+
+    return diagnoses
+
+
+# ════════════════════════════════════════════════════════════════
+# Grep 工具
+# ════════════════════════════════════════════════════════════════
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    """用 grep 从日志文件提取匹配行。"""
+    try:
+        if tail:
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    return "'" + s.replace("'", "'\\''") + "'"
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_latency_report(result):
+    """将分析结果格式化为终端报告。"""
+    sections = []
+    stats = result["stats"]
+
+    sections.append("## 延迟分析")
+    sections.append("")
+
+    if stats["count"] == 0:
+        sections.append("  未找到推理请求 (/v1/chat/completions, /v1/completions)")
+        return "\n".join(sections)
+
+    # 百分位数概览
+    sections.append(
+        f'  推理请求: {stats["count"]}  |  '
+        f'p50={_fmt_ms(stats["p50"])}  p90={_fmt_ms(stats["p90"])}  '
+        f'p95={_fmt_ms(stats["p95"])}  p99={_fmt_ms(stats["p99"])}  '
+        f'max={_fmt_ms(stats["max"])}'
+    )
+    sections.append("")
+
+    # 延迟分布
+    if stats.get("distribution"):
+        sections.append("### 延迟分布")
+        sections.append("")
+        bar_data = []
+        for d in stats["distribution"]:
+            bar_data.append(
+                {
+                    "label": d["range"],
+                    "value": d["pct"],
+                    "count": d["count"],
+                }
+            )
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # 延迟趋势
+    if result["latency_trend"] and len(result["latency_trend"]) > 1:
+        sections.append("### 延迟趋势 (p50)")
+        sections.append("")
+        sections.append(
+            render_sparkline(
+                result["latency_trend"],
+                value_field="latency_ms_p50",
+                title="p50 Latency",
+                y_label="ms",
+            )
+        )
+        sections.append("")
+
+    # 吞吐量趋势
+    if result["throughput_trend"] and len(result["throughput_trend"]) > 1:
+        sections.append("### 吞吐量趋势")
+        sections.append("")
+        sections.append(
+            render_sparkline(
+                result["throughput_trend"],
+                value_field="count",
+                title="Throughput",
+                y_label="req",
+            )
+        )
+        sections.append("")
+
+    # 调度耗时
+    if result["scheduling_stats"]:
+        ss = result["scheduling_stats"]
+        sections.append(f'### 调度耗时 ({ss["count"]} samples)')
+        sections.append(f'  p50={_fmt_ms(ss["p50"])}  p90={_fmt_ms(ss["p90"])}  p99={_fmt_ms(ss["p99"])}')
+        sections.append("")
+
+    # 慢请求 Top 10
+    if result["slow_top10"]:
+        sections.append("### 慢请求 Top 10")
+        sections.append("")
+        table_data = []
+        for r in result["slow_top10"]:
+            table_data.append(
+                {
+                    "时间": r["ts"][-8:] if len(r["ts"]) > 8 else r["ts"],
+                    "延迟": _fmt_ms(r["latency_ms"]),
+                    "状态": str(r["status"]),
+                    "路径": r["path"],
+                    "Client": r["client_ip"],
+                }
+            )
+        sections.append(
+            render_table(
+                table_data,
+                columns=["时间", "延迟", "状态", "路径", "Client"],
+            )
+        )
+        sections.append("")
+
+    # 诊断
+    if result["diagnoses"]:
+        sections.append("### 诊断")
+        for d in result["diagnoses"]:
+            severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "}
+            mark = severity_mark.get(d["severity"], " ")
+            sections.append(f'  [{mark}] {d["message"]}')
+        sections.append("")
+
+    return "\n".join(sections)
+
+
+def _fmt_ms(ms):
+    """格式化毫秒值为人类可读字符串。"""
+    if ms >= 60000:
+        return f"{ms/60000:.1f}min"
+    elif ms >= 1000:
+        return f"{ms/1000:.2f}s"
+    elif ms >= 1:
+        return f"{ms:.1f}ms"
+    else:
+        return f"{ms*1000:.0f}µs"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
new file mode 100644
index 00000000000..e712011d932
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -0,0 +1,389 @@
+#!/usr/bin/env python3
+"""
+Load Analyzer — 负载与计数器分析
+
+分析 Worker 负载分布、计数器异常、请求堆积检测、token 计数器。
+"""
+
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from chart import render_bar, render_sparkline, render_table
+from log_parser import extract_ts, match_select_release, parse_stats_line
+from stats import compute_statistics, time_bucket
+
+# ════════════════════════════════════════════════════════════════
+# Counter 异常检测正则
+# ════════════════════════════════════════════════════════════════
+
+DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release")
+COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up")
+COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
+TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)")
+
+# Token 事件
+SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+
+
+def parse_counter_anomaly(line):
+    """解析 H5 counter 异常行。"""
+    ts = extract_ts(line)
+    m = DOUBLE_RELEASE_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "anomaly_type": "double-release"}
+    m = COUNTER_CLEANED_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-cleaned-up"}
+    m = COUNTER_PRESERVED_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-preserved"}
+    m = TOKEN_PRESERVED_RE.search(line)
+    if m:
+        return {"ts": ts, "worker": m.group(1), "anomaly_type": "token-preserved"}
+    return None
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+
+def analyze_load(log_file, tail=None):
+    """分析负载与计数器。
+
+    Returns:
+        dict: {load_stats, worker_load, load_trend, counter_anomalies,
+               select_release, token_stats, diagnoses, summary}
+    """
+    h7_lines = _grep_lines(log_file, r"\[stats\]", tail)
+    h3_lines = _grep_lines(log_file, r"select worker|release worker|Failed to select", tail)
+    h5_lines = _grep_lines(
+        log_file,
+        r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters",
+        tail,
+    )
+    h11_lines = _grep_lines(log_file, r"release prefill tokens", tail)
+
+    # 解析 stats 行
+    stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r]
+
+    # 负载统计
+    total_running_vals = [r["total_running"] for r in stats_records if "total_running" in r]
+    load_stats = compute_statistics(total_running_vals) if total_running_vals else {}
+
+    # Per-Worker 负载分布
+    worker_running = defaultdict(list)
+    for r in stats_records:
+        for w_url, running in r.get("workers", {}).items():
+            worker_running[w_url].append(running)
+
+    worker_load = []
+    for w_url in sorted(worker_running.keys()):
+        vals = worker_running[w_url]
+        avg = sum(vals) / len(vals) if vals else 0
+        worker_load.append(
+            {
+                "worker": w_url.replace("http://", ""),
+                "avg_running": round(avg, 1),
+                "max_running": max(vals) if vals else 0,
+                "samples": len(vals),
+            }
+        )
+
+    # 负载趋势
+    load_trend = (
+        time_bucket(stats_records, window="auto", agg_specs=[("total_running", "mean")]) if stats_records else []
+    )
+
+    # Counter 异常
+    counter_anomalies = defaultdict(lambda: defaultdict(int))
+    for line in h5_lines:
+        evt = parse_counter_anomaly(line)
+        if evt:
+            counter_anomalies[evt["anomaly_type"]][evt["worker"]] += 1
+
+    anomaly_summary = []
+    for atype, workers in counter_anomalies.items():
+        total = sum(workers.values())
+        anomaly_summary.append(
+            {
+                "type": atype,
+                "total": total,
+                "workers": dict(workers),
+            }
+        )
+
+    # Select/Release 匹配
+    sr_result = (
+        match_select_release(h3_lines)
+        if h3_lines
+        else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}}
+    )
+
+    # Token 统计
+    token_stats = _analyze_tokens(h3_lines, h11_lines)
+
+    # 请求堆积检测
+    pileup = _detect_pileup(stats_records)
+
+    # 诊断
+    diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup)
+
+    return {
+        "load_stats": load_stats,
+        "worker_load": worker_load,
+        "load_trend": load_trend,
+        "counter_anomalies": anomaly_summary,
+        "select_release": sr_result,
+        "token_stats": token_stats,
+        "pileup_detected": pileup,
+        "diagnoses": diagnoses,
+        "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)",
+    }
+
+
+def _analyze_tokens(h3_lines, h11_lines):
+    """分析 token 分配与释放。"""
+    token_alloc = defaultdict(list)
+    token_release = defaultdict(list)
+
+    for line in h3_lines:
+        m = SELECT_TOKENS_RE.search(line)
+        if m:
+            token_alloc[m.group(1)].append(int(m.group(2)))
+
+    for line in h11_lines:
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            token_release[m.group(1)].append(int(m.group(2)))
+
+    result = []
+    all_workers = set(token_alloc.keys()) | set(token_release.keys())
+    for w in sorted(all_workers):
+        allocs = token_alloc.get(w, [])
+        releases = token_release.get(w, [])
+        result.append(
+            {
+                "worker": w.replace("http://", ""),
+                "alloc_count": len(allocs),
+                "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0,
+                "release_count": len(releases),
+            }
+        )
+    return result
+
+
+def _detect_pileup(stats_records):
+    """检测请求堆积：total_running 连续上升 >5 个采样点。"""
+    if len(stats_records) < 5:
+        return False
+    vals = [r.get("total_running", 0) for r in stats_records]
+    max_consecutive = 0
+    current = 0
+    for i in range(1, len(vals)):
+        if vals[i] > vals[i - 1]:
+            current += 1
+            max_consecutive = max(max_consecutive, current)
+        else:
+            current = 0
+    return max_consecutive >= 5
+
+
+def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
+    """生成负载诊断。"""
+    diagnoses = []
+
+    if pileup:
+        diagnoses.append(
+            {"severity": "HIGH", "message": "total_running 持续上升，疑似请求堆积", "source_layer": "FD 后端"}
+        )
+
+    # 空闲 Worker
+    for w in worker_load:
+        if w["avg_running"] == 0 and w["samples"] > 3:
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'{w["worker"]} running 持续 =0（空闲或故障未移除）',
+                    "source_layer": "Router",
+                }
+            )
+
+    # 负载严重不均
+    if load_stats.get("stddev", 0) > 3:
+        diagnoses.append(
+            {
+                "severity": "MEDIUM",
+                "message": f'负载标准差 {load_stats["stddev"]}，分布不均衡',
+                "source_layer": "Router",
+            }
+        )
+
+    # Counter 异常
+    for a in anomaly_summary:
+        if a["type"] == "double-release" and a["total"] > 0:
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'double-release {a["total"]} 次（计数器逻辑 bug）',
+                    "source_layer": "Router",
+                }
+            )
+
+    # Select/Release 不一致
+    for w_url, pw in sr_result.get("per_worker", {}).items():
+        if pw.get("delta", 0) > 0:
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
+                    "source_layer": "FD 后端",
+                }
+            )
+
+    # 卡住的请求
+    if sr_result.get("unmatched_selects"):
+        diagnoses.append(
+            {
+                "severity": "HIGH",
+                "message": f'{len(sr_result["unmatched_selects"])} 个 select 无对应 release（疑似卡住）',
+                "source_layer": "FD 后端",
+            }
+        )
+
+    return diagnoses
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_load_report(result):
+    """将分析结果格式化为终端报告。"""
+    sections = ["## 负载与计数器分析", ""]
+    sections.append(f'  {result["summary"]}')
+    sections.append("")
+
+    if result["diagnoses"]:
+        sections.append("### 诊断")
+        sections.append("")
+        for d in result["diagnoses"]:
+            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("")
+
+    # 负载概览
+    ls = result.get("load_stats", {})
+    if ls:
+        sections.append("### 负载概览 (total_running)")
+        sections.append("")
+        sections.append(
+            f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
+            f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
+        )
+        sections.append("")
+
+    # Per-Worker 负载
+    if result["worker_load"]:
+        sections.append("### Per-Worker 负载")
+        sections.append("")
+        bar_data = [
+            {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]}
+            for w in result["worker_load"]
+        ]
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # 负载趋势
+    if result["load_trend"] and len(result["load_trend"]) > 1:
+        sections.append("### 负载趋势")
+        sections.append("")
+        sections.append(
+            render_sparkline(
+                result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req"
+            )
+        )
+        sections.append("")
+
+    # Counter 异常
+    if result["counter_anomalies"]:
+        sections.append("### 计数器异常")
+        sections.append("")
+        for a in result["counter_anomalies"]:
+            workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items())
+            sections.append(f'  {a["type"]}: {a["total"]} 次 [{workers_str}]')
+        sections.append("")
+
+    # Select/Release 匹配
+    sr = result.get("select_release", {})
+    if sr.get("per_worker"):
+        sections.append("### Select/Release 匹配")
+        sections.append("")
+        table_data = []
+        for w_url, pw in sorted(sr["per_worker"].items()):
+            table_data.append(
+                {
+                    "Worker": w_url.replace("http://", ""),
+                    "Select": str(pw["selects"]),
+                    "Release": str(pw["releases"]),
+                    "Delta": str(pw["delta"]),
+                }
+            )
+        sections.append(
+            render_table(
+                table_data,
+                columns=["Worker", "Select", "Release", "Delta"],
+                right_align={"Select", "Release", "Delta"},
+            )
+        )
+        sections.append("")
+
+    if sr.get("unmatched_selects"):
+        sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
+        for u in sr["unmatched_selects"][:5]:
+            sections.append(f'    [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})')
+        sections.append("")
+
+    # Token 统计
+    if result.get("token_stats"):
+        sections.append("### Token 计数器")
+        sections.append("")
+        sections.append(
+            render_table(
+                result["token_stats"],
+                columns=["worker", "alloc_count", "alloc_avg", "release_count"],
+                right_align={"alloc_count", "alloc_avg", "release_count"},
+            )
+        )
+        sections.append("")
+
+    return "\n".join(sections)
+
+
+# ════════════════════════════════════════════════════════════════
+# Grep 工具
+# ════════════════════════════════════════════════════════════════
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    try:
+        if tail:
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
new file mode 100644
index 00000000000..45a5056616e
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -0,0 +1,391 @@
+#!/usr/bin/env python3
+"""
+Trace Analyzer — 请求追踪
+
+通过 session_id / trace_id / request_id / req_id 追踪单个或多个请求的
+完整生命周期，重建事件链，检测不完整生命周期。
+"""
+
+import os
+import re
+import subprocess
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from log_parser import (
+    extract_tags,
+    extract_ts,
+    parse_cache_strategy_line,
+    parse_http_line,
+)
+
+# ════════════════════════════════════════════════════════════════
+# 事件识别正则
+# ════════════════════════════════════════════════════════════════
+
+PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection")
+SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
+RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
+RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully")
+TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
+
+# Prefill 事件
+PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)")
+PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)")
+PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)")
+PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)")
+PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)")
+FAILED_SELECT_RE = re.compile(r"Failed to select")
+
+
+# ════════════════════════════════════════════════════════════════
+# 主分析函数
+# ════════════════════════════════════════════════════════════════
+
+
+def analyze_trace(log_file, trace_ids, tail=None):
+    """追踪指定 ID 的请求生命周期。
+
+    Args:
+        log_file: 日志文件路径
+        trace_ids: ID 列表（逗号分隔的字符串或列表）
+        tail: 尾部行数限制
+
+    Returns:
+        dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary}
+    """
+    if isinstance(trace_ids, str):
+        trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()]
+
+    if not trace_ids:
+        return {"traces": {}, "summary": "未指定追踪 ID"}
+
+    traces = {}
+    for tid in trace_ids:
+        # Grep 搜索四种 context tag
+        pattern = f"session_id:{tid}|trace_id:{tid}|request_id:{tid}|req_id:{tid}"
+        matching_lines = _grep_lines(log_file, pattern, tail)
+
+        if not matching_lines:
+            traces[tid] = {
+                "events": [],
+                "lifecycle_complete": False,
+                "diagnoses": [{"severity": "INFO", "message": f"未找到 ID={tid} 的匹配行"}],
+                "matched_tag": None,
+                "related_ids": {},
+            }
+            continue
+
+        # 识别匹配到的 tag 类型，并展开 session 下所有 request_id
+        first_tags = extract_tags(matching_lines[0])
+        is_session = tid in [first_tags.get("session_id", "")]
+
+        # 如果是 session_id，收集所有关联的 request_id
+        related_request_ids = set()
+        if is_session:
+            for line in matching_lines:
+                tags = extract_tags(line)
+                rid = tags.get("request_id", "")
+                if rid:
+                    related_request_ids.add(rid)
+
+            # 为每个 request_id 额外搜索行
+            extra_lines = []
+            for rid in related_request_ids:
+                rid_lines = _grep_lines(log_file, f"request_id:{rid}", tail)
+                extra_lines.extend(rid_lines)
+            all_lines = list(set(matching_lines + extra_lines))
+        else:
+            all_lines = matching_lines
+
+        # 解析事件链
+        events = _parse_event_chain(all_lines)
+        lifecycle_complete = _check_lifecycle_complete(events)
+        diagnoses = _diagnose_trace(events, lifecycle_complete)
+
+        traces[tid] = {
+            "events": events,
+            "lifecycle_complete": lifecycle_complete,
+            "diagnoses": diagnoses,
+            "matched_tag": "session_id" if is_session else "request_id/trace_id",
+            "related_ids": {
+                "request_ids": sorted(related_request_ids) if is_session else [],
+            },
+        }
+
+    total_traced = len(traces)
+    complete = sum(1 for t in traces.values() if t["lifecycle_complete"])
+
+    return {
+        "traces": traces,
+        "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整",
+    }
+
+
+def _parse_event_chain(lines):
+    """从匹配行重建事件链，按时间排序。"""
+    events = []
+
+    for line in lines:
+        ts = extract_ts(line)
+        tags = extract_tags(line)
+
+        # HTTP 请求进入/完成
+        http = parse_http_line(line)
+        if http:
+            events.append(
+                {
+                    "ts": ts,
+                    "type": "HTTP",
+                    "tags": tags,
+                    "method": http["method"],
+                    "path": http["path"],
+                    "status": http["status"],
+                    "latency_ms": http["latency_ms"],
+                }
+            )
+            continue
+
+        # Parsing completed
+        if PARSING_COMPLETE_RE.search(line):
+            events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags})
+            continue
+
+        # Cache-aware strategy
+        strategy = parse_cache_strategy_line(line)
+        if strategy:
+            events.append(
+                {
+                    "ts": ts,
+                    "type": "CACHE_STRATEGY",
+                    "tags": tags,
+                    "strategy": strategy.get("strategy"),
+                    "selected": strategy.get("selected", ""),
+                    "selected_hitRatio": strategy.get("selected_hitRatio", 0),
+                }
+            )
+            continue
+
+        # Select worker
+        m = SELECT_WORKER_RE.search(line)
+        if m:
+            events.append(
+                {
+                    "ts": ts,
+                    "type": "SELECT_WORKER",
+                    "tags": tags,
+                    "worker_type": m.group(1) or "unknown",
+                    "worker": m.group(2),
+                }
+            )
+            continue
+
+        # Release worker
+        m = RELEASE_WORKER_RE.search(line)
+        if m:
+            events.append(
+                {
+                    "ts": ts,
+                    "type": "RELEASE_WORKER",
+                    "tags": tags,
+                    "worker_type": m.group(1) or "unknown",
+                    "worker": m.group(2),
+                }
+            )
+            continue
+
+        # Release tokens
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            events.append(
+                {
+                    "ts": ts,
+                    "type": "RELEASE_TOKENS",
+                    "tags": tags,
+                    "worker": m.group(1),
+                    "tokens": int(m.group(2)),
+                }
+            )
+            continue
+
+        # Prefill events
+        m = PREFILL_FIRST_CHUNK_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1)})
+            continue
+        m = PREFILL_DONE_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1)})
+            continue
+        m = PREFILL_ERROR_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2)})
+            continue
+        m = PREFILL_DEFER_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1)})
+            continue
+        m = PREFILL_ERR_PATH_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1)})
+            continue
+
+        # Request completed
+        if REQUEST_COMPLETE_RE.search(line):
+            events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags})
+            continue
+
+        # ts_ms
+        m = TS_MS_RE.search(line)
+        if m:
+            events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1)})
+            continue
+
+        # Failed to select
+        if FAILED_SELECT_RE.search(line):
+            events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags})
+            continue
+
+    # 按时间排序
+    events.sort(key=lambda e: e.get("ts") or "")
+    return events
+
+
+def _check_lifecycle_complete(events):
+    """检查生命周期是否完整。"""
+    types = {e["type"] for e in events}
+    has_entry = "HTTP" in types or "PARSING_COMPLETE" in types
+    has_exit = "REQUEST_COMPLETE" in types or (
+        "HTTP" in types and any(e["type"] == "HTTP" and e.get("status") for e in events)
+    )
+    has_select = "SELECT_WORKER" in types
+    has_release = "RELEASE_WORKER" in types
+
+    return has_entry and has_exit and (not has_select or has_release)
+
+
+def _diagnose_trace(events, lifecycle_complete):
+    """生成追踪诊断。"""
+    diagnoses = []
+    types = [e["type"] for e in events]
+
+    if not lifecycle_complete:
+        if "SELECT_WORKER" in types and "RELEASE_WORKER" not in types:
+            diagnoses.append({"severity": "HIGH", "message": "有 select 但无 release — 疑似请求卡住"})
+        elif "HTTP" not in types and "PARSING_COMPLETE" not in types:
+            diagnoses.append({"severity": "MEDIUM", "message": "未找到 HTTP 入口事件"})
+        elif "REQUEST_COMPLETE" not in types:
+            diagnoses.append({"severity": "MEDIUM", "message": "未检测到请求完成事件 — 疑似异常退出"})
+
+    if "PREFILL_ERROR" in types:
+        for e in events:
+            if e["type"] == "PREFILL_ERROR":
+                diagnoses.append(
+                    {"severity": "HIGH", "message": f'Prefill 错误: {e.get("error","")} @ {e.get("worker","")}'}
+                )
+
+    if "FAILED_SELECT" in types:
+        diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"})
+
+    return diagnoses
+
+
+# ════════════════════════════════════════════════════════════════
+# 报告格式化
+# ════════════════════════════════════════════════════════════════
+
+
+def format_trace_report(result):
+    """将追踪结果格式化为终端报告。
+
+    Returns:
+        tuple: (summary_text, detail_dict)
+            summary_text: 总结部分（概览 + 诊断 + 生命周期状态）
+            detail_dict: {trace_id: event_chain_text} 各 ID 的完整事件链
+    """
+    sections = ["## 请求追踪", ""]
+    sections.append(f'  {result["summary"]}')
+    sections.append("")
+
+    detail_dict = {}
+
+    for tid, trace in result["traces"].items():
+        sections.append(f"### ID: {tid}")
+        if trace.get("matched_tag"):
+            sections.append(f'  匹配类型: {trace["matched_tag"]}')
+        if trace.get("related_ids", {}).get("request_ids"):
+            sections.append(f'  关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}')
+
+        status = "完整" if trace["lifecycle_complete"] else "不完整"
+        sections.append(f"  生命周期: {status}")
+        sections.append("")
+
+        # 诊断
+        if trace["diagnoses"]:
+            for d in trace["diagnoses"]:
+                sections.append(f'  [{d["severity"]}] {d["message"]}')
+            sections.append("")
+
+        # 事件链 → 拆分到 detail_dict
+        if trace["events"]:
+            detail_lines = [f"# 请求追踪事件链: {tid}", ""]
+            if trace.get("matched_tag"):
+                detail_lines.append(f'匹配类型: {trace["matched_tag"]}')
+            if trace.get("related_ids", {}).get("request_ids"):
+                detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}')
+            detail_lines.append(f"生命周期: {status}")
+            detail_lines.append("")
+            detail_lines.append("## 事件链")
+            detail_lines.append("")
+            for evt in trace["events"]:
+                line = f'  [{evt.get("ts","")}] {evt["type"]}'
+                if evt.get("worker"):
+                    line += f' → {evt["worker"].replace("http://","")}'
+                if evt.get("status"):
+                    line += f' [{evt["status"]}]'
+                if evt.get("latency_ms"):
+                    line += f' {evt["latency_ms"]}ms'
+                if evt.get("strategy"):
+                    line += f' strategy={evt["strategy"]}'
+                if evt.get("selected_hitRatio"):
+                    line += f' hitRatio={evt["selected_hitRatio"]}'
+                if evt.get("tokens"):
+                    line += f' tokens={evt["tokens"]}'
+                if evt.get("error"):
+                    line += f' error={evt["error"]}'
+                detail_lines.append(line)
+            detail_lines.append("")
+            detail_dict[tid] = "\n".join(detail_lines)
+
+            # 主报告中添加引用和摘要
+            safe_tid = tid.replace("/", "_")
+            sections.append(f'  事件数: {len(trace["events"])}')
+            sections.append(f"  > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)")
+            sections.append("")
+
+    return "\n".join(sections), detail_dict
+
+
+# ════════════════════════════════════════════════════════════════
+# Grep 工具
+# ════════════════════════════════════════════════════════════════
+
+
+def _grep_lines(log_file, pattern, tail=None):
+    try:
+        if tail:
+            cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}"
+        else:
+            cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}"
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60)
+        if result.returncode > 1:
+            return []
+        return [line for line in result.stdout.split("\n") if line.strip()]
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return []
+
+
+def _shell_quote(s):
+    return "'" + s.replace("'", "'\\''") + "'"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
new file mode 100644
index 00000000000..83bb0203432
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -0,0 +1,351 @@
+#!/usr/bin/env python3
+"""
+Chart — 终端可视化渲染工具
+
+提供 sparkline 折线图、Unicode 柱状图、Markdown 表格、Worker 时间线的渲染函数。
+所有函数返回字符串（不直接打印），方便组装到报告中。
+
+Python 3 stdlib only，零依赖。
+"""
+
+from datetime import datetime
+
+# ════════════════════════════════════════════════════════════════
+# Sparkline 折线图
+# ════════════════════════════════════════════════════════════════
+
+BLOCK_CHARS = " ▁▂▃▄▅▆▇█"
+
+
+def render_sparkline(
+    records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60
+):
+    """渲染 8 级 Unicode sparkline 折线图。
+
+    Args:
+        records: dict 列表，每个 dict 包含 bucket_field 和 value_field
+        value_field: 数值字段名
+        bucket_field: 时间桶字段名
+        title: 图表标题
+        y_label: Y 轴标签（如 '%'）
+        y_range: Y 轴范围 (min, max) 元组，None 则自动
+        width: 图表宽度（字符数）
+
+    Returns:
+        str: 渲染后的图表文本
+    """
+    if not records:
+        return "  (no data)"
+
+    all_values = []
+    for r in records:
+        v = r.get(value_field)
+        if v is not None:
+            all_values.append(float(v))
+
+    if not all_values:
+        return "  (no numeric data)"
+
+    # Y 轴范围
+    if y_range:
+        y_min, y_max = y_range
+    else:
+        y_min = min(all_values)
+        y_max = max(all_values)
+        if y_max == y_min:
+            y_min = 0 if y_max > 0 else y_max - 1
+            y_max = max(y_max, 1)
+
+    y_span = y_max - y_min if y_max != y_min else 1
+
+    # 降采样
+    n = len(records)
+    if n > width:
+        step = n / width
+        sampled = []
+        for i in range(width):
+            start_idx = int(i * step)
+            end_idx = int((i + 1) * step)
+            chunk = records[start_idx:end_idx]
+            vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None]
+            avg_record = {
+                bucket_field: chunk[0].get(bucket_field, ""),
+                value_field: sum(vals) / len(vals) if vals else 0,
+            }
+            sampled.append(avg_record)
+        records = sampled
+
+    lines = []
+
+    # 标题行
+    def fmt_val(v):
+        if abs(v) >= 1000:
+            return f"{v:.0f}"
+        elif abs(v) >= 10:
+            return f"{v:.1f}"
+        return f"{v:.2f}"
+
+    header_parts = []
+    if title:
+        header_parts.append(title)
+    header_parts.append(f"min={fmt_val(min(all_values))}")
+    header_parts.append(f"max={fmt_val(max(all_values))}")
+    if y_label:
+        header_parts.append(f"({y_label})")
+    lines.append("  " + "  ".join(header_parts))
+
+    # Sparkline 字符
+    spark_chars = []
+    for r in records:
+        v = r.get(value_field)
+        if v is None:
+            spark_chars.append(" ")
+            continue
+        v = float(v)
+        normalized = (v - y_min) / y_span
+        level = max(0, min(8, round(normalized * 8)))
+        spark_chars.append(BLOCK_CHARS[level])
+    lines.append("  " + "".join(spark_chars))
+
+    # X 轴标签
+    data_width = len(records)
+    if data_width > 0:
+
+        def short_bucket(r):
+            b = str(r.get(bucket_field, ""))
+            if " " in b:
+                b = b.split(" ")[-1]
+            return b[:5] if len(b) >= 5 else b
+
+        lbl_width = 6
+        max_labels = max(1, data_width // lbl_width)
+        n_records = len(records)
+
+        if n_records <= 2:
+            indices = list(range(n_records))
+        elif n_records <= max_labels:
+            indices = [0, n_records - 1]
+        else:
+            n_labels = min(5, max(2, max_labels))
+            indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)]
+
+        label_line = [" "] * (data_width + lbl_width + 2)
+        last_end = -1
+        for idx in indices:
+            lbl = short_bucket(records[idx])
+            pos = idx
+            if pos < last_end:
+                continue
+            for ci, c in enumerate(lbl):
+                p = pos + ci
+                if p < len(label_line):
+                    label_line[p] = c
+            last_end = pos + len(lbl) + 1
+        lines.append("  " + "".join(label_line).rstrip())
+
+    return "\n".join(lines)
+
+
+# ════════════════════════════════════════════════════════════════
+# Unicode 柱状图
+# ════════════════════════════════════════════════════════════════
+
+
+def render_bar(data, bar_width=20, show_count=False):
+    """渲染 Unicode 柱状图。
+
+    Args:
+        data: dict 列表，每个 dict 包含 label, value（百分比 0-100）, 可选 count
+        bar_width: 柱状图宽度（字符数）
+        show_count: 是否显示绝对数量
+
+    Returns:
+        str: 渲染后的图表文本
+    """
+    if not data:
+        return "  (no data)"
+
+    max_label_len = max(len(str(d.get("label", ""))) for d in data)
+    max_label_len = max(max_label_len, 4)
+
+    lines = []
+    for d in data:
+        label = str(d.get("label", ""))
+        value = float(d.get("value", 0))
+        count = d.get("count")
+
+        filled = round(value / 100 * bar_width) if value > 0 else 0
+        filled = max(1, filled) if value > 0 else 0
+        filled = min(bar_width, filled)
+        empty = bar_width - filled
+        bar = "█" * filled + "░" * empty
+
+        line = f"  {label:<{max_label_len}}  {bar} {value:>5.1f}%"
+        if show_count and count is not None:
+            line += f"  (N={count})"
+        lines.append(line)
+
+    return "\n".join(lines)
+
+
+# ════════════════════════════════════════════════════════════════
+# Markdown 表格
+# ════════════════════════════════════════════════════════════════
+
+
+def render_table(data, columns=None, right_align=None):
+    """渲染 Markdown 表格。
+
+    Args:
+        data: dict 列表
+        columns: 列名列表，None 则用第一条记录的所有 key
+        right_align: 右对齐的列名集合
+
+    Returns:
+        str: 渲染后的表格文本
+    """
+    if not data:
+        return "  (no data)"
+
+    if columns is None:
+        columns = list(data[0].keys())
+    if right_align is None:
+        right_align = set()
+
+    # 计算列宽
+    col_widths = {}
+    for col in columns:
+        col_widths[col] = len(col)
+        for row in data:
+            val = str(row.get(col, ""))
+            col_widths[col] = max(col_widths[col], len(val))
+
+    # 表头
+    header_parts = []
+    sep_parts = []
+    for col in columns:
+        w = col_widths[col]
+        if col in right_align:
+            header_parts.append(f" {col:>{w}} ")
+        else:
+            header_parts.append(f" {col:<{w}} ")
+        sep_parts.append("-" * (w + 2))
+
+    lines = []
+    lines.append("|" + "|".join(header_parts) + "|")
+    lines.append("|" + "|".join(sep_parts) + "|")
+
+    # 数据行
+    for row in data:
+        row_parts = []
+        for col in columns:
+            val = str(row.get(col, ""))
+            w = col_widths[col]
+            if col in right_align:
+                row_parts.append(f" {val:>{w}} ")
+            else:
+                row_parts.append(f" {val:<{w}} ")
+        lines.append("|" + "|".join(row_parts) + "|")
+
+    return "\n".join(lines)
+
+
+# ════════════════════════════════════════════════════════════════
+# Worker 可用性时间线
+# ════════════════════════════════════════════════════════════════
+
+
+def render_timeline(data, width=40):
+    """渲染 Worker 可用性时间线。
+
+    Args:
+        data: dict，结构为:
+            {
+                'start': 'YYYY/MM/DD HH:MM:SS',
+                'end': 'YYYY/MM/DD HH:MM:SS',
+                'workers': {
+                    'http://ip:port': [
+                        {'from': 'ts', 'to': 'ts', 'status': 'up'|'down'},
+                        ...
+                    ],
+                    ...
+                }
+            }
+        width: 时间线宽度（字符数）
+
+    Returns:
+        str: 渲染后的时间线文本
+    """
+    if not data:
+        return "  (no data)"
+
+    start_str = data.get("start", "")
+    end_str = data.get("end", "")
+    workers = data.get("workers", {})
+
+    if not workers or not start_str or not end_str:
+        return "  (insufficient data)"
+
+    # Parse time range
+    try:
+        if "/" in start_str:
+            fmt = "%Y/%m/%d %H:%M:%S"
+        else:
+            fmt = "%H:%M:%S"
+        t_start = datetime.strptime(start_str, fmt)
+        t_end = datetime.strptime(end_str, fmt)
+    except ValueError:
+        return f"  (cannot parse time range: {start_str} ~ {end_str})"
+
+    total_seconds = (t_end - t_start).total_seconds()
+    if total_seconds <= 0:
+        total_seconds = 1
+
+    lines = []
+
+    for worker_url, periods in workers.items():
+        # Short label: just IP:PORT
+        label = worker_url.replace("http://", "")
+        bar = ["░"] * width
+
+        for period in periods:
+            p_start_str = period.get("from", start_str)
+            p_end_str = period.get("to", end_str)
+            status = period.get("status", "up")
+
+            try:
+                if "/" in p_start_str:
+                    p_start = datetime.strptime(p_start_str, "%Y/%m/%d %H:%M:%S")
+                    p_end = datetime.strptime(p_end_str, "%Y/%m/%d %H:%M:%S")
+                else:
+                    p_start = datetime.strptime(p_start_str, "%H:%M:%S")
+                    p_end = datetime.strptime(p_end_str, "%H:%M:%S")
+            except ValueError:
+                continue
+
+            start_pos = int((p_start - t_start).total_seconds() / total_seconds * width)
+            end_pos = int((p_end - t_start).total_seconds() / total_seconds * width)
+            start_pos = max(0, min(width - 1, start_pos))
+            end_pos = max(0, min(width, end_pos))
+
+            char = "█" if status == "up" else "░"
+            for i in range(start_pos, end_pos):
+                bar[i] = char
+
+        up_count = bar.count("█")
+        uptime_pct = round(up_count / width * 100, 1)
+
+        max_label_len = max(len(w.replace("http://", "")) for w in workers)
+        lines.append(f'  {label:<{max_label_len}}  {"".join(bar)} {uptime_pct}%')
+
+    # Time axis
+    if lines:
+        max_label_len = max(len(w.replace("http://", "")) for w in workers)
+        padding = " " * (max_label_len + 4)
+        start_lbl = start_str.split(" ")[-1] if " " in start_str else start_str
+        end_lbl = end_str.split(" ")[-1] if " " in end_str else end_str
+        gap = width - len(start_lbl) - len(end_lbl)
+        lines.append(f'{padding}{start_lbl}{" " * max(1, gap)}{end_lbl}')
+        lines.append(f"{padding}█ = online  ░ = offline")
+
+    return "\n".join(lines)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
new file mode 100644
index 00000000000..2a90d39b632
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -0,0 +1,832 @@
+#!/usr/bin/env python3
+"""
+Router Log Parser — FastDeploy Go Router 日志解析原语
+
+支持两种调用方式：
+1. 作为模块导入：from log_parser import parse_http_line, parse_cache_strategy_line, ...
+2. 作为 CLI 工具：grep 'pattern' logfile | python3 log_parser.py parse-http
+
+Python 3 stdlib only，零依赖。
+"""
+
+import argparse
+import json
+import re
+import sys
+from collections import defaultdict
+from datetime import datetime
+
+# ════════════════════════════════════════════════════════════════
+# 通用解析原语
+# ════════════════════════════════════════════════════════════════
+
+# Go time.Duration.String() parser: handles 1h2m3.456s, 500µs, 150.5ms, etc.
+DURATION_RE = re.compile(r"(\d+(?:\.\d+)?)(h|m(?!s)|s|ms|[µu]s|ns)")
+
+
+def parse_go_duration_ms(s):
+    """解析 Go time.Duration.String() 输出为毫秒。
+
+    Examples: '1.5s' -> 1500.0, '500µs' -> 0.5, '1m30s' -> 90000.0
+    """
+    total = 0.0
+    for m in DURATION_RE.finditer(s):
+        val, unit = float(m.group(1)), m.group(2)
+        if unit == "h":
+            total += val * 3600000
+        elif unit == "m":
+            total += val * 60000
+        elif unit == "s":
+            total += val * 1000
+        elif unit == "ms":
+            total += val
+        elif unit in ("µs", "us"):
+            total += val / 1000
+        elif unit == "ns":
+            total += val / 1000000
+    return total
+
+
+def parse_go_map(s):
+    """解析 Go fmt.Sprintf('%v', map) 输出：map[key1:val1 key2:val2 ...]
+
+    处理 URL 中冒号与 Go map key-value 分隔符的冲突（从最后一个冒号分割）。
+    空 map 'map[]' 返回空 dict。
+    """
+    inner_match = re.search(r"map\[(.*?)\]", s)
+    if not inner_match:
+        return {}
+    inner = inner_match.group(1).strip()
+    if not inner:
+        return {}
+    result = {}
+    for token in inner.split():
+        idx = token.rfind(":")
+        if idx > 0:
+            key = token[:idx]
+            val_str = token[idx + 1 :]
+            try:
+                result[key] = int(val_str) if "." not in val_str else float(val_str)
+            except ValueError:
+                result[key] = val_str
+    return result
+
+
+# 时间戳：YYYY/MM/DD HH:MM:SS
+TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})")
+
+# ts_ms：2025-01-15 18:25:33.123
+TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
+
+
+def extract_ts(line):
+    """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。"""
+    m = TS_RE.search(line)
+    return m.group(1) if m else None
+
+
+def parse_ts(ts_str):
+    """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。"""
+    return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")
+
+
+# ════════════════════════════════════════════════════════════════
+# 时间范围过滤
+# ════════════════════════════════════════════════════════════════
+
+import os
+import subprocess
+import tempfile
+
+_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$")
+_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$")
+_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$")
+_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$")
+
+
+def _get_log_boundary_ts(log_file, which="first"):
+    """从日志文件首行或末行提取时间戳。"""
+    cmd = "head" if which == "first" else "tail"
+    try:
+        r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5)
+        return extract_ts(r.stdout) if r.returncode == 0 else None
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        return None
+
+
+def complete_time_arg(time_str, log_file, is_end=False):
+    """解析灵活时间输入，补全缺失部分。
+
+    支持格式：
+        'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD',
+        'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM'
+
+    补全规则：
+        - 缺年份：从日志首行取
+        - 缺日期：从日志末行取
+        - 缺时间：start→00:00:00, end→23:59:59
+
+    Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串
+    """
+    if time_str is None:
+        return None
+    time_str = time_str.strip()
+
+    # Case 1: 完整日期时间
+    m = _FULL_DT_RE.match(time_str)
+    if m:
+        y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
+        h, mi = m.group(4).zfill(2), m.group(5)
+        s = (m.group(6) or "00").zfill(2)
+        return f"{y}/{mo}/{d} {h}:{mi}:{s}"
+
+    # Case 2: 仅日期 YYYY/MM/DD
+    m = _DATE_ONLY_RE.match(time_str)
+    if m:
+        y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2)
+        t = "23:59:59" if is_end else "00:00:00"
+        return f"{y}/{mo}/{d} {t}"
+
+    # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS]
+    m = _SHORT_DATE_RE.match(time_str)
+    if m:
+        mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
+        ts = _get_log_boundary_ts(log_file, "first")
+        year = ts[:4] if ts else "2026"
+        if m.group(3):  # 有时间部分
+            h, mi = m.group(3).zfill(2), m.group(4)
+            s = (m.group(5) or "00").zfill(2)
+            return f"{year}/{mo}/{d} {h}:{mi}:{s}"
+        t = "23:59:59" if is_end else "00:00:00"
+        return f"{year}/{mo}/{d} {t}"
+
+    # Case 4: 仅时间 HH:MM[:SS]
+    m = _TIME_ONLY_RE.match(time_str)
+    if m:
+        h, mi = m.group(1).zfill(2), m.group(2)
+        s = (m.group(3) or "00").zfill(2)
+        ts = _get_log_boundary_ts(log_file, "last")
+        date_part = ts[:10] if ts else "2026/01/01"
+        return f"{date_part} {h}:{mi}:{s}"
+
+    # Fallback: 原样返回
+    return time_str
+
+
+def filter_file_by_time_range(log_file, start_str=None, end_str=None):
+    """用 awk 按时间范围预过滤日志文件。
+
+    时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比，直接用 awk 字符串比较。
+    无时间戳的行（如 panic 堆栈续行）保留。
+
+    Args:
+        log_file: 原日志文件路径
+        start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'（含），或 None
+        end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'（含），或 None
+
+    Returns:
+        tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除
+    """
+    if not start_str and not end_str:
+        return (log_file, False)
+
+    tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_")
+    tmp.close()
+
+    awk_script = r"""{
+        ts = ""
+        if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) {
+            ts = substr($0, RSTART, RLENGTH)
+        }
+        if (ts == "") { print; next }
+        if ((start == "" || ts >= start) && (end == "" || ts <= end)) print
+    }"""
+
+    cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file]
+
+    try:
+        with open(tmp.name, "w") as outf:
+            result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120)
+        if result.returncode != 0:
+            os.unlink(tmp.name)
+            return (log_file, False)
+    except (subprocess.TimeoutExpired, OSError):
+        if os.path.exists(tmp.name):
+            os.unlink(tmp.name)
+        return (log_file, False)
+
+    return (tmp.name, True)
+
+
+# Context tag：[session_id:...], [request_id:...], [trace_id:...], [req_id:...]
+TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]")
+
+
+def extract_tags(line):
+    """从日志行提取 context tag。"""
+    return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)}
+
+
+# Log level
+LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]")
+
+
+def extract_level(line):
+    """从日志行提取日志级别。"""
+    m = LEVEL_RE.search(line)
+    return m.group(1) if m else None
+
+
+# ════════════════════════════════════════════════════════════════
+# HTTP 请求行解析（类别 H1）
+# ════════════════════════════════════════════════════════════════
+
+# H1 pattern: [METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP
+HTTP_RE = re.compile(
+    r"\[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\]\s+"
+    r"(/\S*)\s+HTTP/\d\.\d\s+"
+    r"(\d{3})\s+"
+    r"(\S+)\s+"  # latency (Go duration)
+    r"(\d+\.\d+\.\d+\.\d+)"  # client IP
+)
+
+
+def parse_http_line(line, inference_only=False):
+    """解析 H1 HTTP 请求行。
+
+    输入示例：
+        [INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1
+
+    Args:
+        line: 日志行
+        inference_only: True 则仅保留推理路径 (/v1/chat/completions, /v1/completions)
+
+    返回 dict 或 None。
+    """
+    ts = extract_ts(line)
+    m = HTTP_RE.search(line)
+    if not m:
+        return None
+
+    method, path, status, latency_raw, client_ip = m.groups()
+    latency_ms = parse_go_duration_ms(latency_raw)
+
+    if inference_only and path not in ("/v1/chat/completions", "/v1/completions"):
+        return None
+
+    record = {
+        "ts": ts or "",
+        "method": method,
+        "path": path,
+        "status": int(status),
+        "latency_ms": round(latency_ms, 3),
+        "client_ip": client_ip,
+    }
+
+    tags = extract_tags(line)
+    if tags:
+        record["tags"] = tags
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# Cache-Aware 策略行解析（类别 H6）
+# ════════════════════════════════════════════════════════════════
+
+STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
+SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+
+
+def parse_cache_strategy_line(line):
+    """解析 cache-aware prefill 策略行。
+
+    返回 dict 或 None（如果不是策略行）。
+    """
+    sm = STRATEGY_RE.search(line)
+    if not sm:
+        return None
+
+    ts = extract_ts(line)
+    strategy = sm.group(1)
+    record = {"ts": ts or "", "strategy": strategy}
+
+    sel_m = SELECTED_RE.search(line)
+    if sel_m:
+        record["selected"] = sel_m.group(1)
+
+    reason_m = REASON_RE.search(line)
+    if reason_m and strategy == "process_tokens":
+        record["reason"] = reason_m.group(1).strip()
+
+    hr_match = re.search(r"hitRatios=(map\[.*?\])", line)
+    if hr_match:
+        hit_ratios = parse_go_map(hr_match.group(1))
+        record["hitRatios"] = hit_ratios
+        if "selected" in record:
+            record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0)
+    else:
+        record["hitRatios"] = {}
+        if "selected" in record:
+            record["selected_hitRatio"] = 0
+
+    loads_match = re.search(r"loads=(map\[.*?\])", line)
+    if loads_match:
+        record["loads"] = parse_go_map(loads_match.group(1))
+
+    ts_ms_m = TS_MS_RE.search(line)
+    if ts_ms_m:
+        record["ts_ms"] = ts_ms_m.group(1)
+
+    tags = extract_tags(line)
+    if tags:
+        record["tags"] = tags
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# Stats 行解析（类别 H7）
+# ════════════════════════════════════════════════════════════════
+
+TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
+WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
+
+
+def parse_stats_line(line):
+    """解析 [stats] 统计行。
+
+    注意：hits 和 total 是 per-interval 的（每 5s 重置），累计值必须 sum 所有行。
+
+    返回 dict 或 None（如果不是 stats 行）。
+    """
+    if "[stats]" not in line:
+        return None
+
+    ts = extract_ts(line)
+    record = {"ts": ts or ""}
+
+    tr_m = TOTAL_RUNNING_RE.search(line)
+    if tr_m:
+        record["total_running"] = int(tr_m.group(1))
+
+    workers = {}
+    for wm in WORKER_RUNNING_RE.finditer(line):
+        workers[wm.group(1)] = int(wm.group(2))
+    record["workers"] = workers
+
+    chr_m = CACHE_HR_RE.search(line)
+    if chr_m:
+        record["cache_hit_rate"] = float(chr_m.group(1))
+        record["hits"] = int(chr_m.group(2))
+        record["total"] = int(chr_m.group(3))
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# 错误消息模板归一化
+# ════════════════════════════════════════════════════════════════
+
+NORMALIZE_PATTERNS = [
+    (re.compile(r"https?://[\w.:]+"), "{url}"),
+    (re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I), "{uuid}"),
+    (re.compile(r"\d+\.\d+\.\d+\.\d+:\d+"), "{ip:port}"),
+    (re.compile(r"\b\d+\b"), "{N}"),
+]
+
+# Message extraction: everything after "logger.go:NN: " (and optional context tags)
+MSG_RE = re.compile(r"logger\.go:\d+:\s*(?:\[[^\]]*\]\s*)*(.+)")
+
+
+def normalize_message(msg):
+    """将错误消息中的变量替换为占位符模板。"""
+    for pat, repl in NORMALIZE_PATTERNS:
+        msg = pat.sub(repl, msg)
+    return msg
+
+
+def parse_error_line(line):
+    """解析 ERROR/WARN 行并进行模板归一化。
+
+    返回 dict: {ts, level, original, template, tags}
+    """
+    ts = extract_ts(line)
+    level = extract_level(line)
+    tags = extract_tags(line)
+
+    mm = MSG_RE.search(line)
+    original = mm.group(1).strip() if mm else line
+
+    template = normalize_message(original)
+
+    record = {
+        "ts": ts or "",
+        "level": level or "",
+        "original": original,
+        "template": template,
+    }
+    if tags:
+        record["tags"] = tags
+
+    return record
+
+
+# ════════════════════════════════════════════════════════════════
+# Select/Release 事件匹配
+# ════════════════════════════════════════════════════════════════
+
+SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
+RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
+FAILED_SELECT_RE = re.compile(r"Failed to select")
+SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
+
+
+def match_select_release(lines):
+    """匹配 select/release worker 事件对。
+
+    Args:
+        lines: 日志行列表（字符串）
+
+    Returns:
+        dict: {matched, unmatched_selects, failed_selects, per_worker}
+    """
+    selects = []
+    releases = []
+    failed_selects = []
+
+    for line_no, line in enumerate(lines, 1):
+        ts = extract_ts(line)
+        tags = extract_tags(line)
+
+        # Token-bearing select
+        tm = SELECT_TOKENS_RE.search(line)
+        if tm:
+            selects.append(
+                {
+                    "ts": ts,
+                    "worker": tm.group(1),
+                    "type": "prefill",
+                    "tags": tags,
+                    "tokens": int(tm.group(2)),
+                    "line": line_no,
+                }
+            )
+            continue
+
+        # Token-bearing release
+        trm = RELEASE_TOKENS_RE.search(line)
+        if trm:
+            releases.append(
+                {
+                    "ts": ts,
+                    "worker": trm.group(1),
+                    "type": "prefill_tokens",
+                    "tags": tags,
+                    "tokens": int(trm.group(2)),
+                    "line": line_no,
+                }
+            )
+            continue
+
+        sm = SELECT_RE.search(line)
+        if sm:
+            selects.append(
+                {
+                    "ts": ts,
+                    "worker": sm.group(2),
+                    "type": sm.group(1) or "unknown",
+                    "tags": tags,
+                    "tokens": None,
+                    "line": line_no,
+                }
+            )
+            continue
+
+        rm = RELEASE_RE.search(line)
+        if rm:
+            releases.append(
+                {
+                    "ts": ts,
+                    "worker": rm.group(2),
+                    "type": rm.group(1) or "unknown",
+                    "tags": tags,
+                    "tokens": None,
+                    "line": line_no,
+                }
+            )
+            continue
+
+        if FAILED_SELECT_RE.search(line):
+            failed_selects.append({"ts": ts, "tags": tags, "line": line_no})
+
+    # Match by request_id
+    matched = []
+    unmatched_selects = []
+    release_used = set()
+
+    release_by_reqid = defaultdict(list)
+    for i, r in enumerate(releases):
+        rid = r["tags"].get("request_id", "")
+        if rid:
+            release_by_reqid[rid].append(i)
+
+    for s in selects:
+        rid = s["tags"].get("request_id", "")
+        found = False
+        if rid and rid in release_by_reqid:
+            for ri in release_by_reqid[rid]:
+                if ri not in release_used:
+                    r = releases[ri]
+                    matched.append(
+                        {
+                            "request_id": rid,
+                            "worker": s["worker"],
+                            "select_ts": s["ts"],
+                            "release_ts": r["ts"],
+                            "type": s["type"],
+                        }
+                    )
+                    release_used.add(ri)
+                    found = True
+                    break
+
+        if not found:
+            unmatched_selects.append(
+                {
+                    "worker": s["worker"],
+                    "select_ts": s["ts"],
+                    "type": s["type"],
+                    "tags": s["tags"],
+                    "note": "no matching release found",
+                }
+            )
+
+    # Per-worker summary
+    per_worker = defaultdict(lambda: {"selects": 0, "releases": 0})
+    for s in selects:
+        per_worker[s["worker"]]["selects"] += 1
+    for r in releases:
+        per_worker[r["worker"]]["releases"] += 1
+
+    pw_result = {}
+    for w, counts in per_worker.items():
+        pw_result[w] = {
+            "selects": counts["selects"],
+            "releases": counts["releases"],
+            "delta": counts["selects"] - counts["releases"],
+        }
+
+    return {
+        "matched": matched,
+        "unmatched_selects": unmatched_selects,
+        "failed_selects": failed_selects,
+        "per_worker": pw_result,
+    }
+
+
+# ════════════════════════════════════════════════════════════════
+# 不支持请求检测
+# ════════════════════════════════════════════════════════════════
+
+# Router 已知路由白名单 (method, path)
+KNOWN_ROUTES = {
+    ("POST", "/v1/chat/completions"),
+    ("POST", "/v1/completions"),
+    ("POST", "/register"),
+    ("GET", "/registered_number"),
+    ("GET", "/registered"),
+    ("GET", "/health_generate"),
+    ("GET", "/metrics"),
+}
+
+
+def find_unsupported_requests(lines):
+    """从 HTTP 日志行中筛选不匹配任何已知路由的请求。
+
+    Returns:
+        dict: {details: [...], summary: {total, unique_paths: {path: count}}}
+    """
+    details = []
+    path_counts = defaultdict(int)
+
+    for line in lines:
+        record = parse_http_line(line)
+        if not record:
+            continue
+        key = (record["method"], record["path"])
+        if key not in KNOWN_ROUTES:
+            details.append(
+                {
+                    "ts": record["ts"],
+                    "method": record["method"],
+                    "path": record["path"],
+                    "status": record["status"],
+                    "client_ip": record["client_ip"],
+                }
+            )
+            path_counts[f"{record['method']} {record['path']}"] += 1
+
+    return {
+        "details": details,
+        "summary": {
+            "total": len(details),
+            "unique_paths": dict(path_counts),
+        },
+    }
+
+
+def _cli_unsupported_requests(args):
+    """CLI: 检测不支持的请求。"""
+    lines = [line.rstrip("\n") for line in sys.stdin]
+    result = find_unsupported_requests(lines)
+
+    if args.summary_only:
+        print(json.dumps(result["summary"], ensure_ascii=False))
+    else:
+        print(json.dumps(result, ensure_ascii=False))
+
+
+# ════════════════════════════════════════════════════════════════
+# CLI 入口
+# ════════════════════════════════════════════════════════════════
+
+
+def _cli_parse_stream(parse_fn):
+    """通用 CLI 流式解析：从 stdin 读入日志行，输出 JSON Lines 到 stdout。"""
+    parsed = 0
+    skipped = 0
+    for line in sys.stdin:
+        line = line.rstrip("\n")
+        record = parse_fn(line)
+        if record:
+            print(json.dumps(record, ensure_ascii=False))
+            parsed += 1
+        else:
+            skipped += 1
+    print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr)
+
+
+def _cli_parse_http(args):
+    """CLI: 解析 HTTP 请求行。"""
+    parsed = 0
+    skipped = 0
+    for line in sys.stdin:
+        line = line.rstrip("\n")
+        record = parse_http_line(line, inference_only=args.inference_only)
+        if record:
+            print(json.dumps(record, ensure_ascii=False))
+            parsed += 1
+        else:
+            skipped += 1
+    print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr)
+
+
+def _cli_normalize_errors(args):
+    """CLI: 归一化错误消息。"""
+    parsed = 0
+    for line in sys.stdin:
+        line = line.rstrip("\n")
+        record = parse_error_line(line)
+        print(json.dumps(record, ensure_ascii=False))
+        parsed += 1
+    print(f"Normalized {parsed} lines", file=sys.stderr)
+
+
+def _cli_match_select_release(args):
+    """CLI: 匹配 select/release 事件。"""
+    lines = [line.rstrip("\n") for line in sys.stdin]
+    result = match_select_release(lines)
+    print(json.dumps(result, ensure_ascii=False))
+
+
+def _cli_self_test(args):
+    """运行内置测试。"""
+    passed = 0
+    failed = 0
+
+    def check(name, got, expected):
+        nonlocal passed, failed
+        if got == expected:
+            print(f"  PASS: {name}")
+            passed += 1
+        else:
+            print(f"  FAIL: {name}")
+            print(f"    expected: {expected}")
+            print(f"    got:      {got}")
+            failed += 1
+
+    print("=== Testing parse_go_duration_ms ===")
+    check("simple seconds", parse_go_duration_ms("1.5s"), 1500.0)
+    check("milliseconds", parse_go_duration_ms("150ms"), 150.0)
+    check("fractional ms", parse_go_duration_ms("150.5ms"), 150.5)
+    check("microseconds µs", parse_go_duration_ms("500µs"), 0.5)
+    check("microseconds us", parse_go_duration_ms("500us"), 0.5)
+    check("nanoseconds", parse_go_duration_ms("500ns"), 0.0005)
+    check("composite m+s", parse_go_duration_ms("1m30s"), 90000.0)
+    check("composite h+m+s", parse_go_duration_ms("1h2m3s"), 3723000.0)
+    check("composite h+m+fractional_s", parse_go_duration_ms("1h2m3.456s"), 3723456.0)
+    check("pure minutes", parse_go_duration_ms("2m"), 120000.0)
+    check("zero", parse_go_duration_ms("0s"), 0.0)
+    check("sub-ms decimal", parse_go_duration_ms("2.798235ms"), 2.798235)
+
+    print("\n=== Testing parse_go_map ===")
+    check("single entry", parse_go_map("map[http://10.0.0.1:9263:100]"), {"http://10.0.0.1:9263": 100})
+    check(
+        "multi entry",
+        parse_go_map("map[http://10.0.0.1:9263:100 http://10.0.0.2:9867:50]"),
+        {"http://10.0.0.1:9263": 100, "http://10.0.0.2:9867": 50},
+    )
+    check("empty map", parse_go_map("map[]"), {})
+    check("float values", parse_go_map("map[http://10.0.0.1:9263:0.85]"), {"http://10.0.0.1:9263": 0.85})
+
+    print("\n=== Testing extract_ts ===")
+    check("standard", extract_ts("[INFO] 2025/01/15 18:25:33 logger.go:45: msg"), "2025/01/15 18:25:33")
+    check("no timestamp", extract_ts("no timestamp here"), None)
+
+    print("\n=== Testing extract_tags ===")
+    check(
+        "session+request",
+        extract_tags("[session_id:abc] [request_id:def]"),
+        {"session_id": "abc", "request_id": "def"},
+    )
+    check(
+        "all four",
+        extract_tags("[trace_id:t1] [req_id:r1] [session_id:s1] [request_id:rq1]"),
+        {"trace_id": "t1", "req_id": "r1", "session_id": "s1", "request_id": "rq1"},
+    )
+    check("no tags", extract_tags("no tags here"), {})
+
+    print("\n=== Testing parse_http_line ===")
+    http_line = "[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.0.0.1"
+    r = parse_http_line(http_line)
+    check("http method", r["method"], "POST")
+    check("http path", r["path"], "/v1/chat/completions")
+    check("http status", r["status"], 200)
+    check("http latency", r["latency_ms"], 2.798)
+    check("http client_ip", r["client_ip"], "10.0.0.1")
+
+    r_infer = parse_http_line(
+        "[INFO] 2025/01/15 18:25:33 logger.go:45: [GET] /health HTTP/1.1 200 1ms 10.0.0.1", inference_only=True
+    )
+    check("inference_only filters health", r_infer, None)
+
+    print("\n=== Testing normalize_message ===")
+    check("url", normalize_message("Failed to connect to http://10.0.0.1:9965"), "Failed to connect to {url}")
+    check("uuid", normalize_message("request abc12345-1234-5678-9012-abcdef123456 failed"), "request {uuid} failed")
+    check(
+        "ip:port",
+        normalize_message("dial tcp 10.0.0.1:9965: connection refused"),
+        "dial tcp {ip:port}: connection refused",
+    )
+
+    print(f'\n{"=" * 40}')
+    print(f"Results: {passed} passed, {failed} failed")
+    if failed:
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="FastDeploy Go Router Log Parser",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    sub = parser.add_subparsers(dest="command")
+
+    p = sub.add_parser("parse-http", help="解析 HTTP 请求行 (H1) → JSON Lines")
+    p.add_argument("--inference-only", action="store_true", help="仅保留推理路径")
+
+    sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 (H6) → JSON Lines")
+    sub.add_parser("parse-stats", help="解析 [stats] 统计行 (H7) → JSON Lines")
+    sub.add_parser("normalize-errors", help="ERROR/WARN 行模板归一化 → JSON Lines")
+    sub.add_parser("match-select-release", help="匹配 select/release worker 事件")
+    p = sub.add_parser("unsupported-requests", help="检测不匹配已知路由的请求")
+    p.add_argument("--summary-only", action="store_true", help="仅输出汇总（不含详细列表）")
+    sub.add_parser("self-test", help="运行内置测试")
+
+    args = parser.parse_args()
+
+    if args.command == "parse-http":
+        _cli_parse_http(args)
+    elif args.command == "parse-cache-strategy":
+        _cli_parse_stream(parse_cache_strategy_line)
+    elif args.command == "parse-stats":
+        _cli_parse_stream(parse_stats_line)
+    elif args.command == "normalize-errors":
+        _cli_normalize_errors(args)
+    elif args.command == "match-select-release":
+        _cli_match_select_release(args)
+    elif args.command == "unsupported-requests":
+        _cli_unsupported_requests(args)
+    elif args.command == "self-test":
+        _cli_self_test(args)
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py
new file mode 100644
index 00000000000..a197ee7aff0
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py
@@ -0,0 +1,278 @@
+#!/usr/bin/env python3
+"""
+Stats — 通用统计计算工具
+
+提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。
+不含任何业务逻辑或日志格式依赖。
+
+Python 3 stdlib only，零依赖。
+"""
+
+import math
+from collections import defaultdict
+from datetime import datetime, timedelta
+
+# ════════════════════════════════════════════════════════════════
+# 百分位数与基础统计
+# ════════════════════════════════════════════════════════════════
+
+
+def percentile(sorted_vals, p):
+    """从已排序列表计算第 p 百分位数（线性插值）。"""
+    if not sorted_vals:
+        return 0.0
+    n = len(sorted_vals)
+    k = (p / 100.0) * (n - 1)
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return sorted_vals[int(k)]
+    return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f)
+
+
+def compute_statistics(values, percentiles_list=None, distribution_spec=None):
+    """计算一组数值的统计量。
+
+    Args:
+        values: 数值列表
+        percentiles_list: 要计算的百分位数列表，默认 [50, 90, 95, 99]
+        distribution_spec: 分布区间规格字符串，如 '0-20,20-40,40-60,60-80,80-100'
+
+    Returns:
+        dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution}
+    """
+    if percentiles_list is None:
+        percentiles_list = [50, 90, 95, 99]
+
+    if not values:
+        result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0}
+        for p in percentiles_list:
+            result[f"p{p}"] = 0
+        if distribution_spec is not None:
+            result["distribution"] = []
+        return result
+
+    sorted_vals = sorted(values)
+    n = len(sorted_vals)
+    total = sum(sorted_vals)
+    mean = total / n
+    variance = sum((x - mean) ** 2 for x in sorted_vals) / n
+    stddev = math.sqrt(variance)
+
+    result = {
+        "count": n,
+        "min": round(sorted_vals[0], 3),
+        "max": round(sorted_vals[-1], 3),
+        "mean": round(mean, 3),
+        "sum": round(total, 3),
+        "stddev": round(stddev, 3),
+    }
+
+    for p in percentiles_list:
+        result[f"p{p}"] = round(percentile(sorted_vals, p), 3)
+
+    if distribution_spec is not None:
+        result["distribution"] = compute_distribution(sorted_vals, distribution_spec)
+
+    return result
+
+
+def compute_distribution(sorted_vals, spec_str):
+    """根据区间规格计算分布直方图。
+
+    spec_str 示例：'0-20,20-40,40-60,60-80,80-100'
+    每个区间是左闭右开 [lo, hi)。
+    """
+    buckets = _parse_distribution_spec(spec_str)
+    n = len(sorted_vals)
+    result = []
+    for b in buckets:
+        if b[0] == "lt":
+            count = sum(1 for v in sorted_vals if v < b[1])
+            label = b[2]
+        elif b[0] == "gt":
+            count = sum(1 for v in sorted_vals if v > b[1])
+            label = b[2]
+        elif b[0] == "range":
+            count = sum(1 for v in sorted_vals if b[1] <= v < b[2])
+            label = b[3]
+        else:
+            continue
+        result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0})
+    return result
+
+
+def _parse_distribution_spec(spec_str):
+    """解析分布区间规格：'<100,100-500,>1000' → bucket 定义列表。"""
+    buckets = []
+    for part in spec_str.split(","):
+        part = part.strip()
+        if part.startswith("<"):
+            buckets.append(("lt", float(part[1:]), part))
+        elif part.startswith(">"):
+            buckets.append(("gt", float(part[1:]), part))
+        elif "-" in part:
+            lo, hi = part.split("-", 1)
+            buckets.append(("range", float(lo), float(hi), part))
+    return buckets
+
+
+# ════════════════════════════════════════════════════════════════
+# 时间窗口聚合
+# ════════════════════════════════════════════════════════════════
+
+
+def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"):
+    """按时间窗口聚合记录。
+
+    Args:
+        records: dict 列表，每个 dict 必须有 ts_field 字段
+        window: 窗口大小 '5s'/'1m'/'5m'/'auto'
+        agg_specs: 聚合规格列表 [(field, func), ...]，如 [('selected_hitRatio', 'mean')]
+                   func 支持：count, sum, mean, min, max, pNN
+        ts_field: 时间戳字段名
+
+    Returns:
+        list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...}
+    """
+    if agg_specs is None:
+        agg_specs = [("_", "count")]
+
+    if not records:
+        return []
+
+    window_td = _parse_window(window, records, ts_field)
+
+    # 按窗口分组
+    buckets = defaultdict(list)
+    for r in records:
+        ts_str = r.get(ts_field, "")
+        if not ts_str:
+            continue
+        try:
+            dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")
+        except ValueError:
+            continue
+        bucket_dt = _align_to_bucket(dt, window_td)
+        bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S")
+        buckets[bucket_key].append(r)
+
+    # 按时间排序并聚合
+    result = []
+    for bucket_key in sorted(buckets.keys()):
+        bucket_records = buckets[bucket_key]
+        entry = {"bucket": bucket_key, "count": len(bucket_records)}
+
+        for field, func in agg_specs:
+            if field == "_":
+                if func == "count":
+                    entry["count"] = len(bucket_records)
+                continue
+
+            values = []
+            for r in bucket_records:
+                v = r.get(field)
+                if v is not None:
+                    try:
+                        values.append(float(v))
+                    except (ValueError, TypeError):
+                        pass
+
+            out_key = f"{field}_{func}"
+            entry[out_key] = _aggregate_values(values, func)
+
+        result.append(entry)
+
+    return result
+
+
+def _parse_window(window_str, records, ts_field):
+    """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。"""
+    if window_str == "auto":
+        timestamps = []
+        for r in records:
+            ts_str = r.get(ts_field, "")
+            if ts_str:
+                try:
+                    timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S"))
+                except ValueError:
+                    pass
+        if len(timestamps) < 2:
+            return timedelta(minutes=1)
+        span = max(timestamps) - min(timestamps)
+        if span < timedelta(minutes=30):
+            return timedelta(seconds=5)
+        elif span < timedelta(hours=3):
+            return timedelta(minutes=1)
+        else:
+            return timedelta(minutes=5)
+    elif window_str.endswith("s"):
+        return timedelta(seconds=int(window_str[:-1]))
+    elif window_str.endswith("m"):
+        return timedelta(minutes=int(window_str[:-1]))
+    elif window_str.endswith("h"):
+        return timedelta(hours=int(window_str[:-1]))
+    return timedelta(minutes=1)
+
+
+def _align_to_bucket(dt, window_td):
+    """将 datetime 对齐到窗口边界。"""
+    secs = max(1, int(window_td.total_seconds()))
+    epoch = datetime(dt.year, dt.month, dt.day)
+    offset = int((dt - epoch).total_seconds())
+    aligned = (offset // secs) * secs
+    return epoch + timedelta(seconds=aligned)
+
+
+def _aggregate_values(values, func):
+    """用指定函数聚合一组数值。"""
+    if not values:
+        return 0
+    if func == "count":
+        return len(values)
+    elif func == "sum":
+        return round(sum(values), 3)
+    elif func == "mean":
+        return round(sum(values) / len(values), 3)
+    elif func == "min":
+        return round(min(values), 3)
+    elif func == "max":
+        return round(max(values), 3)
+    elif func.startswith("p"):
+        p = int(func[1:])
+        return round(percentile(sorted(values), p), 3)
+    return 0
+
+
+# ════════════════════════════════════════════════════════════════
+# 分组计数
+# ════════════════════════════════════════════════════════════════
+
+
+def count_by(records, field, top_n=None):
+    """按指定字段分组计数。
+
+    Args:
+        records: dict 列表
+        field: 分组字段名
+        top_n: 只返回前 N 个（按计数降序）
+
+    Returns:
+        list[dict]: [{value, count, pct}]，按计数降序排列
+    """
+    counts = defaultdict(int)
+    total = 0
+    for r in records:
+        val = r.get(field)
+        if val is not None:
+            counts[str(val)] += 1
+            total += 1
+
+    result = []
+    for val, count in sorted(counts.items(), key=lambda x: -x[1]):
+        result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0})
+
+    if top_n:
+        result = result[:top_n]
+
+    return result
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
new file mode 100644
index 00000000000..4e64a2092b3
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3
+"""
+Troubleshoot — FastDeploy Go Router 综合问题排查主编排器
+
+Usage:
+    python3 troubleshoot.py <log_file> [options]
+
+Options:
+    --errors            仅分析错误日志
+    --latency           仅分析延迟
+    --health            仅分析 Worker 健康
+    --cache             仅分析 Cache 调度
+    --load              仅分析负载与计数器
+    --trace ID          追踪指定请求（支持逗号分隔多 ID）
+    --tail N            仅分析尾部 N 行（支持 N 或 Nm 格式如 30m）
+    --start TIME        起始时间（如 "16:00:00"、"03/31 16:00"）
+    --end TIME          结束时间（如 "17:00:00"、"2026/03/31 17:00:00"）
+    --output DIR        详细报告导出目录（默认: skill_output/troubleshoot/<timestamp>/）
+
+支持维度：errors, latency, health, cache, load, trace
+"""
+
+import argparse
+import os
+import sys
+from datetime import datetime
+
+# 确保能 import 同级模块
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from analyzers.cache import analyze_cache, format_cache_report
+from analyzers.errors import analyze_errors, format_errors_report
+from analyzers.health import analyze_health, format_health_report
+from analyzers.latency import analyze_latency, format_latency_report
+from analyzers.load import analyze_load, format_load_report
+from analyzers.trace import analyze_trace, format_trace_report
+from log_parser import complete_time_arg, filter_file_by_time_range
+
+
+def determine_log_file(user_path=None):
+    """确定日志文件路径。
+
+    搜索顺序：
+    1. 用户指定路径（直接使用，不质疑）
+    2. logs/router.log
+    3. fd-router.log（golang_router 根目录）
+    """
+    if user_path:
+        if os.path.isfile(user_path):
+            return user_path
+        print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr)
+        sys.exit(1)
+
+    # 尝试不同 CWD 下的候选路径
+    candidates = [
+        "logs/router.log",  # CWD = golang_router/
+        "fd-router.log",  # CWD = golang_router/
+        "fastdeploy/golang_router/logs/router.log",  # CWD = 项目根
+        "fastdeploy/golang_router/fd-router.log",  # CWD = 项目根
+    ]
+    for path in candidates:
+        if os.path.isfile(path):
+            return path
+
+    print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr)
+    sys.exit(1)
+
+
+def parse_tail_arg(tail_str):
+    """解析 --tail 参数：支持纯数字(行数)或 Nm(分钟)格式。"""
+    if tail_str is None:
+        return None
+    if tail_str.endswith("m"):
+        # 分钟模式：转换为大致行数（假设 ~20 行/秒）
+        minutes = int(tail_str[:-1])
+        return minutes * 60 * 20
+    return int(tail_str)
+
+
+def determine_status(results):
+    """根据分析结果判定全局状态。"""
+    reasons = []
+
+    # Errors 维度
+    errors_result = results.get("errors")
+    if errors_result:
+        if errors_result["panic_list"]:
+            return "CRITICAL", f'{len(errors_result["panic_list"])} Panic 事件'
+        if errors_result["error_rate"] > 20:
+            return "CRITICAL", f'错误率 {errors_result["error_rate"]}%'
+        if errors_result["error_rate"] > 5:
+            reasons.append(f'错误率 {errors_result["error_rate"]}%')
+        for s in errors_result["status_code_dist"]:
+            code = str(s["value"])
+            if code in ("502", "503") and s["count"] > 0:
+                reasons.append(f'{code}: {s["count"]}')
+
+    # Latency 维度
+    latency_result = results.get("latency")
+    if latency_result:
+        for d in latency_result.get("diagnoses", []):
+            if d["severity"] == "CRITICAL":
+                return "CRITICAL", d["message"]
+            if d["severity"] == "HIGH":
+                reasons.append(d["message"])
+
+    # Health 维度
+    health_result = results.get("health")
+    if health_result:
+        for d in health_result.get("diagnoses", []):
+            if d["severity"] == "CRITICAL":
+                return "CRITICAL", d["message"]
+            if d["severity"] == "HIGH":
+                reasons.append(d["message"])
+
+    # Load 维度
+    load_result = results.get("load")
+    if load_result:
+        for d in load_result.get("diagnoses", []):
+            if d["severity"] == "CRITICAL":
+                return "CRITICAL", d["message"]
+            if d["severity"] == "HIGH":
+                reasons.append(d["message"])
+
+    # Cache 维度
+    cache_result = results.get("cache")
+    if cache_result:
+        for d in cache_result.get("diagnoses", []):
+            if d["severity"] == "HIGH":
+                reasons.append(d["message"])
+
+    if reasons:
+        return "DEGRADED", ", ".join(reasons)
+
+    if not results:
+        return "HEALTHY", "无分析数据"
+
+    return "HEALTHY", "无严重问题"
+
+
+def format_full_report(results, status, status_reason):
+    """组装完整报告。
+
+    Returns:
+        tuple: (report_text, details)
+            report_text: 主报告文本（总结 + 可视化）
+            details: dict 包含需要拆分到独立文件的详情数据
+                - 'health_events': str 或 None
+                - 'trace_files': {trace_id: text} 或 {}
+    """
+    parts = []
+    details = {"health_events": None, "trace_files": {}}
+
+    # 状态行
+    parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append("=" * 60)
+    parts.append("")
+
+    # 各维度报告
+    if "errors" in results:
+        parts.append(format_errors_report(results["errors"]))
+
+    if "latency" in results:
+        parts.append(format_latency_report(results["latency"]))
+
+    if "health" in results:
+        summary, detail = format_health_report(results["health"])
+        parts.append(summary)
+        if detail:
+            details["health_events"] = detail
+
+    if "load" in results:
+        parts.append(format_load_report(results["load"]))
+
+    if "cache" in results:
+        parts.append(format_cache_report(results["cache"]))
+
+    if "trace" in results:
+        summary, detail_dict = format_trace_report(results["trace"])
+        parts.append(summary)
+        if detail_dict:
+            details["trace_files"] = detail_dict
+
+    return "\n".join(parts), details
+
+
+def save_detailed_report(report_text, output_dir, details=None):
+    """保存报告到文件。
+
+    Args:
+        report_text: 主报告文本
+        output_dir: 输出目录
+        details: 详情数据 dict（来自 format_full_report）
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    filename = f"troubleshoot_report_{timestamp}.md"
+    filepath = os.path.join(output_dir, filename)
+
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write("# Router Troubleshooting Report\n")
+        f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n')
+        f.write(report_text)
+
+    # 保存详情到 details/ 子目录
+    if details:
+        details_dir = os.path.join(output_dir, "details")
+
+        if details.get("health_events"):
+            os.makedirs(details_dir, exist_ok=True)
+            health_path = os.path.join(details_dir, "health_events.md")
+            with open(health_path, "w", encoding="utf-8") as f:
+                f.write(details["health_events"])
+
+        for trace_id, trace_text in details.get("trace_files", {}).items():
+            os.makedirs(details_dir, exist_ok=True)
+            safe_id = trace_id.replace("/", "_")
+            trace_path = os.path.join(details_dir, f"trace_{safe_id}.md")
+            with open(trace_path, "w", encoding="utf-8") as f:
+                f.write(trace_text)
+
+    return filepath
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="FastDeploy Go Router Troubleshooting",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__,
+    )
+    parser.add_argument("log_file", nargs="?", help="日志文件路径")
+    parser.add_argument("--errors", action="store_true", help="仅分析错误日志")
+    parser.add_argument("--latency", action="store_true", help="仅分析延迟")
+    parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康")
+    parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度")
+    parser.add_argument("--load", action="store_true", help="仅分析负载与计数器")
+    parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID）")
+    parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)")
+    parser.add_argument(
+        "--start", default=None, help='起始时间（如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00"）'
+    )
+    parser.add_argument("--end", default=None, help='结束时间（如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00"）')
+    parser.add_argument("--output", help="详细报告导出目录（默认：skill_output/troubleshoot/<timestamp>/）")
+
+    args = parser.parse_args()
+
+    # 确定日志文件
+    log_file = determine_log_file(args.log_file)
+    print(f"日志文件: {log_file}", file=sys.stderr)
+
+    # --tail 与 --start/--end 不能混用（两者是不同的范围选择方式）
+    if args.tail and (args.start or args.end):
+        print("Error: --tail 与 --start/--end 不能同时使用，请选择其一", file=sys.stderr)
+        sys.exit(1)
+
+    # 时间范围预过滤（--start 和 --end 可单独或同时指定）
+    import atexit
+
+    if args.start or args.end:
+        start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None
+        end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None
+        filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts)
+        if is_temp:
+            atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None)
+        log_file = filtered_path
+        print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr)
+
+    # 确定分析模式
+    any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace
+    run_errors = args.errors or (not any_mode)
+    run_latency = args.latency or (not any_mode)
+    run_health = args.health or (not any_mode)
+    run_load = args.load or (not any_mode)
+    run_cache = args.cache or (not any_mode)
+    run_trace = bool(args.trace)  # trace 需要指定 ID，全量扫描不自动调用
+
+    tail = parse_tail_arg(args.tail)
+
+    results = {}
+    step = 0
+    total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace])
+
+    # 执行分析
+    if run_errors:
+        step += 1
+        print(f"[{step}/{total_steps}] 分析错误日志...", file=sys.stderr)
+        results["errors"] = analyze_errors(log_file, tail=tail)
+
+    if run_latency:
+        step += 1
+        print(f"[{step}/{total_steps}] 分析请求延迟...", file=sys.stderr)
+        results["latency"] = analyze_latency(log_file, tail=tail)
+
+    if run_health:
+        step += 1
+        print(f"[{step}/{total_steps}] 分析 Worker 健康...", file=sys.stderr)
+        results["health"] = analyze_health(log_file, tail=tail)
+
+    if run_cache:
+        step += 1
+        print(f"[{step}/{total_steps}] 分析 Cache 调度...", file=sys.stderr)
+        results["cache"] = analyze_cache(log_file, tail=tail)
+
+    if run_load:
+        step += 1
+        print(f"[{step}/{total_steps}] 分析负载与计数器...", file=sys.stderr)
+        results["load"] = analyze_load(log_file, tail=tail)
+
+    if run_trace:
+        step += 1
+        print(f"[{step}/{total_steps}] 追踪请求...", file=sys.stderr)
+        results["trace"] = analyze_trace(log_file, args.trace, tail=tail)
+
+    # 判定状态
+    status, status_reason = determine_status(results)
+
+    # 输出报告
+    report, details = format_full_report(results, status, status_reason)
+    print(report)
+
+    # 保存详细报告
+    if args.output:
+        output_dir = args.output
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp)
+    filepath = save_detailed_report(report, output_dir, details=details)
+    print(f"\n详细报告已保存到: {filepath}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fastdeploy/golang_router/.gitignore b/fastdeploy/golang_router/.gitignore
new file mode 100644
index 00000000000..58b5c84d190
--- /dev/null
+++ b/fastdeploy/golang_router/.gitignore
@@ -0,0 +1,2 @@
+# Generated skill analysis outputs
+skill_output/

From 5346b515809947c7f9657cc130a16bf2b6e4e028 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 18:08:52 +0800
Subject: [PATCH 02/40] chore: remove sample log fixtures per request

---
 .../stat-cache-hitrate/scripts/log_parser.py  |  11 +-
 .../scripts/stat_cache_hitrate.py             |   7 +-
 .../troubleshoot/scripts/analyzers/cache.py   |   8 +-
 .../troubleshoot/scripts/analyzers/errors.py  |   2 +
 .../troubleshoot/scripts/analyzers/health.py  |  25 +--
 .../troubleshoot/scripts/analyzers/load.py    | 102 ++++++++---
 .../troubleshoot/scripts/analyzers/trace.py   |  23 ++-
 .../skills/troubleshoot/scripts/log_parser.py | 168 +++++++++++++++---
 .../troubleshoot/scripts/troubleshoot.py      |  22 ++-
 9 files changed, 285 insertions(+), 83 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
index 0b7377b4865..d43d6909c64 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
@@ -125,7 +125,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
     if m:
         mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
         ts = _get_log_boundary_ts(log_file, "first")
-        year = ts[:4] if ts else "2026"
+        year = ts[:4] if ts else str(datetime.now().year)
         if m.group(3):  # 有时间部分
             h, mi = m.group(3).zfill(2), m.group(4)
             s = (m.group(5) or "00").zfill(2)
@@ -139,7 +139,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
         h, mi = m.group(1).zfill(2), m.group(2)
         s = (m.group(3) or "00").zfill(2)
         ts = _get_log_boundary_ts(log_file, "last")
-        date_part = ts[:10] if ts else "2026/01/01"
+        date_part = ts[:10] if ts else f"{datetime.now().year}/01/01"
         return f"{date_part} {h}:{mi}:{s}"
 
     # Fallback: 原样返回
@@ -204,9 +204,10 @@ def extract_tags(line):
 # Cache-Aware 策略行解析（类别 A）
 # ════════════════════════════════════════════════════════════════
 
+URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?"
 STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
-SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
-REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)")
 
 
 def parse_cache_strategy_line(line):
@@ -271,7 +272,7 @@ def parse_cache_strategy_line(line):
 # ════════════════════════════════════════════════════════════════
 
 TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
-WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)")
 CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
 
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index c193e99d47c..5487bc2cc96 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -14,6 +14,7 @@
 import argparse
 import json
 import os
+import re
 import subprocess
 import sys
 from collections import defaultdict
@@ -32,6 +33,10 @@
 )
 from stats import compute_statistics, count_by, time_bucket
 
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -235,7 +240,7 @@ def compute_per_worker_stats(strategies):
         avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0
         result.append(
             {
-                "Worker": worker.replace("http://", ""),
+                "Worker": _strip_scheme(worker),
                 "Selected": data["selected_count"],
                 "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%",
                 "AvgHitRatio": f"{avg_hr}%",
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3a18b668a41..3fca296f4d6 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -26,6 +26,10 @@
 TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens")
 
 
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
+
 def classify_fallback(record, tokenizer_degraded_ts=None):
     """对 process_tokens 策略行分类 fallback 原因。
 
@@ -210,9 +214,9 @@ def _analyze_suboptimal(records, hr_weight, lb_weight):
         suboptimal.append(
             {
                 "ts": r.get("ts", ""),
-                "selected": selected.replace("http://", ""),
+                "selected": _strip_scheme(selected),
                 "selected_hr": sel_hr,
-                "best_hr_worker": best_by_hr.replace("http://", ""),
+                "best_hr_worker": _strip_scheme(best_by_hr),
                 "best_hr": max_hr,
                 "reason": reason,
             }
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index 0817e280aa5..b8217a5ffa4 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -44,6 +44,8 @@
     ("No available", "FD 后端"),
     ("request failed", "FD 后端"),
     ("Removed unhealthy", "FD 后端"),
+    ("is not healthy", "FD 后端"),
+    ("is healthy", "FD 后端"),
     ("Backend request failed", "FD 后端"),
     ("Decode request failed", "FD 后端"),
     ("Prefill request failed", "FD 后端"),
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index d2d7ca77acb..ca01d718dbc 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -21,11 +21,16 @@
 # 健康事件解析
 # ════════════════════════════════════════════════════════════════
 
-NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy")
-REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)")
-IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy")
-COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
-CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)")
+WORKER_URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+NOT_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is not healthy")
+REMOVED_RE = re.compile(rf"Removed unhealthy \w+ instance:\s*{WORKER_URL_RE}")
+IS_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is healthy")
+COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{WORKER_URL_RE}")
+CLEANUP_UNHEALTHY_RE = re.compile(rf"cleanup unhealthy.*?{WORKER_URL_RE}")
+
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
 
 
 def parse_health_event(line):
@@ -110,7 +115,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
     # IP → worker URL 映射
     ip_to_urls = defaultdict(set)
     for url in worker_urls:
-        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url)
         if ip_m:
             ip_to_urls[ip_m.group(1)].add(url)
 
@@ -130,7 +135,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
     workers = {}
     for url in sorted(worker_urls):
         events = sorted(worker_events[url], key=lambda e: e["ts"] or "")
-        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url)
         worker_ip = ip_m.group(1) if ip_m else ""
 
         # 恢复检测：REMOVED 后有 register
@@ -237,7 +242,7 @@ def _diagnose(workers):
         )
 
     for url, w in workers.items():
-        s = url.replace("http://", "")
+        s = _strip_scheme(url)
         if w["down_count"] > 3:
             diagnoses.append(
                 {
@@ -326,7 +331,7 @@ def format_health_report(result):
             )
         table_data.append(
             {
-                "Worker": url.replace("http://", ""),
+                "Worker": _strip_scheme(url),
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
@@ -358,7 +363,7 @@ def format_health_report(result):
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
-            detail_parts.append(f'## {url.replace("http://", "")}')
+            detail_parts.append(f"## {_strip_scheme(url)}")
             detail_parts.append("")
             for evt in w["events"]:
                 detail_parts.append(f'  [{evt["ts"]}] {evt["type"]}')
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index e712011d932..9be82357494 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -21,14 +21,19 @@
 # Counter 异常检测正则
 # ════════════════════════════════════════════════════════════════
 
-DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release")
-COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up")
-COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
-TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)")
+URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+DOUBLE_RELEASE_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?double-release")
+COUNTER_CLEANED_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?counter already cleaned up")
+COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{URL_RE}")
+TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}")
 
 # Token 事件
-SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
 
 
 def parse_counter_anomaly(line):
@@ -89,7 +94,7 @@ def analyze_load(log_file, tail=None):
         avg = sum(vals) / len(vals) if vals else 0
         worker_load.append(
             {
-                "worker": w_url.replace("http://", ""),
+                "worker": _strip_scheme(w_url),
                 "avg_running": round(avg, 1),
                 "max_running": max(vals) if vals else 0,
                 "samples": len(vals),
@@ -121,9 +126,9 @@ def analyze_load(log_file, tail=None):
 
     # Select/Release 匹配
     sr_result = (
-        match_select_release(h3_lines)
+        match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}}
+        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
     )
 
     # Token 统计
@@ -133,7 +138,7 @@ def analyze_load(log_file, tail=None):
     pileup = _detect_pileup(stats_records)
 
     # 诊断
-    diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup)
+    diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup)
 
     return {
         "load_stats": load_stats,
@@ -170,7 +175,7 @@ def _analyze_tokens(h3_lines, h11_lines):
         releases = token_release.get(w, [])
         result.append(
             {
-                "worker": w.replace("http://", ""),
+                "worker": _strip_scheme(w),
                 "alloc_count": len(allocs),
                 "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0,
                 "release_count": len(releases),
@@ -195,7 +200,7 @@ def _detect_pileup(stats_records):
     return max_consecutive >= 5
 
 
-def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
+def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup):
     """生成负载诊断。"""
     diagnoses = []
 
@@ -236,16 +241,20 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
                 }
             )
 
-    # Select/Release 不一致
-    for w_url, pw in sr_result.get("per_worker", {}).items():
-        if pw.get("delta", 0) > 0:
-            diagnoses.append(
-                {
-                    "severity": "HIGH",
-                    "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
-                    "source_layer": "FD 后端",
-                }
-            )
+    id_cov = sr_result.get("id_coverage", {})
+    has_correlatable_ids = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) > 0
+
+    # Select/Release 不一致（仅在存在可关联 ID 时启用，避免无 ID 场景误报）
+    if has_correlatable_ids:
+        for w_url, pw in sr_result.get("per_worker", {}).items():
+            if pw.get("delta", 0) > 0:
+                diagnoses.append(
+                    {
+                        "severity": "HIGH",
+                        "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
+                        "source_layer": "FD 后端",
+                    }
+                )
 
     # 卡住的请求
     if sr_result.get("unmatched_selects"):
@@ -257,6 +266,17 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
             }
         )
 
+    # Token 计数器潜在泄漏
+    for t in token_stats:
+        if t.get("alloc_count", 0) > t.get("release_count", 0):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'{t["worker"]} token alloc/release 不平衡 ({t["alloc_count"]}/{t["release_count"]})',
+                    "source_layer": "Router",
+                }
+            )
+
     return diagnoses
 
 
@@ -316,23 +336,44 @@ def format_load_report(result):
         sections.append("### 计数器异常")
         sections.append("")
         for a in result["counter_anomalies"]:
-            workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items())
+            workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items())
             sections.append(f'  {a["type"]}: {a["total"]} 次 [{workers_str}]')
         sections.append("")
 
+    id_cov = result.get("select_release", {}).get("id_coverage", {})
+    if id_cov:
+        sections.append("### 请求标识覆盖（基于 select 近似请求数）")
+        sections.append("")
+        sections.append(
+            "  total={total} | with_request_id={with_rid} | without_request_id={without_rid} | "
+            "with_alt_id={with_alt} | without_any_id={without_any}".format(
+                total=id_cov.get("total_requests_estimated", 0),
+                with_rid=id_cov.get("with_request_id", 0),
+                without_rid=id_cov.get("without_request_id", 0),
+                with_alt=id_cov.get("with_alt_id", 0),
+                without_any=id_cov.get("without_any_id", 0),
+            )
+        )
+        if id_cov.get("without_any_id", 0) > 0:
+            sections.append("  ℹ 无 request/session/trace/req_id 时，不做退化匹配，仅统计为 untracked。")
+        sections.append("")
+
     # Select/Release 匹配
     sr = result.get("select_release", {})
     if sr.get("per_worker"):
         sections.append("### Select/Release 匹配")
         sections.append("")
+        id_cov = sr.get("id_coverage", {})
+        no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0
         table_data = []
         for w_url, pw in sorted(sr["per_worker"].items()):
+            delta_display = "N/A" if no_correlatable_id else str(pw["delta"])
             table_data.append(
                 {
-                    "Worker": w_url.replace("http://", ""),
+                    "Worker": _strip_scheme(w_url),
                     "Select": str(pw["selects"]),
                     "Release": str(pw["releases"]),
-                    "Delta": str(pw["delta"]),
+                    "Delta": delta_display,
                 }
             )
         sections.append(
@@ -343,11 +384,20 @@ def format_load_report(result):
             )
         )
         sections.append("")
+        if no_correlatable_id:
+            sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
+            sections.append("")
 
     if sr.get("unmatched_selects"):
         sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
         for u in sr["unmatched_selects"][:5]:
-            sections.append(f'    [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})')
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
+        sections.append("")
+
+    if sr.get("untracked_selects"):
+        sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
+        for u in sr["untracked_selects"][:5]:
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
         sections.append("")
 
     # Token 统计
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 45a5056616e..6c9a0323724 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -25,21 +25,26 @@
 # ════════════════════════════════════════════════════════════════
 
 PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection")
-SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
-RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+SELECT_WORKER_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*{URL_RE}")
+RELEASE_WORKER_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*{URL_RE}")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
 REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully")
 TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
 
 # Prefill 事件
-PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)")
-PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)")
-PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)")
-PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)")
-PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)")
+PREFILL_FIRST_CHUNK_RE = re.compile(rf"\[prefill\] first chunk received.*?{URL_RE}")
+PREFILL_DONE_RE = re.compile(rf"\[prefill\] non-stream prefill response done.*?{URL_RE}")
+PREFILL_ERROR_RE = re.compile(rf"\[prefill\] (scanner error|copy error).*?{URL_RE}")
+PREFILL_DEFER_RE = re.compile(rf"\[prefill\] release in defer.*?{URL_RE}")
+PREFILL_ERR_PATH_RE = re.compile(rf"\[prefill\] release in CommonCompletions defer \(error path\).*?{URL_RE}")
 FAILED_SELECT_RE = re.compile(r"Failed to select")
 
 
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -342,7 +347,7 @@ def format_trace_report(result):
             for evt in trace["events"]:
                 line = f'  [{evt.get("ts","")}] {evt["type"]}'
                 if evt.get("worker"):
-                    line += f' → {evt["worker"].replace("http://","")}'
+                    line += f' → {_strip_scheme(evt["worker"])}'
                 if evt.get("status"):
                     line += f' [{evt["status"]}]'
                 if evt.get("latency_ms"):
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 2a90d39b632..44f5cdebd94 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -14,7 +14,7 @@
 import re
 import sys
 from collections import defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta
 
 # ════════════════════════════════════════════════════════════════
 # 通用解析原语
@@ -152,7 +152,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
     if m:
         mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
         ts = _get_log_boundary_ts(log_file, "first")
-        year = ts[:4] if ts else "2026"
+        year = ts[:4] if ts else str(datetime.now().year)
         if m.group(3):  # 有时间部分
             h, mi = m.group(3).zfill(2), m.group(4)
             s = (m.group(5) or "00").zfill(2)
@@ -166,7 +166,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
         h, mi = m.group(1).zfill(2), m.group(2)
         s = (m.group(3) or "00").zfill(2)
         ts = _get_log_boundary_ts(log_file, "last")
-        date_part = ts[:10] if ts else "2026/01/01"
+        date_part = ts[:10] if ts else f"{datetime.now().year}/01/01"
         return f"{date_part} {h}:{mi}:{s}"
 
     # Fallback: 原样返回
@@ -218,6 +218,30 @@ def filter_file_by_time_range(log_file, start_str=None, end_str=None):
     return (tmp.name, True)
 
 
+def filter_file_by_recent_minutes(log_file, minutes):
+    """按日志末时间戳向前过滤最近 N 分钟日志。
+
+    Returns:
+        tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除
+    """
+    if minutes is None or minutes <= 0:
+        return (log_file, False)
+
+    last_ts = _get_log_boundary_ts(log_file, "last")
+    if not last_ts:
+        return (log_file, False)
+
+    try:
+        end_dt = parse_ts(last_ts)
+    except ValueError:
+        return (log_file, False)
+
+    start_dt = end_dt - timedelta(minutes=minutes)
+    start_str = start_dt.strftime("%Y/%m/%d %H:%M:%S")
+    end_str = end_dt.strftime("%Y/%m/%d %H:%M:%S")
+    return filter_file_by_time_range(log_file, start_str=start_str, end_str=end_str)
+
+
 # Context tag：[session_id:...], [request_id:...], [trace_id:...], [req_id:...]
 TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]")
 
@@ -228,7 +252,7 @@ def extract_tags(line):
 
 
 # Log level
-LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]")
+LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN|DEBUG)\]")
 
 
 def extract_level(line):
@@ -294,9 +318,10 @@ def parse_http_line(line, inference_only=False):
 # Cache-Aware 策略行解析（类别 H6）
 # ════════════════════════════════════════════════════════════════
 
+URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?"
 STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
-SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
-REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)")
 
 
 def parse_cache_strategy_line(line):
@@ -351,7 +376,7 @@ def parse_cache_strategy_line(line):
 # ════════════════════════════════════════════════════════════════
 
 TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
-WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)")
 CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
 
 
@@ -438,14 +463,37 @@ def parse_error_line(line):
 # Select/Release 事件匹配
 # ════════════════════════════════════════════════════════════════
 
-SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
-RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
+SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
+RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
 FAILED_SELECT_RE = re.compile(r"Failed to select")
-SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)")
+
+
+def _parse_ts_safe(ts):
+    if not ts:
+        return None
+    try:
+        return parse_ts(ts)
+    except ValueError:
+        return None
+
+
+def _select_match_key(tags):
+    """构建请求关联 key，优先 request_id，其次 req_id/trace_id/session_id。"""
+    if not tags:
+        return (None, None)
+    rid = tags.get("request_id")
+    if rid:
+        return ("request_id", f"request_id:{rid}")
+    for k in ("req_id", "trace_id", "session_id"):
+        v = tags.get(k)
+        if v:
+            return ("alt_id", f"{k}:{v}")
+    return (None, None)
 
 
-def match_select_release(lines):
+def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
     Args:
@@ -523,31 +571,60 @@ def match_select_release(lines):
         if FAILED_SELECT_RE.search(line):
             failed_selects.append({"ts": ts, "tags": tags, "line": line_no})
 
-    # Match by request_id
+    # Match by request_id / alt_id
     matched = []
     unmatched_selects = []
     release_used = set()
 
-    release_by_reqid = defaultdict(list)
+    release_by_key = defaultdict(list)
     for i, r in enumerate(releases):
-        rid = r["tags"].get("request_id", "")
-        if rid:
-            release_by_reqid[rid].append(i)
-
+        _, key = _select_match_key(r.get("tags", {}))
+        if key:
+            release_by_key[key].append(i)
+
+    # 请求 ID 覆盖（按 select 事件近似请求数）
+    total_req_est = len(selects)
+    with_request_id = 0
+    with_alt_id = 0
+    without_any_id = 0
+
+    pending_selects = []
+    untracked_selects = []
     for s in selects:
-        rid = s["tags"].get("request_id", "")
+        key_type, key = _select_match_key(s.get("tags", {}))
+        if key_type == "request_id":
+            with_request_id += 1
+        elif key_type == "alt_id":
+            with_alt_id += 1
+        else:
+            without_any_id += 1
+
         found = False
-        if rid and rid in release_by_reqid:
-            for ri in release_by_reqid[rid]:
+        if not key:
+            # 没有任何可用 ID 时，不做退化匹配（只统计可观测信息）
+            untracked_selects.append(
+                {
+                    "worker": s["worker"],
+                    "select_ts": s["ts"],
+                    "type": s["type"],
+                    "tags": s["tags"],
+                    "note": "no correlatable id (request_id/req_id/trace_id/session_id)",
+                }
+            )
+            continue
+
+        if key and key in release_by_key:
+            for ri in release_by_key[key]:
                 if ri not in release_used:
                     r = releases[ri]
                     matched.append(
                         {
-                            "request_id": rid,
+                            "request_id": s["tags"].get("request_id", ""),
                             "worker": s["worker"],
                             "select_ts": s["ts"],
                             "release_ts": r["ts"],
                             "type": s["type"],
+                            "match_method": key_type or "id",
                         }
                     )
                     release_used.add(ri)
@@ -555,13 +632,50 @@ def match_select_release(lines):
                     break
 
         if not found:
+            pending_selects.append(s)
+
+    # Fallback: 有 ID 但未匹配时，按 worker + 时间邻近匹配
+    for s in pending_selects:
+        sdt = _parse_ts_safe(s["ts"])
+        best_idx = None
+        best_delta = None
+        for ri, r in enumerate(releases):
+            if ri in release_used:
+                continue
+            if r.get("worker") != s.get("worker"):
+                continue
+            rdt = _parse_ts_safe(r.get("ts"))
+            if sdt and rdt:
+                delta = (rdt - sdt).total_seconds()
+                if delta < 0 or delta > fallback_window_s:
+                    continue
+            else:
+                delta = 0
+            if best_delta is None or delta < best_delta:
+                best_delta = delta
+                best_idx = ri
+
+        if best_idx is not None:
+            r = releases[best_idx]
+            matched.append(
+                {
+                    "request_id": s["tags"].get("request_id", ""),
+                    "worker": s["worker"],
+                    "select_ts": s["ts"],
+                    "release_ts": r["ts"],
+                    "type": s["type"],
+                    "match_method": "worker_time_fallback",
+                }
+            )
+            release_used.add(best_idx)
+        else:
             unmatched_selects.append(
                 {
                     "worker": s["worker"],
                     "select_ts": s["ts"],
                     "type": s["type"],
                     "tags": s["tags"],
-                    "note": "no matching release found",
+                    "note": "no matching release found (request_id/worker-time)",
                 }
             )
 
@@ -583,8 +697,16 @@ def match_select_release(lines):
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
+        "untracked_selects": untracked_selects,
         "failed_selects": failed_selects,
         "per_worker": pw_result,
+        "id_coverage": {
+            "total_requests_estimated": total_req_est,
+            "with_request_id": with_request_id,
+            "without_request_id": total_req_est - with_request_id,
+            "with_alt_id": with_alt_id,
+            "without_any_id": without_any_id,
+        },
     }
 
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 4e64a2092b3..5096c5b294a 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -34,7 +34,7 @@
 from analyzers.latency import analyze_latency, format_latency_report
 from analyzers.load import analyze_load, format_load_report
 from analyzers.trace import analyze_trace, format_trace_report
-from log_parser import complete_time_arg, filter_file_by_time_range
+from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range
 
 
 def determine_log_file(user_path=None):
@@ -71,10 +71,8 @@ def parse_tail_arg(tail_str):
     if tail_str is None:
         return None
     if tail_str.endswith("m"):
-        # 分钟模式：转换为大致行数（假设 ~20 行/秒）
-        minutes = int(tail_str[:-1])
-        return minutes * 60 * 20
-    return int(tail_str)
+        return {"type": "minutes", "value": int(tail_str[:-1])}
+    return {"type": "lines", "value": int(tail_str)}
 
 
 def determine_status(results):
@@ -265,6 +263,18 @@ def main():
         log_file = filtered_path
         print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr)
 
+    tail_arg = parse_tail_arg(args.tail)
+    tail = None
+    # --tail Nm 采用真实时间窗口过滤，再全量分析过滤后的临时文件
+    if tail_arg and tail_arg["type"] == "minutes":
+        filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"])
+        if is_temp:
+            atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None)
+        log_file = filtered_path
+        print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr)
+    elif tail_arg and tail_arg["type"] == "lines":
+        tail = tail_arg["value"]
+
     # 确定分析模式
     any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace
     run_errors = args.errors or (not any_mode)
@@ -274,8 +284,6 @@ def main():
     run_cache = args.cache or (not any_mode)
     run_trace = bool(args.trace)  # trace 需要指定 ID，全量扫描不自动调用
 
-    tail = parse_tail_arg(args.tail)
-
     results = {}
     step = 0
     total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace])

From 4ced999b66bd760e9fa89e7a6b5b70113e6cdc4b Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 20:09:29 +0800
Subject: [PATCH 03/40] fix(stat-cache-hitrate): include dated span and
 markdown summary output

---
 .../stat-cache-hitrate/scripts/log_parser.py  |  11 +-
 .../scripts/stat_cache_hitrate.py             | 218 +++++++++++++++---
 .../troubleshoot/scripts/analyzers/cache.py   |   8 +-
 .../troubleshoot/scripts/analyzers/errors.py  |   2 +
 .../troubleshoot/scripts/analyzers/health.py  |  25 +-
 .../troubleshoot/scripts/analyzers/load.py    | 102 +++++---
 .../troubleshoot/scripts/analyzers/trace.py   |  23 +-
 .../skills/troubleshoot/scripts/log_parser.py | 168 ++++++++++++--
 .../troubleshoot/scripts/troubleshoot.py      |  22 +-
 9 files changed, 461 insertions(+), 118 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
index 0b7377b4865..d43d6909c64 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
@@ -125,7 +125,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
     if m:
         mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
         ts = _get_log_boundary_ts(log_file, "first")
-        year = ts[:4] if ts else "2026"
+        year = ts[:4] if ts else str(datetime.now().year)
         if m.group(3):  # 有时间部分
             h, mi = m.group(3).zfill(2), m.group(4)
             s = (m.group(5) or "00").zfill(2)
@@ -139,7 +139,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
         h, mi = m.group(1).zfill(2), m.group(2)
         s = (m.group(3) or "00").zfill(2)
         ts = _get_log_boundary_ts(log_file, "last")
-        date_part = ts[:10] if ts else "2026/01/01"
+        date_part = ts[:10] if ts else f"{datetime.now().year}/01/01"
         return f"{date_part} {h}:{mi}:{s}"
 
     # Fallback: 原样返回
@@ -204,9 +204,10 @@ def extract_tags(line):
 # Cache-Aware 策略行解析（类别 A）
 # ════════════════════════════════════════════════════════════════
 
+URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?"
 STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
-SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
-REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)")
 
 
 def parse_cache_strategy_line(line):
@@ -271,7 +272,7 @@ def parse_cache_strategy_line(line):
 # ════════════════════════════════════════════════════════════════
 
 TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
-WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)")
 CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
 
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index c193e99d47c..6d63a565fe2 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -14,6 +14,7 @@
 import argparse
 import json
 import os
+import re
 import subprocess
 import sys
 from collections import defaultdict
@@ -32,6 +33,10 @@
 )
 from stats import compute_statistics, count_by, time_bucket
 
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -235,7 +240,7 @@ def compute_per_worker_stats(strategies):
         avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0
         result.append(
             {
-                "Worker": worker.replace("http://", ""),
+                "Worker": _strip_scheme(worker),
                 "Selected": data["selected_count"],
                 "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%",
                 "AvgHitRatio": f"{avg_hr}%",
@@ -339,7 +344,7 @@ def _quartile_trend(trend, value_field):
     return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}"
 
 
-def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None):
+def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None):
     """格式化完整终端报告。"""
     parts = []
 
@@ -361,6 +366,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
         dist_data = [
             {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]
         ]
+        parts.append("  Unicode 柱状图（Prefix HR 分布）:")
         parts.append(render_bar(dist_data, show_count=True))
 
         parts.append(f'  冷启动率: {prefix_hr["cold_start_rate"]}%')
@@ -375,6 +381,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
                 {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]
             ]
             parts.append("")
+            parts.append("  ASCII 折线图（Prefix HR 趋势）:")
             parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100)))
     else:
         parts.append("  (无 cache_aware_scoring 数据)")
@@ -391,6 +398,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
 
     if session_hr["trend"]:
         parts.append("")
+        parts.append("  ASCII 折线图（Session HR 趋势）:")
         parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100)))
     parts.append("")
 
@@ -428,6 +436,18 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
     parts.append(f'  {diagnosis["icon"]} {diagnosis["summary"]}')
     parts.append(f'  {diagnosis["detail"]}')
 
+    # 6. 每窗口明细预览
+    if window_rows:
+        parts.append("")
+        parts.append("### 6. 每5s窗口明细预览（前10行）")
+        parts.append(
+            render_table(
+                window_rows[:10],
+                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
+                right_align={"Scoring", "Fallback", "Total Running"},
+            )
+        )
+
     return "\n".join(parts)
 
 
@@ -458,7 +478,77 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling):
     return "\n".join(parts)
 
 
-def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir):
+def build_per_window_rows(strategies, stats_recs):
+    """构建每窗口明细行，用于终端预览和 details 导出。"""
+    time_data = defaultdict(
+        lambda: {
+            "prefix_vals": [],
+            "hits": 0,
+            "total": 0,
+            "scoring": 0,
+            "fallback": 0,
+            "running": 0,
+            "has_running": False,
+        }
+    )
+    for r in strategies:
+        ts = r.get("ts", "")
+        if r.get("strategy") == "cache_aware_scoring":
+            time_data[ts]["scoring"] += 1
+            time_data[ts]["prefix_vals"].append(r.get("selected_hitRatio", 0))
+        else:
+            time_data[ts]["fallback"] += 1
+
+    for r in stats_recs:
+        ts = r.get("ts", "")
+        time_data[ts]["hits"] += r.get("hits", 0)
+        time_data[ts]["total"] += r.get("total", 0)
+        if "total_running" in r:
+            time_data[ts]["running"] += r.get("total_running", 0)
+            time_data[ts]["has_running"] = True
+
+    rows = []
+    for ts in sorted(time_data.keys()):
+        d = time_data[ts]
+        short_ts = ts.split(" ")[-1] if " " in ts else ts
+        if d["prefix_vals"]:
+            prefix_mean = round(sum(d["prefix_vals"]) / len(d["prefix_vals"]), 1)
+            prefix_hr = f"{prefix_mean}%"
+        else:
+            prefix_hr = "-"
+
+        if d["total"] > 0:
+            session_val = round(d["hits"] / d["total"] * 100, 1)
+            session_hr = f'{session_val}% ({d["hits"]}/{d["total"]})'
+        else:
+            session_hr = "-"
+
+        running = str(d["running"]) if d["has_running"] else "-"
+        rows.append(
+            {
+                "Time": short_ts,
+                "Prefix HR": prefix_hr,
+                "Session HR": session_hr,
+                "Scoring": str(d["scoring"]),
+                "Fallback": str(d["fallback"]),
+                "Total Running": running,
+            }
+        )
+    return rows
+
+
+def save_detailed_report(
+    filepath,
+    strategies,
+    stats_recs,
+    prefix_hr,
+    session_hr,
+    per_worker,
+    scheduling,
+    diagnosis,
+    output_dir,
+    time_span=None,
+):
     """导出详细数据 Markdown 文件。
 
     主报告包含 Per-Worker 统计和 Fallback 明细。
@@ -471,10 +561,63 @@ def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr
     parts.append("# Cache Hit Rate Detailed Report")
     parts.append(f'**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
     parts.append(f"**Source**: {filepath}")
+    if time_span:
+        parts.append(f"**Span**: {time_span}")
+    parts.append("")
+
+    # 1) 主指标摘要（与终端一致，避免“只在终端可见”）
+    parts.append("## 1. Key Metrics Summary")
+    parts.append("")
+    parts.append("### Prefix Hit Ratio")
+    if prefix_hr["stats"]:
+        parts.append(f'- 累计平均: **{prefix_hr["mean"]}%** (N={prefix_hr["count"]})')
+        parts.append(f'- 冷启动率: **{prefix_hr["cold_start_rate"]}%**')
+        trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean")
+        if trend_str:
+            parts.append(f"- 趋势: {trend_str}")
+        dist_data = [{"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]]
+        parts.append("")
+        parts.append("```text")
+        parts.append("Unicode 柱状图（Prefix HR 分布）")
+        parts.append(render_bar(dist_data, show_count=True))
+        if prefix_hr["trend"]:
+            sparkline_data = [{"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]]
+            parts.append("")
+            parts.append("ASCII 折线图（Prefix HR 趋势）")
+            parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100)))
+        parts.append("```")
+    else:
+        parts.append("- (无 cache_aware_scoring 数据)")
+    parts.append("")
+
+    parts.append("### Session Hit Rate")
+    parts.append(f'- 累计: **{session_hr["rate"]}%** (hits={session_hr["hits"]}/total={session_hr["total"]})')
+    parts.append(f'- 覆盖率: **{session_hr["coverage"]}%**')
+    trend_str = _quartile_trend(session_hr["trend"], "value")
+    if trend_str:
+        parts.append(f"- 趋势: {trend_str}")
+    if session_hr["trend"]:
+        parts.append("")
+        parts.append("```text")
+        parts.append("ASCII 折线图（Session HR 趋势）")
+        parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100)))
+        parts.append("```")
+    parts.append("")
+
+    parts.append("### Scheduling Strategy")
+    parts.append(
+        f'- cache_aware_scoring: **{scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)**'
+        f' | fallback: **{scheduling["fallback_count"]}**'
+    )
+    parts.append(
+        f'- 非最优命中选择: **{scheduling["suboptimal_pct"]}%**'
+        f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)'
+    )
+    parts.append(f'- Diagnosis: {diagnosis["icon"]} {diagnosis["summary"]}；{diagnosis["detail"]}')
     parts.append("")
 
-    # Per-Worker 完整统计
-    parts.append("## 1. Per-Worker 完整统计")
+    # 2) Per-Worker 完整统计
+    parts.append("## 2. Per-Worker 完整统计")
     parts.append("")
     if per_worker:
         parts.append(
@@ -486,49 +629,34 @@ def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr
         )
     parts.append("")
 
-    # Fallback 明细
+    # 3) Fallback 明细
     if scheduling["fallback_reasons"]:
-        parts.append("## 2. Fallback 明细")
+        parts.append("## 3. Fallback 明细")
         for reason in scheduling["fallback_reasons"]:
             parts.append(f'- **{reason["value"]}**: {reason["count"]} 次 ({reason["pct"]}%)')
         parts.append("")
 
     # 每窗口明细 → 拆分到 details/
-    time_data = defaultdict(lambda: {"prefix_hr": "-", "session_hr": "-", "scoring": 0, "fallback": 0, "running": "-"})
-    for r in strategies:
-        ts = r.get("ts", "")
-        if r.get("strategy") == "cache_aware_scoring":
-            time_data[ts]["scoring"] += 1
-        else:
-            time_data[ts]["fallback"] += 1
-
-    for r in stats_recs:
-        ts = r.get("ts", "")
-        h = r.get("hits", 0)
-        t = r.get("total", 0)
-        time_data[ts]["session_hr"] = f"{round(h / t * 100, 1)}% ({h}/{t})" if t else "0%"
-        time_data[ts]["running"] = str(r.get("total_running", "-"))
+    window_rows = build_per_window_rows(strategies, stats_recs)
 
-    if time_data:
+    if window_rows:
         # 主报告中添加引用
         parts.append(
-            f"> 每窗口明细数据 ({len(time_data)} 条): [details/per_window_data.md](details/per_window_data.md)"
+            f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)"
         )
         parts.append("")
 
         # 写入 details 子目录
         details_dir = os.path.join(output_dir, "details")
         os.makedirs(details_dir, exist_ok=True)
-        detail_parts = ["# 每窗口明细数据", ""]
-        detail_parts.append("| Time | Prefix HR | Session HR | Scoring | Fallback | Total Running |")
-        detail_parts.append("|------|-----------|------------|---------|----------|---------------|")
-        for ts in sorted(time_data.keys()):
-            d = time_data[ts]
-            short_ts = ts.split(" ")[-1] if " " in ts else ts
-            detail_parts.append(
-                f'| {short_ts} | {d["prefix_hr"]} | {d["session_hr"]} '
-                f'| {d["scoring"]} | {d["fallback"]} | {d["running"]} |'
+        detail_parts = ["# 每5s窗口明细数据", ""]
+        detail_parts.append(
+            render_table(
+                window_rows,
+                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
+                right_align={"Scoring", "Fallback", "Total Running"},
             )
+        )
         detail_parts.append("")
 
         detail_path = os.path.join(details_dir, "per_window_data.md")
@@ -564,8 +692,8 @@ def compute_time_span(strategies, stats_recs):
     duration = t_max - t_min
     hours = int(duration.total_seconds() // 3600)
     minutes = int((duration.total_seconds() % 3600) // 60)
-    start = t_min.strftime("%H:%M:%S")
-    end = t_max.strftime("%H:%M:%S")
+    start = t_min.strftime("%Y-%m-%d %H:%M:%S")
+    end = t_max.strftime("%Y-%m-%d %H:%M:%S")
     if hours > 0:
         return f"{start} ~ {end} ({hours}h{minutes}m)"
     return f"{start} ~ {end} ({minutes}m)"
@@ -642,9 +770,18 @@ def main():
         print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling))
     else:
         time_span = compute_time_span(strategy_recs, stats_recs)
+        window_rows = build_per_window_rows(strategy_recs, stats_recs)
         print(
             format_full_report(
-                args.log_file, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span
+                args.log_file,
+                line_count,
+                prefix_hr,
+                session_hr,
+                per_worker,
+                scheduling,
+                diagnosis,
+                time_span,
+                window_rows=window_rows,
             )
         )
 
@@ -657,7 +794,16 @@ def main():
             run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp)
         report_path = save_detailed_report(
-            args.log_file, strategy_recs, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir
+            args.log_file,
+            strategy_recs,
+            stats_recs,
+            prefix_hr,
+            session_hr,
+            per_worker,
+            scheduling,
+            diagnosis,
+            output_dir,
+            time_span=time_span,
         )
         print(f"\n\U0001f4c4 详细数据见: {report_path}")
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3a18b668a41..3fca296f4d6 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -26,6 +26,10 @@
 TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens")
 
 
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
+
 def classify_fallback(record, tokenizer_degraded_ts=None):
     """对 process_tokens 策略行分类 fallback 原因。
 
@@ -210,9 +214,9 @@ def _analyze_suboptimal(records, hr_weight, lb_weight):
         suboptimal.append(
             {
                 "ts": r.get("ts", ""),
-                "selected": selected.replace("http://", ""),
+                "selected": _strip_scheme(selected),
                 "selected_hr": sel_hr,
-                "best_hr_worker": best_by_hr.replace("http://", ""),
+                "best_hr_worker": _strip_scheme(best_by_hr),
                 "best_hr": max_hr,
                 "reason": reason,
             }
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index 0817e280aa5..b8217a5ffa4 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -44,6 +44,8 @@
     ("No available", "FD 后端"),
     ("request failed", "FD 后端"),
     ("Removed unhealthy", "FD 后端"),
+    ("is not healthy", "FD 后端"),
+    ("is healthy", "FD 后端"),
     ("Backend request failed", "FD 后端"),
     ("Decode request failed", "FD 后端"),
     ("Prefill request failed", "FD 后端"),
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index d2d7ca77acb..ca01d718dbc 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -21,11 +21,16 @@
 # 健康事件解析
 # ════════════════════════════════════════════════════════════════
 
-NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy")
-REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)")
-IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy")
-COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
-CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)")
+WORKER_URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+NOT_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is not healthy")
+REMOVED_RE = re.compile(rf"Removed unhealthy \w+ instance:\s*{WORKER_URL_RE}")
+IS_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is healthy")
+COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{WORKER_URL_RE}")
+CLEANUP_UNHEALTHY_RE = re.compile(rf"cleanup unhealthy.*?{WORKER_URL_RE}")
+
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
 
 
 def parse_health_event(line):
@@ -110,7 +115,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
     # IP → worker URL 映射
     ip_to_urls = defaultdict(set)
     for url in worker_urls:
-        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url)
         if ip_m:
             ip_to_urls[ip_m.group(1)].add(url)
 
@@ -130,7 +135,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
     workers = {}
     for url in sorted(worker_urls):
         events = sorted(worker_events[url], key=lambda e: e["ts"] or "")
-        ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url)
+        ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url)
         worker_ip = ip_m.group(1) if ip_m else ""
 
         # 恢复检测：REMOVED 后有 register
@@ -237,7 +242,7 @@ def _diagnose(workers):
         )
 
     for url, w in workers.items():
-        s = url.replace("http://", "")
+        s = _strip_scheme(url)
         if w["down_count"] > 3:
             diagnoses.append(
                 {
@@ -326,7 +331,7 @@ def format_health_report(result):
             )
         table_data.append(
             {
-                "Worker": url.replace("http://", ""),
+                "Worker": _strip_scheme(url),
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
@@ -358,7 +363,7 @@ def format_health_report(result):
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
-            detail_parts.append(f'## {url.replace("http://", "")}')
+            detail_parts.append(f"## {_strip_scheme(url)}")
             detail_parts.append("")
             for evt in w["events"]:
                 detail_parts.append(f'  [{evt["ts"]}] {evt["type"]}')
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index e712011d932..9be82357494 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -21,14 +21,19 @@
 # Counter 异常检测正则
 # ════════════════════════════════════════════════════════════════
 
-DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release")
-COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up")
-COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)")
-TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)")
+URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+DOUBLE_RELEASE_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?double-release")
+COUNTER_CLEANED_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?counter already cleaned up")
+COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{URL_RE}")
+TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}")
 
 # Token 事件
-SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+
+
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
 
 
 def parse_counter_anomaly(line):
@@ -89,7 +94,7 @@ def analyze_load(log_file, tail=None):
         avg = sum(vals) / len(vals) if vals else 0
         worker_load.append(
             {
-                "worker": w_url.replace("http://", ""),
+                "worker": _strip_scheme(w_url),
                 "avg_running": round(avg, 1),
                 "max_running": max(vals) if vals else 0,
                 "samples": len(vals),
@@ -121,9 +126,9 @@ def analyze_load(log_file, tail=None):
 
     # Select/Release 匹配
     sr_result = (
-        match_select_release(h3_lines)
+        match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}}
+        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
     )
 
     # Token 统计
@@ -133,7 +138,7 @@ def analyze_load(log_file, tail=None):
     pileup = _detect_pileup(stats_records)
 
     # 诊断
-    diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup)
+    diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup)
 
     return {
         "load_stats": load_stats,
@@ -170,7 +175,7 @@ def _analyze_tokens(h3_lines, h11_lines):
         releases = token_release.get(w, [])
         result.append(
             {
-                "worker": w.replace("http://", ""),
+                "worker": _strip_scheme(w),
                 "alloc_count": len(allocs),
                 "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0,
                 "release_count": len(releases),
@@ -195,7 +200,7 @@ def _detect_pileup(stats_records):
     return max_consecutive >= 5
 
 
-def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
+def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup):
     """生成负载诊断。"""
     diagnoses = []
 
@@ -236,16 +241,20 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
                 }
             )
 
-    # Select/Release 不一致
-    for w_url, pw in sr_result.get("per_worker", {}).items():
-        if pw.get("delta", 0) > 0:
-            diagnoses.append(
-                {
-                    "severity": "HIGH",
-                    "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
-                    "source_layer": "FD 后端",
-                }
-            )
+    id_cov = sr_result.get("id_coverage", {})
+    has_correlatable_ids = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) > 0
+
+    # Select/Release 不一致（仅在存在可关联 ID 时启用，避免无 ID 场景误报）
+    if has_correlatable_ids:
+        for w_url, pw in sr_result.get("per_worker", {}).items():
+            if pw.get("delta", 0) > 0:
+                diagnoses.append(
+                    {
+                        "severity": "HIGH",
+                        "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
+                        "source_layer": "FD 后端",
+                    }
+                )
 
     # 卡住的请求
     if sr_result.get("unmatched_selects"):
@@ -257,6 +266,17 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup):
             }
         )
 
+    # Token 计数器潜在泄漏
+    for t in token_stats:
+        if t.get("alloc_count", 0) > t.get("release_count", 0):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'{t["worker"]} token alloc/release 不平衡 ({t["alloc_count"]}/{t["release_count"]})',
+                    "source_layer": "Router",
+                }
+            )
+
     return diagnoses
 
 
@@ -316,23 +336,44 @@ def format_load_report(result):
         sections.append("### 计数器异常")
         sections.append("")
         for a in result["counter_anomalies"]:
-            workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items())
+            workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items())
             sections.append(f'  {a["type"]}: {a["total"]} 次 [{workers_str}]')
         sections.append("")
 
+    id_cov = result.get("select_release", {}).get("id_coverage", {})
+    if id_cov:
+        sections.append("### 请求标识覆盖（基于 select 近似请求数）")
+        sections.append("")
+        sections.append(
+            "  total={total} | with_request_id={with_rid} | without_request_id={without_rid} | "
+            "with_alt_id={with_alt} | without_any_id={without_any}".format(
+                total=id_cov.get("total_requests_estimated", 0),
+                with_rid=id_cov.get("with_request_id", 0),
+                without_rid=id_cov.get("without_request_id", 0),
+                with_alt=id_cov.get("with_alt_id", 0),
+                without_any=id_cov.get("without_any_id", 0),
+            )
+        )
+        if id_cov.get("without_any_id", 0) > 0:
+            sections.append("  ℹ 无 request/session/trace/req_id 时，不做退化匹配，仅统计为 untracked。")
+        sections.append("")
+
     # Select/Release 匹配
     sr = result.get("select_release", {})
     if sr.get("per_worker"):
         sections.append("### Select/Release 匹配")
         sections.append("")
+        id_cov = sr.get("id_coverage", {})
+        no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0
         table_data = []
         for w_url, pw in sorted(sr["per_worker"].items()):
+            delta_display = "N/A" if no_correlatable_id else str(pw["delta"])
             table_data.append(
                 {
-                    "Worker": w_url.replace("http://", ""),
+                    "Worker": _strip_scheme(w_url),
                     "Select": str(pw["selects"]),
                     "Release": str(pw["releases"]),
-                    "Delta": str(pw["delta"]),
+                    "Delta": delta_display,
                 }
             )
         sections.append(
@@ -343,11 +384,20 @@ def format_load_report(result):
             )
         )
         sections.append("")
+        if no_correlatable_id:
+            sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
+            sections.append("")
 
     if sr.get("unmatched_selects"):
         sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
         for u in sr["unmatched_selects"][:5]:
-            sections.append(f'    [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})')
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
+        sections.append("")
+
+    if sr.get("untracked_selects"):
+        sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
+        for u in sr["untracked_selects"][:5]:
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
         sections.append("")
 
     # Token 统计
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 45a5056616e..6c9a0323724 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -25,21 +25,26 @@
 # ════════════════════════════════════════════════════════════════
 
 PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection")
-SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
-RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)")
+URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)"
+SELECT_WORKER_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*{URL_RE}")
+RELEASE_WORKER_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*{URL_RE}")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
 REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully")
 TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)")
 
 # Prefill 事件
-PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)")
-PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)")
-PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)")
-PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)")
-PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)")
+PREFILL_FIRST_CHUNK_RE = re.compile(rf"\[prefill\] first chunk received.*?{URL_RE}")
+PREFILL_DONE_RE = re.compile(rf"\[prefill\] non-stream prefill response done.*?{URL_RE}")
+PREFILL_ERROR_RE = re.compile(rf"\[prefill\] (scanner error|copy error).*?{URL_RE}")
+PREFILL_DEFER_RE = re.compile(rf"\[prefill\] release in defer.*?{URL_RE}")
+PREFILL_ERR_PATH_RE = re.compile(rf"\[prefill\] release in CommonCompletions defer \(error path\).*?{URL_RE}")
 FAILED_SELECT_RE = re.compile(r"Failed to select")
 
 
+def _strip_scheme(url):
+    return re.sub(r"^https?://", "", url)
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -342,7 +347,7 @@ def format_trace_report(result):
             for evt in trace["events"]:
                 line = f'  [{evt.get("ts","")}] {evt["type"]}'
                 if evt.get("worker"):
-                    line += f' → {evt["worker"].replace("http://","")}'
+                    line += f' → {_strip_scheme(evt["worker"])}'
                 if evt.get("status"):
                     line += f' [{evt["status"]}]'
                 if evt.get("latency_ms"):
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 2a90d39b632..44f5cdebd94 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -14,7 +14,7 @@
 import re
 import sys
 from collections import defaultdict
-from datetime import datetime
+from datetime import datetime, timedelta
 
 # ════════════════════════════════════════════════════════════════
 # 通用解析原语
@@ -152,7 +152,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
     if m:
         mo, d = m.group(1).zfill(2), m.group(2).zfill(2)
         ts = _get_log_boundary_ts(log_file, "first")
-        year = ts[:4] if ts else "2026"
+        year = ts[:4] if ts else str(datetime.now().year)
         if m.group(3):  # 有时间部分
             h, mi = m.group(3).zfill(2), m.group(4)
             s = (m.group(5) or "00").zfill(2)
@@ -166,7 +166,7 @@ def complete_time_arg(time_str, log_file, is_end=False):
         h, mi = m.group(1).zfill(2), m.group(2)
         s = (m.group(3) or "00").zfill(2)
         ts = _get_log_boundary_ts(log_file, "last")
-        date_part = ts[:10] if ts else "2026/01/01"
+        date_part = ts[:10] if ts else f"{datetime.now().year}/01/01"
         return f"{date_part} {h}:{mi}:{s}"
 
     # Fallback: 原样返回
@@ -218,6 +218,30 @@ def filter_file_by_time_range(log_file, start_str=None, end_str=None):
     return (tmp.name, True)
 
 
+def filter_file_by_recent_minutes(log_file, minutes):
+    """按日志末时间戳向前过滤最近 N 分钟日志。
+
+    Returns:
+        tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除
+    """
+    if minutes is None or minutes <= 0:
+        return (log_file, False)
+
+    last_ts = _get_log_boundary_ts(log_file, "last")
+    if not last_ts:
+        return (log_file, False)
+
+    try:
+        end_dt = parse_ts(last_ts)
+    except ValueError:
+        return (log_file, False)
+
+    start_dt = end_dt - timedelta(minutes=minutes)
+    start_str = start_dt.strftime("%Y/%m/%d %H:%M:%S")
+    end_str = end_dt.strftime("%Y/%m/%d %H:%M:%S")
+    return filter_file_by_time_range(log_file, start_str=start_str, end_str=end_str)
+
+
 # Context tag：[session_id:...], [request_id:...], [trace_id:...], [req_id:...]
 TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]")
 
@@ -228,7 +252,7 @@ def extract_tags(line):
 
 
 # Log level
-LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]")
+LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN|DEBUG)\]")
 
 
 def extract_level(line):
@@ -294,9 +318,10 @@ def parse_http_line(line, inference_only=False):
 # Cache-Aware 策略行解析（类别 H6）
 # ════════════════════════════════════════════════════════════════
 
+URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?"
 STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)")
-SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)")
-REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)")
+SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)")
+REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)")
 
 
 def parse_cache_strategy_line(line):
@@ -351,7 +376,7 @@ def parse_cache_strategy_line(line):
 # ════════════════════════════════════════════════════════════════
 
 TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)")
-WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)")
+WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)")
 CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)")
 
 
@@ -438,14 +463,37 @@ def parse_error_line(line):
 # Select/Release 事件匹配
 # ════════════════════════════════════════════════════════════════
 
-SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
-RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)")
+SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
+RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
 FAILED_SELECT_RE = re.compile(r"Failed to select")
-SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)")
+
+
+def _parse_ts_safe(ts):
+    if not ts:
+        return None
+    try:
+        return parse_ts(ts)
+    except ValueError:
+        return None
+
+
+def _select_match_key(tags):
+    """构建请求关联 key，优先 request_id，其次 req_id/trace_id/session_id。"""
+    if not tags:
+        return (None, None)
+    rid = tags.get("request_id")
+    if rid:
+        return ("request_id", f"request_id:{rid}")
+    for k in ("req_id", "trace_id", "session_id"):
+        v = tags.get(k)
+        if v:
+            return ("alt_id", f"{k}:{v}")
+    return (None, None)
 
 
-def match_select_release(lines):
+def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
     Args:
@@ -523,31 +571,60 @@ def match_select_release(lines):
         if FAILED_SELECT_RE.search(line):
             failed_selects.append({"ts": ts, "tags": tags, "line": line_no})
 
-    # Match by request_id
+    # Match by request_id / alt_id
     matched = []
     unmatched_selects = []
     release_used = set()
 
-    release_by_reqid = defaultdict(list)
+    release_by_key = defaultdict(list)
     for i, r in enumerate(releases):
-        rid = r["tags"].get("request_id", "")
-        if rid:
-            release_by_reqid[rid].append(i)
-
+        _, key = _select_match_key(r.get("tags", {}))
+        if key:
+            release_by_key[key].append(i)
+
+    # 请求 ID 覆盖（按 select 事件近似请求数）
+    total_req_est = len(selects)
+    with_request_id = 0
+    with_alt_id = 0
+    without_any_id = 0
+
+    pending_selects = []
+    untracked_selects = []
     for s in selects:
-        rid = s["tags"].get("request_id", "")
+        key_type, key = _select_match_key(s.get("tags", {}))
+        if key_type == "request_id":
+            with_request_id += 1
+        elif key_type == "alt_id":
+            with_alt_id += 1
+        else:
+            without_any_id += 1
+
         found = False
-        if rid and rid in release_by_reqid:
-            for ri in release_by_reqid[rid]:
+        if not key:
+            # 没有任何可用 ID 时，不做退化匹配（只统计可观测信息）
+            untracked_selects.append(
+                {
+                    "worker": s["worker"],
+                    "select_ts": s["ts"],
+                    "type": s["type"],
+                    "tags": s["tags"],
+                    "note": "no correlatable id (request_id/req_id/trace_id/session_id)",
+                }
+            )
+            continue
+
+        if key and key in release_by_key:
+            for ri in release_by_key[key]:
                 if ri not in release_used:
                     r = releases[ri]
                     matched.append(
                         {
-                            "request_id": rid,
+                            "request_id": s["tags"].get("request_id", ""),
                             "worker": s["worker"],
                             "select_ts": s["ts"],
                             "release_ts": r["ts"],
                             "type": s["type"],
+                            "match_method": key_type or "id",
                         }
                     )
                     release_used.add(ri)
@@ -555,13 +632,50 @@ def match_select_release(lines):
                     break
 
         if not found:
+            pending_selects.append(s)
+
+    # Fallback: 有 ID 但未匹配时，按 worker + 时间邻近匹配
+    for s in pending_selects:
+        sdt = _parse_ts_safe(s["ts"])
+        best_idx = None
+        best_delta = None
+        for ri, r in enumerate(releases):
+            if ri in release_used:
+                continue
+            if r.get("worker") != s.get("worker"):
+                continue
+            rdt = _parse_ts_safe(r.get("ts"))
+            if sdt and rdt:
+                delta = (rdt - sdt).total_seconds()
+                if delta < 0 or delta > fallback_window_s:
+                    continue
+            else:
+                delta = 0
+            if best_delta is None or delta < best_delta:
+                best_delta = delta
+                best_idx = ri
+
+        if best_idx is not None:
+            r = releases[best_idx]
+            matched.append(
+                {
+                    "request_id": s["tags"].get("request_id", ""),
+                    "worker": s["worker"],
+                    "select_ts": s["ts"],
+                    "release_ts": r["ts"],
+                    "type": s["type"],
+                    "match_method": "worker_time_fallback",
+                }
+            )
+            release_used.add(best_idx)
+        else:
             unmatched_selects.append(
                 {
                     "worker": s["worker"],
                     "select_ts": s["ts"],
                     "type": s["type"],
                     "tags": s["tags"],
-                    "note": "no matching release found",
+                    "note": "no matching release found (request_id/worker-time)",
                 }
             )
 
@@ -583,8 +697,16 @@ def match_select_release(lines):
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
+        "untracked_selects": untracked_selects,
         "failed_selects": failed_selects,
         "per_worker": pw_result,
+        "id_coverage": {
+            "total_requests_estimated": total_req_est,
+            "with_request_id": with_request_id,
+            "without_request_id": total_req_est - with_request_id,
+            "with_alt_id": with_alt_id,
+            "without_any_id": without_any_id,
+        },
     }
 
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 4e64a2092b3..5096c5b294a 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -34,7 +34,7 @@
 from analyzers.latency import analyze_latency, format_latency_report
 from analyzers.load import analyze_load, format_load_report
 from analyzers.trace import analyze_trace, format_trace_report
-from log_parser import complete_time_arg, filter_file_by_time_range
+from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range
 
 
 def determine_log_file(user_path=None):
@@ -71,10 +71,8 @@ def parse_tail_arg(tail_str):
     if tail_str is None:
         return None
     if tail_str.endswith("m"):
-        # 分钟模式：转换为大致行数（假设 ~20 行/秒）
-        minutes = int(tail_str[:-1])
-        return minutes * 60 * 20
-    return int(tail_str)
+        return {"type": "minutes", "value": int(tail_str[:-1])}
+    return {"type": "lines", "value": int(tail_str)}
 
 
 def determine_status(results):
@@ -265,6 +263,18 @@ def main():
         log_file = filtered_path
         print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr)
 
+    tail_arg = parse_tail_arg(args.tail)
+    tail = None
+    # --tail Nm 采用真实时间窗口过滤，再全量分析过滤后的临时文件
+    if tail_arg and tail_arg["type"] == "minutes":
+        filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"])
+        if is_temp:
+            atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None)
+        log_file = filtered_path
+        print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr)
+    elif tail_arg and tail_arg["type"] == "lines":
+        tail = tail_arg["value"]
+
     # 确定分析模式
     any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace
     run_errors = args.errors or (not any_mode)
@@ -274,8 +284,6 @@ def main():
     run_cache = args.cache or (not any_mode)
     run_trace = bool(args.trace)  # trace 需要指定 ID，全量扫描不自动调用
 
-    tail = parse_tail_arg(args.tail)
-
     results = {}
     step = 0
     total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace])

From be5c4f5fa08cd15ecb7454d6cb10f44e9d26078a Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 20:32:40 +0800
Subject: [PATCH 04/40] Fix stat-cache-hitrate path links for terminal output

---
 .../references/report_templates.md            | 11 ++++++-
 .../scripts/stat_cache_hitrate.py             | 33 ++++++++++++++++++-
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
index dcef9c47498..7f060cacb6a 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
@@ -62,7 +62,16 @@
 ### 5. Diagnosis
   ✅/⚠/❌ <综合诊断>
 
-📄 详细数据见: skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
+### 图表说明（Legend）
+  - Unicode 柱状图：每个区间的请求占比，条越长占比越高
+  - ASCII 折线图：横轴是时间窗口，纵轴是命中率（0-100%）
+  - Q1→Q4 趋势：按时间四等分后的均值变化（↑/↓/→）
+
+📄 详细数据见:
+  - 报告文件: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
+    URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
+  - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
+    URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
 ```
 
 ---
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 6d63a565fe2..6d09bc1915d 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -17,6 +17,8 @@
 import re
 import subprocess
 import sys
+from pathlib import Path
+from urllib.parse import quote
 from collections import defaultdict
 from datetime import datetime
 
@@ -37,6 +39,13 @@
 def _strip_scheme(url):
     return re.sub(r"^https?://", "", url)
 
+
+def _build_path_links(path):
+    """返回绝对路径与 file URI，兼容空格/中文路径。"""
+    abs_path = str(Path(path).resolve())
+    file_uri = "file://" + quote(abs_path, safe="/:-._~")
+    return abs_path, file_uri
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -356,6 +365,13 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
         parts.append(f"**Span**: {span_str}")
     parts.append("")
 
+    # 图表说明
+    parts.append("### 图表说明（如何解读）")
+    parts.append("  - Unicode 柱状图：每行代表一个 Prefix HR 区间（如 60-80%），条越长表示该区间请求占比越高。")
+    parts.append("  - ASCII 折线图：横轴是时间窗口，纵轴是命中率（0-100%）；越靠上表示命中率越高。")
+    parts.append("  - 趋势 Q1→Q4：把时间均分为四段，比较首尾；↑ 上升，↓ 下降，→ 基本稳定。")
+    parts.append("")
+
     # 1. Prefix Hit Ratio
     parts.append("### 1. Prefix Hit Ratio (KV Cache 内容复用度)")
     if prefix_hr["stats"]:
@@ -474,6 +490,7 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling):
             {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]
         ]
         parts.append(render_sparkline(sparkline_data, title="Recent Prefix HR", y_label="%", y_range=(0, 100)))
+        parts.append("  说明: 折线越靠上表示对应时间窗口 Prefix HR 越高。")
 
     return "\n".join(parts)
 
@@ -565,6 +582,12 @@ def save_detailed_report(
         parts.append(f"**Span**: {time_span}")
     parts.append("")
 
+    parts.append("## 图表说明（Legend）")
+    parts.append("- **Unicode 柱状图**: 展示 Prefix HR 分布，`█` 越多说明该命中率区间占比越高。")
+    parts.append("- **ASCII 折线图**: 展示命中率随时间变化，横轴为时间窗口，纵轴为命中率（0-100%）。")
+    parts.append("- **Q1~Q4 趋势**: 将观察区间均分四段，反映整体走向（↑/↓/→）。")
+    parts.append("")
+
     # 1) 主指标摘要（与终端一致，避免“只在终端可见”）
     parts.append("## 1. Key Metrics Summary")
     parts.append("")
@@ -805,7 +828,15 @@ def main():
             output_dir,
             time_span=time_span,
         )
-        print(f"\n\U0001f4c4 详细数据见: {report_path}")
+        print("\n\U0001f4c4 详细数据见:")
+        report_abs, report_uri = _build_path_links(report_path)
+        print(f"  - 报告文件: {report_abs}")
+        print(f"    URI: {report_uri}")
+        details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md")
+        if os.path.exists(details_path):
+            details_abs, details_uri = _build_path_links(details_path)
+            print(f"  - 窗口明细: {details_abs}")
+            print(f"    URI: {details_uri}")
 
     if args.watch:
         print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail")

From 5d2984999fb485d3cd4ca3cad42ca1cef7f4f6c7 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 21:01:20 +0800
Subject: [PATCH 05/40] split session and window logic out of
 stat_cache_hitrate

---
 .../skills/stat-cache-hitrate/SKILL.md        |  11 +-
 .../references/report_templates.md            |   3 +
 .../scripts/session_analysis.py               | 116 ++++++++++++++++++
 .../scripts/stat_cache_hitrate.py             |  65 +++++++++-
 .../scripts/window_utils.py                   |  80 ++++++++++++
 5 files changed, 268 insertions(+), 7 deletions(-)
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index 6534fb332f2..f9c5156ca69 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -23,10 +23,10 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 运行脚本前，Claude 必须先向用户确认以下参数：
 
 ### 1. 日志文件路径
-使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项，同时允许用户直接输入自定义路径（支持绝对路径和相对路径）：
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入（支持绝对路径和相对路径）：
 - 选项 1: `logs/router.log`（默认）
-- 选项 2: `fd-router.log`（golang_router 根目录）
-- 选项 3: 用户通过 Other 输入自定义路径
+- 选项 2: `fd-router.log`（golang_router 根目录常用文件名）
+- 选项 3: Other（用户直接输入任意路径，例如 `logs/fd-router.log`、`/home/user/logs/router.log`）
 
 **重要规则**：
 - 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
@@ -75,7 +75,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00"
 ```
 
-默认日志路径：`logs/router.log` 或 `fd-router.log`（相对于 `fastdeploy/golang_router/`）。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate/<timestamp>/`。
+默认日志路径：`logs/router.log`（相对于 `fastdeploy/golang_router/`）。常用备选：`fd-router.log`（根目录）。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate/<timestamp>/`。
 
 脚本会自动根据文件大小选择解析策略：小文件（<5000 行）在内存中处理，大文件用 grep + 管道流式处理。
 
@@ -94,7 +94,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 详细报告和图表输出到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/` 目录，每次运行自动创建带时间戳的子目录。
 
 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
-- `details/per_window_data.md` — 每5s窗口的完整明细数据（Prefix HR / Session HR / Scoring / Fallback / Running）
+- `details/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
+- `details/session_hit_details.md` — 每个 session 的命中明细（`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`），并附带 `prefill_urls`、prefill URL 切换前后 request_id（或 req_id/trace_id）以及命中率突降 request_id
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
index 7f060cacb6a..f5a0def5f55 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
@@ -72,6 +72,9 @@
     URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
   - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
     URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
+  - Session 命中详情: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/session_hit_details.md
+    URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/session_hit_details.md
+    (含 prefill_urls、worker 切换前后 request_id，以及命中率突降 request_id)
 ```
 
 ---
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
new file mode 100644
index 00000000000..355ba8fc947
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Session 维度分析：聚合每个 session 的命中率、worker 切换与突降请求。
+"""
+
+from collections import defaultdict
+
+
+def compute_session_details(strategies, strip_scheme):
+    """按 session 统计命中详情。"""
+
+    def _req_id_from_tags(tags, fallback):
+        return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback
+
+    session_records = defaultdict(list)
+    for idx, rec in enumerate(strategies):
+        if rec.get("strategy") != "cache_aware_scoring":
+            continue
+        tags = rec.get("tags", {}) or {}
+        session_id = tags.get("session_id")
+        if not session_id:
+            continue
+        session_records[session_id].append((idx, rec))
+
+    rows = []
+    for session_id, items in session_records.items():
+        items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0]))
+        recs = [r for _, r in items]
+        hits = [int(r.get("selected_hitRatio", 0)) for r in recs]
+        if not hits:
+            continue
+
+        non_first = hits[1:]
+        avg_excl_first = round(sum(non_first) / len(non_first), 1) if non_first else "-"
+        workers = {r.get("selected", "") for r in recs if r.get("selected")}
+
+        prefill_urls = []
+        for r in recs:
+            u = r.get("selected", "")
+            if u and u not in prefill_urls:
+                prefill_urls.append(u)
+
+        switch_events = []
+        sharp_drop_req_ids = []
+        for i in range(1, len(recs)):
+            prev_r = recs[i - 1]
+            curr_r = recs[i]
+            prev_url = prev_r.get("selected", "")
+            curr_url = curr_r.get("selected", "")
+            prev_tags = prev_r.get("tags", {}) or {}
+            curr_tags = curr_r.get("tags", {}) or {}
+            prev_req = _req_id_from_tags(prev_tags, f"idx#{i}")
+            curr_req = _req_id_from_tags(curr_tags, f"idx#{i+1}")
+
+            if prev_url and curr_url and prev_url != curr_url:
+                switch_events.append(f"{prev_req}->{curr_req} ({strip_scheme(prev_url)}→{strip_scheme(curr_url)})")
+
+            prev_hit = int(prev_r.get("selected_hitRatio", 0))
+            curr_hit = int(curr_r.get("selected_hitRatio", 0))
+            if curr_hit - prev_hit <= -30:
+                sharp_drop_req_ids.append(f"{curr_req} ({prev_hit}%→{curr_hit}%)")
+
+        rows.append(
+            {
+                "session": session_id,
+                "req_count": len(hits),
+                "first_hit": f"{hits[0]}%",
+                "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-",
+                "max_hit": f"{max(hits)}%",
+                "min_hit": f"{min(hits)}%",
+                "all_hits": ", ".join(f"{h}%" for h in hits),
+                "sticky": "yes" if len(workers) <= 1 else "no",
+                "unique_workers": len(workers),
+                "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls),
+                "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-",
+                "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-",
+            }
+        )
+
+    rows.sort(key=lambda r: (r["req_count"], r["session"]), reverse=True)
+    return rows
+
+
+def summarize_session_details(rows):
+    """生成 session 级摘要指标。"""
+    if not rows:
+        return {
+            "total_sessions": 0,
+            "multi_req": 0,
+            "single_req": 0,
+            "sticky_multi": 0,
+            "non_sticky_multi": 0,
+            "non_first_avg": 0,
+            "non_first_total": 0,
+        }
+
+    multi_req_rows = [r for r in rows if r["req_count"] > 1]
+    sticky_multi = [r for r in multi_req_rows if r["sticky"] == "yes"]
+    non_sticky_multi = [r for r in multi_req_rows if r["sticky"] == "no"]
+
+    non_first_vals = []
+    for r in rows:
+        hit_tokens = [h.strip().rstrip("%") for h in r["all_hits"].split(",") if h.strip()]
+        nums = [int(x) for x in hit_tokens if x.isdigit()]
+        if len(nums) > 1:
+            non_first_vals.extend(nums[1:])
+
+    return {
+        "total_sessions": len(rows),
+        "multi_req": len(multi_req_rows),
+        "single_req": len(rows) - len(multi_req_rows),
+        "sticky_multi": len(sticky_multi),
+        "non_sticky_multi": len(non_sticky_multi),
+        "non_first_avg": round(sum(non_first_vals) / len(non_first_vals), 2) if non_first_vals else 0,
+        "non_first_total": len(non_first_vals),
+    }
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 6d09bc1915d..066c15330d7 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -33,7 +33,9 @@
     parse_stats_line,
     parse_ts,
 )
+from session_analysis import compute_session_details, summarize_session_details
 from stats import compute_statistics, count_by, time_bucket
+from window_utils import merge_blank_window_rows
 
 
 def _strip_scheme(url):
@@ -661,11 +663,15 @@ def save_detailed_report(
 
     # 每窗口明细 → 拆分到 details/
     window_rows = build_per_window_rows(strategies, stats_recs)
+    window_rows_merged = merge_blank_window_rows(window_rows)
+    session_rows = compute_session_details(strategies, _strip_scheme)
+    session_summary = summarize_session_details(session_rows)
 
     if window_rows:
         # 主报告中添加引用
         parts.append(
-            f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)"
+            f"> 每5s窗口明细数据（原始 {len(window_rows)} 条，合并后 {len(window_rows_merged)} 条）:"
+            " [details/per_window_data.md](details/per_window_data.md)"
         )
         parts.append("")
 
@@ -673,9 +679,13 @@ def save_detailed_report(
         details_dir = os.path.join(output_dir, "details")
         os.makedirs(details_dir, exist_ok=True)
         detail_parts = ["# 每5s窗口明细数据", ""]
+        detail_parts.append(
+            "> 注：连续空窗口（Prefix/Session 都为空、且 Scoring/Fallback=0）已按 3 行格式合并展示（起始/合并说明/结束）。"
+        )
+        detail_parts.append("")
         detail_parts.append(
             render_table(
-                window_rows,
+                window_rows_merged,
                 columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
                 right_align={"Scoring", "Fallback", "Total Running"},
             )
@@ -686,6 +696,57 @@ def save_detailed_report(
         with open(detail_path, "w") as f:
             f.write("\n".join(detail_parts))
 
+        if session_rows:
+            parts.append(
+                f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)"
+            )
+            parts.append("")
+
+            session_parts = ["# Session 命中详情", ""]
+            session_parts.append("## 概览")
+            session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**')
+            session_parts.append(
+                f'- Sessions with >1 request: **{session_summary["multi_req"]}**'
+                f' | single request: **{session_summary["single_req"]}**'
+            )
+            if session_summary["multi_req"] > 0:
+                sticky_pct = round(session_summary["sticky_multi"] / session_summary["multi_req"] * 100, 1)
+                session_parts.append(
+                    f'- Sticky (multi-request): **{session_summary["sticky_multi"]} ({sticky_pct}%)**'
+                    f' | non-sticky: **{session_summary["non_sticky_multi"]}**'
+                )
+            session_parts.append(
+                f'- Non-first request avg hit: **{session_summary["non_first_avg"]}%**'
+                f' (N={session_summary["non_first_total"]})'
+            )
+            session_parts.append("")
+            session_parts.append("## 明细表")
+            session_parts.append(
+                render_table(
+                    session_rows,
+                    columns=[
+                        "session",
+                        "req_count",
+                        "first_hit",
+                        "avg_hit(excl_first)",
+                        "max_hit",
+                        "min_hit",
+                        "all_hits",
+                        "prefill_urls",
+                        "switch_req_pairs",
+                        "sharp_drop_request_ids",
+                        "sticky",
+                        "unique_workers",
+                    ],
+                    right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"},
+                )
+            )
+            session_parts.append("")
+
+            session_path = os.path.join(details_dir, "session_hit_details.md")
+            with open(session_path, "w") as f:
+                f.write("\n".join(session_parts))
+
     os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
     with open(output_path, "w") as f:
         f.write("\n".join(parts))
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
new file mode 100644
index 00000000000..526fe2382ce
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+窗口明细压缩工具：合并连续空窗口，降低 per_window_data.md 噪声。
+"""
+
+
+def _is_blank_window_row(row):
+    """判断是否为空窗口（无 Prefix/Session 明细值）。"""
+    return (
+        row.get("Prefix HR") == "-"
+        and row.get("Session HR") == "-"
+        and row.get("Scoring") in {"0", 0}
+        and row.get("Fallback") in {"0", 0}
+    )
+
+
+def merge_blank_window_rows(rows, min_merge_len=5):
+    """合并连续空窗口，避免明细表被大量 '-' 行淹没。
+
+    对于连续空窗口段（长度 >= min_merge_len），压缩成 3 行：
+      1) 起始时间行
+      2) 合并说明行（含窗口数量）
+      3) 结束时间行
+    """
+    if not rows:
+        return rows
+
+    merged = []
+    i = 0
+    while i < len(rows):
+        if not _is_blank_window_row(rows[i]):
+            merged.append(rows[i])
+            i += 1
+            continue
+
+        j = i
+        while j < len(rows) and _is_blank_window_row(rows[j]):
+            j += 1
+
+        seg_len = j - i
+        if seg_len < min_merge_len:
+            merged.extend(rows[i:j])
+            i = j
+            continue
+
+        start_t = rows[i]["Time"]
+        end_t = rows[j - 1]["Time"]
+        merged.append(
+            {
+                "Time": start_t,
+                "Prefix HR": "-",
+                "Session HR": "-",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": rows[i].get("Total Running", "-"),
+            }
+        )
+        merged.append(
+            {
+                "Time": f"... {start_t} ~ {end_t} merged ({seg_len} windows) ...",
+                "Prefix HR": "-",
+                "Session HR": "-",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": "-",
+            }
+        )
+        merged.append(
+            {
+                "Time": end_t,
+                "Prefix HR": "-",
+                "Session HR": "-",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": rows[j - 1].get("Total Running", "-"),
+            }
+        )
+        i = j
+
+    return merged

From c32897d235520bbc713be8f4105bb0631e8feaab Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 21:11:17 +0800
Subject: [PATCH 06/40] refine merged-window format and print session detail
 link

---
 .../skills/stat-cache-hitrate/SKILL.md        |  11 +-
 .../references/report_templates.md            |   3 +
 .../scripts/session_analysis.py               | 116 ++++++++++++++++++
 .../scripts/stat_cache_hitrate.py             |  70 ++++++++++-
 .../scripts/window_utils.py                   |  80 ++++++++++++
 5 files changed, 273 insertions(+), 7 deletions(-)
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
 create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index 6534fb332f2..f9c5156ca69 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -23,10 +23,10 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 运行脚本前，Claude 必须先向用户确认以下参数：
 
 ### 1. 日志文件路径
-使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项，同时允许用户直接输入自定义路径（支持绝对路径和相对路径）：
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入（支持绝对路径和相对路径）：
 - 选项 1: `logs/router.log`（默认）
-- 选项 2: `fd-router.log`（golang_router 根目录）
-- 选项 3: 用户通过 Other 输入自定义路径
+- 选项 2: `fd-router.log`（golang_router 根目录常用文件名）
+- 选项 3: Other（用户直接输入任意路径，例如 `logs/fd-router.log`、`/home/user/logs/router.log`）
 
 **重要规则**：
 - 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
@@ -75,7 +75,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00"
 ```
 
-默认日志路径：`logs/router.log` 或 `fd-router.log`（相对于 `fastdeploy/golang_router/`）。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate/<timestamp>/`。
+默认日志路径：`logs/router.log`（相对于 `fastdeploy/golang_router/`）。常用备选：`fd-router.log`（根目录）。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate/<timestamp>/`。
 
 脚本会自动根据文件大小选择解析策略：小文件（<5000 行）在内存中处理，大文件用 grep + 管道流式处理。
 
@@ -94,7 +94,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 详细报告和图表输出到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/` 目录，每次运行自动创建带时间戳的子目录。
 
 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
-- `details/per_window_data.md` — 每5s窗口的完整明细数据（Prefix HR / Session HR / Scoring / Fallback / Running）
+- `details/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
+- `details/session_hit_details.md` — 每个 session 的命中明细（`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`），并附带 `prefill_urls`、prefill URL 切换前后 request_id（或 req_id/trace_id）以及命中率突降 request_id
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
index 7f060cacb6a..f5a0def5f55 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
@@ -72,6 +72,9 @@
     URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/cache_hitrate_report_<timestamp>.md
   - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
     URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/per_window_data.md
+  - Session 命中详情: /abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/session_hit_details.md
+    URI: file:///abs/path/to/skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/details/session_hit_details.md
+    (含 prefill_urls、worker 切换前后 request_id，以及命中率突降 request_id)
 ```
 
 ---
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
new file mode 100644
index 00000000000..355ba8fc947
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Session 维度分析：聚合每个 session 的命中率、worker 切换与突降请求。
+"""
+
+from collections import defaultdict
+
+
+def compute_session_details(strategies, strip_scheme):
+    """按 session 统计命中详情。"""
+
+    def _req_id_from_tags(tags, fallback):
+        return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback
+
+    session_records = defaultdict(list)
+    for idx, rec in enumerate(strategies):
+        if rec.get("strategy") != "cache_aware_scoring":
+            continue
+        tags = rec.get("tags", {}) or {}
+        session_id = tags.get("session_id")
+        if not session_id:
+            continue
+        session_records[session_id].append((idx, rec))
+
+    rows = []
+    for session_id, items in session_records.items():
+        items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0]))
+        recs = [r for _, r in items]
+        hits = [int(r.get("selected_hitRatio", 0)) for r in recs]
+        if not hits:
+            continue
+
+        non_first = hits[1:]
+        avg_excl_first = round(sum(non_first) / len(non_first), 1) if non_first else "-"
+        workers = {r.get("selected", "") for r in recs if r.get("selected")}
+
+        prefill_urls = []
+        for r in recs:
+            u = r.get("selected", "")
+            if u and u not in prefill_urls:
+                prefill_urls.append(u)
+
+        switch_events = []
+        sharp_drop_req_ids = []
+        for i in range(1, len(recs)):
+            prev_r = recs[i - 1]
+            curr_r = recs[i]
+            prev_url = prev_r.get("selected", "")
+            curr_url = curr_r.get("selected", "")
+            prev_tags = prev_r.get("tags", {}) or {}
+            curr_tags = curr_r.get("tags", {}) or {}
+            prev_req = _req_id_from_tags(prev_tags, f"idx#{i}")
+            curr_req = _req_id_from_tags(curr_tags, f"idx#{i+1}")
+
+            if prev_url and curr_url and prev_url != curr_url:
+                switch_events.append(f"{prev_req}->{curr_req} ({strip_scheme(prev_url)}→{strip_scheme(curr_url)})")
+
+            prev_hit = int(prev_r.get("selected_hitRatio", 0))
+            curr_hit = int(curr_r.get("selected_hitRatio", 0))
+            if curr_hit - prev_hit <= -30:
+                sharp_drop_req_ids.append(f"{curr_req} ({prev_hit}%→{curr_hit}%)")
+
+        rows.append(
+            {
+                "session": session_id,
+                "req_count": len(hits),
+                "first_hit": f"{hits[0]}%",
+                "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-",
+                "max_hit": f"{max(hits)}%",
+                "min_hit": f"{min(hits)}%",
+                "all_hits": ", ".join(f"{h}%" for h in hits),
+                "sticky": "yes" if len(workers) <= 1 else "no",
+                "unique_workers": len(workers),
+                "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls),
+                "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-",
+                "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-",
+            }
+        )
+
+    rows.sort(key=lambda r: (r["req_count"], r["session"]), reverse=True)
+    return rows
+
+
+def summarize_session_details(rows):
+    """生成 session 级摘要指标。"""
+    if not rows:
+        return {
+            "total_sessions": 0,
+            "multi_req": 0,
+            "single_req": 0,
+            "sticky_multi": 0,
+            "non_sticky_multi": 0,
+            "non_first_avg": 0,
+            "non_first_total": 0,
+        }
+
+    multi_req_rows = [r for r in rows if r["req_count"] > 1]
+    sticky_multi = [r for r in multi_req_rows if r["sticky"] == "yes"]
+    non_sticky_multi = [r for r in multi_req_rows if r["sticky"] == "no"]
+
+    non_first_vals = []
+    for r in rows:
+        hit_tokens = [h.strip().rstrip("%") for h in r["all_hits"].split(",") if h.strip()]
+        nums = [int(x) for x in hit_tokens if x.isdigit()]
+        if len(nums) > 1:
+            non_first_vals.extend(nums[1:])
+
+    return {
+        "total_sessions": len(rows),
+        "multi_req": len(multi_req_rows),
+        "single_req": len(rows) - len(multi_req_rows),
+        "sticky_multi": len(sticky_multi),
+        "non_sticky_multi": len(non_sticky_multi),
+        "non_first_avg": round(sum(non_first_vals) / len(non_first_vals), 2) if non_first_vals else 0,
+        "non_first_total": len(non_first_vals),
+    }
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 6d09bc1915d..7adc0b97d02 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -33,7 +33,9 @@
     parse_stats_line,
     parse_ts,
 )
+from session_analysis import compute_session_details, summarize_session_details
 from stats import compute_statistics, count_by, time_bucket
+from window_utils import merge_blank_window_rows
 
 
 def _strip_scheme(url):
@@ -661,11 +663,15 @@ def save_detailed_report(
 
     # 每窗口明细 → 拆分到 details/
     window_rows = build_per_window_rows(strategies, stats_recs)
+    window_rows_merged = merge_blank_window_rows(window_rows)
+    session_rows = compute_session_details(strategies, _strip_scheme)
+    session_summary = summarize_session_details(session_rows)
 
     if window_rows:
         # 主报告中添加引用
         parts.append(
-            f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)"
+            f"> 每5s窗口明细数据（原始 {len(window_rows)} 条，合并后 {len(window_rows_merged)} 条）:"
+            " [details/per_window_data.md](details/per_window_data.md)"
         )
         parts.append("")
 
@@ -673,9 +679,13 @@ def save_detailed_report(
         details_dir = os.path.join(output_dir, "details")
         os.makedirs(details_dir, exist_ok=True)
         detail_parts = ["# 每5s窗口明细数据", ""]
+        detail_parts.append(
+            "> 注：连续空窗口（Prefix/Session 都为空、且 Scoring/Fallback=0）已按 3 行格式合并展示（起始/合并说明/结束）。"
+        )
+        detail_parts.append("")
         detail_parts.append(
             render_table(
-                window_rows,
+                window_rows_merged,
                 columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
                 right_align={"Scoring", "Fallback", "Total Running"},
             )
@@ -686,6 +696,57 @@ def save_detailed_report(
         with open(detail_path, "w") as f:
             f.write("\n".join(detail_parts))
 
+        if session_rows:
+            parts.append(
+                f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)"
+            )
+            parts.append("")
+
+            session_parts = ["# Session 命中详情", ""]
+            session_parts.append("## 概览")
+            session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**')
+            session_parts.append(
+                f'- Sessions with >1 request: **{session_summary["multi_req"]}**'
+                f' | single request: **{session_summary["single_req"]}**'
+            )
+            if session_summary["multi_req"] > 0:
+                sticky_pct = round(session_summary["sticky_multi"] / session_summary["multi_req"] * 100, 1)
+                session_parts.append(
+                    f'- Sticky (multi-request): **{session_summary["sticky_multi"]} ({sticky_pct}%)**'
+                    f' | non-sticky: **{session_summary["non_sticky_multi"]}**'
+                )
+            session_parts.append(
+                f'- Non-first request avg hit: **{session_summary["non_first_avg"]}%**'
+                f' (N={session_summary["non_first_total"]})'
+            )
+            session_parts.append("")
+            session_parts.append("## 明细表")
+            session_parts.append(
+                render_table(
+                    session_rows,
+                    columns=[
+                        "session",
+                        "req_count",
+                        "first_hit",
+                        "avg_hit(excl_first)",
+                        "max_hit",
+                        "min_hit",
+                        "all_hits",
+                        "prefill_urls",
+                        "switch_req_pairs",
+                        "sharp_drop_request_ids",
+                        "sticky",
+                        "unique_workers",
+                    ],
+                    right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"},
+                )
+            )
+            session_parts.append("")
+
+            session_path = os.path.join(details_dir, "session_hit_details.md")
+            with open(session_path, "w") as f:
+                f.write("\n".join(session_parts))
+
     os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
     with open(output_path, "w") as f:
         f.write("\n".join(parts))
@@ -837,6 +898,11 @@ def main():
             details_abs, details_uri = _build_path_links(details_path)
             print(f"  - 窗口明细: {details_abs}")
             print(f"    URI: {details_uri}")
+        session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md")
+        if os.path.exists(session_detail_path):
+            session_abs, session_uri = _build_path_links(session_detail_path)
+            print(f"  - Session 明细: {session_abs}")
+            print(f"    URI: {session_uri}")
 
     if args.watch:
         print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail")
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
new file mode 100644
index 00000000000..4ff6aa666d5
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+窗口明细压缩工具：合并连续空窗口，降低 per_window_data.md 噪声。
+"""
+
+
+def _is_blank_window_row(row):
+    """判断是否为空窗口（无 Prefix/Session 明细值）。"""
+    return (
+        row.get("Prefix HR") == "-"
+        and row.get("Session HR") == "-"
+        and row.get("Scoring") in {"0", 0}
+        and row.get("Fallback") in {"0", 0}
+    )
+
+
+def merge_blank_window_rows(rows, min_merge_len=5):
+    """合并连续空窗口，避免明细表被大量 '-' 行淹没。
+
+    对于连续空窗口段（长度 >= min_merge_len），压缩成 3 行：
+      1) 起始时间行
+      2) 合并说明行（含窗口数量）
+      3) 结束时间行
+    """
+    if not rows:
+        return rows
+
+    merged = []
+    i = 0
+    while i < len(rows):
+        if not _is_blank_window_row(rows[i]):
+            merged.append(rows[i])
+            i += 1
+            continue
+
+        j = i
+        while j < len(rows) and _is_blank_window_row(rows[j]):
+            j += 1
+
+        seg_len = j - i
+        if seg_len < min_merge_len:
+            merged.extend(rows[i:j])
+            i = j
+            continue
+
+        start_t = rows[i]["Time"]
+        end_t = rows[j - 1]["Time"]
+        merged.append(
+            {
+                "Time": start_t,
+                "Prefix HR": "-",
+                "Session HR": "-",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": rows[i].get("Total Running", "-"),
+            }
+        )
+        merged.append(
+            {
+                "Time": "|",
+                "Prefix HR": "-",
+                "Session HR": f"merged {seg_len} windows",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": "-",
+            }
+        )
+        merged.append(
+            {
+                "Time": end_t,
+                "Prefix HR": "-",
+                "Session HR": "-",
+                "Scoring": "0",
+                "Fallback": "0",
+                "Total Running": rows[j - 1].get("Total Running", "-"),
+            }
+        )
+        i = j
+
+    return merged

From 37dbd7886b11f526ad0289e379d05f04000dfcc3 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 21:22:25 +0800
Subject: [PATCH 07/40] Improve stat-cache-hitrate UX and running metric
 normalization

---
 .../skills/stat-cache-hitrate/SKILL.md        |  5 +-
 .../scripts/stat_cache_hitrate.py             | 71 ++++++++++++-------
 .../scripts/window_utils.py                   |  8 ++-
 3 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index f9c5156ca69..e7925127dec 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -23,10 +23,9 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 运行脚本前，Claude 必须先向用户确认以下参数：
 
 ### 1. 日志文件路径
-使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入（支持绝对路径和相对路径）：
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项（客户端会自动提供 Other 自定义输入）：
 - 选项 1: `logs/router.log`（默认）
 - 选项 2: `fd-router.log`（golang_router 根目录常用文件名）
-- 选项 3: Other（用户直接输入任意路径，例如 `logs/fd-router.log`、`/home/user/logs/router.log`）
 
 **重要规则**：
 - 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
@@ -95,7 +94,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 
 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
 - `details/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
-- `details/session_hit_details.md` — 每个 session 的命中明细（`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`），并附带 `prefill_urls`、prefill URL 切换前后 request_id（或 req_id/trace_id）以及命中率突降 request_id
+- `details/session_hit_details.md` — 每个 session 的命中明细（TSV 单行格式，便于横向滚动查看），包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids`
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 7adc0b97d02..1476e61d724 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -48,6 +48,28 @@ def _build_path_links(path):
     file_uri = "file://" + quote(abs_path, safe="/:-._~")
     return abs_path, file_uri
 
+
+def _format_half_running(total_running):
+    """将 stats.total_running 归一化为 prefill 口径（decode+prefill 合计 / 2）。"""
+    normalized = total_running / 2
+    if float(normalized).is_integer():
+        return str(int(normalized))
+    return f"{normalized:.1f}"
+
+
+def _render_scrollable_tsv(data, columns):
+    """渲染单行 TSV 文本，适合在 Markdown 查看器里横向滚动。"""
+    if not data:
+        return "```tsv\n(no data)\n```"
+
+    def _escape(v):
+        return str(v).replace("\t", " ").replace("\n", "\\n")
+
+    lines = ["\t".join(columns)]
+    for row in data:
+        lines.append("\t".join(_escape(row.get(col, "")) for col in columns))
+    return "```tsv\n" + "\n".join(lines) + "\n```"
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -461,8 +483,8 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
         parts.append(
             render_table(
                 window_rows[:10],
-                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
-                right_align={"Scoring", "Fallback", "Total Running"},
+                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"],
+                right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"},
             )
         )
 
@@ -542,7 +564,7 @@ def build_per_window_rows(strategies, stats_recs):
         else:
             session_hr = "-"
 
-        running = str(d["running"]) if d["has_running"] else "-"
+        running = _format_half_running(d["running"]) if d["has_running"] else "-"
         rows.append(
             {
                 "Time": short_ts,
@@ -550,7 +572,7 @@ def build_per_window_rows(strategies, stats_recs):
                 "Session HR": session_hr,
                 "Scoring": str(d["scoring"]),
                 "Fallback": str(d["fallback"]),
-                "Total Running": running,
+                "Total Running (prefill≈stats/2)": running,
             }
         )
     return rows
@@ -686,8 +708,8 @@ def save_detailed_report(
         detail_parts.append(
             render_table(
                 window_rows_merged,
-                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"],
-                right_align={"Scoring", "Fallback", "Total Running"},
+                columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"],
+                right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"},
             )
         )
         detail_parts.append("")
@@ -720,27 +742,22 @@ def save_detailed_report(
                 f' (N={session_summary["non_first_total"]})'
             )
             session_parts.append("")
-            session_parts.append("## 明细表")
-            session_parts.append(
-                render_table(
-                    session_rows,
-                    columns=[
-                        "session",
-                        "req_count",
-                        "first_hit",
-                        "avg_hit(excl_first)",
-                        "max_hit",
-                        "min_hit",
-                        "all_hits",
-                        "prefill_urls",
-                        "switch_req_pairs",
-                        "sharp_drop_request_ids",
-                        "sticky",
-                        "unique_workers",
-                    ],
-                    right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"},
-                )
-            )
+            session_columns = [
+                "session",
+                "req_count",
+                "first_hit",
+                "avg_hit(excl_first)",
+                "max_hit",
+                "min_hit",
+                "all_hits",
+                "prefill_urls",
+                "switch_req_pairs",
+                "sharp_drop_request_ids",
+                "sticky",
+                "unique_workers",
+            ]
+            session_parts.append("## 明细（单行 TSV，可横向滚动）")
+            session_parts.append(_render_scrollable_tsv(session_rows, session_columns))
             session_parts.append("")
 
             session_path = os.path.join(details_dir, "session_hit_details.md")
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
index 4ff6aa666d5..4e09710f6f9 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py
@@ -3,6 +3,8 @@
 窗口明细压缩工具：合并连续空窗口，降低 per_window_data.md 噪声。
 """
 
+RUNNING_COL = "Total Running (prefill≈stats/2)"
+
 
 def _is_blank_window_row(row):
     """判断是否为空窗口（无 Prefix/Session 明细值）。"""
@@ -52,7 +54,7 @@ def merge_blank_window_rows(rows, min_merge_len=5):
                 "Session HR": "-",
                 "Scoring": "0",
                 "Fallback": "0",
-                "Total Running": rows[i].get("Total Running", "-"),
+                RUNNING_COL: rows[i].get(RUNNING_COL, "-"),
             }
         )
         merged.append(
@@ -62,7 +64,7 @@ def merge_blank_window_rows(rows, min_merge_len=5):
                 "Session HR": f"merged {seg_len} windows",
                 "Scoring": "0",
                 "Fallback": "0",
-                "Total Running": "-",
+                RUNNING_COL: "-",
             }
         )
         merged.append(
@@ -72,7 +74,7 @@ def merge_blank_window_rows(rows, min_merge_len=5):
                 "Session HR": "-",
                 "Scoring": "0",
                 "Fallback": "0",
-                "Total Running": rows[j - 1].get("Total Running", "-"),
+                RUNNING_COL: rows[j - 1].get(RUNNING_COL, "-"),
             }
         )
         i = j

From 5791a801135df0ff05a497b4f5824b34c78bbb20 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 21:34:37 +0800
Subject: [PATCH 08/40] Improve skill reports: markdown session tables and
 timestamped output layout

---
 .../skills/stat-cache-hitrate/SKILL.md        |  8 +-
 .../scripts/stat_cache_hitrate.py             | 96 +++++++++++++++----
 .../.claude/skills/troubleshoot/SKILL.md      |  8 +-
 .../troubleshoot/scripts/troubleshoot.py      | 26 +++--
 4 files changed, 100 insertions(+), 38 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index e7925127dec..150a15a4dd7 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -50,7 +50,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 
 ### 3. 输出目录
 分析结果默认保存到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
-用户可通过 `--output` 指定自定义目录。
+用户可通过 `--output` 指定**基目录**，脚本会继续在其下创建 `<YYYYMMDD_HHMMSS>/summary` 与 `<YYYYMMDD_HHMMSS>/detail`，避免覆盖历史明细。
 
 ## 使用方式
 
@@ -92,9 +92,9 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 
 详细报告和图表输出到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/` 目录，每次运行自动创建带时间戳的子目录。
 
-- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
-- `details/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
-- `details/session_hit_details.md` — 每个 session 的命中明细（TSV 单行格式，便于横向滚动查看），包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids`
+- `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接
+- `detail/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
+- `detail/session_hit_details.md` — 每个 session 的命中明细（Markdown 表格），包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids`
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 1476e61d724..c338e46030b 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -70,6 +70,29 @@ def _escape(v):
         lines.append("\t".join(_escape(row.get(col, "")) for col in columns))
     return "```tsv\n" + "\n".join(lines) + "\n```"
 
+
+def _render_markdown_table(data, columns, align_right=None):
+    """渲染 Markdown 表格，便于在终端/文档中直接阅读。"""
+    if not data:
+        return "_(no data)_"
+
+    align_right = align_right or set()
+
+    def _escape_md(v):
+        return str(v).replace("\n", "<br>").replace("|", "\\|")
+
+    header = "| " + " | ".join(columns) + " |"
+    align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |"
+    rows = []
+    for row in data:
+        rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |")
+    return "\n".join([header, align] + rows)
+
+
+def _truncate_text(v, limit=72):
+    s = str(v)
+    return s if len(s) <= limit else s[: limit - 1] + "…"
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -595,8 +618,11 @@ def save_detailed_report(
     主报告包含 Per-Worker 统计和 Fallback 明细。
     每窗口明细数据拆分到 details/per_window_data.md。
     """
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md")
+    summary_dir = os.path.join(output_dir, "summary")
+    details_dir = os.path.join(output_dir, "detail")
+    os.makedirs(summary_dir, exist_ok=True)
+    os.makedirs(details_dir, exist_ok=True)
+    output_path = os.path.join(summary_dir, "cache_hitrate_report.md")
 
     parts = []
     parts.append("# Cache Hit Rate Detailed Report")
@@ -693,13 +719,11 @@ def save_detailed_report(
         # 主报告中添加引用
         parts.append(
             f"> 每5s窗口明细数据（原始 {len(window_rows)} 条，合并后 {len(window_rows_merged)} 条）:"
-            " [details/per_window_data.md](details/per_window_data.md)"
+            " [../detail/per_window_data.md](../detail/per_window_data.md)"
         )
         parts.append("")
 
         # 写入 details 子目录
-        details_dir = os.path.join(output_dir, "details")
-        os.makedirs(details_dir, exist_ok=True)
         detail_parts = ["# 每5s窗口明细数据", ""]
         detail_parts.append(
             "> 注：连续空窗口（Prefix/Session 都为空、且 Scoring/Fallback=0）已按 3 行格式合并展示（起始/合并说明/结束）。"
@@ -719,9 +743,7 @@ def save_detailed_report(
             f.write("\n".join(detail_parts))
 
         if session_rows:
-            parts.append(
-                f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)"
-            )
+            parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
             parts.append("")
 
             session_parts = ["# Session 命中详情", ""]
@@ -742,6 +764,42 @@ def save_detailed_report(
                 f' (N={session_summary["non_first_total"]})'
             )
             session_parts.append("")
+            focus_columns = [
+                "session",
+                "req_count",
+                "sticky",
+                "unique_workers",
+                "avg_hit(excl_first)",
+                "min_hit",
+                "switch_req_pairs",
+                "sharp_drop_request_ids",
+            ]
+            session_parts.append("## 优先排查 Session（Top 20）")
+            prioritized_rows = sorted(
+                session_rows,
+                key=lambda r: (
+                    0 if r.get("sticky") == "no" else 1,
+                    int(str(r.get("min_hit", "0")).rstrip("%") or 0),
+                    -int(r.get("req_count", 0)),
+                ),
+            )[:20]
+            compact_rows = []
+            for r in prioritized_rows:
+                compact_rows.append(
+                    {
+                        "session": r["session"],
+                        "req_count": r["req_count"],
+                        "sticky": r["sticky"],
+                        "unique_workers": r["unique_workers"],
+                        "avg_hit(excl_first)": r["avg_hit(excl_first)"],
+                        "min_hit": r["min_hit"],
+                        "switch_req_pairs": _truncate_text(r["switch_req_pairs"]),
+                        "sharp_drop_request_ids": _truncate_text(r["sharp_drop_request_ids"]),
+                    }
+                )
+            session_parts.append(_render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"}))
+            session_parts.append("")
+
             session_columns = [
                 "session",
                 "req_count",
@@ -756,15 +814,20 @@ def save_detailed_report(
                 "sticky",
                 "unique_workers",
             ]
-            session_parts.append("## 明细（单行 TSV，可横向滚动）")
-            session_parts.append(_render_scrollable_tsv(session_rows, session_columns))
+            session_parts.append("## 全量明细（Markdown 表格）")
+            session_parts.append(
+                _render_markdown_table(
+                    session_rows,
+                    session_columns,
+                    align_right={"req_count", "unique_workers"},
+                )
+            )
             session_parts.append("")
 
             session_path = os.path.join(details_dir, "session_hit_details.md")
             with open(session_path, "w") as f:
                 f.write("\n".join(session_parts))
 
-    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
     with open(output_path, "w") as f:
         f.write("\n".join(parts))
 
@@ -887,13 +950,14 @@ def main():
         )
 
         # 导出详细报告
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         if args.output:
-            output_dir = args.output
+            output_base = args.output
         else:
             script_dir = os.path.dirname(os.path.abspath(__file__))
             golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
-            run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp)
+            output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate")
+        output_dir = os.path.join(output_base, run_timestamp)
         report_path = save_detailed_report(
             args.log_file,
             strategy_recs,
@@ -910,12 +974,12 @@ def main():
         report_abs, report_uri = _build_path_links(report_path)
         print(f"  - 报告文件: {report_abs}")
         print(f"    URI: {report_uri}")
-        details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md")
+        details_path = os.path.join(output_dir, "detail", "per_window_data.md")
         if os.path.exists(details_path):
             details_abs, details_uri = _build_path_links(details_path)
             print(f"  - 窗口明细: {details_abs}")
             print(f"    URI: {details_uri}")
-        session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md")
+        session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md")
         if os.path.exists(session_detail_path):
             session_abs, session_uri = _build_path_links(session_detail_path)
             print(f"  - Session 明细: {session_abs}")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index ab0c3ce7219..43ee91a46b1 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -64,7 +64,7 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 
 ### 4. 输出目录
 诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
-用户可通过 `--output` 指定自定义目录。
+用户可通过 `--output` 指定**基目录**，脚本会继续在其下创建 `<YYYYMMDD_HHMMSS>/summary` 与 `<YYYYMMDD_HHMMSS>/detail`，避免覆盖历史明细。
 
 ## 用法
 
@@ -107,9 +107,9 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00" --end "17:00" --erro
 ## 输出
 
 - **终端**：简洁三层汇总（Router / FD 后端 / 客户端），含状态码分布、错误 Top N、趋势图
-- **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
-  - 逐分钟事件详情拆分到 `details/health_events.md`
-  - 请求追踪事件链拆分到 `details/trace_<ID>.md`
+- **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/summary/troubleshoot_report.md`
+  - 逐分钟事件详情拆分到 `detail/health_events.md`
+  - 请求追踪事件链拆分到 `detail/trace_<ID>.md`
 - **状态行**：`STATUS: HEALTHY / DEGRADED / CRITICAL`
 
 ## 三层诊断框架
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 5096c5b294a..30b9df0f443 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -190,30 +190,27 @@ def save_detailed_report(report_text, output_dir, details=None):
         output_dir: 输出目录
         details: 详情数据 dict（来自 format_full_report）
     """
-    os.makedirs(output_dir, exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"troubleshoot_report_{timestamp}.md"
-    filepath = os.path.join(output_dir, filename)
+    summary_dir = os.path.join(output_dir, "summary")
+    detail_dir = os.path.join(output_dir, "detail")
+    os.makedirs(summary_dir, exist_ok=True)
+    os.makedirs(detail_dir, exist_ok=True)
+    filepath = os.path.join(summary_dir, "troubleshoot_report.md")
 
     with open(filepath, "w", encoding="utf-8") as f:
         f.write("# Router Troubleshooting Report\n")
         f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n')
         f.write(report_text)
 
-    # 保存详情到 details/ 子目录
+    # 保存详情到 detail/ 子目录
     if details:
-        details_dir = os.path.join(output_dir, "details")
-
         if details.get("health_events"):
-            os.makedirs(details_dir, exist_ok=True)
-            health_path = os.path.join(details_dir, "health_events.md")
+            health_path = os.path.join(detail_dir, "health_events.md")
             with open(health_path, "w", encoding="utf-8") as f:
                 f.write(details["health_events"])
 
         for trace_id, trace_text in details.get("trace_files", {}).items():
-            os.makedirs(details_dir, exist_ok=True)
             safe_id = trace_id.replace("/", "_")
-            trace_path = os.path.join(details_dir, f"trace_{safe_id}.md")
+            trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")
             with open(trace_path, "w", encoding="utf-8") as f:
                 f.write(trace_text)
 
@@ -327,13 +324,14 @@ def main():
     print(report)
 
     # 保存详细报告
+    run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     if args.output:
-        output_dir = args.output
+        output_base = args.output
     else:
         script_dir = os.path.dirname(os.path.abspath(__file__))
         golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
-        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp)
+        output_base = os.path.join(golang_router_root, "skill_output", "troubleshoot")
+    output_dir = os.path.join(output_base, run_timestamp)
     filepath = save_detailed_report(report, output_dir, details=details)
     print(f"\n详细报告已保存到: {filepath}", file=sys.stderr)
 

From fe6403165257a1c7443bb19aedfa863ffd1902de Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 21:50:01 +0800
Subject: [PATCH 09/40] Refine session detail output: indexed IDs, trace
 fallback, and switch links

---
 .../skills/stat-cache-hitrate/SKILL.md        |   8 +-
 .../scripts/session_analysis.py               |  13 +-
 .../scripts/stat_cache_hitrate.py             | 147 ++++++++++++++----
 .../.claude/skills/troubleshoot/SKILL.md      |   8 +-
 .../troubleshoot/scripts/troubleshoot.py      |  26 ++--
 5 files changed, 147 insertions(+), 55 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index e7925127dec..e07281576a6 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -50,7 +50,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 
 ### 3. 输出目录
 分析结果默认保存到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
-用户可通过 `--output` 指定自定义目录。
+用户可通过 `--output` 指定**基目录**，脚本会继续在其下创建 `<YYYYMMDD_HHMMSS>/summary` 与 `<YYYYMMDD_HHMMSS>/detail`，避免覆盖历史明细。
 
 ## 使用方式
 
@@ -92,9 +92,9 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 
 详细报告和图表输出到 `skill_output/stat-cache-hitrate/<YYYYMMDD_HHMMSS>/` 目录，每次运行自动创建带时间戳的子目录。
 
-- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细
-- `details/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
-- `details/session_hit_details.md` — 每个 session 的命中明细（TSV 单行格式，便于横向滚动查看），包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids`
+- `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接
+- `detail/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
+- `detail/session_hit_details.md` — 每个 session（无 session_id 时回退 trace_id）的命中明细（Markdown 表格），包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`，并附「序号与会话ID映射」「切换 reqid 明细（可跳转）」。
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
index 355ba8fc947..f7b4caed542 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
@@ -7,7 +7,7 @@
 
 
 def compute_session_details(strategies, strip_scheme):
-    """按 session 统计命中详情。"""
+    """按 session_id（优先）或 trace_id（兜底）统计命中详情。"""
 
     def _req_id_from_tags(tags, fallback):
         return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback
@@ -18,12 +18,14 @@ def _req_id_from_tags(tags, fallback):
             continue
         tags = rec.get("tags", {}) or {}
         session_id = tags.get("session_id")
-        if not session_id:
+        trace_id = tags.get("trace_id")
+        identity = session_id or trace_id
+        if not identity:
             continue
-        session_records[session_id].append((idx, rec))
+        session_records[identity].append((idx, rec))
 
     rows = []
-    for session_id, items in session_records.items():
+    for identity, items in session_records.items():
         items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0]))
         recs = [r for _, r in items]
         hits = [int(r.get("selected_hitRatio", 0)) for r in recs]
@@ -62,7 +64,8 @@ def _req_id_from_tags(tags, fallback):
 
         rows.append(
             {
-                "session": session_id,
+                "session": identity,
+                "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id",
                 "req_count": len(hits),
                 "first_hit": f"{hits[0]}%",
                 "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-",
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 1476e61d724..b5adcb9bd5f 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -70,6 +70,33 @@ def _escape(v):
         lines.append("\t".join(_escape(row.get(col, "")) for col in columns))
     return "```tsv\n" + "\n".join(lines) + "\n```"
 
+
+def _render_markdown_table(data, columns, align_right=None):
+    """渲染 Markdown 表格，便于在终端/文档中直接阅读。"""
+    if not data:
+        return "_(no data)_"
+
+    align_right = align_right or set()
+
+    def _escape_md(v):
+        return str(v).replace("\n", "<br>").replace("|", "\\|")
+
+    header = "| " + " | ".join(columns) + " |"
+    align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |"
+    rows = []
+    for row in data:
+        rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |")
+    return "\n".join([header, align] + rows)
+
+
+def _truncate_text(v, limit=72):
+    s = str(v)
+    return s if len(s) <= limit else s[: limit - 1] + "…"
+
+
+def _seq_label(n):
+    return f"S{n:03d}"
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -234,7 +261,6 @@ def compute_session_hitrate(stats_recs, inference_count):
     total_total = sum(r.get("total", 0) for r in stats_recs)
 
     session_hr = round(total_hits / total_total * 100, 1) if total_total else 0
-    coverage = round(total_total / inference_count * 100, 1) if inference_count else 0
 
     # 趋势：每个窗口的 hits/total
     trend = time_bucket(stats_recs, "auto", [("hits", "sum"), ("total", "sum")])
@@ -247,7 +273,6 @@ def compute_session_hitrate(stats_recs, inference_count):
         "rate": session_hr,
         "hits": total_hits,
         "total": total_total,
-        "coverage": coverage,
         "inference_count": inference_count,
         "trend": trend,
     }
@@ -430,8 +455,6 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker,
     # 2. Session Hit Rate
     parts.append("### 2. Session Hit Rate (请求级路由粘性)")
     parts.append(f'  累计: {session_hr["rate"]}% (hits={session_hr["hits"]} / total={session_hr["total"]})')
-    parts.append(f'  覆盖率: {session_hr["coverage"]}% 的推理请求带 session_id')
-
     trend_str = _quartile_trend(session_hr["trend"], "value")
     if trend_str:
         parts.append(f"  趋势: {trend_str}")
@@ -498,10 +521,7 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling):
     parts.append(f"**File**: {filepath} | **tail {line_count} lines**")
     parts.append("")
     parts.append(f'  Prefix Hit Ratio:  {prefix_hr["mean"]}% (avg) | Cold start: {prefix_hr["cold_start_rate"]}%')
-    parts.append(
-        f'  Session Hit Rate:  {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})'
-        f' | Coverage: {session_hr["coverage"]}%'
-    )
+    parts.append(f'  Session Hit Rate:  {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})')
     parts.append(
         f'  Strategy: scoring {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)'
         f' | fallback {scheduling["fallback_count"]}'
@@ -595,8 +615,11 @@ def save_detailed_report(
     主报告包含 Per-Worker 统计和 Fallback 明细。
     每窗口明细数据拆分到 details/per_window_data.md。
     """
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md")
+    summary_dir = os.path.join(output_dir, "summary")
+    details_dir = os.path.join(output_dir, "detail")
+    os.makedirs(summary_dir, exist_ok=True)
+    os.makedirs(details_dir, exist_ok=True)
+    output_path = os.path.join(summary_dir, "cache_hitrate_report.md")
 
     parts = []
     parts.append("# Cache Hit Rate Detailed Report")
@@ -639,7 +662,6 @@ def save_detailed_report(
 
     parts.append("### Session Hit Rate")
     parts.append(f'- 累计: **{session_hr["rate"]}%** (hits={session_hr["hits"]}/total={session_hr["total"]})')
-    parts.append(f'- 覆盖率: **{session_hr["coverage"]}%**')
     trend_str = _quartile_trend(session_hr["trend"], "value")
     if trend_str:
         parts.append(f"- 趋势: {trend_str}")
@@ -693,13 +715,11 @@ def save_detailed_report(
         # 主报告中添加引用
         parts.append(
             f"> 每5s窗口明细数据（原始 {len(window_rows)} 条，合并后 {len(window_rows_merged)} 条）:"
-            " [details/per_window_data.md](details/per_window_data.md)"
+            " [../detail/per_window_data.md](../detail/per_window_data.md)"
         )
         parts.append("")
 
         # 写入 details 子目录
-        details_dir = os.path.join(output_dir, "details")
-        os.makedirs(details_dir, exist_ok=True)
         detail_parts = ["# 每5s窗口明细数据", ""]
         detail_parts.append(
             "> 注：连续空窗口（Prefix/Session 都为空、且 Scoring/Fallback=0）已按 3 行格式合并展示（起始/合并说明/结束）。"
@@ -719,9 +739,7 @@ def save_detailed_report(
             f.write("\n".join(detail_parts))
 
         if session_rows:
-            parts.append(
-                f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)"
-            )
+            parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
             parts.append("")
 
             session_parts = ["# Session 命中详情", ""]
@@ -742,29 +760,101 @@ def save_detailed_report(
                 f' (N={session_summary["non_first_total"]})'
             )
             session_parts.append("")
+            focus_columns = [
+                "id",
+                "req_count",
+                "id_type",
+                "sticky",
+                "unique_workers",
+                "avg_hit(excl_first)",
+                "max_hit",
+                "min_hit",
+                "switch_reqids",
+            ]
+            session_parts.append("## 优先排查 Session（Top 20）")
+            prioritized_rows = sorted(
+                session_rows,
+                key=lambda r: (
+                    0 if r.get("sticky") == "no" else 1,
+                    int(str(r.get("min_hit", "0")).rstrip("%") or 0),
+                    -int(r.get("req_count", 0)),
+                ),
+            )[:20]
+            compact_rows = []
+            all_rows_with_seq = []
+            for i, r in enumerate(session_rows, start=1):
+                all_rows_with_seq.append({**r, "id": _seq_label(i)})
+
+            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
+
+            for r in prioritized_rows:
+                sid = seq_map.get(r["session"], "-")
+                compact_rows.append(
+                    {
+                        "id": sid,
+                        "req_count": r["req_count"],
+                        "id_type": r.get("id_type", "session_id"),
+                        "sticky": r["sticky"],
+                        "unique_workers": r["unique_workers"],
+                        "avg_hit(excl_first)": r["avg_hit(excl_first)"],
+                        "max_hit": r["max_hit"],
+                        "min_hit": r["min_hit"],
+                        "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-",
+                    }
+                )
+            session_parts.append(
+                _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"})
+            )
+            session_parts.append("")
+
             session_columns = [
-                "session",
+                "id",
                 "req_count",
+                "id_type",
                 "first_hit",
                 "avg_hit(excl_first)",
                 "max_hit",
                 "min_hit",
                 "all_hits",
                 "prefill_urls",
-                "switch_req_pairs",
-                "sharp_drop_request_ids",
                 "sticky",
                 "unique_workers",
             ]
-            session_parts.append("## 明细（单行 TSV，可横向滚动）")
-            session_parts.append(_render_scrollable_tsv(session_rows, session_columns))
+            session_parts.append("## 全量明细（Markdown 表格）")
+            session_parts.append(
+                _render_markdown_table(
+                    all_rows_with_seq,
+                    session_columns,
+                    align_right={"req_count", "unique_workers"},
+                )
+            )
+            session_parts.append("")
+
+            session_parts.append("## 序号与会话ID映射")
+            map_rows = [
+                {
+                    "id": r["id"],
+                    "id_type": r.get("id_type", "session_id"),
+                    "session_or_trace_id": r["session"],
+                }
+                for r in all_rows_with_seq
+            ]
+            session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"]))
             session_parts.append("")
 
+            session_parts.append("## 切换 reqid 明细（可跳转）")
+            for r in all_rows_with_seq:
+                session_parts.append(f'### switch-{r["id"].lower()}')
+                session_parts.append(f'- ID: **{r["id"]}**')
+                session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})')
+                session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}')
+                session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}')
+                session_parts.append("")
+
             session_path = os.path.join(details_dir, "session_hit_details.md")
             with open(session_path, "w") as f:
                 f.write("\n".join(session_parts))
 
-    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
     with open(output_path, "w") as f:
         f.write("\n".join(parts))
 
@@ -887,13 +977,14 @@ def main():
         )
 
         # 导出详细报告
+        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         if args.output:
-            output_dir = args.output
+            output_base = args.output
         else:
             script_dir = os.path.dirname(os.path.abspath(__file__))
             golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
-            run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp)
+            output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate")
+        output_dir = os.path.join(output_base, run_timestamp)
         report_path = save_detailed_report(
             args.log_file,
             strategy_recs,
@@ -910,12 +1001,12 @@ def main():
         report_abs, report_uri = _build_path_links(report_path)
         print(f"  - 报告文件: {report_abs}")
         print(f"    URI: {report_uri}")
-        details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md")
+        details_path = os.path.join(output_dir, "detail", "per_window_data.md")
         if os.path.exists(details_path):
             details_abs, details_uri = _build_path_links(details_path)
             print(f"  - 窗口明细: {details_abs}")
             print(f"    URI: {details_uri}")
-        session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md")
+        session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md")
         if os.path.exists(session_detail_path):
             session_abs, session_uri = _build_path_links(session_detail_path)
             print(f"  - Session 明细: {session_abs}")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index ab0c3ce7219..43ee91a46b1 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -64,7 +64,7 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 
 ### 4. 输出目录
 诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
-用户可通过 `--output` 指定自定义目录。
+用户可通过 `--output` 指定**基目录**，脚本会继续在其下创建 `<YYYYMMDD_HHMMSS>/summary` 与 `<YYYYMMDD_HHMMSS>/detail`，避免覆盖历史明细。
 
 ## 用法
 
@@ -107,9 +107,9 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00" --end "17:00" --erro
 ## 输出
 
 - **终端**：简洁三层汇总（Router / FD 后端 / 客户端），含状态码分布、错误 Top N、趋势图
-- **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
-  - 逐分钟事件详情拆分到 `details/health_events.md`
-  - 请求追踪事件链拆分到 `details/trace_<ID>.md`
+- **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/summary/troubleshoot_report.md`
+  - 逐分钟事件详情拆分到 `detail/health_events.md`
+  - 请求追踪事件链拆分到 `detail/trace_<ID>.md`
 - **状态行**：`STATUS: HEALTHY / DEGRADED / CRITICAL`
 
 ## 三层诊断框架
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 5096c5b294a..30b9df0f443 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -190,30 +190,27 @@ def save_detailed_report(report_text, output_dir, details=None):
         output_dir: 输出目录
         details: 详情数据 dict（来自 format_full_report）
     """
-    os.makedirs(output_dir, exist_ok=True)
-    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    filename = f"troubleshoot_report_{timestamp}.md"
-    filepath = os.path.join(output_dir, filename)
+    summary_dir = os.path.join(output_dir, "summary")
+    detail_dir = os.path.join(output_dir, "detail")
+    os.makedirs(summary_dir, exist_ok=True)
+    os.makedirs(detail_dir, exist_ok=True)
+    filepath = os.path.join(summary_dir, "troubleshoot_report.md")
 
     with open(filepath, "w", encoding="utf-8") as f:
         f.write("# Router Troubleshooting Report\n")
         f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n')
         f.write(report_text)
 
-    # 保存详情到 details/ 子目录
+    # 保存详情到 detail/ 子目录
     if details:
-        details_dir = os.path.join(output_dir, "details")
-
         if details.get("health_events"):
-            os.makedirs(details_dir, exist_ok=True)
-            health_path = os.path.join(details_dir, "health_events.md")
+            health_path = os.path.join(detail_dir, "health_events.md")
             with open(health_path, "w", encoding="utf-8") as f:
                 f.write(details["health_events"])
 
         for trace_id, trace_text in details.get("trace_files", {}).items():
-            os.makedirs(details_dir, exist_ok=True)
             safe_id = trace_id.replace("/", "_")
-            trace_path = os.path.join(details_dir, f"trace_{safe_id}.md")
+            trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")
             with open(trace_path, "w", encoding="utf-8") as f:
                 f.write(trace_text)
 
@@ -327,13 +324,14 @@ def main():
     print(report)
 
     # 保存详细报告
+    run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     if args.output:
-        output_dir = args.output
+        output_base = args.output
     else:
         script_dir = os.path.dirname(os.path.abspath(__file__))
         golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
-        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp)
+        output_base = os.path.join(golang_router_root, "skill_output", "troubleshoot")
+    output_dir = os.path.join(output_base, run_timestamp)
     filepath = save_detailed_report(report, output_dir, details=details)
     print(f"\n详细报告已保存到: {filepath}", file=sys.stderr)
 

From 81be2a24d2db96470480475457f01ad60581df63 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 22:50:29 +0800
Subject: [PATCH 10/40] Improve session detail markdown id_type summary and
 table alignment

---
 .../scripts/stat_cache_hitrate.py             | 86 +++++++++++++++++--
 1 file changed, 77 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index b5adcb9bd5f..bd055393637 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None):
     def _escape_md(v):
         return str(v).replace("\n", "<br>").replace("|", "\\|")
 
-    header = "| " + " | ".join(columns) + " |"
-    align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |"
-    rows = []
+    matrix = []
     for row in data:
-        rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |")
+        matrix.append([_escape_md(row.get(c, "")) for c in columns])
+
+    widths = []
+    for i, col in enumerate(columns):
+        max_cell = max((len(r[i]) for r in matrix), default=0)
+        widths.append(max(len(col), max_cell))
+
+    def _format_cell(text, width, right=False):
+        return text.rjust(width) if right else text.ljust(width)
+
+    header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)]
+    header = "| " + " | ".join(header_cells) + " |"
+
+    align_cells = []
+    for i, c in enumerate(columns):
+        w = max(widths[i], 3)
+        if c in align_right:
+            align_cells.append("-" * (w - 1) + ":")
+        else:
+            align_cells.append(":" + "-" * (w - 1))
+    align = "| " + " | ".join(align_cells) + " |"
+
+    rows = []
+    for row_cells in matrix:
+        padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)]
+        rows.append("| " + " | ".join(padded) + " |")
     return "\n".join([header, align] + rows)
 
 
@@ -97,6 +120,36 @@ def _truncate_text(v, limit=72):
 def _seq_label(n):
     return f"S{n:03d}"
 
+
+def _extract_seq_num(seq_id):
+    return int(str(seq_id).lstrip("S") or 0)
+
+
+def _summarize_id_type_ranges(rows_with_seq):
+    """基于序号连续区间汇总 id_type，便于在报告开头快速识别口径。"""
+    if not rows_with_seq:
+        return []
+
+    ranges = []
+    current_type = rows_with_seq[0].get("id_type", "session_id")
+    start_id = rows_with_seq[0]["id"]
+    end_id = start_id
+
+    for row in rows_with_seq[1:]:
+        row_type = row.get("id_type", "session_id")
+        row_id = row["id"]
+        if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1:
+            end_id = row_id
+            continue
+
+        ranges.append((start_id, end_id, current_type))
+        current_type = row_type
+        start_id = row_id
+        end_id = row_id
+
+    ranges.append((start_id, end_id, current_type))
+    return ranges
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -742,7 +795,27 @@ def save_detailed_report(
             parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
             parts.append("")
 
+            all_rows_with_seq = []
+            for i, r in enumerate(session_rows, start=1):
+                all_rows_with_seq.append({**r, "id": _seq_label(i)})
+            id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq)
+            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
+
             session_parts = ["# Session 命中详情", ""]
+            session_parts.append("## id_type 摘要")
+            if len(id_type_ranges) == 1:
+                start_id, end_id, id_type = id_type_ranges[0]
+                if start_id == end_id:
+                    session_parts.append(f"- `{start_id}`: `{id_type}`")
+                else:
+                    session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}`")
+            else:
+                for start_id, end_id, id_type in id_type_ranges:
+                    if start_id == end_id:
+                        session_parts.append(f"- `{start_id}`: `{id_type}`")
+                    else:
+                        session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}`")
+            session_parts.append("")
             session_parts.append("## 概览")
             session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**')
             session_parts.append(
@@ -781,11 +854,6 @@ def save_detailed_report(
                 ),
             )[:20]
             compact_rows = []
-            all_rows_with_seq = []
-            for i, r in enumerate(session_rows, start=1):
-                all_rows_with_seq.append({**r, "id": _seq_label(i)})
-
-            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
 
             for r in prioritized_rows:
                 sid = seq_map.get(r["session"], "-")

From b8f12c42917150ba5ebb48f5b0e067534f42698e Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 23:13:06 +0800
Subject: [PATCH 11/40] Simplify time-range prompt flow in stat-cache-hitrate
 skill

---
 .../skills/stat-cache-hitrate/SKILL.md        |  19 +--
 .../scripts/session_analysis.py               |   3 +
 .../scripts/stat_cache_hitrate.py             | 133 +++++++++++++++---
 3 files changed, 124 insertions(+), 31 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index e07281576a6..251cbb04c2a 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -4,7 +4,7 @@ description: >
   统计 FastDeploy Go Router 日志中的三层 cache 命中率指标，生成可视化报告。
   三层指标：Prefix Hit Ratio（KV Cache 内容复用度）、Session Hit Rate（请求级路由粘性）、
   Per-Worker Cache Stats（各 prefill worker 的缓存利用排名）。支持全量统计、tail 快速查看、
-  持续监控模式。
+  持续监控模式、指定时间段统计（--start/--end）。
 
   当用户提到以下内容时触发此 skill：统计/查看 cache 命中率、查看 cache-aware 调度效果、
   查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。
@@ -35,12 +35,15 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 如果用户直接确认或未指定路径，使用默认值 `logs/router.log`。
 
 ### 2. 分析模式
-向用户询问分析模式：
-> "请选择分析模式：
-> 1. **全量统计**（默认）— 扫描完整日志
-> 2. **快速查看尾部** — 只看最近的数据（可指定行数如 2000 或时间如 30m）
-> 3. **持续监控** — 全量分析后提示监控命令
-> 4. **指定时间段** — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）"
+必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号，避免客户端偶发不显示第 4 项）：
+- 选项 1: `全量统计（默认）` — 扫描完整日志
+- 选项 2: `快速查看尾部` — 只看最近的数据（可指定行数如 2000 或时间如 30m）
+- 选项 3: `持续监控` — 全量分析后提示监控命令
+- 选项 4: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
+
+若用户选择“指定时间段”，直接让用户填写：  
+- 从 `xxx` 开始，到 `xxx` 结束（`start/end` 可只填一个）；  
+- 然后映射为 `--start/--end` 参数执行。
 
 如果用户未选择，默认使用全量统计。
 
@@ -94,7 +97,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 
 - `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接
 - `detail/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
-- `detail/session_hit_details.md` — 每个 session（无 session_id 时回退 trace_id）的命中明细（Markdown 表格），包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`，并附「序号与会话ID映射」「切换 reqid 明细（可跳转）」。
+- `detail/session_hit_details.md` — 每个 session（无 session_id 时回退 trace_id）的命中明细（Markdown 表格），包含 `id序号 / req_count / first_hit / avg-hit(=去首请求平均命中率) / max_hit / min_hit / all_hits / purl_cnt / prefill_urls`，并附「序号与会话ID映射」「切换 reqid 明细（含 session 时间段，可跳转）」。
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
index f7b4caed542..7de5b7f6042 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
@@ -66,6 +66,8 @@ def _req_id_from_tags(tags, fallback):
             {
                 "session": identity,
                 "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id",
+                "first_ts": recs[0].get("ts", "-"),
+                "last_ts": recs[-1].get("ts", "-"),
                 "req_count": len(hits),
                 "first_hit": f"{hits[0]}%",
                 "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-",
@@ -74,6 +76,7 @@ def _req_id_from_tags(tags, fallback):
                 "all_hits": ", ".join(f"{h}%" for h in hits),
                 "sticky": "yes" if len(workers) <= 1 else "no",
                 "unique_workers": len(workers),
+                "prefill_url_count": len(prefill_urls),
                 "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls),
                 "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-",
                 "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-",
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index b5adcb9bd5f..fb6b45b56fa 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None):
     def _escape_md(v):
         return str(v).replace("\n", "<br>").replace("|", "\\|")
 
-    header = "| " + " | ".join(columns) + " |"
-    align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |"
-    rows = []
+    matrix = []
     for row in data:
-        rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |")
+        matrix.append([_escape_md(row.get(c, "")) for c in columns])
+
+    widths = []
+    for i, col in enumerate(columns):
+        max_cell = max((len(r[i]) for r in matrix), default=0)
+        widths.append(max(len(col), max_cell))
+
+    def _format_cell(text, width, right=False):
+        return text.rjust(width) if right else text.ljust(width)
+
+    header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)]
+    header = "| " + " | ".join(header_cells) + " |"
+
+    align_cells = []
+    for i, c in enumerate(columns):
+        w = max(widths[i], 3)
+        if c in align_right:
+            align_cells.append("-" * (w - 1) + ":")
+        else:
+            align_cells.append(":" + "-" * (w - 1))
+    align = "| " + " | ".join(align_cells) + " |"
+
+    rows = []
+    for row_cells in matrix:
+        padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)]
+        rows.append("| " + " | ".join(padded) + " |")
     return "\n".join([header, align] + rows)
 
 
@@ -97,6 +120,41 @@ def _truncate_text(v, limit=72):
 def _seq_label(n):
     return f"S{n:03d}"
 
+
+def _extract_seq_num(seq_id):
+    return int(str(seq_id).lstrip("S") or 0)
+
+
+def _summarize_id_type_ranges(rows_with_seq):
+    """基于序号连续区间汇总 id_type，便于在报告开头快速识别口径。"""
+    if not rows_with_seq:
+        return []
+
+    ranges = []
+    current_type = rows_with_seq[0].get("id_type", "session_id")
+    start_id = rows_with_seq[0]["id"]
+    end_id = start_id
+    start_ts = rows_with_seq[0].get("first_ts", "-")
+    end_ts = rows_with_seq[0].get("last_ts", "-")
+
+    for row in rows_with_seq[1:]:
+        row_type = row.get("id_type", "session_id")
+        row_id = row["id"]
+        if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1:
+            end_id = row_id
+            end_ts = row.get("last_ts", end_ts)
+            continue
+
+        ranges.append((start_id, end_id, current_type, start_ts, end_ts))
+        current_type = row_type
+        start_id = row_id
+        end_id = row_id
+        start_ts = row.get("first_ts", "-")
+        end_ts = row.get("last_ts", "-")
+
+    ranges.append((start_id, end_id, current_type, start_ts, end_ts))
+    return ranges
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -742,8 +800,36 @@ def save_detailed_report(
             parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
             parts.append("")
 
+            all_rows_with_seq = []
+            for i, r in enumerate(session_rows, start=1):
+                all_rows_with_seq.append({**r, "id": _seq_label(i)})
+            id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq)
+            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
+            ts_starts = [r.get("first_ts", "-") for r in all_rows_with_seq if r.get("first_ts", "-") != "-"]
+            ts_ends = [r.get("last_ts", "-") for r in all_rows_with_seq if r.get("last_ts", "-") != "-"]
+
             session_parts = ["# Session 命中详情", ""]
+            overall_start_ts = min(ts_starts) if ts_starts else "-"
+            overall_end_ts = max(ts_ends) if ts_ends else "-"
+            session_parts.append("## 时间范围")
+            session_parts.append(f"- 分析覆盖时间段: `{overall_start_ts} ~ {overall_end_ts}`")
+            session_parts.append("")
+            session_parts.append("## id_type 摘要")
+            if len(id_type_ranges) == 1:
+                start_id, end_id, id_type, range_start_ts, range_end_ts = id_type_ranges[0]
+                if start_id == end_id:
+                    session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+                else:
+                    session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+            else:
+                for start_id, end_id, id_type, range_start_ts, range_end_ts in id_type_ranges:
+                    if start_id == end_id:
+                        session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+                    else:
+                        session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+            session_parts.append("")
             session_parts.append("## 概览")
+            session_parts.append("- 字段说明：`avg-hit` = `avg_hit(excl_first)`（去除首请求后的平均命中率）")
             session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**')
             session_parts.append(
                 f'- Sessions with >1 request: **{session_summary["multi_req"]}**'
@@ -763,10 +849,9 @@ def save_detailed_report(
             focus_columns = [
                 "id",
                 "req_count",
-                "id_type",
                 "sticky",
-                "unique_workers",
-                "avg_hit(excl_first)",
+                "purl_cnt",
+                "avg-hit",
                 "max_hit",
                 "min_hit",
                 "switch_reqids",
@@ -781,11 +866,6 @@ def save_detailed_report(
                 ),
             )[:20]
             compact_rows = []
-            all_rows_with_seq = []
-            for i, r in enumerate(session_rows, start=1):
-                all_rows_with_seq.append({**r, "id": _seq_label(i)})
-
-            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
 
             for r in prioritized_rows:
                 sid = seq_map.get(r["session"], "-")
@@ -793,39 +873,46 @@ def save_detailed_report(
                     {
                         "id": sid,
                         "req_count": r["req_count"],
-                        "id_type": r.get("id_type", "session_id"),
                         "sticky": r["sticky"],
-                        "unique_workers": r["unique_workers"],
-                        "avg_hit(excl_first)": r["avg_hit(excl_first)"],
+                        "purl_cnt": r.get("prefill_url_count", 0),
+                        "avg-hit": r["avg_hit(excl_first)"],
                         "max_hit": r["max_hit"],
                         "min_hit": r["min_hit"],
                         "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-",
                     }
                 )
             session_parts.append(
-                _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"})
+                _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "purl_cnt"})
             )
             session_parts.append("")
 
             session_columns = [
                 "id",
                 "req_count",
-                "id_type",
                 "first_hit",
-                "avg_hit(excl_first)",
+                "avg-hit",
                 "max_hit",
                 "min_hit",
                 "all_hits",
+                "purl_cnt",
                 "prefill_urls",
                 "sticky",
-                "unique_workers",
             ]
+            all_rows_for_table = []
+            for r in all_rows_with_seq:
+                all_rows_for_table.append(
+                    {
+                        **r,
+                        "avg-hit": r["avg_hit(excl_first)"],
+                        "purl_cnt": r.get("prefill_url_count", 0),
+                    }
+                )
             session_parts.append("## 全量明细（Markdown 表格）")
             session_parts.append(
                 _render_markdown_table(
-                    all_rows_with_seq,
+                    all_rows_for_table,
                     session_columns,
-                    align_right={"req_count", "unique_workers"},
+                    align_right={"req_count", "purl_cnt"},
                 )
             )
             session_parts.append("")
@@ -834,12 +921,11 @@ def save_detailed_report(
             map_rows = [
                 {
                     "id": r["id"],
-                    "id_type": r.get("id_type", "session_id"),
                     "session_or_trace_id": r["session"],
                 }
                 for r in all_rows_with_seq
             ]
-            session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"]))
+            session_parts.append(_render_markdown_table(map_rows, ["id", "session_or_trace_id"]))
             session_parts.append("")
 
             session_parts.append("## 切换 reqid 明细（可跳转）")
@@ -847,6 +933,7 @@ def save_detailed_report(
                 session_parts.append(f'### switch-{r["id"].lower()}')
                 session_parts.append(f'- ID: **{r["id"]}**')
                 session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})')
+                session_parts.append(f'- 时间段: `{r.get("first_ts", "-")} ~ {r.get("last_ts", "-")}`')
                 session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}')
                 session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}')
                 session_parts.append("")

From 8564f9cd62ed26f32559d115483a3ca1796d9049 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Sun, 12 Apr 2026 23:19:42 +0800
Subject: [PATCH 12/40] Unify full session detail table columns with Top20

---
 .../skills/stat-cache-hitrate/SKILL.md        |  19 ++-
 .../scripts/session_analysis.py               |   3 +
 .../scripts/stat_cache_hitrate.py             | 148 ++++++++++++++----
 3 files changed, 129 insertions(+), 41 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index e07281576a6..251cbb04c2a 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -4,7 +4,7 @@ description: >
   统计 FastDeploy Go Router 日志中的三层 cache 命中率指标，生成可视化报告。
   三层指标：Prefix Hit Ratio（KV Cache 内容复用度）、Session Hit Rate（请求级路由粘性）、
   Per-Worker Cache Stats（各 prefill worker 的缓存利用排名）。支持全量统计、tail 快速查看、
-  持续监控模式。
+  持续监控模式、指定时间段统计（--start/--end）。
 
   当用户提到以下内容时触发此 skill：统计/查看 cache 命中率、查看 cache-aware 调度效果、
   查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。
@@ -35,12 +35,15 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 如果用户直接确认或未指定路径，使用默认值 `logs/router.log`。
 
 ### 2. 分析模式
-向用户询问分析模式：
-> "请选择分析模式：
-> 1. **全量统计**（默认）— 扫描完整日志
-> 2. **快速查看尾部** — 只看最近的数据（可指定行数如 2000 或时间如 30m）
-> 3. **持续监控** — 全量分析后提示监控命令
-> 4. **指定时间段** — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）"
+必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号，避免客户端偶发不显示第 4 项）：
+- 选项 1: `全量统计（默认）` — 扫描完整日志
+- 选项 2: `快速查看尾部` — 只看最近的数据（可指定行数如 2000 或时间如 30m）
+- 选项 3: `持续监控` — 全量分析后提示监控命令
+- 选项 4: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
+
+若用户选择“指定时间段”，直接让用户填写：  
+- 从 `xxx` 开始，到 `xxx` 结束（`start/end` 可只填一个）；  
+- 然后映射为 `--start/--end` 参数执行。
 
 如果用户未选择，默认使用全量统计。
 
@@ -94,7 +97,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 
 - `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接
 - `detail/per_window_data.md` — 每5s窗口明细（连续空窗口自动合并为 3 行：起始/合并说明/结束）
-- `detail/session_hit_details.md` — 每个 session（无 session_id 时回退 trace_id）的命中明细（Markdown 表格），包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`，并附「序号与会话ID映射」「切换 reqid 明细（可跳转）」。
+- `detail/session_hit_details.md` — 每个 session（无 session_id 时回退 trace_id）的命中明细（Markdown 表格），包含 `id序号 / req_count / first_hit / avg-hit(=去首请求平均命中率) / max_hit / min_hit / all_hits / purl_cnt / prefill_urls`，并附「序号与会话ID映射」「切换 reqid 明细（含 session 时间段，可跳转）」。
 
 ### 交叉诊断矩阵
 
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
index f7b4caed542..7de5b7f6042 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py
@@ -66,6 +66,8 @@ def _req_id_from_tags(tags, fallback):
             {
                 "session": identity,
                 "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id",
+                "first_ts": recs[0].get("ts", "-"),
+                "last_ts": recs[-1].get("ts", "-"),
                 "req_count": len(hits),
                 "first_hit": f"{hits[0]}%",
                 "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-",
@@ -74,6 +76,7 @@ def _req_id_from_tags(tags, fallback):
                 "all_hits": ", ".join(f"{h}%" for h in hits),
                 "sticky": "yes" if len(workers) <= 1 else "no",
                 "unique_workers": len(workers),
+                "prefill_url_count": len(prefill_urls),
                 "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls),
                 "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-",
                 "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-",
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index b5adcb9bd5f..bd85730b7d1 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None):
     def _escape_md(v):
         return str(v).replace("\n", "<br>").replace("|", "\\|")
 
-    header = "| " + " | ".join(columns) + " |"
-    align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |"
-    rows = []
+    matrix = []
     for row in data:
-        rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |")
+        matrix.append([_escape_md(row.get(c, "")) for c in columns])
+
+    widths = []
+    for i, col in enumerate(columns):
+        max_cell = max((len(r[i]) for r in matrix), default=0)
+        widths.append(max(len(col), max_cell))
+
+    def _format_cell(text, width, right=False):
+        return text.rjust(width) if right else text.ljust(width)
+
+    header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)]
+    header = "| " + " | ".join(header_cells) + " |"
+
+    align_cells = []
+    for i, c in enumerate(columns):
+        w = max(widths[i], 3)
+        if c in align_right:
+            align_cells.append("-" * (w - 1) + ":")
+        else:
+            align_cells.append(":" + "-" * (w - 1))
+    align = "| " + " | ".join(align_cells) + " |"
+
+    rows = []
+    for row_cells in matrix:
+        padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)]
+        rows.append("| " + " | ".join(padded) + " |")
     return "\n".join([header, align] + rows)
 
 
@@ -97,6 +120,41 @@ def _truncate_text(v, limit=72):
 def _seq_label(n):
     return f"S{n:03d}"
 
+
+def _extract_seq_num(seq_id):
+    return int(str(seq_id).lstrip("S") or 0)
+
+
+def _summarize_id_type_ranges(rows_with_seq):
+    """基于序号连续区间汇总 id_type，便于在报告开头快速识别口径。"""
+    if not rows_with_seq:
+        return []
+
+    ranges = []
+    current_type = rows_with_seq[0].get("id_type", "session_id")
+    start_id = rows_with_seq[0]["id"]
+    end_id = start_id
+    start_ts = rows_with_seq[0].get("first_ts", "-")
+    end_ts = rows_with_seq[0].get("last_ts", "-")
+
+    for row in rows_with_seq[1:]:
+        row_type = row.get("id_type", "session_id")
+        row_id = row["id"]
+        if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1:
+            end_id = row_id
+            end_ts = row.get("last_ts", end_ts)
+            continue
+
+        ranges.append((start_id, end_id, current_type, start_ts, end_ts))
+        current_type = row_type
+        start_id = row_id
+        end_id = row_id
+        start_ts = row.get("first_ts", "-")
+        end_ts = row.get("last_ts", "-")
+
+    ranges.append((start_id, end_id, current_type, start_ts, end_ts))
+    return ranges
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -742,8 +800,36 @@ def save_detailed_report(
             parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
             parts.append("")
 
+            all_rows_with_seq = []
+            for i, r in enumerate(session_rows, start=1):
+                all_rows_with_seq.append({**r, "id": _seq_label(i)})
+            id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq)
+            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
+            ts_starts = [r.get("first_ts", "-") for r in all_rows_with_seq if r.get("first_ts", "-") != "-"]
+            ts_ends = [r.get("last_ts", "-") for r in all_rows_with_seq if r.get("last_ts", "-") != "-"]
+
             session_parts = ["# Session 命中详情", ""]
+            overall_start_ts = min(ts_starts) if ts_starts else "-"
+            overall_end_ts = max(ts_ends) if ts_ends else "-"
+            session_parts.append("## 时间范围")
+            session_parts.append(f"- 分析覆盖时间段: `{overall_start_ts} ~ {overall_end_ts}`")
+            session_parts.append("")
+            session_parts.append("## id_type 摘要")
+            if len(id_type_ranges) == 1:
+                start_id, end_id, id_type, range_start_ts, range_end_ts = id_type_ranges[0]
+                if start_id == end_id:
+                    session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+                else:
+                    session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+            else:
+                for start_id, end_id, id_type, range_start_ts, range_end_ts in id_type_ranges:
+                    if start_id == end_id:
+                        session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+                    else:
+                        session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+            session_parts.append("")
             session_parts.append("## 概览")
+            session_parts.append("- 字段说明：`avg-hit` = `avg_hit(excl_first)`（去除首请求后的平均命中率）")
             session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**')
             session_parts.append(
                 f'- Sessions with >1 request: **{session_summary["multi_req"]}**'
@@ -763,10 +849,9 @@ def save_detailed_report(
             focus_columns = [
                 "id",
                 "req_count",
-                "id_type",
                 "sticky",
-                "unique_workers",
-                "avg_hit(excl_first)",
+                "purl_cnt",
+                "avg-hit",
                 "max_hit",
                 "min_hit",
                 "switch_reqids",
@@ -781,11 +866,6 @@ def save_detailed_report(
                 ),
             )[:20]
             compact_rows = []
-            all_rows_with_seq = []
-            for i, r in enumerate(session_rows, start=1):
-                all_rows_with_seq.append({**r, "id": _seq_label(i)})
-
-            seq_map = {r["session"]: r["id"] for r in all_rows_with_seq}
 
             for r in prioritized_rows:
                 sid = seq_map.get(r["session"], "-")
@@ -793,39 +873,41 @@ def save_detailed_report(
                     {
                         "id": sid,
                         "req_count": r["req_count"],
-                        "id_type": r.get("id_type", "session_id"),
                         "sticky": r["sticky"],
-                        "unique_workers": r["unique_workers"],
-                        "avg_hit(excl_first)": r["avg_hit(excl_first)"],
+                        "purl_cnt": r.get("prefill_url_count", 0),
+                        "avg-hit": r["avg_hit(excl_first)"],
                         "max_hit": r["max_hit"],
                         "min_hit": r["min_hit"],
                         "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-",
                     }
                 )
             session_parts.append(
-                _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"})
+                _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "purl_cnt"})
             )
             session_parts.append("")
 
-            session_columns = [
-                "id",
-                "req_count",
-                "id_type",
-                "first_hit",
-                "avg_hit(excl_first)",
-                "max_hit",
-                "min_hit",
-                "all_hits",
-                "prefill_urls",
-                "sticky",
-                "unique_workers",
-            ]
+            session_columns = focus_columns
+            all_rows_for_table = []
+            for r in all_rows_with_seq:
+                sid = r["id"]
+                all_rows_for_table.append(
+                    {
+                        "id": sid,
+                        "req_count": r["req_count"],
+                        "sticky": r["sticky"],
+                        "purl_cnt": r.get("prefill_url_count", 0),
+                        "avg-hit": r["avg_hit(excl_first)"],
+                        "max_hit": r["max_hit"],
+                        "min_hit": r["min_hit"],
+                        "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-",
+                    }
+                )
             session_parts.append("## 全量明细（Markdown 表格）")
             session_parts.append(
                 _render_markdown_table(
-                    all_rows_with_seq,
+                    all_rows_for_table,
                     session_columns,
-                    align_right={"req_count", "unique_workers"},
+                    align_right={"req_count", "purl_cnt"},
                 )
             )
             session_parts.append("")
@@ -834,12 +916,11 @@ def save_detailed_report(
             map_rows = [
                 {
                     "id": r["id"],
-                    "id_type": r.get("id_type", "session_id"),
                     "session_or_trace_id": r["session"],
                 }
                 for r in all_rows_with_seq
             ]
-            session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"]))
+            session_parts.append(_render_markdown_table(map_rows, ["id", "session_or_trace_id"]))
             session_parts.append("")
 
             session_parts.append("## 切换 reqid 明细（可跳转）")
@@ -847,6 +928,7 @@ def save_detailed_report(
                 session_parts.append(f'### switch-{r["id"].lower()}')
                 session_parts.append(f'- ID: **{r["id"]}**')
                 session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})')
+                session_parts.append(f'- 时间段: `{r.get("first_ts", "-")} ~ {r.get("last_ts", "-")}`')
                 session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}')
                 session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}')
                 session_parts.append("")

From 322b98b0abd910f63a260e89285889222bde6bb6 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 00:04:20 +0800
Subject: [PATCH 13/40] infer mixed token-select counts from router semantics

---
 .../.claude/skills/troubleshoot/SKILL.md      |  21 +-
 .../troubleshoot/scripts/analyzers/load.py    | 148 +----------
 .../scripts/analyzers/load_report.py          | 243 ++++++++++++++++++
 .../skills/troubleshoot/scripts/log_parser.py |  80 +++++-
 .../troubleshoot/scripts/troubleshoot.py      |  13 +-
 5 files changed, 342 insertions(+), 163 deletions(-)
 create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index 43ee91a46b1..7f7a5793e91 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -24,10 +24,9 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 运行脚本前，Claude 必须按以下顺序向用户确认参数：
 
 ### 1. 日志文件路径
-使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项，同时允许用户直接输入自定义路径（支持绝对路径和相对路径）：
+使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项（客户端会自动提供 Other 自定义输入）：
 - 选项 1: `logs/router.log`（默认）
 - 选项 2: `fd-router.log`（golang_router 根目录）
-- 选项 3: 用户通过 Other 输入自定义路径
 
 **重要规则**：
 - 如果用户已经在消息中明确指定了日志路径，直接使用该路径，跳过询问步骤
@@ -37,11 +36,10 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 如果用户直接确认或未指定路径，使用脚本的自动发现逻辑。
 
 ### 2. 分析范围
-向用户询问分析范围：
-> "请选择分析范围：
-> 1. **全量分析**（默认）— 分析整个日志文件
-> 2. **尾部分析** — 只分析最近数据（可指定行数或时间如 `--tail 5000` 或 `--tail 30m`）
-> 3. **指定时间段** — 分析特定时间范围内的日志"
+必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
+- 选项 1: `全量分析（默认）` — 分析整个日志文件
+- 选项 2: `尾部分析` — 只分析最近数据（可指定行数或时间如 `--tail 5000` 或 `--tail 30m`）
+- 选项 3: `指定时间段` — 分析特定时间范围内的日志
 
 如果用户未选择，默认使用全量分析。
 
@@ -54,11 +52,10 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 `--start/--end` 与 `--tail` 互斥。
 
 ### 3. 分析模式
-向用户询问分析模式：
-> "请选择分析模式：
-> 1. **完整分析**（默认）— 运行所有维度（errors + latency + health + cache + load）
-> 2. **单维度/多维度分析** — 选择特定维度（errors / latency / health / cache / load），可选多个
-> 3. **请求追踪** — 追踪特定请求 ID（需提供 ID）"
+必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
+- 选项 1: `完整分析（默认）` — 运行所有维度（errors + latency + health + cache + load）
+- 选项 2: `单维度/多维度分析` — 选择特定维度（errors / latency / health / cache / load），可选多个
+- 选项 3: `请求追踪` — 追踪特定请求 ID（需提供 ID）
 
 如果用户未选择，默认使用完整分析。
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index 9be82357494..c38b0b80953 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -13,9 +13,9 @@
 
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from chart import render_bar, render_sparkline, render_table
 from log_parser import extract_ts, match_select_release, parse_stats_line
 from stats import compute_statistics, time_bucket
+from analyzers.load_report import format_load_report
 
 # ════════════════════════════════════════════════════════════════
 # Counter 异常检测正则
@@ -28,14 +28,21 @@
 TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}")
 
 # Token 事件
-SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
 
 
 def _strip_scheme(url):
     return re.sub(r"^https?://", "", url)
 
 
+def _normalize_worker_type(worker_type):
+    t = (worker_type or "unknown").lower()
+    if t in ("prefill", "decode", "mixed"):
+        return t
+    return "unknown"
+
+
 def parse_counter_anomaly(line):
     """解析 H5 counter 异常行。"""
     ts = extract_ts(line)
@@ -73,7 +80,7 @@ def analyze_load(log_file, tail=None):
         r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters",
         tail,
     )
-    h11_lines = _grep_lines(log_file, r"release prefill tokens", tail)
+    h11_lines = _grep_lines(log_file, r"release (?:[a-zA-Z_]+\s+)?tokens", tail)
 
     # 解析 stats 行
     stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r]
@@ -161,12 +168,12 @@ def _analyze_tokens(h3_lines, h11_lines):
     for line in h3_lines:
         m = SELECT_TOKENS_RE.search(line)
         if m:
-            token_alloc[m.group(1)].append(int(m.group(2)))
+            token_alloc[m.group(2)].append(int(m.group(3)))
 
     for line in h11_lines:
         m = RELEASE_TOKENS_RE.search(line)
         if m:
-            token_release[m.group(1)].append(int(m.group(2)))
+            token_release[m.group(2)].append(int(m.group(3)))
 
     result = []
     all_workers = set(token_alloc.keys()) | set(token_release.keys())
@@ -285,135 +292,6 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats,
 # ════════════════════════════════════════════════════════════════
 
 
-def format_load_report(result):
-    """将分析结果格式化为终端报告。"""
-    sections = ["## 负载与计数器分析", ""]
-    sections.append(f'  {result["summary"]}')
-    sections.append("")
-
-    if result["diagnoses"]:
-        sections.append("### 诊断")
-        sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
-        sections.append("")
-
-    # 负载概览
-    ls = result.get("load_stats", {})
-    if ls:
-        sections.append("### 负载概览 (total_running)")
-        sections.append("")
-        sections.append(
-            f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
-            f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
-        )
-        sections.append("")
-
-    # Per-Worker 负载
-    if result["worker_load"]:
-        sections.append("### Per-Worker 负载")
-        sections.append("")
-        bar_data = [
-            {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]}
-            for w in result["worker_load"]
-        ]
-        sections.append(render_bar(bar_data, show_count=True))
-        sections.append("")
-
-    # 负载趋势
-    if result["load_trend"] and len(result["load_trend"]) > 1:
-        sections.append("### 负载趋势")
-        sections.append("")
-        sections.append(
-            render_sparkline(
-                result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req"
-            )
-        )
-        sections.append("")
-
-    # Counter 异常
-    if result["counter_anomalies"]:
-        sections.append("### 计数器异常")
-        sections.append("")
-        for a in result["counter_anomalies"]:
-            workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items())
-            sections.append(f'  {a["type"]}: {a["total"]} 次 [{workers_str}]')
-        sections.append("")
-
-    id_cov = result.get("select_release", {}).get("id_coverage", {})
-    if id_cov:
-        sections.append("### 请求标识覆盖（基于 select 近似请求数）")
-        sections.append("")
-        sections.append(
-            "  total={total} | with_request_id={with_rid} | without_request_id={without_rid} | "
-            "with_alt_id={with_alt} | without_any_id={without_any}".format(
-                total=id_cov.get("total_requests_estimated", 0),
-                with_rid=id_cov.get("with_request_id", 0),
-                without_rid=id_cov.get("without_request_id", 0),
-                with_alt=id_cov.get("with_alt_id", 0),
-                without_any=id_cov.get("without_any_id", 0),
-            )
-        )
-        if id_cov.get("without_any_id", 0) > 0:
-            sections.append("  ℹ 无 request/session/trace/req_id 时，不做退化匹配，仅统计为 untracked。")
-        sections.append("")
-
-    # Select/Release 匹配
-    sr = result.get("select_release", {})
-    if sr.get("per_worker"):
-        sections.append("### Select/Release 匹配")
-        sections.append("")
-        id_cov = sr.get("id_coverage", {})
-        no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0
-        table_data = []
-        for w_url, pw in sorted(sr["per_worker"].items()):
-            delta_display = "N/A" if no_correlatable_id else str(pw["delta"])
-            table_data.append(
-                {
-                    "Worker": _strip_scheme(w_url),
-                    "Select": str(pw["selects"]),
-                    "Release": str(pw["releases"]),
-                    "Delta": delta_display,
-                }
-            )
-        sections.append(
-            render_table(
-                table_data,
-                columns=["Worker", "Select", "Release", "Delta"],
-                right_align={"Select", "Release", "Delta"},
-            )
-        )
-        sections.append("")
-        if no_correlatable_id:
-            sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
-            sections.append("")
-
-    if sr.get("unmatched_selects"):
-        sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
-        for u in sr["unmatched_selects"][:5]:
-            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("")
-
-    if sr.get("untracked_selects"):
-        sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
-        for u in sr["untracked_selects"][:5]:
-            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("")
-
-    # Token 统计
-    if result.get("token_stats"):
-        sections.append("### Token 计数器")
-        sections.append("")
-        sections.append(
-            render_table(
-                result["token_stats"],
-                columns=["worker", "alloc_count", "alloc_avg", "release_count"],
-                right_align={"alloc_count", "alloc_avg", "release_count"},
-            )
-        )
-        sections.append("")
-
-    return "\n".join(sections)
 
 
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
new file mode 100644
index 00000000000..e118c4e1af3
--- /dev/null
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""Load report formatter."""
+
+from chart import render_bar, render_sparkline, render_table
+
+
+def _strip_scheme(url):
+    import re
+    return re.sub(r"^https?://", "", url)
+
+
+def format_load_report(result):
+    """将分析结果格式化为终端报告。
+
+    Returns:
+        tuple: (summary_text, detail_text)
+    """
+    sections = ["## 负载与计数器分析", ""]
+    sections.append(f'  {result["summary"]}')
+    sections.append("")
+    detail_sections = ["# 负载与计数器详情", ""]
+    detail_sections.append(f'总结: {result["summary"]}')
+    detail_sections.append("")
+
+    if result["diagnoses"]:
+        sections.append("### 诊断")
+        sections.append("")
+        for d in result["diagnoses"]:
+            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("")
+        detail_sections.append("## 诊断")
+        detail_sections.append("")
+        for d in result["diagnoses"]:
+            detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_sections.append("")
+
+    # 负载概览
+    ls = result.get("load_stats", {})
+    if ls:
+        sections.append("### 负载概览 (total_running)")
+        sections.append("")
+        sections.append(
+            f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
+            f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
+        )
+        sections.append("")
+
+    # Per-Worker 负载
+    if result["worker_load"]:
+        sections.append("### Per-Worker 负载")
+        sections.append("")
+        bar_data = [
+            {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]}
+            for w in result["worker_load"]
+        ]
+        sections.append(render_bar(bar_data, show_count=True))
+        sections.append("")
+
+    # 负载趋势
+    if result["load_trend"] and len(result["load_trend"]) > 1:
+        sections.append("### 负载趋势")
+        sections.append("")
+        sections.append(
+            render_sparkline(
+                result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req"
+            )
+        )
+        sections.append("")
+
+    # Counter 异常
+    if result["counter_anomalies"]:
+        sections.append("### 计数器异常")
+        sections.append("")
+        for a in result["counter_anomalies"]:
+            workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items())
+            sections.append(f'  {a["type"]}: {a["total"]} 次 [{workers_str}]')
+        sections.append("")
+        detail_sections.append("## 计数器异常")
+        detail_sections.append("")
+        for a in result["counter_anomalies"]:
+            workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items())
+            detail_sections.append(f'- {a["type"]}: {a["total"]} 次 [{workers_str}]')
+        detail_sections.append("")
+
+    # 按 prefill / decode / mixed 分类统计
+    type_summary = result.get("select_release", {}).get("type_summary", {})
+    if type_summary:
+        sections.append("### 按类型统计（prefill / decode / mixed）")
+        sections.append("")
+        type_rows = []
+        for t in ("prefill", "decode", "mixed", "unknown"):
+            s = type_summary.get(t)
+            if not s:
+                continue
+            token_display = "-"
+            if t == "prefill":
+                token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}'
+            elif t == "mixed" and (s.get("token_selects", 0) > 0 or s.get("token_releases", 0) > 0):
+                token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}'
+            type_rows.append(
+                {
+                    "type": t,
+                    "counter(S/R)": f'{s.get("counter_selects",0)}/{s.get("counter_releases",0)}',
+                    "token(S/R)": token_display,
+                }
+            )
+        if type_rows:
+            sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
+            sections.append("")
+            sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("")
+            detail_sections.append("## 按类型统计")
+            detail_sections.append("")
+            detail_sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
+            detail_sections.append("")
+
+    id_cov = result.get("select_release", {}).get("id_coverage", {})
+    if id_cov:
+        sections.append("### 请求标识覆盖（基于 select 近似请求数）")
+        sections.append("")
+        sections.append(
+            "  total={total} | with_request_id={with_rid} | without_request_id={without_rid} | "
+            "with_alt_id={with_alt} | without_any_id={without_any}".format(
+                total=id_cov.get("total_requests_estimated", 0),
+                with_rid=id_cov.get("with_request_id", 0),
+                without_rid=id_cov.get("without_request_id", 0),
+                with_alt=id_cov.get("with_alt_id", 0),
+                without_any=id_cov.get("without_any_id", 0),
+            )
+        )
+        if id_cov.get("without_any_id", 0) > 0:
+            sections.append("  ℹ 无 request/session/trace/req_id 时，不做退化匹配，仅统计为 untracked。")
+        sections.append("  字段说明: total=select 事件总数估算；with_request_id=含 request_id；without_request_id=不含 request_id；with_alt_id=含 req_id/trace_id/session_id；without_any_id=四类 ID 都缺失。")
+        sections.append("")
+        detail_sections.append("## 请求标识覆盖字段说明")
+        detail_sections.append("")
+        detail_sections.append(
+            "- total: select 事件总数（近似请求数）\n"
+            "- with_request_id: 携带 request_id 的 select 数\n"
+            "- without_request_id: 未携带 request_id 的 select 数\n"
+            "- with_alt_id: 无 request_id 但携带 req_id/trace_id/session_id 的 select 数\n"
+            "- without_any_id: 四类 ID 都没有，无法做请求级关联"
+        )
+        detail_sections.append("")
+
+    # Select/Release 匹配
+    sr = result.get("select_release", {})
+    if sr.get("per_worker"):
+        sections.append("### Select/Release 匹配")
+        sections.append("")
+        id_cov = sr.get("id_coverage", {})
+        no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0
+        table_data = []
+        for w_url, pw in sorted(sr["per_worker"].items()):
+            delta_display = "N/A" if no_correlatable_id else str(pw["delta"])
+            table_data.append(
+                {
+                    "Worker": _strip_scheme(w_url),
+                    "ReqSelect": str(pw["selects"]),
+                    "ReqRelease": str(pw["releases"]),
+                    "ReqDelta": delta_display,
+                    "TokenSelect": str(pw.get("token_selects", 0)),
+                    "TokenSelInf": str(pw.get("token_selects_inferred", 0)),
+                    "TokenRelease": str(pw.get("token_releases", 0)),
+                }
+            )
+        sections.append(
+            render_table(
+                table_data,
+                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"],
+                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"},
+            )
+        )
+        sections.append("")
+        if no_correlatable_id:
+            sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
+            sections.append("")
+        sections.append("  说明: prefill/mixed 在运行时都会同时增加 request 与 token 计数器；其中 mixed 的 TokenSelect 可能来自推断（TokenSelInf）。")
+        sections.append("")
+        detail_sections.append("## Select/Release Per-Worker")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                table_data,
+                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"],
+                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"},
+            )
+        )
+        detail_sections.append("")
+
+    if sr.get("unmatched_selects"):
+        sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
+        sections.append("  解释: 出现 request select，但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。")
+        for u in sr["unmatched_selects"][:3]:
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
+        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## 未匹配 select（完整）")
+        detail_sections.append("")
+        for u in sr["unmatched_selects"]:
+            detail_sections.append(
+                f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}'
+            )
+        detail_sections.append("")
+
+    if sr.get("untracked_selects"):
+        sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
+        for u in sr["untracked_selects"][:3]:
+            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
+        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## Untracked selects（缺少可关联 ID）")
+        detail_sections.append("")
+        for u in sr["untracked_selects"]:
+            detail_sections.append(
+                f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}'
+            )
+        detail_sections.append("")
+
+    if sr.get("failed_selects"):
+        sections.append(f'  ⚠ Failed to select: {len(sr["failed_selects"])} 次')
+        sections.append("  解释: 路由在该时刻未能选出可用 worker，通常意味着可用池不足或健康状态异常。")
+        sections.append("")
+        detail_sections.append("## Failed to select")
+        detail_sections.append("")
+        for f in sr["failed_selects"]:
+            detail_sections.append(f'- [{f.get("ts","")}] line={f.get("line","")}')
+        detail_sections.append("")
+
+    # Token 统计
+    if result.get("token_stats"):
+        sections.append("### Token 计数器")
+        sections.append("")
+        sections.append(
+            render_table(
+                result["token_stats"],
+                columns=["worker", "alloc_count", "alloc_avg", "release_count"],
+                right_align={"alloc_count", "alloc_avg", "release_count"},
+            )
+        )
+        sections.append("")
+
+    return "\n".join(sections), "\n".join(detail_sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 44f5cdebd94..1bb11ddaa5e 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -466,8 +466,8 @@ def parse_error_line(line):
 SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
 RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})")
 FAILED_SELECT_RE = re.compile(r"Failed to select")
-SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)")
-RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)")
+SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*({URL_RE}),\s*tokens:\s*(\d+)")
+RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)")
 
 
 def _parse_ts_safe(ts):
@@ -493,6 +493,14 @@ def _select_match_key(tags):
     return (None, None)
 
 
+def _normalize_worker_type(worker_type):
+    """归一化 worker type。"""
+    t = (worker_type or "unknown").lower()
+    if t in ("prefill", "decode", "mixed"):
+        return t
+    return "unknown"
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -516,10 +524,10 @@ def match_select_release(lines, fallback_window_s=120):
             selects.append(
                 {
                     "ts": ts,
-                    "worker": tm.group(1),
-                    "type": "prefill",
+                    "worker": tm.group(2),
+                    "type": _normalize_worker_type(tm.group(1)),
                     "tags": tags,
-                    "tokens": int(tm.group(2)),
+                    "tokens": int(tm.group(3)),
                     "line": line_no,
                 }
             )
@@ -528,13 +536,14 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
+            token_type = trm.group(1) or "prefill"
             releases.append(
                 {
                     "ts": ts,
-                    "worker": trm.group(1),
-                    "type": "prefill_tokens",
+                    "worker": trm.group(2),
+                    "type": f'{_normalize_worker_type(token_type)}_tokens',
                     "tags": tags,
-                    "tokens": int(trm.group(2)),
+                    "tokens": int(trm.group(3)),
                     "line": line_no,
                 }
             )
@@ -546,7 +555,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": sm.group(2),
-                    "type": sm.group(1) or "unknown",
+                    "type": _normalize_worker_type(sm.group(1)),
                     "tags": tags,
                     "tokens": None,
                     "line": line_no,
@@ -560,7 +569,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": rm.group(2),
-                    "type": rm.group(1) or "unknown",
+                    "type": _normalize_worker_type(rm.group(1)),
                     "tags": tags,
                     "tokens": None,
                     "line": line_no,
@@ -576,8 +585,11 @@ def match_select_release(lines, fallback_window_s=120):
     unmatched_selects = []
     release_used = set()
 
+    # 请求生命周期匹配只使用 request counter release（排除 token release）
+    counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")]
     release_by_key = defaultdict(list)
-    for i, r in enumerate(releases):
+    for i in counter_release_indexes:
+        r = releases[i]
         _, key = _select_match_key(r.get("tags", {}))
         if key:
             release_by_key[key].append(i)
@@ -639,7 +651,8 @@ def match_select_release(lines, fallback_window_s=120):
         sdt = _parse_ts_safe(s["ts"])
         best_idx = None
         best_delta = None
-        for ri, r in enumerate(releases):
+        for ri in counter_release_indexes:
+            r = releases[ri]
             if ri in release_used:
                 continue
             if r.get("worker") != s.get("worker"):
@@ -680,11 +693,25 @@ def match_select_release(lines, fallback_window_s=120):
             )
 
     # Per-worker summary
-    per_worker = defaultdict(lambda: {"selects": 0, "releases": 0})
+    # 对照 golang_router SelectWorker 语义：
+    # - prefill: request counter + token counter 同时增加（日志通常带 tokens）
+    # - mixed:   request counter + token counter 同时增加（日志通常不带 tokens，需要推断）
+    per_worker = defaultdict(
+        lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_selects_inferred": 0, "token_releases": 0}
+    )
     for s in selects:
+        s_type = _normalize_worker_type(s.get("type"))
         per_worker[s["worker"]]["selects"] += 1
+        if s.get("tokens") is not None:
+            per_worker[s["worker"]]["token_selects"] += 1
+        elif s_type == "mixed":
+            per_worker[s["worker"]]["token_selects"] += 1
+            per_worker[s["worker"]]["token_selects_inferred"] += 1
     for r in releases:
-        per_worker[r["worker"]]["releases"] += 1
+        if str(r.get("type", "")).endswith("_tokens"):
+            per_worker[r["worker"]]["token_releases"] += 1
+        else:
+            per_worker[r["worker"]]["releases"] += 1
 
     pw_result = {}
     for w, counts in per_worker.items():
@@ -692,7 +719,31 @@ def match_select_release(lines, fallback_window_s=120):
             "selects": counts["selects"],
             "releases": counts["releases"],
             "delta": counts["selects"] - counts["releases"],
+            "token_selects": counts["token_selects"],
+            "token_selects_inferred": counts["token_selects_inferred"],
+            "token_releases": counts["token_releases"],
+        }
+
+    # 按 worker type 分类统计（prefill/decode/mixed）
+    type_summary = defaultdict(
+        lambda: {
+            "counter_selects": 0,
+            "counter_releases": 0,
+            "token_selects": 0,
+            "token_releases": 0,
         }
+    )
+    for s in selects:
+        s_type = _normalize_worker_type(s.get("type"))
+        type_summary[s_type]["counter_selects"] += 1
+        if s.get("tokens") is not None or s_type == "mixed":
+            type_summary[s_type]["token_selects"] += 1
+    for r in releases:
+        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
+        if str(r.get("type", "")).endswith("_tokens"):
+            type_summary[r_type]["token_releases"] += 1
+        else:
+            type_summary[r_type]["counter_releases"] += 1
 
     return {
         "matched": matched,
@@ -707,6 +758,7 @@ def match_select_release(lines, fallback_window_s=120):
             "with_alt_id": with_alt_id,
             "without_any_id": without_any_id,
         },
+        "type_summary": dict(type_summary),
     }
 
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 30b9df0f443..a818d31150f 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -144,10 +144,11 @@ def format_full_report(results, status, status_reason):
             report_text: 主报告文本（总结 + 可视化）
             details: dict 包含需要拆分到独立文件的详情数据
                 - 'health_events': str 或 None
+                - 'load_select_release': str 或 None
                 - 'trace_files': {trace_id: text} 或 {}
     """
     parts = []
-    details = {"health_events": None, "trace_files": {}}
+    details = {"health_events": None, "load_select_release": None, "trace_files": {}}
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
@@ -168,7 +169,10 @@ def format_full_report(results, status, status_reason):
             details["health_events"] = detail
 
     if "load" in results:
-        parts.append(format_load_report(results["load"]))
+        summary, detail = format_load_report(results["load"])
+        parts.append(summary)
+        if detail:
+            details["load_select_release"] = detail
 
     if "cache" in results:
         parts.append(format_cache_report(results["cache"]))
@@ -208,6 +212,11 @@ def save_detailed_report(report_text, output_dir, details=None):
             with open(health_path, "w", encoding="utf-8") as f:
                 f.write(details["health_events"])
 
+        if details.get("load_select_release"):
+            load_path = os.path.join(detail_dir, "load_select_release.md")
+            with open(load_path, "w", encoding="utf-8") as f:
+                f.write(details["load_select_release"])
+
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
             trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")

From 2df1eec9ecfa6785cdc3645a05d84c4b3c7956f2 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 00:07:57 +0800
Subject: [PATCH 14/40] count token-select by prefill/mixed worker type only

---
 .../scripts/analyzers/load_report.py           | 11 +++++------
 .../skills/troubleshoot/scripts/log_parser.py  | 18 +++++-------------
 2 files changed, 10 insertions(+), 19 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index e118c4e1af3..86ba1f0d94f 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -160,30 +160,29 @@ def format_load_report(result):
                     "ReqRelease": str(pw["releases"]),
                     "ReqDelta": delta_display,
                     "TokenSelect": str(pw.get("token_selects", 0)),
-                    "TokenSelInf": str(pw.get("token_selects_inferred", 0)),
                     "TokenRelease": str(pw.get("token_releases", 0)),
                 }
             )
         sections.append(
             render_table(
                 table_data,
-                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"],
-                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"},
+                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"],
+                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"},
             )
         )
         sections.append("")
         if no_correlatable_id:
             sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
             sections.append("")
-        sections.append("  说明: prefill/mixed 在运行时都会同时增加 request 与 token 计数器；其中 mixed 的 TokenSelect 可能来自推断（TokenSelInf）。")
+        sections.append("  说明: TokenSelect 按 worker type 统计（prefill + mixed 的 select 都计入），不依赖日志里是否出现 tokens 字段。")
         sections.append("")
         detail_sections.append("## Select/Release Per-Worker")
         detail_sections.append("")
         detail_sections.append(
             render_table(
                 table_data,
-                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"],
-                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"},
+                columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"],
+                right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"},
             )
         )
         detail_sections.append("")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 1bb11ddaa5e..200f976f2ff 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -692,21 +692,14 @@ def match_select_release(lines, fallback_window_s=120):
                 }
             )
 
-    # Per-worker summary
-    # 对照 golang_router SelectWorker 语义：
-    # - prefill: request counter + token counter 同时增加（日志通常带 tokens）
-    # - mixed:   request counter + token counter 同时增加（日志通常不带 tokens，需要推断）
-    per_worker = defaultdict(
-        lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_selects_inferred": 0, "token_releases": 0}
-    )
+    # Per-worker summary（按 worker type 统计，不依赖日志中的 tokens 字段）
+    # 规则：prefill/mixed 的 select 均计入 token_selects。
+    per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0})
     for s in selects:
         s_type = _normalize_worker_type(s.get("type"))
         per_worker[s["worker"]]["selects"] += 1
-        if s.get("tokens") is not None:
-            per_worker[s["worker"]]["token_selects"] += 1
-        elif s_type == "mixed":
+        if s_type in ("prefill", "mixed"):
             per_worker[s["worker"]]["token_selects"] += 1
-            per_worker[s["worker"]]["token_selects_inferred"] += 1
     for r in releases:
         if str(r.get("type", "")).endswith("_tokens"):
             per_worker[r["worker"]]["token_releases"] += 1
@@ -720,7 +713,6 @@ def match_select_release(lines, fallback_window_s=120):
             "releases": counts["releases"],
             "delta": counts["selects"] - counts["releases"],
             "token_selects": counts["token_selects"],
-            "token_selects_inferred": counts["token_selects_inferred"],
             "token_releases": counts["token_releases"],
         }
 
@@ -736,7 +728,7 @@ def match_select_release(lines, fallback_window_s=120):
     for s in selects:
         s_type = _normalize_worker_type(s.get("type"))
         type_summary[s_type]["counter_selects"] += 1
-        if s.get("tokens") is not None or s_type == "mixed":
+        if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
     for r in releases:
         r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))

From 83279b7001264114508c64138766410ce25c1234 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 00:33:09 +0800
Subject: [PATCH 15/40] troubleshoot: clarify DEGRADED meaning in report header

---
 .../troubleshoot/references/error_catalog.md  |   1 +
 .../troubleshoot/references/log_patterns.md   |  11 ++
 .../references/report_templates.md            |   1 +
 .../troubleshoot/scripts/analyzers/errors.py  |   9 ++
 .../scripts/analyzers/load_report.py          |   9 +-
 .../skills/troubleshoot/scripts/chart.py      |   3 +-
 .../skills/troubleshoot/scripts/log_parser.py | 113 +++++++++++++++++-
 .../troubleshoot/scripts/troubleshoot.py      |  18 ++-
 8 files changed, 156 insertions(+), 9 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
index ba48297d9c9..60b4931b546 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -61,6 +61,7 @@
 | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
 | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件（若未使用 register.yaml 可忽略） |
 | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
index cf33b41f723..4322909c01d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -233,6 +233,17 @@ PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]`
 
 ---
 
+## Select/Release 日志细节（与代码一致）
+
+- `select worker (prefill): <url>, tokens: <n>`
+- `select worker (decode|mixed): <url>, count: <n>`
+- `release worker: <url>, count: <n>`（request counter 释放）
+- `release prefill tokens: <url>, tokens: <n>`（token counter 释放；可能来自 prefill 或 mixed 请求路径）
+
+重点：release 只有上面这两种。`release worker` 不带 worker type，`release prefill tokens` 的文本也不能直接断定是 prefill（mixed 也可能调用）。因此按 `prefill/decode/mixed` 统计时，需要从 select 侧做归类；确实无法归类时才记为 `unknown`。
+
+---
+
 ## 使用脚本工具
 
 各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index ba9e40e9869..5eec70d1514 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -44,6 +44,7 @@
 ### 简洁版（终端输出）
 
 - 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 状态定义：`HEALTHY`=无明显异常；`DEGRADED`=服务可用但性能/稳定性下降（需关注）；`CRITICAL`=服务不可用或高风险故障
 - 按三层分类（Router / FD 后端 / 客户端）
 - 每个问题一行摘要 + 关键指标
 - 末尾提示详细版文件路径
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index b8217a5ffa4..1f3f63fcc45 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -33,6 +33,7 @@
     ("counter already zero", "Router"),
     ("tokenizer failed", "Router"),
     ("Instance {url} role is unknown", "Router"),
+    ("Failed to read YAML file config/register.yaml", "Router"),
     # 客户端
     ("Invalid request body", "客户端"),
     ("Invalid JSON format", "客户端"),
@@ -282,6 +283,14 @@ def format_errors_report(result):
             render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
         )
         sections.append("")
+        yaml_missing_count = sum(
+            e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"]
+        )
+        if yaml_missing_count > 0:
+            sections.append(
+                f"  ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次：若未启用该配置文件，可忽略。"
+            )
+            sections.append("")
 
     # 状态码分布
     if result["status_code_dist"]:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index 86ba1f0d94f..b06d26883aa 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -25,8 +25,11 @@ def format_load_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
+        max_diag_in_summary = 8
+        for d in result["diagnoses"][:max_diag_in_summary]:
             sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        if len(result["diagnoses"]) > max_diag_in_summary:
+            sections.append(f'  ... 其余 {len(result["diagnoses"]) - max_diag_in_summary} 项见 detail 报告')
         sections.append("")
         detail_sections.append("## 诊断")
         detail_sections.append("")
@@ -39,6 +42,7 @@ def format_load_report(result):
     if ls:
         sections.append("### 负载概览 (total_running)")
         sections.append("")
+        sections.append("  说明: stats 采样来自 `[stats]` 周期日志（通常每 5s 一条），用于观察当前并发与负载变化趋势。")
         sections.append(
             f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
             f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
@@ -108,6 +112,9 @@ def format_load_report(result):
             sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
             sections.append("")
             sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("  说明: token-release 由同 worker 邻近 select 推断到 prefill/mixed，不直接依赖 `release prefill tokens` 文本。")
+            if type_summary.get("unknown"):
+                sections.append("  说明: unknown 表示日志里缺少 worker type，且无法从邻近 select/release 关系推断。")
             sections.append("")
             detail_sections.append("## 按类型统计")
             detail_sections.append("")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
index 83bb0203432..1eaea1369f8 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None):
         w = col_widths[col]
         if col in right_align:
             header_parts.append(f" {col:>{w}} ")
+            sep_parts.append("-" * (w + 1) + ":")
         else:
             header_parts.append(f" {col:<{w}} ")
-        sep_parts.append("-" * (w + 2))
+            sep_parts.append(":" + "-" * (w + 1))
 
     lines = []
     lines.append("|" + "|".join(header_parts) + "|")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 200f976f2ff..a5d646dc029 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type):
     return "unknown"
 
 
+def _infer_release_worker_type(release, selects, fallback_window_s=120):
+    """为未显式标注 type 的 release 近似推断 worker type。
+
+    优先级：
+      1) 同 worker、时间上最近且不晚于 release 的 select type
+      2) 若无可解析时间戳，则使用同 worker 的最后一个 select type
+      3) 推断失败返回 unknown
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    # 回退：按出现顺序取同 worker 的最近 select
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
+def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
+    """为 token release 推断 worker type（prefill/mixed）。
+
+    注意：日志文本通常固定为 `release prefill tokens`，即使 mixed 也可能走这条日志。
+    因此 token release 的类型优先依据同 worker 的邻近 select 推断。
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
-            token_type = trm.group(1) or "prefill"
+            token_type = trm.group(1)
             releases.append(
                 {
                     "ts": ts,
                     "worker": trm.group(2),
-                    "type": f'{_normalize_worker_type(token_type)}_tokens',
+                    # 不直接信任日志里的 token type 文本（"release prefill tokens" 也可能来自 mixed）
+                    "type": "unknown_tokens",
+                    "raw_token_type": token_type or "",
                     "tags": tags,
                     "tokens": int(trm.group(3)),
                     "line": line_no,
@@ -716,7 +789,24 @@ def match_select_release(lines, fallback_window_s=120):
             "token_releases": counts["token_releases"],
         }
 
-    # 按 worker type 分类统计（prefill/decode/mixed）
+    # 为未显式标注 type 的 release 推断 worker type（避免大量 unknown）
+    inferred_release_types = {}
+    for i, r in enumerate(releases):
+        r_type_raw = str(r.get("type", ""))
+        if r_type_raw.endswith("_tokens"):
+            base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
+            if base_t == "unknown":
+                # token release 的 worker type 由同 worker 邻近 select 推断（prefill/mixed）
+                base_t = _infer_token_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+            inferred_release_types[i] = f"{base_t}_tokens"
+            continue
+        base_t = _normalize_worker_type(r_type_raw)
+        if base_t != "unknown":
+            inferred_release_types[i] = base_t
+            continue
+        inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+
+    # 按 worker type 分类统计（prefill/decode/mixed，必要时保留 unknown）
     type_summary = defaultdict(
         lambda: {
             "counter_selects": 0,
@@ -730,9 +820,10 @@ def match_select_release(lines, fallback_window_s=120):
         type_summary[s_type]["counter_selects"] += 1
         if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
-    for r in releases:
-        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
-        if str(r.get("type", "")).endswith("_tokens"):
+    for i, r in enumerate(releases):
+        inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", ""))))
+        r_type = _normalize_worker_type(str(inferred).replace("_tokens", ""))
+        if str(inferred).endswith("_tokens"):
             type_summary[r_type]["token_releases"] += 1
         else:
             type_summary[r_type]["counter_releases"] += 1
@@ -949,6 +1040,16 @@ def check(name, got, expected):
         "dial tcp {ip:port}: connection refused",
     )
 
+    print("\n=== Testing match_select_release (token release type inference) ===")
+    sample_lines = [
+        "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1",
+        "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10",
+        "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0",
+    ]
+    msr = match_select_release(sample_lines)
+    check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
+    check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
+
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index a818d31150f..3b80cd45c5e 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -128,7 +128,20 @@ def determine_status(results):
                 reasons.append(d["message"])
 
     if reasons:
-        return "DEGRADED", ", ".join(reasons)
+        # 去重并限制长度，避免状态行过长难读
+        deduped = []
+        seen = set()
+        for r in reasons:
+            if r not in seen:
+                deduped.append(r)
+                seen.add(r)
+        max_reasons = 4
+        shown = deduped[:max_reasons]
+        extra = len(deduped) - len(shown)
+        summary = "；".join(shown)
+        if extra > 0:
+            summary += f"；另有 {extra} 项诊断见各维度 detail 报告"
+        return "DEGRADED", summary
 
     if not results:
         return "HEALTHY", "无分析数据"
@@ -152,6 +165,9 @@ def format_full_report(results, status, status_reason):
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append(
+        "状态定义: HEALTHY=无明显异常；DEGRADED=服务可用但存在性能/稳定性问题（需关注）；CRITICAL=服务不可用或高风险故障。"
+    )
     parts.append("=" * 60)
     parts.append("")
 

From 55657a4405354be0e491ba9a71a4dba778c03beb Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 10:53:12 +0800
Subject: [PATCH 16/40] troubleshoot: revert trend windows to auto and split
 detail outputs by responsibility

---
 .../troubleshoot/references/error_catalog.md  |   1 +
 .../troubleshoot/references/log_patterns.md   |  11 ++
 .../references/report_templates.md            |  11 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 131 ++++++++++++++++-
 .../troubleshoot/scripts/analyzers/errors.py  |  48 +++++-
 .../troubleshoot/scripts/analyzers/health.py  |  38 ++++-
 .../troubleshoot/scripts/analyzers/latency.py |   8 +-
 .../troubleshoot/scripts/analyzers/load.py    |  64 +++++++-
 .../scripts/analyzers/load_report.py          |  42 +++++-
 .../troubleshoot/scripts/analyzers/trace.py   |  25 +++-
 .../skills/troubleshoot/scripts/chart.py      |   3 +-
 .../skills/troubleshoot/scripts/log_parser.py | 139 +++++++++++++++++-
 .../troubleshoot/scripts/troubleshoot.py      | 121 ++++++++++++++-
 13 files changed, 605 insertions(+), 37 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
index ba48297d9c9..60b4931b546 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -61,6 +61,7 @@
 | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
 | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件（若未使用 register.yaml 可忽略） |
 | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
index cf33b41f723..4322909c01d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -233,6 +233,17 @@ PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]`
 
 ---
 
+## Select/Release 日志细节（与代码一致）
+
+- `select worker (prefill): <url>, tokens: <n>`
+- `select worker (decode|mixed): <url>, count: <n>`
+- `release worker: <url>, count: <n>`（request counter 释放）
+- `release prefill tokens: <url>, tokens: <n>`（token counter 释放；可能来自 prefill 或 mixed 请求路径）
+
+重点：release 只有上面这两种。`release worker` 不带 worker type，`release prefill tokens` 的文本也不能直接断定是 prefill（mixed 也可能调用）。因此按 `prefill/decode/mixed` 统计时，需要从 select 侧做归类；确实无法归类时才记为 `unknown`。
+
+---
+
 ## 使用脚本工具
 
 各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index ba9e40e9869..c02f55c2d65 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -44,6 +44,7 @@
 ### 简洁版（终端输出）
 
 - 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 状态定义：`HEALTHY`=无明显异常；`DEGRADED`=服务可用但性能/稳定性下降（需关注）；`CRITICAL`=服务不可用或高风险故障
 - 按三层分类（Router / FD 后端 / 客户端）
 - 每个问题一行摘要 + 关键指标
 - 末尾提示详细版文件路径
@@ -53,8 +54,14 @@
 - 路径：`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
 - 主报告包含各维度总结 + 可视化图表（sparkline/柱状图/时间线等）
 - 详情拆分到 `details/` 子目录：
-  - `details/health_events.md` — Worker 逐分钟健康事件
-  - `details/trace_<ID>.md` — 请求追踪事件链
+  - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断
+  - `detail/load_select_release.md` — 负载诊断 + select/release 明细
+  - `detail/load_diagnoses.md` — load 诊断列表
+  - `detail/load_counter_state.md` — request/token counter 末状态
+  - `detail/latency_diagnoses.md` — 延迟诊断详情
+  - `detail/cache_diagnosis.md` — cache 六维诊断详情（session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断）
+  - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细
+  - `detail/trace_<ID>.md` — 请求追踪事件链
 
 ---
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3fca296f4d6..0d146c9b43c 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig
         "cold_starts": cold_starts,
         "hitratio_stats": hitratio_stats,
         "tokenizer_degraded_count": tokenizer_degraded_count,
+        "cross_diagnosis": _analyze_cross_diagnosis(
+            session_stickiness=session_stickiness,
+            hitratio_stats=hitratio_stats,
+            strategy_dist=strategy_dist,
+            eviction_impact=eviction_impact,
+        ),
         "diagnoses": diagnoses,
         "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, "
         f"冷启动 {cold_starts}",
@@ -339,6 +345,45 @@ def _diagnose(
     return diagnoses
 
 
+def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact):
+    """交叉诊断：基于粘性/命中率/fallback/驱逐给出简表。"""
+    if not session_stickiness:
+        return []
+    avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1)
+    mean_hr = hitratio_stats.get("mean", 0)
+    fallback_pct = 0
+    for s in strategy_dist:
+        if s.get("value") == "process_tokens":
+            fallback_pct = s.get("pct", 0)
+            break
+    evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted"))
+
+    diagnosis = "运行良好"
+    action = "-"
+    if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10:
+        diagnosis = "运行良好"
+    elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0:
+        diagnosis = "疑似驱逐导致命中率低"
+        action = "考虑增大 eviction-duration-mins"
+    elif avg_stickiness < 40 and fallback_pct >= 20:
+        diagnosis = "低粘性 + 高 fallback"
+        action = "检查负载阈值与 cache-aware 参数"
+    elif avg_stickiness < 40 and mean_hr < 20:
+        diagnosis = "低粘性 + 低命中"
+        action = "检查缓存预热与 prompt 稳定性"
+
+    return [
+        {
+            "avg_stickiness_pct": round(avg_stickiness, 1),
+            "mean_hitRatio_pct": round(mean_hr, 1),
+            "fallback_pct": round(fallback_pct, 1),
+            "evicted_after_timeout": evicted_cnt,
+            "diagnosis": diagnosis,
+            "action": action,
+        }
+    ]
+
+
 # ════════════════════════════════════════════════════════════════
 # 报告格式化
 # ════════════════════════════════════════════════════════════════
@@ -349,13 +394,18 @@ def format_cache_report(result):
     sections = ["## Cache 调度诊断", ""]
     sections.append(f'  {result["summary"]}')
     sections.append("")
+    detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""]
 
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)")
         sections.append("")
+        detail_sections.append("## 诊断")
+        detail_sections.append("")
+        for d in result["diagnoses"]:
+            detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_sections.append("")
 
     # 策略分布
     if result["strategy_dist"]:
@@ -364,6 +414,10 @@ def format_cache_report(result):
         bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## 策略分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # hitRatio 统计
     hs = result.get("hitratio_stats", {})
@@ -383,6 +437,10 @@ def format_cache_report(result):
         bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## Fallback 原因分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # Tokenizer 退化
     if result.get("tokenizer_degraded_count", 0) > 0:
@@ -394,6 +452,8 @@ def format_cache_report(result):
     if stickiness:
         sections.append("### Session 粘性")
         sections.append("")
+        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)")
+        sections.append("")
         table_data = [
             {
                 "Session": sid[:16],
@@ -403,26 +463,37 @@ def format_cache_report(result):
             }
             for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"])
         ]
-        sections.append(
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append(
             render_table(
-                table_data[:10],
+                table_data,
                 columns=["Session", "请求数", "粘性率", "切换次数"],
                 right_align={"请求数", "粘性率", "切换次数"},
             )
         )
-        sections.append("")
+        detail_sections.append("")
 
     # 非最优选择
     if result.get("suboptimal_selections"):
         subs = result["suboptimal_selections"]
         sections.append(f"### 非最优选择 ({len(subs)} 次)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)")
+        sections.append("")
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
         for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
             sections.append(f"  {reason}: {count} 次")
         sections.append("")
+        detail_sections.append("## 非最优选择（Top 20）")
+        detail_sections.append("")
+        for s in subs[:20]:
+            detail_sections.append(
+                f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
+            )
+        detail_sections.append("")
 
     # 驱逐影响
     if result.get("eviction_impact"):
@@ -430,13 +501,61 @@ def format_cache_report(result):
         evicted = [e for e in evictions if e["evicted"]]
         sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        for e in evictions[:50]:
+            detail_sections.append(
+                f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
+            )
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
         sections.append(f'  冷启动: {result["cold_starts"]} 次（hitRatios=map[]）')
         sections.append("")
+        detail_sections.append("## 冷启动识别")
+        detail_sections.append("")
+        detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
+        detail_sections.append("")
+
+    if result.get("cross_diagnosis"):
+        sections.append("### 交叉诊断")
+        sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["cross_diagnosis"],
+                columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"],
+                right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"},
+            )
+        )
+        detail_sections.append("")
+
+    if any(
+        [
+            result.get("session_stickiness"),
+            result.get("suboptimal_selections"),
+            result.get("eviction_impact"),
+            result.get("cross_diagnosis"),
+            result.get("diagnoses"),
+        ]
+    ):
+        sections.append(
+            "> 详细诊断: [detail/cache_diagnosis.md](detail/cache_diagnosis.md) | "
+            "[detail/cache_session_stickiness.md](detail/cache_session_stickiness.md) | "
+            "[detail/cache_suboptimal.md](detail/cache_suboptimal.md) | "
+            "[detail/cache_eviction.md](detail/cache_eviction.md) | "
+            "[detail/cache_fallback.md](detail/cache_fallback.md) | "
+            "[detail/cache_cross.md](detail/cache_cross.md)"
+        )
+        sections.append("")
 
-    return "\n".join(sections)
+    return "\n".join(sections), "\n".join(detail_sections)
 
 
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index b8217a5ffa4..7f519806225 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -33,6 +33,7 @@
     ("counter already zero", "Router"),
     ("tokenizer failed", "Router"),
     ("Instance {url} role is unknown", "Router"),
+    ("Failed to read YAML file config/register.yaml", "Router"),
     # 客户端
     ("Invalid request body", "客户端"),
     ("Invalid JSON format", "客户端"),
@@ -55,6 +56,15 @@
     ("GetRemoteMetrics failed", "FD 后端"),
 ]
 
+IMPACT_RULES = [
+    ("Failed to select", "请求可能返回 502/503"),
+    ("Failed to connect to backend", "后端不可达，请求失败"),
+    ("Panic recovered", "Router 代码异常，可能影响稳定性"),
+    ("scanner error", "流式响应中断"),
+    ("copy error", "非流式响应中断"),
+    ("Failed to read YAML file config/register.yaml", "可选配置未加载（若未启用可忽略）"),
+]
+
 # scanner error / copy error 特殊处理：context canceled → 客户端，其他 → FD 后端
 SCANNER_COPY_PATTERNS = ("scanner error", "copy error")
 
@@ -75,6 +85,13 @@ def classify_source_layer(template, original=""):
     return "未知"
 
 
+def classify_impact(template):
+    for pattern, impact in IMPACT_RULES:
+        if pattern in template:
+            return impact
+    return "-"
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n):
                 "count": g["count"],
                 "pct": round(g["count"] / total * 100, 1) if total else 0,
                 "source_layer": source_layer,
+                "impact": classify_impact(g["template"]),
                 "level": g["level"],
+                "urls": _extract_urls(g["originals"]),
                 "sample_originals": g["originals"],
             }
         )
@@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n):
     return result
 
 
+def _extract_urls(originals):
+    import re
+
+    urls = set()
+    for line in originals:
+        for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line):
+            urls.add(m)
+    return sorted(urls)
+
+
 def _grep_lines(log_file, pattern, tail=None):
     """用 grep 从日志文件提取匹配行。"""
     try:
@@ -240,6 +269,9 @@ def format_errors_report(result):
         f'请求总数: {result["total_requests"]}  |  '
         f'错误率: {result["error_rate"]}%'
     )
+    sections.append("  指标口径: ERROR/WARN=日志级别计数；请求总数=HTTP 请求行数；错误率=非200请求数/请求总数×100%。")
+    if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0):
+        sections.append("  ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200；并不代表没有 ERROR/WARN 日志。")
     sections.append("")
 
     # Panic
@@ -276,12 +308,26 @@ def format_errors_report(result):
                     "占比": f'{e["pct"]}%',
                     "级别": e["level"],
                     "来源层": e["source_layer"],
+                    "影响": e.get("impact", "-"),
+                    "URLs": ",".join(e.get("urls", [])[:2]) if e.get("urls") else "-",
                 }
             )
         sections.append(
-            render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
+            render_table(
+                table_data,
+                columns=["模板", "数量", "占比", "级别", "来源层", "影响", "URLs"],
+                right_align={"数量", "占比"},
+            )
         )
         sections.append("")
+        yaml_missing_count = sum(
+            e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"]
+        )
+        if yaml_missing_count > 0:
+            sections.append(
+                f"  ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次：若未启用该配置文件，可忽略。"
+            )
+            sections.append("")
 
     # 状态码分布
     if result["status_code_dist"]:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index ca01d718dbc..8fc4e88cc72 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events):
                         break
 
         all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events]
+        for reg in register_by_ip.get(worker_ip, []):
+            all_events.append({"ts": reg["ts"], "type": "REGISTERED"})
         all_events.extend(recovery_events)
         all_events.sort(key=lambda e: e["ts"] or "")
 
         down_periods = _compute_down_periods(all_events)
         down_count = len(down_periods)
         avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0
+        detect_latency = _compute_detect_latency(all_events)
 
         workers[url] = {
             "events": all_events,
@@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
             "recovered": recovered,
             "inflight_preserved": counter_counts.get(url, 0),
             "down_periods": down_periods,
+            "avg_detect_latency_s": detect_latency,
         }
 
     return workers
@@ -191,6 +195,24 @@ def _compute_down_periods(events):
     return down_periods
 
 
+def _compute_detect_latency(events):
+    """计算 NOT_HEALTHY -> REMOVED 平均检测延迟（秒）。"""
+    last_unhealthy = None
+    latencies = []
+    for evt in events:
+        if evt["type"] == "NOT_HEALTHY" and evt.get("ts"):
+            last_unhealthy = evt["ts"]
+        elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"):
+            try:
+                latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds())
+            except ValueError:
+                pass
+            last_unhealthy = None
+    if not latencies:
+        return "-"
+    return round(sum(latencies) / len(latencies), 1)
+
+
 def _compute_uptime_pct(events):
     """计算 Worker 可用性百分比。"""
     if not events:
@@ -313,8 +335,7 @@ def format_health_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/health_events.md](detail/health_events.md)")
         sections.append("")
 
     # Worker 可用性表格
@@ -335,6 +356,7 @@ def format_health_report(result):
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
+                "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"),
                 "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"),
                 "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-",
             }
@@ -342,8 +364,8 @@ def format_health_report(result):
     sections.append(
         render_table(
             table_data,
-            columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"],
-            right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"},
+            columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"],
+            right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"},
         )
     )
     sections.append("")
@@ -360,6 +382,12 @@ def format_health_report(result):
     # 事件详情 → 拆分到 detail_text
     detail_parts = ["# Worker 健康事件详情", ""]
     has_events = False
+    if result.get("diagnoses"):
+        detail_parts.append("## 诊断")
+        detail_parts.append("")
+        for d in result["diagnoses"]:
+            detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_parts.append("")
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
@@ -373,7 +401,7 @@ def format_health_report(result):
 
     # 主报告中添加引用
     if has_events:
-        sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)")
+        sections.append("> 完整事件详情: [detail/health_events.md](detail/health_events.md)")
         sections.append("")
 
     return "\n".join(sections), detail_text
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
index eec862910e8..57af094661e 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
@@ -255,6 +255,7 @@ def format_latency_report(result):
         f'p95={_fmt_ms(stats["p95"])}  p99={_fmt_ms(stats["p99"])}  '
         f'max={_fmt_ms(stats["max"])}'
     )
+    sections.append("  指标口径: pXX=延迟分位数；吞吐量=每个时间桶内请求数(count)；调度耗时=同 request_id 的 ts_ms(max-min)。")
     sections.append("")
 
     # 延迟分布
@@ -331,13 +332,10 @@ def format_latency_report(result):
         )
         sections.append("")
 
-    # 诊断
+    # 诊断（仅在 detail 输出）
     if result["diagnoses"]:
         sections.append("### 诊断")
-        for d in result["diagnoses"]:
-            severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "}
-            mark = severity_mark.get(d["severity"], " ")
-            sections.append(f'  [{mark}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/latency_diagnoses.md](detail/latency_diagnoses.md)")
         sections.append("")
 
     return "\n".join(sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index c38b0b80953..0d92ae56a32 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -30,6 +30,8 @@
 # Token 事件
 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)")
 RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)")
+RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)")
 
 
 def _strip_scheme(url):
@@ -135,11 +137,21 @@ def analyze_load(log_file, tail=None):
     sr_result = (
         match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
+        else {
+            "matched": [],
+            "unmatched_selects": [],
+            "unmatched_releases": [],
+            "untracked_selects": [],
+            "failed_selects": [],
+            "per_worker": {},
+            "id_coverage": {},
+            "type_summary": {},
+        }
     )
 
     # Token 统计
     token_stats = _analyze_tokens(h3_lines, h11_lines)
+    counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines)
 
     # 请求堆积检测
     pileup = _detect_pileup(stats_records)
@@ -154,6 +166,7 @@ def analyze_load(log_file, tail=None):
         "counter_anomalies": anomaly_summary,
         "select_release": sr_result,
         "token_stats": token_stats,
+        "counter_last_state": counter_last_state,
         "pileup_detected": pileup,
         "diagnoses": diagnoses,
         "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)",
@@ -191,6 +204,55 @@ def _analyze_tokens(h3_lines, h11_lines):
     return result
 
 
+def _analyze_counter_last_state(lines):
+    """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。"""
+    state = defaultdict(
+        lambda: {
+            "req_last_action": "-",
+            "req_last_value": "-",
+            "token_last_action": "-",
+            "token_last_value": "-",
+            "last_ts": "",
+        }
+    )
+    for line in lines:
+        ts = extract_ts(line) or ""
+        m = SELECT_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["req_last_action"] = "select"
+            state[w]["req_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(1)
+            state[w]["req_last_action"] = "release"
+            state[w]["req_last_value"] = m.group(2)
+            state[w]["last_ts"] = ts
+            continue
+        m = SELECT_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "select"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "release"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+
+    result = []
+    for w in sorted(state.keys()):
+        s = state[w]
+        result.append({"worker": _strip_scheme(w), **s})
+    return result
+
+
 def _detect_pileup(stats_records):
     """检测请求堆积：total_running 连续上升 >5 个采样点。"""
     if len(stats_records) < 5:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index 86ba1f0d94f..8c375e25e57 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -25,8 +25,10 @@ def format_load_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append(
+            f'  共 {len(result["diagnoses"])} 条诊断，见详情: [detail/load_diagnoses.md](detail/load_diagnoses.md)；'
+            '匹配明细见 [detail/load_select_release.md](detail/load_select_release.md)'
+        )
         sections.append("")
         detail_sections.append("## 诊断")
         detail_sections.append("")
@@ -39,6 +41,7 @@ def format_load_report(result):
     if ls:
         sections.append("### 负载概览 (total_running)")
         sections.append("")
+        sections.append("  说明: stats 采样来自 `[stats]` 周期日志（通常每 5s 一条），用于观察当前并发与负载变化趋势。")
         sections.append(
             f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
             f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
@@ -108,6 +111,9 @@ def format_load_report(result):
             sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
             sections.append("")
             sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("  说明: token-release 由同 worker 邻近 select 推断到 prefill/mixed，不直接依赖 `release prefill tokens` 文本。")
+            if type_summary.get("unknown"):
+                sections.append("  说明: unknown 表示日志里缺少 worker type，且无法从邻近 select/release 关系推断。")
             sections.append("")
             detail_sections.append("## 按类型统计")
             detail_sections.append("")
@@ -192,7 +198,7 @@ def format_load_report(result):
         sections.append("  解释: 出现 request select，但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。")
         for u in sr["unmatched_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## 未匹配 select（完整）")
         detail_sections.append("")
@@ -202,11 +208,23 @@ def format_load_report(result):
             )
         detail_sections.append("")
 
+    if sr.get("unmatched_releases"):
+        sections.append(f'  ⚠ {len(sr["unmatched_releases"])} 个未匹配 release（已区分 req/token）')
+        sections.append("  > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## 未匹配 release（按 release_kind 分类）")
+        detail_sections.append("")
+        for r in sr["unmatched_releases"]:
+            detail_sections.append(
+                f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}'
+            )
+        detail_sections.append("")
+
     if sr.get("untracked_selects"):
         sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
         for u in sr["untracked_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## Untracked selects（缺少可关联 ID）")
         detail_sections.append("")
@@ -239,4 +257,20 @@ def format_load_report(result):
         )
         sections.append("")
 
+    if result.get("counter_last_state"):
+        sections.append("### 计数器末状态")
+        sections.append("")
+        sections.append("  末状态详情见: [detail/load_counter_state.md](detail/load_counter_state.md)")
+        sections.append("")
+        detail_sections.append("## Counter / Token Counter 末状态（最后一条计数日志）")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["counter_last_state"],
+                columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"],
+                right_align={"req_last_value", "token_last_value"},
+            )
+        )
+        detail_sections.append("")
+
     return "\n".join(sections), "\n".join(detail_sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 6c9a0323724..a792da8002f 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -16,6 +16,7 @@
 from log_parser import (
     extract_tags,
     extract_ts,
+    match_select_release,
     parse_cache_strategy_line,
     parse_http_line,
 )
@@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None):
         # 解析事件链
         events = _parse_event_chain(all_lines)
         lifecycle_complete = _check_lifecycle_complete(events)
-        diagnoses = _diagnose_trace(events, lifecycle_complete)
+        sr_check = match_select_release(all_lines)
+        diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check)
 
         traces[tid] = {
             "events": events,
             "lifecycle_complete": lifecycle_complete,
             "diagnoses": diagnoses,
+            "sr_check": sr_check,
             "matched_tag": "session_id" if is_session else "request_id/trace_id",
             "related_ids": {
                 "request_ids": sorted(related_request_ids) if is_session else [],
@@ -271,7 +274,7 @@ def _check_lifecycle_complete(events):
     return has_entry and has_exit and (not has_select or has_release)
 
 
-def _diagnose_trace(events, lifecycle_complete):
+def _diagnose_trace(events, lifecycle_complete, sr_check=None):
     """生成追踪诊断。"""
     diagnoses = []
     types = [e["type"] for e in events]
@@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete):
     if "FAILED_SELECT" in types:
         diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"})
 
+    if sr_check:
+        if sr_check.get("unmatched_selects"):
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select',
+                }
+            )
+        if sr_check.get("unmatched_releases"):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release',
+                }
+            )
+
     return diagnoses
 
 
@@ -367,7 +386,7 @@ def format_trace_report(result):
             # 主报告中添加引用和摘要
             safe_tid = tid.replace("/", "_")
             sections.append(f'  事件数: {len(trace["events"])}')
-            sections.append(f"  > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)")
+            sections.append(f"  > 完整事件链: [detail/trace_{safe_tid}.md](detail/trace_{safe_tid}.md)")
             sections.append("")
 
     return "\n".join(sections), detail_dict
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
index 83bb0203432..1eaea1369f8 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None):
         w = col_widths[col]
         if col in right_align:
             header_parts.append(f" {col:>{w}} ")
+            sep_parts.append("-" * (w + 1) + ":")
         else:
             header_parts.append(f" {col:<{w}} ")
-        sep_parts.append("-" * (w + 2))
+            sep_parts.append(":" + "-" * (w + 1))
 
     lines = []
     lines.append("|" + "|".join(header_parts) + "|")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 200f976f2ff..b99e75c37c5 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type):
     return "unknown"
 
 
+def _infer_release_worker_type(release, selects, fallback_window_s=120):
+    """为未显式标注 type 的 release 近似推断 worker type。
+
+    优先级：
+      1) 同 worker、时间上最近且不晚于 release 的 select type
+      2) 若无可解析时间戳，则使用同 worker 的最后一个 select type
+      3) 推断失败返回 unknown
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    # 回退：按出现顺序取同 worker 的最近 select
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
+def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
+    """为 token release 推断 worker type（prefill/mixed）。
+
+    注意：日志文本通常固定为 `release prefill tokens`，即使 mixed 也可能走这条日志。
+    因此 token release 的类型优先依据同 worker 的邻近 select 推断。
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
-            token_type = trm.group(1) or "prefill"
+            token_type = trm.group(1)
             releases.append(
                 {
                     "ts": ts,
                     "worker": trm.group(2),
-                    "type": f'{_normalize_worker_type(token_type)}_tokens',
+                    # 不直接信任日志里的 token type 文本（"release prefill tokens" 也可能来自 mixed）
+                    "type": "unknown_tokens",
+                    "raw_token_type": token_type or "",
                     "tags": tags,
                     "tokens": int(trm.group(3)),
                     "line": line_no,
@@ -716,7 +789,24 @@ def match_select_release(lines, fallback_window_s=120):
             "token_releases": counts["token_releases"],
         }
 
-    # 按 worker type 分类统计（prefill/decode/mixed）
+    # 为未显式标注 type 的 release 推断 worker type（避免大量 unknown）
+    inferred_release_types = {}
+    for i, r in enumerate(releases):
+        r_type_raw = str(r.get("type", ""))
+        if r_type_raw.endswith("_tokens"):
+            base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
+            if base_t == "unknown":
+                # token release 的 worker type 由同 worker 邻近 select 推断（prefill/mixed）
+                base_t = _infer_token_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+            inferred_release_types[i] = f"{base_t}_tokens"
+            continue
+        base_t = _normalize_worker_type(r_type_raw)
+        if base_t != "unknown":
+            inferred_release_types[i] = base_t
+            continue
+        inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+
+    # 按 worker type 分类统计（prefill/decode/mixed，必要时保留 unknown）
     type_summary = defaultdict(
         lambda: {
             "counter_selects": 0,
@@ -730,16 +820,43 @@ def match_select_release(lines, fallback_window_s=120):
         type_summary[s_type]["counter_selects"] += 1
         if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
-    for r in releases:
-        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
-        if str(r.get("type", "")).endswith("_tokens"):
+    for i, r in enumerate(releases):
+        inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", ""))))
+        r_type = _normalize_worker_type(str(inferred).replace("_tokens", ""))
+        if str(inferred).endswith("_tokens"):
             type_summary[r_type]["token_releases"] += 1
         else:
             type_summary[r_type]["counter_releases"] += 1
 
+    unmatched_releases = []
+    for i, r in enumerate(releases):
+        if str(r.get("type", "")).endswith("_tokens"):
+            # token release: 近邻存在 prefill/mixed select 则视为可解释，不计入 unmatched
+            inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", ""))
+            if inferred_token_type == "unknown":
+                unmatched_releases.append(
+                    {
+                        "worker": r.get("worker", ""),
+                        "release_ts": r.get("ts", ""),
+                        "type": inferred_token_type,
+                        "release_kind": "token_release",
+                    }
+                )
+            continue
+        if i not in release_used:
+            unmatched_releases.append(
+                {
+                    "worker": r.get("worker", ""),
+                    "release_ts": r.get("ts", ""),
+                    "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")),
+                    "release_kind": "request_release",
+                }
+            )
+
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
+        "unmatched_releases": unmatched_releases,
         "untracked_selects": untracked_selects,
         "failed_selects": failed_selects,
         "per_worker": pw_result,
@@ -949,6 +1066,16 @@ def check(name, got, expected):
         "dial tcp {ip:port}: connection refused",
     )
 
+    print("\n=== Testing match_select_release (token release type inference) ===")
+    sample_lines = [
+        "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1",
+        "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10",
+        "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0",
+    ]
+    msr = match_select_release(sample_lines)
+    check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
+    check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
+
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index a818d31150f..7dd2e153a64 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -128,7 +128,20 @@ def determine_status(results):
                 reasons.append(d["message"])
 
     if reasons:
-        return "DEGRADED", ", ".join(reasons)
+        # 去重并限制长度，避免状态行过长难读
+        deduped = []
+        seen = set()
+        for r in reasons:
+            if r not in seen:
+                deduped.append(r)
+                seen.add(r)
+        max_reasons = 4
+        shown = deduped[:max_reasons]
+        extra = len(deduped) - len(shown)
+        summary = "；".join(shown)
+        if extra > 0:
+            summary += f"；另有 {extra} 项诊断见各维度 detail 报告"
+        return "DEGRADED", summary
 
     if not results:
         return "HEALTHY", "无分析数据"
@@ -148,10 +161,26 @@ def format_full_report(results, status, status_reason):
                 - 'trace_files': {trace_id: text} 或 {}
     """
     parts = []
-    details = {"health_events": None, "load_select_release": None, "trace_files": {}}
+    details = {
+        "health_events": None,
+        "load_select_release": None,
+        "latency_diagnoses": None,
+        "cache_diagnosis": None,
+        "load_diagnoses": None,
+        "load_counter_state": None,
+        "cache_session_stickiness": None,
+        "cache_suboptimal": None,
+        "cache_eviction": None,
+        "cache_fallback": None,
+        "cache_cross": None,
+        "trace_files": {},
+    }
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append(
+        "状态定义: HEALTHY=无明显异常；DEGRADED=服务可用但存在性能/稳定性问题（需关注）；CRITICAL=服务不可用或高风险故障。"
+    )
     parts.append("=" * 60)
     parts.append("")
 
@@ -161,6 +190,12 @@ def format_full_report(results, status, status_reason):
 
     if "latency" in results:
         parts.append(format_latency_report(results["latency"]))
+        if results["latency"].get("diagnoses"):
+            lines = ["# 延迟诊断详情", ""]
+            for d in results["latency"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] {d.get("message","")}')
+            lines.append("")
+            details["latency_diagnoses"] = "\n".join(lines)
 
     if "health" in results:
         summary, detail = format_health_report(results["health"])
@@ -173,9 +208,58 @@ def format_full_report(results, status, status_reason):
         parts.append(summary)
         if detail:
             details["load_select_release"] = detail
+        if results["load"].get("diagnoses"):
+            lines = ["# Load 诊断详情", ""]
+            for d in results["load"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}')
+            lines.append("")
+            details["load_diagnoses"] = "\n".join(lines)
+        if results["load"].get("counter_last_state"):
+            rows = results["load"]["counter_last_state"]
+            lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"]
+            for r in rows:
+                lines.append(
+                    f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |'
+                )
+            lines.append("")
+            details["load_counter_state"] = "\n".join(lines)
 
     if "cache" in results:
-        parts.append(format_cache_report(results["cache"]))
+        summary, detail = format_cache_report(results["cache"])
+        parts.append(summary)
+        if detail:
+            details["cache_diagnosis"] = detail
+        c = results["cache"]
+        if c.get("session_stickiness"):
+            lines = ["# Cache Session 粘性详情", ""]
+            for sid, s in c["session_stickiness"].items():
+                lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}')
+            lines.append("")
+            details["cache_session_stickiness"] = "\n".join(lines)
+        if c.get("suboptimal_selections"):
+            lines = ["# Cache 非最优选择详情", ""]
+            for x in c["suboptimal_selections"][:200]:
+                lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}')
+            lines.append("")
+            details["cache_suboptimal"] = "\n".join(lines)
+        if c.get("eviction_impact"):
+            lines = ["# Cache 驱逐影响详情", ""]
+            for x in c["eviction_impact"][:200]:
+                lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}')
+            lines.append("")
+            details["cache_eviction"] = "\n".join(lines)
+        if c.get("fallback_reasons"):
+            lines = ["# Cache Fallback 原因详情", ""]
+            for x in c["fallback_reasons"]:
+                lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
+            lines.append("")
+            details["cache_fallback"] = "\n".join(lines)
+        if c.get("cross_diagnosis"):
+            lines = ["# Cache 交叉诊断详情", ""]
+            for x in c["cross_diagnosis"]:
+                lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%')
+            lines.append("")
+            details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -217,6 +301,37 @@ def save_detailed_report(report_text, output_dir, details=None):
             with open(load_path, "w", encoding="utf-8") as f:
                 f.write(details["load_select_release"])
 
+        if details.get("latency_diagnoses"):
+            latency_path = os.path.join(detail_dir, "latency_diagnoses.md")
+            with open(latency_path, "w", encoding="utf-8") as f:
+                f.write(details["latency_diagnoses"])
+
+        if details.get("cache_diagnosis"):
+            cache_path = os.path.join(detail_dir, "cache_diagnosis.md")
+            with open(cache_path, "w", encoding="utf-8") as f:
+                f.write(details["cache_diagnosis"])
+        if details.get("load_diagnoses"):
+            with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_diagnoses"])
+        if details.get("load_counter_state"):
+            with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_counter_state"])
+        if details.get("cache_session_stickiness"):
+            with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_session_stickiness"])
+        if details.get("cache_suboptimal"):
+            with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_suboptimal"])
+        if details.get("cache_eviction"):
+            with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_eviction"])
+        if details.get("cache_fallback"):
+            with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_fallback"])
+        if details.get("cache_cross"):
+            with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_cross"])
+
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
             trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")

From 5ac7cb3afd399b6372884dad17f5714b07ea96e4 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 11:21:35 +0800
Subject: [PATCH 17/40] troubleshoot: map token-release type by worker URL
 instead of time-neighbor inference

---
 .../troubleshoot/references/error_catalog.md  |   1 +
 .../troubleshoot/references/log_patterns.md   |  11 ++
 .../references/report_templates.md            |  12 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 131 +++++++++++++-
 .../troubleshoot/scripts/analyzers/errors.py  |  56 ++++--
 .../troubleshoot/scripts/analyzers/health.py  |  38 +++-
 .../troubleshoot/scripts/analyzers/latency.py |   8 +-
 .../troubleshoot/scripts/analyzers/load.py    |  65 ++++++-
 .../scripts/analyzers/load_report.py          |  65 ++++++-
 .../troubleshoot/scripts/analyzers/trace.py   |  25 ++-
 .../skills/troubleshoot/scripts/chart.py      |   3 +-
 .../skills/troubleshoot/scripts/log_parser.py | 163 +++++++++++++++++-
 .../troubleshoot/scripts/troubleshoot.py      | 142 ++++++++++++++-
 13 files changed, 669 insertions(+), 51 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
index ba48297d9c9..60b4931b546 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -61,6 +61,7 @@
 | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
 | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件（若未使用 register.yaml 可忽略） |
 | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
index cf33b41f723..4322909c01d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -233,6 +233,17 @@ PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]`
 
 ---
 
+## Select/Release 日志细节（与代码一致）
+
+- `select worker (prefill): <url>, tokens: <n>`
+- `select worker (decode|mixed): <url>, count: <n>`
+- `release worker: <url>, count: <n>`（request counter 释放）
+- `release prefill tokens: <url>, tokens: <n>`（token counter 释放；可能来自 prefill 或 mixed 请求路径）
+
+重点：release 只有上面这两种。`release worker` 不带 worker type，`release prefill tokens` 的文本也不能直接断定是 prefill（mixed 也可能调用）。因此按 `prefill/decode/mixed` 统计时，需要从 select 侧做归类；确实无法归类时才记为 `unknown`。
+
+---
+
 ## 使用脚本工具
 
 各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index ba9e40e9869..cd705d02816 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -44,6 +44,7 @@
 ### 简洁版（终端输出）
 
 - 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 状态定义：`HEALTHY`=无明显异常；`DEGRADED`=服务可用但性能/稳定性下降（需关注）；`CRITICAL`=服务不可用或高风险故障
 - 按三层分类（Router / FD 后端 / 客户端）
 - 每个问题一行摘要 + 关键指标
 - 末尾提示详细版文件路径
@@ -53,8 +54,15 @@
 - 路径：`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
 - 主报告包含各维度总结 + 可视化图表（sparkline/柱状图/时间线等）
 - 详情拆分到 `details/` 子目录：
-  - `details/health_events.md` — Worker 逐分钟健康事件
-  - `details/trace_<ID>.md` — 请求追踪事件链
+  - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断
+  - `detail/errors_topn.md` — ERROR/WARN 模板明细（数量/级别/来源层/影响 + URLs）
+  - `detail/load_select_release.md` — 负载诊断 + select/release 明细
+  - `detail/load_diagnoses.md` — load 诊断列表
+  - `detail/load_counter_state.md` — request/token counter 末状态
+  - `detail/latency_diagnoses.md` — 延迟诊断详情
+  - `detail/cache_diagnosis.md` — cache 六维诊断详情（session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断）
+  - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细
+  - `detail/trace_<ID>.md` — 请求追踪事件链
 
 ---
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3fca296f4d6..3a5c19ad00b 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig
         "cold_starts": cold_starts,
         "hitratio_stats": hitratio_stats,
         "tokenizer_degraded_count": tokenizer_degraded_count,
+        "cross_diagnosis": _analyze_cross_diagnosis(
+            session_stickiness=session_stickiness,
+            hitratio_stats=hitratio_stats,
+            strategy_dist=strategy_dist,
+            eviction_impact=eviction_impact,
+        ),
         "diagnoses": diagnoses,
         "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, "
         f"冷启动 {cold_starts}",
@@ -339,6 +345,45 @@ def _diagnose(
     return diagnoses
 
 
+def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact):
+    """交叉诊断：基于粘性/命中率/fallback/驱逐给出简表。"""
+    if not session_stickiness:
+        return []
+    avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1)
+    mean_hr = hitratio_stats.get("mean", 0)
+    fallback_pct = 0
+    for s in strategy_dist:
+        if s.get("value") == "process_tokens":
+            fallback_pct = s.get("pct", 0)
+            break
+    evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted"))
+
+    diagnosis = "运行良好"
+    action = "-"
+    if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10:
+        diagnosis = "运行良好"
+    elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0:
+        diagnosis = "疑似驱逐导致命中率低"
+        action = "考虑增大 eviction-duration-mins"
+    elif avg_stickiness < 40 and fallback_pct >= 20:
+        diagnosis = "低粘性 + 高 fallback"
+        action = "检查负载阈值与 cache-aware 参数"
+    elif avg_stickiness < 40 and mean_hr < 20:
+        diagnosis = "低粘性 + 低命中"
+        action = "检查缓存预热与 prompt 稳定性"
+
+    return [
+        {
+            "avg_stickiness_pct": round(avg_stickiness, 1),
+            "mean_hitRatio_pct": round(mean_hr, 1),
+            "fallback_pct": round(fallback_pct, 1),
+            "evicted_after_timeout": evicted_cnt,
+            "diagnosis": diagnosis,
+            "action": action,
+        }
+    ]
+
+
 # ════════════════════════════════════════════════════════════════
 # 报告格式化
 # ════════════════════════════════════════════════════════════════
@@ -349,13 +394,18 @@ def format_cache_report(result):
     sections = ["## Cache 调度诊断", ""]
     sections.append(f'  {result["summary"]}')
     sections.append("")
+    detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""]
 
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
         sections.append("")
+        detail_sections.append("## 诊断")
+        detail_sections.append("")
+        for d in result["diagnoses"]:
+            detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_sections.append("")
 
     # 策略分布
     if result["strategy_dist"]:
@@ -364,6 +414,10 @@ def format_cache_report(result):
         bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## 策略分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # hitRatio 统计
     hs = result.get("hitratio_stats", {})
@@ -383,6 +437,10 @@ def format_cache_report(result):
         bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## Fallback 原因分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # Tokenizer 退化
     if result.get("tokenizer_degraded_count", 0) > 0:
@@ -394,6 +452,8 @@ def format_cache_report(result):
     if stickiness:
         sections.append("### Session 粘性")
         sections.append("")
+        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         table_data = [
             {
                 "Session": sid[:16],
@@ -403,26 +463,37 @@ def format_cache_report(result):
             }
             for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"])
         ]
-        sections.append(
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append(
             render_table(
-                table_data[:10],
+                table_data,
                 columns=["Session", "请求数", "粘性率", "切换次数"],
                 right_align={"请求数", "粘性率", "切换次数"},
             )
         )
-        sections.append("")
+        detail_sections.append("")
 
     # 非最优选择
     if result.get("suboptimal_selections"):
         subs = result["suboptimal_selections"]
         sections.append(f"### 非最优选择 ({len(subs)} 次)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
         for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
             sections.append(f"  {reason}: {count} 次")
         sections.append("")
+        detail_sections.append("## 非最优选择（Top 20）")
+        detail_sections.append("")
+        for s in subs[:20]:
+            detail_sections.append(
+                f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
+            )
+        detail_sections.append("")
 
     # 驱逐影响
     if result.get("eviction_impact"):
@@ -430,13 +501,61 @@ def format_cache_report(result):
         evicted = [e for e in evictions if e["evicted"]]
         sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        for e in evictions[:50]:
+            detail_sections.append(
+                f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
+            )
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
         sections.append(f'  冷启动: {result["cold_starts"]} 次（hitRatios=map[]）')
         sections.append("")
+        detail_sections.append("## 冷启动识别")
+        detail_sections.append("")
+        detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
+        detail_sections.append("")
+
+    if result.get("cross_diagnosis"):
+        sections.append("### 交叉诊断")
+        sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["cross_diagnosis"],
+                columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"],
+                right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"},
+            )
+        )
+        detail_sections.append("")
+
+    if any(
+        [
+            result.get("session_stickiness"),
+            result.get("suboptimal_selections"),
+            result.get("eviction_impact"),
+            result.get("cross_diagnosis"),
+            result.get("diagnoses"),
+        ]
+    ):
+        sections.append(
+            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
+            "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
+            "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
+            "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
+            "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
+            "[detail/cache_cross.md](../detail/cache_cross.md)"
+        )
+        sections.append("")
 
-    return "\n".join(sections)
+    return "\n".join(sections), "\n".join(detail_sections)
 
 
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index b8217a5ffa4..f0e4c352b6c 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -33,6 +33,7 @@
     ("counter already zero", "Router"),
     ("tokenizer failed", "Router"),
     ("Instance {url} role is unknown", "Router"),
+    ("Failed to read YAML file config/register.yaml", "Router"),
     # 客户端
     ("Invalid request body", "客户端"),
     ("Invalid JSON format", "客户端"),
@@ -55,6 +56,15 @@
     ("GetRemoteMetrics failed", "FD 后端"),
 ]
 
+IMPACT_RULES = [
+    ("Failed to select", "请求可能返回 502/503"),
+    ("Failed to connect to backend", "后端不可达，请求失败"),
+    ("Panic recovered", "Router 代码异常，可能影响稳定性"),
+    ("scanner error", "流式响应中断"),
+    ("copy error", "非流式响应中断"),
+    ("Failed to read YAML file config/register.yaml", "可选配置未加载（若未启用可忽略）"),
+]
+
 # scanner error / copy error 特殊处理：context canceled → 客户端，其他 → FD 后端
 SCANNER_COPY_PATTERNS = ("scanner error", "copy error")
 
@@ -75,6 +85,13 @@ def classify_source_layer(template, original=""):
     return "未知"
 
 
+def classify_impact(template):
+    for pattern, impact in IMPACT_RULES:
+        if pattern in template:
+            return impact
+    return "-"
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n):
                 "count": g["count"],
                 "pct": round(g["count"] / total * 100, 1) if total else 0,
                 "source_layer": source_layer,
+                "impact": classify_impact(g["template"]),
                 "level": g["level"],
+                "urls": _extract_urls(g["originals"]),
                 "sample_originals": g["originals"],
             }
         )
@@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n):
     return result
 
 
+def _extract_urls(originals):
+    import re
+
+    urls = set()
+    for line in originals:
+        for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line):
+            urls.add(m)
+    return sorted(urls)
+
+
 def _grep_lines(log_file, pattern, tail=None):
     """用 grep 从日志文件提取匹配行。"""
     try:
@@ -240,6 +269,9 @@ def format_errors_report(result):
         f'请求总数: {result["total_requests"]}  |  '
         f'错误率: {result["error_rate"]}%'
     )
+    sections.append("  指标口径: ERROR/WARN=日志级别计数；请求总数=HTTP 请求行数；错误率=非200请求数/请求总数×100%。")
+    if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0):
+        sections.append("  ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200；并不代表没有 ERROR/WARN 日志。")
     sections.append("")
 
     # Panic
@@ -266,22 +298,16 @@ def format_errors_report(result):
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
 
-        # 来源层表格
-        table_data = []
-        for e in result["error_top_n"][:10]:
-            table_data.append(
-                {
-                    "模板": e["template"][:60],
-                    "数量": e["count"],
-                    "占比": f'{e["pct"]}%',
-                    "级别": e["level"],
-                    "来源层": e["source_layer"],
-                }
-            )
-        sections.append(
-            render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
-        )
+        sections.append("  具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)")
         sections.append("")
+        yaml_missing_count = sum(
+            e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"]
+        )
+        if yaml_missing_count > 0:
+            sections.append(
+                f"  ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次：若未启用该配置文件，可忽略。"
+            )
+            sections.append("")
 
     # 状态码分布
     if result["status_code_dist"]:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index ca01d718dbc..5d1994d9405 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events):
                         break
 
         all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events]
+        for reg in register_by_ip.get(worker_ip, []):
+            all_events.append({"ts": reg["ts"], "type": "REGISTERED"})
         all_events.extend(recovery_events)
         all_events.sort(key=lambda e: e["ts"] or "")
 
         down_periods = _compute_down_periods(all_events)
         down_count = len(down_periods)
         avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0
+        detect_latency = _compute_detect_latency(all_events)
 
         workers[url] = {
             "events": all_events,
@@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
             "recovered": recovered,
             "inflight_preserved": counter_counts.get(url, 0),
             "down_periods": down_periods,
+            "avg_detect_latency_s": detect_latency,
         }
 
     return workers
@@ -191,6 +195,24 @@ def _compute_down_periods(events):
     return down_periods
 
 
+def _compute_detect_latency(events):
+    """计算 NOT_HEALTHY -> REMOVED 平均检测延迟（秒）。"""
+    last_unhealthy = None
+    latencies = []
+    for evt in events:
+        if evt["type"] == "NOT_HEALTHY" and evt.get("ts"):
+            last_unhealthy = evt["ts"]
+        elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"):
+            try:
+                latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds())
+            except ValueError:
+                pass
+            last_unhealthy = None
+    if not latencies:
+        return "-"
+    return round(sum(latencies) / len(latencies), 1)
+
+
 def _compute_uptime_pct(events):
     """计算 Worker 可用性百分比。"""
     if not events:
@@ -313,8 +335,7 @@ def format_health_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     # Worker 可用性表格
@@ -335,6 +356,7 @@ def format_health_report(result):
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
+                "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"),
                 "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"),
                 "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-",
             }
@@ -342,8 +364,8 @@ def format_health_report(result):
     sections.append(
         render_table(
             table_data,
-            columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"],
-            right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"},
+            columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"],
+            right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"},
         )
     )
     sections.append("")
@@ -360,6 +382,12 @@ def format_health_report(result):
     # 事件详情 → 拆分到 detail_text
     detail_parts = ["# Worker 健康事件详情", ""]
     has_events = False
+    if result.get("diagnoses"):
+        detail_parts.append("## 诊断")
+        detail_parts.append("")
+        for d in result["diagnoses"]:
+            detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_parts.append("")
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
@@ -373,7 +401,7 @@ def format_health_report(result):
 
     # 主报告中添加引用
     if has_events:
-        sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)")
+        sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     return "\n".join(sections), detail_text
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
index eec862910e8..508cf3824d9 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
@@ -255,6 +255,7 @@ def format_latency_report(result):
         f'p95={_fmt_ms(stats["p95"])}  p99={_fmt_ms(stats["p99"])}  '
         f'max={_fmt_ms(stats["max"])}'
     )
+    sections.append("  指标口径: pXX=延迟分位数；吞吐量=每个时间桶内请求数(count)；调度耗时=同 request_id 的 ts_ms(max-min)。")
     sections.append("")
 
     # 延迟分布
@@ -331,13 +332,10 @@ def format_latency_report(result):
         )
         sections.append("")
 
-    # 诊断
+    # 诊断（仅在 detail 输出）
     if result["diagnoses"]:
         sections.append("### 诊断")
-        for d in result["diagnoses"]:
-            severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "}
-            mark = severity_mark.get(d["severity"], " ")
-            sections.append(f'  [{mark}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)")
         sections.append("")
 
     return "\n".join(sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index c38b0b80953..2e03ba1ce63 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -30,6 +30,8 @@
 # Token 事件
 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)")
 RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)")
+RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)")
 
 
 def _strip_scheme(url):
@@ -135,11 +137,22 @@ def analyze_load(log_file, tail=None):
     sr_result = (
         match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
+        else {
+            "matched": [],
+            "unmatched_selects": [],
+            "unmatched_releases": [],
+            "untracked_selects": [],
+            "failed_selects": [],
+            "per_worker": {},
+            "id_coverage": {},
+            "type_summary": {},
+            "worker_type_profile": {},
+        }
     )
 
     # Token 统计
     token_stats = _analyze_tokens(h3_lines, h11_lines)
+    counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines)
 
     # 请求堆积检测
     pileup = _detect_pileup(stats_records)
@@ -154,6 +167,7 @@ def analyze_load(log_file, tail=None):
         "counter_anomalies": anomaly_summary,
         "select_release": sr_result,
         "token_stats": token_stats,
+        "counter_last_state": counter_last_state,
         "pileup_detected": pileup,
         "diagnoses": diagnoses,
         "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)",
@@ -191,6 +205,55 @@ def _analyze_tokens(h3_lines, h11_lines):
     return result
 
 
+def _analyze_counter_last_state(lines):
+    """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。"""
+    state = defaultdict(
+        lambda: {
+            "req_last_action": "-",
+            "req_last_value": "-",
+            "token_last_action": "-",
+            "token_last_value": "-",
+            "last_ts": "",
+        }
+    )
+    for line in lines:
+        ts = extract_ts(line) or ""
+        m = SELECT_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["req_last_action"] = "select"
+            state[w]["req_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(1)
+            state[w]["req_last_action"] = "release"
+            state[w]["req_last_value"] = m.group(2)
+            state[w]["last_ts"] = ts
+            continue
+        m = SELECT_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "select"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "release"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+
+    result = []
+    for w in sorted(state.keys()):
+        s = state[w]
+        result.append({"worker": _strip_scheme(w), **s})
+    return result
+
+
 def _detect_pileup(stats_records):
     """检测请求堆积：total_running 连续上升 >5 个采样点。"""
     if len(stats_records) < 5:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index 86ba1f0d94f..9d4e9b51496 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -25,8 +25,10 @@ def format_load_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append(
+            f'  共 {len(result["diagnoses"])} 条诊断，见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)；'
+            '匹配明细见 [detail/load_select_release.md](../detail/load_select_release.md)'
+        )
         sections.append("")
         detail_sections.append("## 诊断")
         detail_sections.append("")
@@ -39,6 +41,7 @@ def format_load_report(result):
     if ls:
         sections.append("### 负载概览 (total_running)")
         sections.append("")
+        sections.append("  说明: stats 采样来自 `[stats]` 周期日志（通常每 5s 一条），用于观察当前并发与负载变化趋势。")
         sections.append(
             f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
             f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
@@ -108,6 +111,9 @@ def format_load_report(result):
             sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
             sections.append("")
             sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("  说明: `release prefill tokens` 会被识别为 token-release；worker type 按该 worker URL 在 select 中的类型映射（prefill/decode/mixed）。")
+            if type_summary.get("unknown"):
+                sections.append("  说明: unknown 表示日志里缺少 worker type，且无法从邻近 select/release 关系推断。")
             sections.append("")
             detail_sections.append("## 按类型统计")
             detail_sections.append("")
@@ -178,6 +184,29 @@ def format_load_report(result):
         sections.append("")
         detail_sections.append("## Select/Release Per-Worker")
         detail_sections.append("")
+
+    if sr.get("worker_type_profile"):
+        sections.append("### Worker URL 类型画像（基于 select）")
+        sections.append("")
+        rows = []
+        for w, p in sorted(sr["worker_type_profile"].items()):
+            rows.append(
+                {
+                    "Worker": _strip_scheme(w),
+                    "Dominant": p.get("dominant_type", "unknown"),
+                    "Prefill": p.get("prefill", 0),
+                    "Decode": p.get("decode", 0),
+                    "Mixed": p.get("mixed", 0),
+                }
+            )
+        sections.append(
+            render_table(
+                rows,
+                columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"],
+                right_align={"Prefill", "Decode", "Mixed"},
+            )
+        )
+        sections.append("")
         detail_sections.append(
             render_table(
                 table_data,
@@ -192,7 +221,7 @@ def format_load_report(result):
         sections.append("  解释: 出现 request select，但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。")
         for u in sr["unmatched_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## 未匹配 select（完整）")
         detail_sections.append("")
@@ -202,11 +231,23 @@ def format_load_report(result):
             )
         detail_sections.append("")
 
+    if sr.get("unmatched_releases"):
+        sections.append(f'  ⚠ {len(sr["unmatched_releases"])} 个未匹配 release（已区分 req/token）')
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## 未匹配 release（按 release_kind 分类）")
+        detail_sections.append("")
+        for r in sr["unmatched_releases"]:
+            detail_sections.append(
+                f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}'
+            )
+        detail_sections.append("")
+
     if sr.get("untracked_selects"):
         sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
         for u in sr["untracked_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## Untracked selects（缺少可关联 ID）")
         detail_sections.append("")
@@ -239,4 +280,20 @@ def format_load_report(result):
         )
         sections.append("")
 
+    if result.get("counter_last_state"):
+        sections.append("### 计数器末状态")
+        sections.append("")
+        sections.append("  末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)")
+        sections.append("")
+        detail_sections.append("## Counter / Token Counter 末状态（最后一条计数日志）")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["counter_last_state"],
+                columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"],
+                right_align={"req_last_value", "token_last_value"},
+            )
+        )
+        detail_sections.append("")
+
     return "\n".join(sections), "\n".join(detail_sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 6c9a0323724..24af9a23500 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -16,6 +16,7 @@
 from log_parser import (
     extract_tags,
     extract_ts,
+    match_select_release,
     parse_cache_strategy_line,
     parse_http_line,
 )
@@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None):
         # 解析事件链
         events = _parse_event_chain(all_lines)
         lifecycle_complete = _check_lifecycle_complete(events)
-        diagnoses = _diagnose_trace(events, lifecycle_complete)
+        sr_check = match_select_release(all_lines)
+        diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check)
 
         traces[tid] = {
             "events": events,
             "lifecycle_complete": lifecycle_complete,
             "diagnoses": diagnoses,
+            "sr_check": sr_check,
             "matched_tag": "session_id" if is_session else "request_id/trace_id",
             "related_ids": {
                 "request_ids": sorted(related_request_ids) if is_session else [],
@@ -271,7 +274,7 @@ def _check_lifecycle_complete(events):
     return has_entry and has_exit and (not has_select or has_release)
 
 
-def _diagnose_trace(events, lifecycle_complete):
+def _diagnose_trace(events, lifecycle_complete, sr_check=None):
     """生成追踪诊断。"""
     diagnoses = []
     types = [e["type"] for e in events]
@@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete):
     if "FAILED_SELECT" in types:
         diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"})
 
+    if sr_check:
+        if sr_check.get("unmatched_selects"):
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select',
+                }
+            )
+        if sr_check.get("unmatched_releases"):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release',
+                }
+            )
+
     return diagnoses
 
 
@@ -367,7 +386,7 @@ def format_trace_report(result):
             # 主报告中添加引用和摘要
             safe_tid = tid.replace("/", "_")
             sections.append(f'  事件数: {len(trace["events"])}')
-            sections.append(f"  > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)")
+            sections.append(f"  > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)")
             sections.append("")
 
     return "\n".join(sections), detail_dict
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
index 83bb0203432..1eaea1369f8 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None):
         w = col_widths[col]
         if col in right_align:
             header_parts.append(f" {col:>{w}} ")
+            sep_parts.append("-" * (w + 1) + ":")
         else:
             header_parts.append(f" {col:<{w}} ")
-        sep_parts.append("-" * (w + 2))
+            sep_parts.append(":" + "-" * (w + 1))
 
     lines = []
     lines.append("|" + "|".join(header_parts) + "|")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 200f976f2ff..548c29ebc29 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type):
     return "unknown"
 
 
+def _infer_release_worker_type(release, selects, fallback_window_s=120):
+    """为未显式标注 type 的 release 近似推断 worker type。
+
+    优先级：
+      1) 同 worker、时间上最近且不晚于 release 的 select type
+      2) 若无可解析时间戳，则使用同 worker 的最后一个 select type
+      3) 推断失败返回 unknown
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    # 回退：按出现顺序取同 worker 的最近 select
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
+def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
+    """为 token release 推断 worker type（prefill/mixed）。
+
+    注意：日志文本通常固定为 `release prefill tokens`，即使 mixed 也可能走这条日志。
+    因此 token release 的类型优先依据同 worker 的邻近 select 推断。
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
-            token_type = trm.group(1) or "prefill"
+            token_type = trm.group(1)
             releases.append(
                 {
                     "ts": ts,
                     "worker": trm.group(2),
-                    "type": f'{_normalize_worker_type(token_type)}_tokens',
+                    # 文本默认按 prefill 记，再结合同 worker 邻近 select 做纠偏（mixed 场景）
+                    "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens',
+                    "raw_token_type": token_type or "",
                     "tags": tags,
                     "tokens": int(trm.group(3)),
                     "line": line_no,
@@ -716,7 +789,33 @@ def match_select_release(lines, fallback_window_s=120):
             "token_releases": counts["token_releases"],
         }
 
-    # 按 worker type 分类统计（prefill/decode/mixed）
+    # 基于 select 构建 worker URL -> dominant type 映射
+    per_worker_type_counts = defaultdict(lambda: defaultdict(int))
+    for s in selects:
+        per_worker_type_counts[s["worker"]][_normalize_worker_type(s.get("type"))] += 1
+    worker_dominant_type = {}
+    for w, counts in per_worker_type_counts.items():
+        worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown"
+
+    # 为未显式标注 type 的 release 推断 worker type（避免大量 unknown）
+    inferred_release_types = {}
+    for i, r in enumerate(releases):
+        r_type_raw = str(r.get("type", ""))
+        if r_type_raw.endswith("_tokens"):
+            base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
+            # token release 按 worker URL 对应的 select 类型映射，不做邻近时间纠偏
+            mapped_t = worker_dominant_type.get(r.get("worker", ""), "unknown")
+            if mapped_t in ("prefill", "decode", "mixed"):
+                base_t = mapped_t
+            inferred_release_types[i] = f"{base_t}_tokens"
+            continue
+        base_t = _normalize_worker_type(r_type_raw)
+        if base_t != "unknown":
+            inferred_release_types[i] = base_t
+            continue
+        inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+
+    # 按 worker type 分类统计（prefill/decode/mixed，必要时保留 unknown）
     type_summary = defaultdict(
         lambda: {
             "counter_selects": 0,
@@ -730,16 +829,57 @@ def match_select_release(lines, fallback_window_s=120):
         type_summary[s_type]["counter_selects"] += 1
         if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
-    for r in releases:
-        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
-        if str(r.get("type", "")).endswith("_tokens"):
+    for i, r in enumerate(releases):
+        inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", ""))))
+        r_type = _normalize_worker_type(str(inferred).replace("_tokens", ""))
+        if str(inferred).endswith("_tokens"):
             type_summary[r_type]["token_releases"] += 1
         else:
             type_summary[r_type]["counter_releases"] += 1
 
+    # 每个 worker URL 的类型画像（基于 select）
+    worker_type_profile = {}
+    for w, counts in per_worker_type_counts.items():
+        dominant = "unknown"
+        if counts:
+            dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0]
+        worker_type_profile[w] = {
+            "dominant_type": dominant,
+            "prefill": counts.get("prefill", 0),
+            "decode": counts.get("decode", 0),
+            "mixed": counts.get("mixed", 0),
+            "unknown": counts.get("unknown", 0),
+        }
+
+    unmatched_releases = []
+    for i, r in enumerate(releases):
+        if str(r.get("type", "")).endswith("_tokens"):
+            # token release: 近邻存在 prefill/mixed select 则视为可解释，不计入 unmatched
+            inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", ""))
+            if inferred_token_type == "unknown":
+                unmatched_releases.append(
+                    {
+                        "worker": r.get("worker", ""),
+                        "release_ts": r.get("ts", ""),
+                        "type": inferred_token_type,
+                        "release_kind": "token_release",
+                    }
+                )
+            continue
+        if i not in release_used:
+            unmatched_releases.append(
+                {
+                    "worker": r.get("worker", ""),
+                    "release_ts": r.get("ts", ""),
+                    "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")),
+                    "release_kind": "request_release",
+                }
+            )
+
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
+        "unmatched_releases": unmatched_releases,
         "untracked_selects": untracked_selects,
         "failed_selects": failed_selects,
         "per_worker": pw_result,
@@ -751,6 +891,7 @@ def match_select_release(lines, fallback_window_s=120):
             "without_any_id": without_any_id,
         },
         "type_summary": dict(type_summary),
+        "worker_type_profile": worker_type_profile,
     }
 
 
@@ -949,6 +1090,16 @@ def check(name, got, expected):
         "dial tcp {ip:port}: connection refused",
     )
 
+    print("\n=== Testing match_select_release (token release type inference) ===")
+    sample_lines = [
+        "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1",
+        "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10",
+        "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0",
+    ]
+    msr = match_select_release(sample_lines)
+    check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
+    check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
+
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index a818d31150f..641c5106bee 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -128,7 +128,14 @@ def determine_status(results):
                 reasons.append(d["message"])
 
     if reasons:
-        return "DEGRADED", ", ".join(reasons)
+        # 去重但保留完整信息
+        deduped = []
+        seen = set()
+        for r in reasons:
+            if r not in seen:
+                deduped.append(r)
+                seen.add(r)
+        return "DEGRADED", "；".join(deduped)
 
     if not results:
         return "HEALTHY", "无分析数据"
@@ -148,19 +155,65 @@ def format_full_report(results, status, status_reason):
                 - 'trace_files': {trace_id: text} 或 {}
     """
     parts = []
-    details = {"health_events": None, "load_select_release": None, "trace_files": {}}
+    details = {
+        "health_events": None,
+        "load_select_release": None,
+        "latency_diagnoses": None,
+        "cache_diagnosis": None,
+        "load_diagnoses": None,
+        "load_counter_state": None,
+        "cache_session_stickiness": None,
+        "cache_suboptimal": None,
+        "cache_eviction": None,
+        "cache_fallback": None,
+        "cache_cross": None,
+        "errors_topn": None,
+        "trace_files": {},
+    }
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append(
+        "状态定义: HEALTHY=无明显异常；DEGRADED=服务可用但存在性能/稳定性问题（需关注）；CRITICAL=服务不可用或高风险故障。"
+    )
     parts.append("=" * 60)
     parts.append("")
 
     # 各维度报告
     if "errors" in results:
         parts.append(format_errors_report(results["errors"]))
+        if results["errors"].get("error_top_n"):
+            lines = [
+                "# Errors TopN 详情",
+                "",
+                "| 模板 | 数量 | 级别 | 来源层 | 影响 |",
+                "|:--|--:|:--|:--|:--|",
+            ]
+            for e in results["errors"]["error_top_n"]:
+                lines.append(
+                    f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |'
+                )
+            lines.append("")
+            lines.append("## 涉及 URLs")
+            lines.append("")
+            for e in results["errors"]["error_top_n"]:
+                urls = e.get("urls") or []
+                if not urls:
+                    continue
+                lines.append(f'- 模板: {e.get("template","")}')
+                for u in urls:
+                    lines.append(f'  - {u}')
+            lines.append("")
+            details["errors_topn"] = "\n".join(lines)
 
     if "latency" in results:
         parts.append(format_latency_report(results["latency"]))
+        if results["latency"].get("diagnoses"):
+            lines = ["# 延迟诊断详情", ""]
+            for d in results["latency"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] {d.get("message","")}')
+            lines.append("")
+            details["latency_diagnoses"] = "\n".join(lines)
 
     if "health" in results:
         summary, detail = format_health_report(results["health"])
@@ -173,9 +226,58 @@ def format_full_report(results, status, status_reason):
         parts.append(summary)
         if detail:
             details["load_select_release"] = detail
+        if results["load"].get("diagnoses"):
+            lines = ["# Load 诊断详情", ""]
+            for d in results["load"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}')
+            lines.append("")
+            details["load_diagnoses"] = "\n".join(lines)
+        if results["load"].get("counter_last_state"):
+            rows = results["load"]["counter_last_state"]
+            lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"]
+            for r in rows:
+                lines.append(
+                    f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |'
+                )
+            lines.append("")
+            details["load_counter_state"] = "\n".join(lines)
 
     if "cache" in results:
-        parts.append(format_cache_report(results["cache"]))
+        summary, detail = format_cache_report(results["cache"])
+        parts.append(summary)
+        if detail:
+            details["cache_diagnosis"] = detail
+        c = results["cache"]
+        if c.get("session_stickiness"):
+            lines = ["# Cache Session 粘性详情", ""]
+            for sid, s in c["session_stickiness"].items():
+                lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}')
+            lines.append("")
+            details["cache_session_stickiness"] = "\n".join(lines)
+        if c.get("suboptimal_selections"):
+            lines = ["# Cache 非最优选择详情", ""]
+            for x in c["suboptimal_selections"][:200]:
+                lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}')
+            lines.append("")
+            details["cache_suboptimal"] = "\n".join(lines)
+        if c.get("eviction_impact"):
+            lines = ["# Cache 驱逐影响详情", ""]
+            for x in c["eviction_impact"][:200]:
+                lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}')
+            lines.append("")
+            details["cache_eviction"] = "\n".join(lines)
+        if c.get("fallback_reasons"):
+            lines = ["# Cache Fallback 原因详情", ""]
+            for x in c["fallback_reasons"]:
+                lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
+            lines.append("")
+            details["cache_fallback"] = "\n".join(lines)
+        if c.get("cross_diagnosis"):
+            lines = ["# Cache 交叉诊断详情", ""]
+            for x in c["cross_diagnosis"]:
+                lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%')
+            lines.append("")
+            details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -217,6 +319,40 @@ def save_detailed_report(report_text, output_dir, details=None):
             with open(load_path, "w", encoding="utf-8") as f:
                 f.write(details["load_select_release"])
 
+        if details.get("latency_diagnoses"):
+            latency_path = os.path.join(detail_dir, "latency_diagnoses.md")
+            with open(latency_path, "w", encoding="utf-8") as f:
+                f.write(details["latency_diagnoses"])
+
+        if details.get("cache_diagnosis"):
+            cache_path = os.path.join(detail_dir, "cache_diagnosis.md")
+            with open(cache_path, "w", encoding="utf-8") as f:
+                f.write(details["cache_diagnosis"])
+        if details.get("load_diagnoses"):
+            with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_diagnoses"])
+        if details.get("load_counter_state"):
+            with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_counter_state"])
+        if details.get("cache_session_stickiness"):
+            with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_session_stickiness"])
+        if details.get("cache_suboptimal"):
+            with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_suboptimal"])
+        if details.get("cache_eviction"):
+            with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_eviction"])
+        if details.get("cache_fallback"):
+            with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_fallback"])
+        if details.get("cache_cross"):
+            with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_cross"])
+        if details.get("errors_topn"):
+            with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f:
+                f.write(details["errors_topn"])
+
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
             trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")

From c8a3fd75a649e8ba14b13fdc09f7441a2cc27093 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 12:28:52 +0800
Subject: [PATCH 18/40] fix(troubleshoot): add FIFO+ID consistency checks and
 quote-safe hint

---
 .../troubleshoot/references/error_catalog.md  |   1 +
 .../troubleshoot/references/log_patterns.md   |  11 +
 .../references/report_templates.md            |  12 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 131 +++++++-
 .../troubleshoot/scripts/analyzers/errors.py  |  56 +++-
 .../troubleshoot/scripts/analyzers/health.py  |  38 ++-
 .../troubleshoot/scripts/analyzers/latency.py |   8 +-
 .../troubleshoot/scripts/analyzers/load.py    |  75 ++++-
 .../scripts/analyzers/load_report.py          | 102 +++++-
 .../troubleshoot/scripts/analyzers/trace.py   |  25 +-
 .../skills/troubleshoot/scripts/chart.py      |   3 +-
 .../skills/troubleshoot/scripts/log_parser.py | 303 ++++++++++++++----
 .../troubleshoot/scripts/troubleshoot.py      | 198 +++++++++++-
 13 files changed, 850 insertions(+), 113 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
index ba48297d9c9..60b4931b546 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -61,6 +61,7 @@
 | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
 | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件（若未使用 register.yaml 可忽略） |
 | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
index cf33b41f723..4322909c01d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -233,6 +233,17 @@ PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]`
 
 ---
 
+## Select/Release 日志细节（与代码一致）
+
+- `select worker (prefill): <url>, tokens: <n>`
+- `select worker (decode|mixed): <url>, count: <n>`
+- `release worker: <url>, count: <n>`（request counter 释放）
+- `release prefill tokens: <url>, tokens: <n>`（token counter 释放；可能来自 prefill 或 mixed 请求路径）
+
+重点：release 只有上面这两种。`release worker` 不带 worker type，`release prefill tokens` 的文本也不能直接断定是 prefill（mixed 也可能调用）。因此按 `prefill/decode/mixed` 统计时，需要从 select 侧做归类；确实无法归类时才记为 `unknown`。
+
+---
+
 ## 使用脚本工具
 
 各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index ba9e40e9869..cd705d02816 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -44,6 +44,7 @@
 ### 简洁版（终端输出）
 
 - 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 状态定义：`HEALTHY`=无明显异常；`DEGRADED`=服务可用但性能/稳定性下降（需关注）；`CRITICAL`=服务不可用或高风险故障
 - 按三层分类（Router / FD 后端 / 客户端）
 - 每个问题一行摘要 + 关键指标
 - 末尾提示详细版文件路径
@@ -53,8 +54,15 @@
 - 路径：`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
 - 主报告包含各维度总结 + 可视化图表（sparkline/柱状图/时间线等）
 - 详情拆分到 `details/` 子目录：
-  - `details/health_events.md` — Worker 逐分钟健康事件
-  - `details/trace_<ID>.md` — 请求追踪事件链
+  - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断
+  - `detail/errors_topn.md` — ERROR/WARN 模板明细（数量/级别/来源层/影响 + URLs）
+  - `detail/load_select_release.md` — 负载诊断 + select/release 明细
+  - `detail/load_diagnoses.md` — load 诊断列表
+  - `detail/load_counter_state.md` — request/token counter 末状态
+  - `detail/latency_diagnoses.md` — 延迟诊断详情
+  - `detail/cache_diagnosis.md` — cache 六维诊断详情（session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断）
+  - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细
+  - `detail/trace_<ID>.md` — 请求追踪事件链
 
 ---
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3fca296f4d6..3a5c19ad00b 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig
         "cold_starts": cold_starts,
         "hitratio_stats": hitratio_stats,
         "tokenizer_degraded_count": tokenizer_degraded_count,
+        "cross_diagnosis": _analyze_cross_diagnosis(
+            session_stickiness=session_stickiness,
+            hitratio_stats=hitratio_stats,
+            strategy_dist=strategy_dist,
+            eviction_impact=eviction_impact,
+        ),
         "diagnoses": diagnoses,
         "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, "
         f"冷启动 {cold_starts}",
@@ -339,6 +345,45 @@ def _diagnose(
     return diagnoses
 
 
+def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact):
+    """交叉诊断：基于粘性/命中率/fallback/驱逐给出简表。"""
+    if not session_stickiness:
+        return []
+    avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1)
+    mean_hr = hitratio_stats.get("mean", 0)
+    fallback_pct = 0
+    for s in strategy_dist:
+        if s.get("value") == "process_tokens":
+            fallback_pct = s.get("pct", 0)
+            break
+    evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted"))
+
+    diagnosis = "运行良好"
+    action = "-"
+    if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10:
+        diagnosis = "运行良好"
+    elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0:
+        diagnosis = "疑似驱逐导致命中率低"
+        action = "考虑增大 eviction-duration-mins"
+    elif avg_stickiness < 40 and fallback_pct >= 20:
+        diagnosis = "低粘性 + 高 fallback"
+        action = "检查负载阈值与 cache-aware 参数"
+    elif avg_stickiness < 40 and mean_hr < 20:
+        diagnosis = "低粘性 + 低命中"
+        action = "检查缓存预热与 prompt 稳定性"
+
+    return [
+        {
+            "avg_stickiness_pct": round(avg_stickiness, 1),
+            "mean_hitRatio_pct": round(mean_hr, 1),
+            "fallback_pct": round(fallback_pct, 1),
+            "evicted_after_timeout": evicted_cnt,
+            "diagnosis": diagnosis,
+            "action": action,
+        }
+    ]
+
+
 # ════════════════════════════════════════════════════════════════
 # 报告格式化
 # ════════════════════════════════════════════════════════════════
@@ -349,13 +394,18 @@ def format_cache_report(result):
     sections = ["## Cache 调度诊断", ""]
     sections.append(f'  {result["summary"]}')
     sections.append("")
+    detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""]
 
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
         sections.append("")
+        detail_sections.append("## 诊断")
+        detail_sections.append("")
+        for d in result["diagnoses"]:
+            detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_sections.append("")
 
     # 策略分布
     if result["strategy_dist"]:
@@ -364,6 +414,10 @@ def format_cache_report(result):
         bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## 策略分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # hitRatio 统计
     hs = result.get("hitratio_stats", {})
@@ -383,6 +437,10 @@ def format_cache_report(result):
         bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## Fallback 原因分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # Tokenizer 退化
     if result.get("tokenizer_degraded_count", 0) > 0:
@@ -394,6 +452,8 @@ def format_cache_report(result):
     if stickiness:
         sections.append("### Session 粘性")
         sections.append("")
+        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         table_data = [
             {
                 "Session": sid[:16],
@@ -403,26 +463,37 @@ def format_cache_report(result):
             }
             for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"])
         ]
-        sections.append(
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append(
             render_table(
-                table_data[:10],
+                table_data,
                 columns=["Session", "请求数", "粘性率", "切换次数"],
                 right_align={"请求数", "粘性率", "切换次数"},
             )
         )
-        sections.append("")
+        detail_sections.append("")
 
     # 非最优选择
     if result.get("suboptimal_selections"):
         subs = result["suboptimal_selections"]
         sections.append(f"### 非最优选择 ({len(subs)} 次)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
         for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
             sections.append(f"  {reason}: {count} 次")
         sections.append("")
+        detail_sections.append("## 非最优选择（Top 20）")
+        detail_sections.append("")
+        for s in subs[:20]:
+            detail_sections.append(
+                f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
+            )
+        detail_sections.append("")
 
     # 驱逐影响
     if result.get("eviction_impact"):
@@ -430,13 +501,61 @@ def format_cache_report(result):
         evicted = [e for e in evictions if e["evicted"]]
         sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        for e in evictions[:50]:
+            detail_sections.append(
+                f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
+            )
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
         sections.append(f'  冷启动: {result["cold_starts"]} 次（hitRatios=map[]）')
         sections.append("")
+        detail_sections.append("## 冷启动识别")
+        detail_sections.append("")
+        detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
+        detail_sections.append("")
+
+    if result.get("cross_diagnosis"):
+        sections.append("### 交叉诊断")
+        sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["cross_diagnosis"],
+                columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"],
+                right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"},
+            )
+        )
+        detail_sections.append("")
+
+    if any(
+        [
+            result.get("session_stickiness"),
+            result.get("suboptimal_selections"),
+            result.get("eviction_impact"),
+            result.get("cross_diagnosis"),
+            result.get("diagnoses"),
+        ]
+    ):
+        sections.append(
+            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
+            "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
+            "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
+            "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
+            "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
+            "[detail/cache_cross.md](../detail/cache_cross.md)"
+        )
+        sections.append("")
 
-    return "\n".join(sections)
+    return "\n".join(sections), "\n".join(detail_sections)
 
 
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index b8217a5ffa4..f0e4c352b6c 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -33,6 +33,7 @@
     ("counter already zero", "Router"),
     ("tokenizer failed", "Router"),
     ("Instance {url} role is unknown", "Router"),
+    ("Failed to read YAML file config/register.yaml", "Router"),
     # 客户端
     ("Invalid request body", "客户端"),
     ("Invalid JSON format", "客户端"),
@@ -55,6 +56,15 @@
     ("GetRemoteMetrics failed", "FD 后端"),
 ]
 
+IMPACT_RULES = [
+    ("Failed to select", "请求可能返回 502/503"),
+    ("Failed to connect to backend", "后端不可达，请求失败"),
+    ("Panic recovered", "Router 代码异常，可能影响稳定性"),
+    ("scanner error", "流式响应中断"),
+    ("copy error", "非流式响应中断"),
+    ("Failed to read YAML file config/register.yaml", "可选配置未加载（若未启用可忽略）"),
+]
+
 # scanner error / copy error 特殊处理：context canceled → 客户端，其他 → FD 后端
 SCANNER_COPY_PATTERNS = ("scanner error", "copy error")
 
@@ -75,6 +85,13 @@ def classify_source_layer(template, original=""):
     return "未知"
 
 
+def classify_impact(template):
+    for pattern, impact in IMPACT_RULES:
+        if pattern in template:
+            return impact
+    return "-"
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n):
                 "count": g["count"],
                 "pct": round(g["count"] / total * 100, 1) if total else 0,
                 "source_layer": source_layer,
+                "impact": classify_impact(g["template"]),
                 "level": g["level"],
+                "urls": _extract_urls(g["originals"]),
                 "sample_originals": g["originals"],
             }
         )
@@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n):
     return result
 
 
+def _extract_urls(originals):
+    import re
+
+    urls = set()
+    for line in originals:
+        for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line):
+            urls.add(m)
+    return sorted(urls)
+
+
 def _grep_lines(log_file, pattern, tail=None):
     """用 grep 从日志文件提取匹配行。"""
     try:
@@ -240,6 +269,9 @@ def format_errors_report(result):
         f'请求总数: {result["total_requests"]}  |  '
         f'错误率: {result["error_rate"]}%'
     )
+    sections.append("  指标口径: ERROR/WARN=日志级别计数；请求总数=HTTP 请求行数；错误率=非200请求数/请求总数×100%。")
+    if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0):
+        sections.append("  ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200；并不代表没有 ERROR/WARN 日志。")
     sections.append("")
 
     # Panic
@@ -266,22 +298,16 @@ def format_errors_report(result):
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
 
-        # 来源层表格
-        table_data = []
-        for e in result["error_top_n"][:10]:
-            table_data.append(
-                {
-                    "模板": e["template"][:60],
-                    "数量": e["count"],
-                    "占比": f'{e["pct"]}%',
-                    "级别": e["level"],
-                    "来源层": e["source_layer"],
-                }
-            )
-        sections.append(
-            render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
-        )
+        sections.append("  具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)")
         sections.append("")
+        yaml_missing_count = sum(
+            e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"]
+        )
+        if yaml_missing_count > 0:
+            sections.append(
+                f"  ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次：若未启用该配置文件，可忽略。"
+            )
+            sections.append("")
 
     # 状态码分布
     if result["status_code_dist"]:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index ca01d718dbc..5d1994d9405 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events):
                         break
 
         all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events]
+        for reg in register_by_ip.get(worker_ip, []):
+            all_events.append({"ts": reg["ts"], "type": "REGISTERED"})
         all_events.extend(recovery_events)
         all_events.sort(key=lambda e: e["ts"] or "")
 
         down_periods = _compute_down_periods(all_events)
         down_count = len(down_periods)
         avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0
+        detect_latency = _compute_detect_latency(all_events)
 
         workers[url] = {
             "events": all_events,
@@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
             "recovered": recovered,
             "inflight_preserved": counter_counts.get(url, 0),
             "down_periods": down_periods,
+            "avg_detect_latency_s": detect_latency,
         }
 
     return workers
@@ -191,6 +195,24 @@ def _compute_down_periods(events):
     return down_periods
 
 
+def _compute_detect_latency(events):
+    """计算 NOT_HEALTHY -> REMOVED 平均检测延迟（秒）。"""
+    last_unhealthy = None
+    latencies = []
+    for evt in events:
+        if evt["type"] == "NOT_HEALTHY" and evt.get("ts"):
+            last_unhealthy = evt["ts"]
+        elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"):
+            try:
+                latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds())
+            except ValueError:
+                pass
+            last_unhealthy = None
+    if not latencies:
+        return "-"
+    return round(sum(latencies) / len(latencies), 1)
+
+
 def _compute_uptime_pct(events):
     """计算 Worker 可用性百分比。"""
     if not events:
@@ -313,8 +335,7 @@ def format_health_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     # Worker 可用性表格
@@ -335,6 +356,7 @@ def format_health_report(result):
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
+                "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"),
                 "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"),
                 "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-",
             }
@@ -342,8 +364,8 @@ def format_health_report(result):
     sections.append(
         render_table(
             table_data,
-            columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"],
-            right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"},
+            columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"],
+            right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"},
         )
     )
     sections.append("")
@@ -360,6 +382,12 @@ def format_health_report(result):
     # 事件详情 → 拆分到 detail_text
     detail_parts = ["# Worker 健康事件详情", ""]
     has_events = False
+    if result.get("diagnoses"):
+        detail_parts.append("## 诊断")
+        detail_parts.append("")
+        for d in result["diagnoses"]:
+            detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_parts.append("")
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
@@ -373,7 +401,7 @@ def format_health_report(result):
 
     # 主报告中添加引用
     if has_events:
-        sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)")
+        sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     return "\n".join(sections), detail_text
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
index eec862910e8..508cf3824d9 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
@@ -255,6 +255,7 @@ def format_latency_report(result):
         f'p95={_fmt_ms(stats["p95"])}  p99={_fmt_ms(stats["p99"])}  '
         f'max={_fmt_ms(stats["max"])}'
     )
+    sections.append("  指标口径: pXX=延迟分位数；吞吐量=每个时间桶内请求数(count)；调度耗时=同 request_id 的 ts_ms(max-min)。")
     sections.append("")
 
     # 延迟分布
@@ -331,13 +332,10 @@ def format_latency_report(result):
         )
         sections.append("")
 
-    # 诊断
+    # 诊断（仅在 detail 输出）
     if result["diagnoses"]:
         sections.append("### 诊断")
-        for d in result["diagnoses"]:
-            severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "}
-            mark = severity_mark.get(d["severity"], " ")
-            sections.append(f'  [{mark}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)")
         sections.append("")
 
     return "\n".join(sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index c38b0b80953..5b9e3271f07 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -30,6 +30,8 @@
 # Token 事件
 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)")
 RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)")
+RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)")
 
 
 def _strip_scheme(url):
@@ -135,11 +137,22 @@ def analyze_load(log_file, tail=None):
     sr_result = (
         match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
+        else {
+            "matched": [],
+            "unmatched_selects": [],
+            "unmatched_releases": [],
+            "untracked_selects": [],
+            "failed_selects": [],
+            "per_worker": {},
+            "id_coverage": {},
+            "type_summary": {},
+            "worker_type_profile": {},
+        }
     )
 
     # Token 统计
     token_stats = _analyze_tokens(h3_lines, h11_lines)
+    counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines)
 
     # 请求堆积检测
     pileup = _detect_pileup(stats_records)
@@ -154,6 +167,7 @@ def analyze_load(log_file, tail=None):
         "counter_anomalies": anomaly_summary,
         "select_release": sr_result,
         "token_stats": token_stats,
+        "counter_last_state": counter_last_state,
         "pileup_detected": pileup,
         "diagnoses": diagnoses,
         "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)",
@@ -191,6 +205,55 @@ def _analyze_tokens(h3_lines, h11_lines):
     return result
 
 
+def _analyze_counter_last_state(lines):
+    """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。"""
+    state = defaultdict(
+        lambda: {
+            "req_last_action": "-",
+            "req_last_value": "-",
+            "token_last_action": "-",
+            "token_last_value": "-",
+            "last_ts": "",
+        }
+    )
+    for line in lines:
+        ts = extract_ts(line) or ""
+        m = SELECT_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["req_last_action"] = "select"
+            state[w]["req_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(1)
+            state[w]["req_last_action"] = "release"
+            state[w]["req_last_value"] = m.group(2)
+            state[w]["last_ts"] = ts
+            continue
+        m = SELECT_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "select"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "release"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+
+    result = []
+    for w in sorted(state.keys()):
+        s = state[w]
+        result.append({"worker": _strip_scheme(w), **s})
+    return result
+
+
 def _detect_pileup(stats_records):
     """检测请求堆积：total_running 连续上升 >5 个采样点。"""
     if len(stats_records) < 5:
@@ -273,6 +336,16 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats,
             }
         )
 
+    id_mismatch_count = sr_result.get("id_consistency", {}).get("both_present_but_mismatch", 0)
+    if id_mismatch_count > 0:
+        diagnoses.append(
+            {
+                "severity": "MEDIUM",
+                "message": f"{id_mismatch_count} 个 select/release 在 FIFO 命中后 ID 不一致（疑似串流或日志错配）",
+                "source_layer": "FD 后端",
+            }
+        )
+
     # Token 计数器潜在泄漏
     for t in token_stats:
         if t.get("alloc_count", 0) > t.get("release_count", 0):
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index 86ba1f0d94f..74358b6e72d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -25,8 +25,7 @@ def format_load_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append(f'  共 {len(result["diagnoses"])} 条诊断，见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)')
         sections.append("")
         detail_sections.append("## 诊断")
         detail_sections.append("")
@@ -39,6 +38,7 @@ def format_load_report(result):
     if ls:
         sections.append("### 负载概览 (total_running)")
         sections.append("")
+        sections.append("  说明: stats 采样来自 `[stats]` 周期日志（通常每 5s 一条），用于观察当前并发与负载变化趋势。")
         sections.append(
             f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
             f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
@@ -108,6 +108,9 @@ def format_load_report(result):
             sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
             sections.append("")
             sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("  说明: `release prefill tokens` 会被识别为 token-release；worker type 按该 worker URL 在 select 中的类型映射（prefill/decode/mixed）。")
+            if type_summary.get("unknown"):
+                sections.append("  说明: unknown 表示日志里缺少 worker type，且无法从邻近 select/release 关系推断。")
             sections.append("")
             detail_sections.append("## 按类型统计")
             detail_sections.append("")
@@ -178,6 +181,56 @@ def format_load_report(result):
         sections.append("")
         detail_sections.append("## Select/Release Per-Worker")
         detail_sections.append("")
+
+    id_consistency = sr.get("id_consistency", {})
+    if id_consistency:
+        sections.append("### FIFO × ID 一致性校验")
+        sections.append("")
+        sections.append(
+            "  matched={ok}, mismatch={mismatch}, select_only={so}, release_only={ro}, both_missing={bm}".format(
+                ok=id_consistency.get("both_present_and_equal", 0),
+                mismatch=id_consistency.get("both_present_but_mismatch", 0),
+                so=id_consistency.get("only_select_has_id", 0),
+                ro=id_consistency.get("only_release_has_id", 0),
+                bm=id_consistency.get("both_missing", 0),
+            )
+        )
+        sections.append("")
+        sections.append("  说明: 主匹配按 worker FIFO，随后检查 matched 对中的 ID 是否一致。")
+        sections.append("")
+        detail_sections.append("## FIFO × ID 一致性")
+        detail_sections.append("")
+        detail_sections.append(
+            "- both_present_and_equal: select/release 都有可关联 ID 且相等\n"
+            "- both_present_but_mismatch: select/release 都有 ID 但不一致（需要重点排查）\n"
+            "- only_select_has_id: 仅 select 有 ID\n"
+            "- only_release_has_id: 仅 release 有 ID\n"
+            "- both_missing: 两边都没有可关联 ID"
+        )
+        detail_sections.append("")
+
+    if sr.get("worker_type_profile"):
+        sections.append("### Worker URL 类型画像（基于 select）")
+        sections.append("")
+        rows = []
+        for w, p in sorted(sr["worker_type_profile"].items()):
+            rows.append(
+                {
+                    "Worker": _strip_scheme(w),
+                    "Dominant": p.get("dominant_type", "unknown"),
+                    "Prefill": p.get("prefill", 0),
+                    "Decode": p.get("decode", 0),
+                    "Mixed": p.get("mixed", 0),
+                }
+            )
+        sections.append(
+            render_table(
+                rows,
+                columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"],
+                right_align={"Prefill", "Decode", "Mixed"},
+            )
+        )
+        sections.append("")
         detail_sections.append(
             render_table(
                 table_data,
@@ -192,7 +245,7 @@ def format_load_report(result):
         sections.append("  解释: 出现 request select，但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。")
         for u in sr["unmatched_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## 未匹配 select（完整）")
         detail_sections.append("")
@@ -202,14 +255,39 @@ def format_load_report(result):
             )
         detail_sections.append("")
 
+    if sr.get("unmatched_releases"):
+        sections.append(f'  ⚠ {len(sr["unmatched_releases"])} 个未匹配 release（已区分 req/token）')
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## 未匹配 release（按 release_kind 分类）")
+        detail_sections.append("")
+        for r in sr["unmatched_releases"]:
+            detail_sections.append(
+                f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}'
+            )
+        detail_sections.append("")
+
     if sr.get("untracked_selects"):
         sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
         for u in sr["untracked_selects"][:3]:
             sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
         sections.append("")
         detail_sections.append("## Untracked selects（缺少可关联 ID）")
         detail_sections.append("")
+
+    if sr.get("id_mismatched_matches"):
+        sections.append(f'  ⚠ {len(sr["id_mismatched_matches"])} 个 FIFO 匹配对存在 ID 不一致')
+        sections.append("  > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)")
+        sections.append("")
+        detail_sections.append("## FIFO 匹配但 ID 不一致（完整）")
+        detail_sections.append("")
+        for m in sr["id_mismatched_matches"]:
+            detail_sections.append(
+                f'- [{m.get("select_ts","")}] worker={_strip_scheme(m.get("worker",""))} '
+                f'select_id={m.get("select_id","")} release_id={m.get("release_id","")} note={m.get("note","")}'
+            )
+        detail_sections.append("")
         for u in sr["untracked_selects"]:
             detail_sections.append(
                 f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}'
@@ -239,4 +317,20 @@ def format_load_report(result):
         )
         sections.append("")
 
+    if result.get("counter_last_state"):
+        sections.append("### 计数器末状态")
+        sections.append("")
+        sections.append("  末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)")
+        sections.append("")
+        detail_sections.append("## Counter / Token Counter 末状态（最后一条计数日志）")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["counter_last_state"],
+                columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"],
+                right_align={"req_last_value", "token_last_value"},
+            )
+        )
+        detail_sections.append("")
+
     return "\n".join(sections), "\n".join(detail_sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 6c9a0323724..24af9a23500 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -16,6 +16,7 @@
 from log_parser import (
     extract_tags,
     extract_ts,
+    match_select_release,
     parse_cache_strategy_line,
     parse_http_line,
 )
@@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None):
         # 解析事件链
         events = _parse_event_chain(all_lines)
         lifecycle_complete = _check_lifecycle_complete(events)
-        diagnoses = _diagnose_trace(events, lifecycle_complete)
+        sr_check = match_select_release(all_lines)
+        diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check)
 
         traces[tid] = {
             "events": events,
             "lifecycle_complete": lifecycle_complete,
             "diagnoses": diagnoses,
+            "sr_check": sr_check,
             "matched_tag": "session_id" if is_session else "request_id/trace_id",
             "related_ids": {
                 "request_ids": sorted(related_request_ids) if is_session else [],
@@ -271,7 +274,7 @@ def _check_lifecycle_complete(events):
     return has_entry and has_exit and (not has_select or has_release)
 
 
-def _diagnose_trace(events, lifecycle_complete):
+def _diagnose_trace(events, lifecycle_complete, sr_check=None):
     """生成追踪诊断。"""
     diagnoses = []
     types = [e["type"] for e in events]
@@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete):
     if "FAILED_SELECT" in types:
         diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"})
 
+    if sr_check:
+        if sr_check.get("unmatched_selects"):
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select',
+                }
+            )
+        if sr_check.get("unmatched_releases"):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release',
+                }
+            )
+
     return diagnoses
 
 
@@ -367,7 +386,7 @@ def format_trace_report(result):
             # 主报告中添加引用和摘要
             safe_tid = tid.replace("/", "_")
             sections.append(f'  事件数: {len(trace["events"])}')
-            sections.append(f"  > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)")
+            sections.append(f"  > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)")
             sections.append("")
 
     return "\n".join(sections), detail_dict
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
index 83bb0203432..1eaea1369f8 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None):
         w = col_widths[col]
         if col in right_align:
             header_parts.append(f" {col:>{w}} ")
+            sep_parts.append("-" * (w + 1) + ":")
         else:
             header_parts.append(f" {col:<{w}} ")
-        sep_parts.append("-" * (w + 2))
+            sep_parts.append(":" + "-" * (w + 1))
 
     lines = []
     lines.append("|" + "|".join(header_parts) + "|")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 200f976f2ff..4dc5832c103 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -501,6 +501,83 @@ def _normalize_worker_type(worker_type):
     return "unknown"
 
 
+def _normalize_worker_url_key(url):
+    if not url:
+        return ""
+    return re.sub(r"^https?://", "", str(url).strip().rstrip("/"))
+
+
+def _infer_release_worker_type(release, selects, fallback_window_s=120):
+    """为未显式标注 type 的 release 近似推断 worker type。
+
+    优先级：
+      1) 同 worker、时间上最近且不晚于 release 的 select type
+      2) 若无可解析时间戳，则使用同 worker 的最后一个 select type
+      3) 推断失败返回 unknown
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    # 回退：按出现顺序取同 worker 的最近 select
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
+def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
+    """为 token release 推断 worker type（prefill/mixed）。
+
+    注意：日志文本通常固定为 `release prefill tokens`，即使 mixed 也可能走这条日志。
+    因此 token release 的类型优先依据同 worker 的邻近 select 推断。
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -525,6 +602,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": tm.group(2),
+                    "worker_key": _normalize_worker_url_key(tm.group(2)),
                     "type": _normalize_worker_type(tm.group(1)),
                     "tags": tags,
                     "tokens": int(tm.group(3)),
@@ -536,12 +614,15 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
-            token_type = trm.group(1) or "prefill"
+            token_type = trm.group(1)
             releases.append(
                 {
                     "ts": ts,
                     "worker": trm.group(2),
-                    "type": f'{_normalize_worker_type(token_type)}_tokens',
+                    "worker_key": _normalize_worker_url_key(trm.group(2)),
+                    # 文本默认按 prefill 记，再结合同 worker 邻近 select 做纠偏（mixed 场景）
+                    "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens',
+                    "raw_token_type": token_type or "",
                     "tags": tags,
                     "tokens": int(trm.group(3)),
                     "line": line_no,
@@ -555,6 +636,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": sm.group(2),
+                    "worker_key": _normalize_worker_url_key(sm.group(2)),
                     "type": _normalize_worker_type(sm.group(1)),
                     "tags": tags,
                     "tokens": None,
@@ -569,6 +651,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": rm.group(2),
+                    "worker_key": _normalize_worker_url_key(rm.group(2)),
                     "type": _normalize_worker_type(rm.group(1)),
                     "tags": tags,
                     "tokens": None,
@@ -580,28 +663,22 @@ def match_select_release(lines, fallback_window_s=120):
         if FAILED_SELECT_RE.search(line):
             failed_selects.append({"ts": ts, "tags": tags, "line": line_no})
 
-    # Match by request_id / alt_id
+    # Match by worker FIFO（select -> 同 worker 下一条 release）
     matched = []
     unmatched_selects = []
     release_used = set()
 
     # 请求生命周期匹配只使用 request counter release（排除 token release）
+    # 说明：request_id 只用于覆盖率观测，不参与 select/release 配对条件。
     counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")]
-    release_by_key = defaultdict(list)
-    for i in counter_release_indexes:
-        r = releases[i]
-        _, key = _select_match_key(r.get("tags", {}))
-        if key:
-            release_by_key[key].append(i)
-
     # 请求 ID 覆盖（按 select 事件近似请求数）
     total_req_est = len(selects)
     with_request_id = 0
     with_alt_id = 0
     without_any_id = 0
 
-    pending_selects = []
     untracked_selects = []
+    pending_selects = []
     for s in selects:
         key_type, key = _select_match_key(s.get("tags", {}))
         if key_type == "request_id":
@@ -611,7 +688,6 @@ def match_select_release(lines, fallback_window_s=120):
         else:
             without_any_id += 1
 
-        found = False
         if not key:
             # 没有任何可用 ID 时，不做退化匹配（只统计可观测信息）
             untracked_selects.append(
@@ -623,53 +699,74 @@ def match_select_release(lines, fallback_window_s=120):
                     "note": "no correlatable id (request_id/req_id/trace_id/session_id)",
                 }
             )
-            continue
-
-        if key and key in release_by_key:
-            for ri in release_by_key[key]:
-                if ri not in release_used:
-                    r = releases[ri]
-                    matched.append(
-                        {
-                            "request_id": s["tags"].get("request_id", ""),
-                            "worker": s["worker"],
-                            "select_ts": s["ts"],
-                            "release_ts": r["ts"],
-                            "type": s["type"],
-                            "match_method": key_type or "id",
-                        }
-                    )
-                    release_used.add(ri)
-                    found = True
-                    break
-
-        if not found:
-            pending_selects.append(s)
+        pending_selects.append(s)
+
+    # worker FIFO + ID 一致性联合校验：
+    # 1) 主匹配仍按 worker FIFO，保证在缺失 request_id 场景可工作
+    # 2) 对已匹配对追加 ID 一致性检查（request_id/req_id/trace_id/session_id）
+    id_consistency = {
+        "both_present_and_equal": 0,
+        "both_present_but_mismatch": 0,
+        "only_select_has_id": 0,
+        "only_release_has_id": 0,
+        "both_missing": 0,
+    }
+    id_mismatched_matches = []
 
-    # Fallback: 有 ID 但未匹配时，按 worker + 时间邻近匹配
     for s in pending_selects:
-        sdt = _parse_ts_safe(s["ts"])
+        sdt = _parse_ts_safe(s.get("ts"))
         best_idx = None
-        best_delta = None
+        best_ts = None
         for ri in counter_release_indexes:
-            r = releases[ri]
             if ri in release_used:
                 continue
-            if r.get("worker") != s.get("worker"):
+            r = releases[ri]
+            if r.get("worker_key") != s.get("worker_key"):
                 continue
             rdt = _parse_ts_safe(r.get("ts"))
-            if sdt and rdt:
-                delta = (rdt - sdt).total_seconds()
-                if delta < 0 or delta > fallback_window_s:
-                    continue
-            else:
-                delta = 0
-            if best_delta is None or delta < best_delta:
-                best_delta = delta
+            # 优先选择时间不早于 select 的最早 release；解析失败则按出现顺序
+            if sdt and rdt and rdt < sdt:
+                continue
+            if best_idx is None:
+                best_idx = ri
+                best_ts = rdt
+            elif rdt and best_ts and rdt < best_ts:
                 best_idx = ri
+                best_ts = rdt
 
         if best_idx is not None:
             r = releases[best_idx]
+            s_key_type, s_key = _select_match_key(s.get("tags", {}))
+            r_key_type, r_key = _select_match_key(r.get("tags", {}))
+            if s_key and r_key:
+                if s_key == r_key:
+                    id_check = "match"
+                    id_consistency["both_present_and_equal"] += 1
+                else:
+                    id_check = "mismatch"
+                    id_consistency["both_present_but_mismatch"] += 1
+                    id_mismatched_matches.append(
+                        {
+                            "worker": s["worker"],
+                            "select_ts": s["ts"],
+                            "release_ts": r["ts"],
+                            "select_id_key": s_key_type,
+                            "select_id": s_key,
+                            "release_id_key": r_key_type,
+                            "release_id": r_key,
+                            "note": "worker FIFO matched, but ID mismatched",
+                        }
+                    )
+            elif s_key and not r_key:
+                id_check = "select_only"
+                id_consistency["only_select_has_id"] += 1
+            elif (not s_key) and r_key:
+                id_check = "release_only"
+                id_consistency["only_release_has_id"] += 1
+            else:
+                id_check = "both_missing"
+                id_consistency["both_missing"] += 1
+
             matched.append(
                 {
                     "request_id": s["tags"].get("request_id", ""),
@@ -677,7 +774,8 @@ def match_select_release(lines, fallback_window_s=120):
                     "select_ts": s["ts"],
                     "release_ts": r["ts"],
                     "type": s["type"],
-                    "match_method": "worker_time_fallback",
+                    "match_method": "worker_fifo",
+                    "id_check": id_check,
                 }
             )
             release_used.add(best_idx)
@@ -688,7 +786,7 @@ def match_select_release(lines, fallback_window_s=120):
                     "select_ts": s["ts"],
                     "type": s["type"],
                     "tags": s["tags"],
-                    "note": "no matching release found (request_id/worker-time)",
+                    "note": "no matching release found (worker FIFO)",
                 }
             )
 
@@ -697,14 +795,16 @@ def match_select_release(lines, fallback_window_s=120):
     per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0})
     for s in selects:
         s_type = _normalize_worker_type(s.get("type"))
-        per_worker[s["worker"]]["selects"] += 1
+        wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker"))
+        per_worker[wkey]["selects"] += 1
         if s_type in ("prefill", "mixed"):
-            per_worker[s["worker"]]["token_selects"] += 1
+            per_worker[wkey]["token_selects"] += 1
     for r in releases:
+        wkey = r.get("worker_key") or _normalize_worker_url_key(r.get("worker"))
         if str(r.get("type", "")).endswith("_tokens"):
-            per_worker[r["worker"]]["token_releases"] += 1
+            per_worker[wkey]["token_releases"] += 1
         else:
-            per_worker[r["worker"]]["releases"] += 1
+            per_worker[wkey]["releases"] += 1
 
     pw_result = {}
     for w, counts in per_worker.items():
@@ -716,7 +816,34 @@ def match_select_release(lines, fallback_window_s=120):
             "token_releases": counts["token_releases"],
         }
 
-    # 按 worker type 分类统计（prefill/decode/mixed）
+    # 基于 select 构建 worker URL -> dominant type 映射
+    per_worker_type_counts = defaultdict(lambda: defaultdict(int))
+    for s in selects:
+        wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker"))
+        per_worker_type_counts[wkey][_normalize_worker_type(s.get("type"))] += 1
+    worker_dominant_type = {}
+    for w, counts in per_worker_type_counts.items():
+        worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown"
+
+    # 为未显式标注 type 的 release 推断 worker type（避免大量 unknown）
+    inferred_release_types = {}
+    for i, r in enumerate(releases):
+        r_type_raw = str(r.get("type", ""))
+        if r_type_raw.endswith("_tokens"):
+            base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
+            # token release 按 worker URL 对应的 select 类型映射，不做邻近时间纠偏
+            mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown")
+            if mapped_t in ("prefill", "decode", "mixed"):
+                base_t = mapped_t
+            inferred_release_types[i] = f"{base_t}_tokens"
+            continue
+        base_t = _normalize_worker_type(r_type_raw)
+        if base_t != "unknown":
+            inferred_release_types[i] = base_t
+            continue
+        inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+
+    # 按 worker type 分类统计（prefill/decode/mixed，必要时保留 unknown）
     type_summary = defaultdict(
         lambda: {
             "counter_selects": 0,
@@ -730,16 +857,57 @@ def match_select_release(lines, fallback_window_s=120):
         type_summary[s_type]["counter_selects"] += 1
         if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
-    for r in releases:
-        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
-        if str(r.get("type", "")).endswith("_tokens"):
+    for i, r in enumerate(releases):
+        inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", ""))))
+        r_type = _normalize_worker_type(str(inferred).replace("_tokens", ""))
+        if str(inferred).endswith("_tokens"):
             type_summary[r_type]["token_releases"] += 1
         else:
             type_summary[r_type]["counter_releases"] += 1
 
+    # 每个 worker URL 的类型画像（基于 select）
+    worker_type_profile = {}
+    for w, counts in per_worker_type_counts.items():
+        dominant = "unknown"
+        if counts:
+            dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0]
+        worker_type_profile[w] = {
+            "dominant_type": dominant,
+            "prefill": counts.get("prefill", 0),
+            "decode": counts.get("decode", 0),
+            "mixed": counts.get("mixed", 0),
+            "unknown": counts.get("unknown", 0),
+        }
+
+    unmatched_releases = []
+    for i, r in enumerate(releases):
+        if str(r.get("type", "")).endswith("_tokens"):
+            # token release: 近邻存在 prefill/mixed select 则视为可解释，不计入 unmatched
+            inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", ""))
+            if inferred_token_type == "unknown":
+                unmatched_releases.append(
+                    {
+                        "worker": r.get("worker", ""),
+                        "release_ts": r.get("ts", ""),
+                        "type": inferred_token_type,
+                        "release_kind": "token_release",
+                    }
+                )
+            continue
+        if i not in release_used:
+            unmatched_releases.append(
+                {
+                    "worker": r.get("worker", ""),
+                    "release_ts": r.get("ts", ""),
+                    "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")),
+                    "release_kind": "request_release",
+                }
+            )
+
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
+        "unmatched_releases": unmatched_releases,
         "untracked_selects": untracked_selects,
         "failed_selects": failed_selects,
         "per_worker": pw_result,
@@ -750,7 +918,10 @@ def match_select_release(lines, fallback_window_s=120):
             "with_alt_id": with_alt_id,
             "without_any_id": without_any_id,
         },
+        "id_consistency": id_consistency,
+        "id_mismatched_matches": id_mismatched_matches,
         "type_summary": dict(type_summary),
+        "worker_type_profile": worker_type_profile,
     }
 
 
@@ -949,6 +1120,24 @@ def check(name, got, expected):
         "dial tcp {ip:port}: connection refused",
     )
 
+    print("\n=== Testing match_select_release (token release type inference) ===")
+    sample_lines = [
+        "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1",
+        "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10",
+        "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0",
+    ]
+    msr = match_select_release(sample_lines)
+    check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
+    check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
+    check("id consistency exact match", msr["id_consistency"].get("both_present_and_equal", 0), 1)
+
+    mismatch_lines = [
+        "[INFO] 2026/04/12 10:01:00 logger.go:1: [request_id:r2] select worker (decode): http://10.0.0.2:9965, count: 1",
+        "[INFO] 2026/04/12 10:01:01 logger.go:1: [request_id:r3] release worker: http://10.0.0.2:9965, count: 0",
+    ]
+    mm = match_select_release(mismatch_lines)
+    check("id mismatch detected", mm["id_consistency"].get("both_present_but_mismatch", 0), 1)
+
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index a818d31150f..803bf6fba43 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -24,6 +24,7 @@
 import os
 import sys
 from datetime import datetime
+from pathlib import Path
 
 # 确保能 import 同级模块
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -46,23 +47,56 @@ def determine_log_file(user_path=None):
     3. fd-router.log（golang_router 根目录）
     """
     if user_path:
-        if os.path.isfile(user_path):
-            return user_path
+        p = Path(user_path).expanduser()
+        if p.is_file():
+            return str(p)
         print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr)
+        print(
+            "提示: 若路径含空格/括号，请使用引号，例如: "
+            "python3 scripts/troubleshoot.py 'fastdeploy/golang_router/logs/fd-router (2).log' --load",
+            file=sys.stderr,
+        )
         sys.exit(1)
 
-    # 尝试不同 CWD 下的候选路径
-    candidates = [
-        "logs/router.log",  # CWD = golang_router/
-        "fd-router.log",  # CWD = golang_router/
-        "fastdeploy/golang_router/logs/router.log",  # CWD = 项目根
-        "fastdeploy/golang_router/fd-router.log",  # CWD = 项目根
+    # 统一基于脚本位置与当前工作目录搜索，避免 CWD 差异导致找不到日志。
+    script_dir = Path(__file__).resolve().parent
+    golang_router_dir = script_dir.parents[2]  # .../fastdeploy/golang_router
+    cwd = Path.cwd()
+
+    # 精确候选（优先常见命名）
+    exact_candidates = [
+        golang_router_dir / "logs" / "router.log",
+        golang_router_dir / "fd-router.log",
+        cwd / "logs" / "router.log",
+        cwd / "fd-router.log",
+        cwd / "fastdeploy" / "golang_router" / "logs" / "router.log",
+        cwd / "fastdeploy" / "golang_router" / "fd-router.log",
     ]
-    for path in candidates:
-        if os.path.isfile(path):
-            return path
+    for p in exact_candidates:
+        if p.is_file():
+            return str(p)
+
+    # 模糊候选：支持 fd-router (2).log 等命名
+    pattern_roots = [
+        golang_router_dir / "logs",
+        golang_router_dir,
+        cwd / "logs",
+        cwd,
+        cwd / "fastdeploy" / "golang_router" / "logs",
+        cwd / "fastdeploy" / "golang_router",
+    ]
+    dynamic_candidates = []
+    for root in pattern_roots:
+        if not root.is_dir():
+            continue
+        dynamic_candidates.extend(sorted(root.glob("fd-router*.log")))
+        dynamic_candidates.extend(sorted(root.glob("router*.log")))
+
+    if dynamic_candidates:
+        return str(dynamic_candidates[0])
 
     print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr)
+    print("已搜索: logs/router.log, fd-router.log, fd-router*.log, router*.log", file=sys.stderr)
     sys.exit(1)
 
 
@@ -128,7 +162,14 @@ def determine_status(results):
                 reasons.append(d["message"])
 
     if reasons:
-        return "DEGRADED", ", ".join(reasons)
+        # 去重但保留完整信息
+        deduped = []
+        seen = set()
+        for r in reasons:
+            if r not in seen:
+                deduped.append(r)
+                seen.add(r)
+        return "DEGRADED", "；".join(deduped)
 
     if not results:
         return "HEALTHY", "无分析数据"
@@ -148,19 +189,65 @@ def format_full_report(results, status, status_reason):
                 - 'trace_files': {trace_id: text} 或 {}
     """
     parts = []
-    details = {"health_events": None, "load_select_release": None, "trace_files": {}}
+    details = {
+        "health_events": None,
+        "load_select_release": None,
+        "latency_diagnoses": None,
+        "cache_diagnosis": None,
+        "load_diagnoses": None,
+        "load_counter_state": None,
+        "cache_session_stickiness": None,
+        "cache_suboptimal": None,
+        "cache_eviction": None,
+        "cache_fallback": None,
+        "cache_cross": None,
+        "errors_topn": None,
+        "trace_files": {},
+    }
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append(
+        "状态定义: HEALTHY=无明显异常；DEGRADED=服务可用但存在性能/稳定性问题（需关注）；CRITICAL=服务不可用或高风险故障。"
+    )
     parts.append("=" * 60)
     parts.append("")
 
     # 各维度报告
     if "errors" in results:
         parts.append(format_errors_report(results["errors"]))
+        if results["errors"].get("error_top_n"):
+            lines = [
+                "# Errors TopN 详情",
+                "",
+                "| 模板 | 数量 | 级别 | 来源层 | 影响 |",
+                "|:--|--:|:--|:--|:--|",
+            ]
+            for e in results["errors"]["error_top_n"]:
+                lines.append(
+                    f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |'
+                )
+            lines.append("")
+            lines.append("## 涉及 URLs")
+            lines.append("")
+            for e in results["errors"]["error_top_n"]:
+                urls = e.get("urls") or []
+                if not urls:
+                    continue
+                lines.append(f'- 模板: {e.get("template","")}')
+                for u in urls:
+                    lines.append(f'  - {u}')
+            lines.append("")
+            details["errors_topn"] = "\n".join(lines)
 
     if "latency" in results:
         parts.append(format_latency_report(results["latency"]))
+        if results["latency"].get("diagnoses"):
+            lines = ["# 延迟诊断详情", ""]
+            for d in results["latency"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] {d.get("message","")}')
+            lines.append("")
+            details["latency_diagnoses"] = "\n".join(lines)
 
     if "health" in results:
         summary, detail = format_health_report(results["health"])
@@ -173,9 +260,58 @@ def format_full_report(results, status, status_reason):
         parts.append(summary)
         if detail:
             details["load_select_release"] = detail
+        if results["load"].get("diagnoses"):
+            lines = ["# Load 诊断详情", ""]
+            for d in results["load"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}')
+            lines.append("")
+            details["load_diagnoses"] = "\n".join(lines)
+        if results["load"].get("counter_last_state"):
+            rows = results["load"]["counter_last_state"]
+            lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"]
+            for r in rows:
+                lines.append(
+                    f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |'
+                )
+            lines.append("")
+            details["load_counter_state"] = "\n".join(lines)
 
     if "cache" in results:
-        parts.append(format_cache_report(results["cache"]))
+        summary, detail = format_cache_report(results["cache"])
+        parts.append(summary)
+        if detail:
+            details["cache_diagnosis"] = detail
+        c = results["cache"]
+        if c.get("session_stickiness"):
+            lines = ["# Cache Session 粘性详情", ""]
+            for sid, s in c["session_stickiness"].items():
+                lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}')
+            lines.append("")
+            details["cache_session_stickiness"] = "\n".join(lines)
+        if c.get("suboptimal_selections"):
+            lines = ["# Cache 非最优选择详情", ""]
+            for x in c["suboptimal_selections"][:200]:
+                lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}')
+            lines.append("")
+            details["cache_suboptimal"] = "\n".join(lines)
+        if c.get("eviction_impact"):
+            lines = ["# Cache 驱逐影响详情", ""]
+            for x in c["eviction_impact"][:200]:
+                lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}')
+            lines.append("")
+            details["cache_eviction"] = "\n".join(lines)
+        if c.get("fallback_reasons"):
+            lines = ["# Cache Fallback 原因详情", ""]
+            for x in c["fallback_reasons"]:
+                lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
+            lines.append("")
+            details["cache_fallback"] = "\n".join(lines)
+        if c.get("cross_diagnosis"):
+            lines = ["# Cache 交叉诊断详情", ""]
+            for x in c["cross_diagnosis"]:
+                lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%')
+            lines.append("")
+            details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -217,6 +353,40 @@ def save_detailed_report(report_text, output_dir, details=None):
             with open(load_path, "w", encoding="utf-8") as f:
                 f.write(details["load_select_release"])
 
+        if details.get("latency_diagnoses"):
+            latency_path = os.path.join(detail_dir, "latency_diagnoses.md")
+            with open(latency_path, "w", encoding="utf-8") as f:
+                f.write(details["latency_diagnoses"])
+
+        if details.get("cache_diagnosis"):
+            cache_path = os.path.join(detail_dir, "cache_diagnosis.md")
+            with open(cache_path, "w", encoding="utf-8") as f:
+                f.write(details["cache_diagnosis"])
+        if details.get("load_diagnoses"):
+            with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_diagnoses"])
+        if details.get("load_counter_state"):
+            with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_counter_state"])
+        if details.get("cache_session_stickiness"):
+            with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_session_stickiness"])
+        if details.get("cache_suboptimal"):
+            with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_suboptimal"])
+        if details.get("cache_eviction"):
+            with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_eviction"])
+        if details.get("cache_fallback"):
+            with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_fallback"])
+        if details.get("cache_cross"):
+            with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_cross"])
+        if details.get("errors_topn"):
+            with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f:
+                f.write(details["errors_topn"])
+
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
             trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")

From a317cecf8df73eaa751ff1cd8e4612e9818899d1 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 12:56:38 +0800
Subject: [PATCH 19/40] fix(load): treat positive req delta as possible
 in-flight requests

---
 .../troubleshoot/references/error_catalog.md  |   1 +
 .../troubleshoot/references/log_patterns.md   |  11 +
 .../references/report_templates.md            |  12 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 131 +++++++++-
 .../troubleshoot/scripts/analyzers/errors.py  |  56 +++--
 .../troubleshoot/scripts/analyzers/health.py  |  38 ++-
 .../troubleshoot/scripts/analyzers/latency.py |   8 +-
 .../troubleshoot/scripts/analyzers/load.py    |  81 ++++--
 .../scripts/analyzers/load_report.py          |  77 +++---
 .../troubleshoot/scripts/analyzers/trace.py   |  25 +-
 .../skills/troubleshoot/scripts/chart.py      |   3 +-
 .../skills/troubleshoot/scripts/log_parser.py | 234 ++++++++++++------
 .../troubleshoot/scripts/troubleshoot.py      | 198 +++++++++++++--
 13 files changed, 710 insertions(+), 165 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
index ba48297d9c9..60b4931b546 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md
@@ -61,6 +61,7 @@
 | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 |
 | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 |
+| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件（若未使用 register.yaml 可忽略） |
 | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 |
 | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 |
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
index cf33b41f723..4322909c01d 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md
@@ -233,6 +233,17 @@ PD（Prefill/Decode 分离）模式下，`completions.go` 产生的 `[prefill]`
 
 ---
 
+## Select/Release 日志细节（与代码一致）
+
+- `select worker (prefill): <url>, tokens: <n>`
+- `select worker (decode|mixed): <url>, count: <n>`
+- `release worker: <url>, count: <n>`（request counter 释放）
+- `release prefill tokens: <url>, tokens: <n>`（token counter 释放；可能来自 prefill 或 mixed 请求路径）
+
+重点：release 只有上面这两种。`release worker` 不带 worker type，`release prefill tokens` 的文本也不能直接断定是 prefill（mixed 也可能调用）。因此按 `prefill/decode/mixed` 统计时，需要从 select 侧做归类；确实无法归类时才记为 `unknown`。
+
+---
+
 ## 使用脚本工具
 
 各 skill 的脚本位于各自的 `scripts/` 目录下，自动处理上述所有日志解析和计算。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index ba9e40e9869..cd705d02816 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -44,6 +44,7 @@
 ### 简洁版（终端输出）
 
 - 第一行：`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明`
+- 状态定义：`HEALTHY`=无明显异常；`DEGRADED`=服务可用但性能/稳定性下降（需关注）；`CRITICAL`=服务不可用或高风险故障
 - 按三层分类（Router / FD 后端 / 客户端）
 - 每个问题一行摘要 + 关键指标
 - 末尾提示详细版文件路径
@@ -53,8 +54,15 @@
 - 路径：`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/troubleshoot_report_<timestamp>.md`
 - 主报告包含各维度总结 + 可视化图表（sparkline/柱状图/时间线等）
 - 详情拆分到 `details/` 子目录：
-  - `details/health_events.md` — Worker 逐分钟健康事件
-  - `details/trace_<ID>.md` — 请求追踪事件链
+  - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断
+  - `detail/errors_topn.md` — ERROR/WARN 模板明细（数量/级别/来源层/影响 + URLs）
+  - `detail/load_select_release.md` — 负载诊断 + select/release 明细
+  - `detail/load_diagnoses.md` — load 诊断列表
+  - `detail/load_counter_state.md` — request/token counter 末状态
+  - `detail/latency_diagnoses.md` — 延迟诊断详情
+  - `detail/cache_diagnosis.md` — cache 六维诊断详情（session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断）
+  - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细
+  - `detail/trace_<ID>.md` — 请求追踪事件链
 
 ---
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3fca296f4d6..3a5c19ad00b 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig
         "cold_starts": cold_starts,
         "hitratio_stats": hitratio_stats,
         "tokenizer_degraded_count": tokenizer_degraded_count,
+        "cross_diagnosis": _analyze_cross_diagnosis(
+            session_stickiness=session_stickiness,
+            hitratio_stats=hitratio_stats,
+            strategy_dist=strategy_dist,
+            eviction_impact=eviction_impact,
+        ),
         "diagnoses": diagnoses,
         "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, "
         f"冷启动 {cold_starts}",
@@ -339,6 +345,45 @@ def _diagnose(
     return diagnoses
 
 
+def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact):
+    """交叉诊断：基于粘性/命中率/fallback/驱逐给出简表。"""
+    if not session_stickiness:
+        return []
+    avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1)
+    mean_hr = hitratio_stats.get("mean", 0)
+    fallback_pct = 0
+    for s in strategy_dist:
+        if s.get("value") == "process_tokens":
+            fallback_pct = s.get("pct", 0)
+            break
+    evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted"))
+
+    diagnosis = "运行良好"
+    action = "-"
+    if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10:
+        diagnosis = "运行良好"
+    elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0:
+        diagnosis = "疑似驱逐导致命中率低"
+        action = "考虑增大 eviction-duration-mins"
+    elif avg_stickiness < 40 and fallback_pct >= 20:
+        diagnosis = "低粘性 + 高 fallback"
+        action = "检查负载阈值与 cache-aware 参数"
+    elif avg_stickiness < 40 and mean_hr < 20:
+        diagnosis = "低粘性 + 低命中"
+        action = "检查缓存预热与 prompt 稳定性"
+
+    return [
+        {
+            "avg_stickiness_pct": round(avg_stickiness, 1),
+            "mean_hitRatio_pct": round(mean_hr, 1),
+            "fallback_pct": round(fallback_pct, 1),
+            "evicted_after_timeout": evicted_cnt,
+            "diagnosis": diagnosis,
+            "action": action,
+        }
+    ]
+
+
 # ════════════════════════════════════════════════════════════════
 # 报告格式化
 # ════════════════════════════════════════════════════════════════
@@ -349,13 +394,18 @@ def format_cache_report(result):
     sections = ["## Cache 调度诊断", ""]
     sections.append(f'  {result["summary"]}')
     sections.append("")
+    detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""]
 
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
         sections.append("")
+        detail_sections.append("## 诊断")
+        detail_sections.append("")
+        for d in result["diagnoses"]:
+            detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_sections.append("")
 
     # 策略分布
     if result["strategy_dist"]:
@@ -364,6 +414,10 @@ def format_cache_report(result):
         bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## 策略分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # hitRatio 统计
     hs = result.get("hitratio_stats", {})
@@ -383,6 +437,10 @@ def format_cache_report(result):
         bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]]
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
+        detail_sections.append("## Fallback 原因分布")
+        detail_sections.append("")
+        detail_sections.append(render_bar(bar_data, show_count=True))
+        detail_sections.append("")
 
     # Tokenizer 退化
     if result.get("tokenizer_degraded_count", 0) > 0:
@@ -394,6 +452,8 @@ def format_cache_report(result):
     if stickiness:
         sections.append("### Session 粘性")
         sections.append("")
+        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         table_data = [
             {
                 "Session": sid[:16],
@@ -403,26 +463,37 @@ def format_cache_report(result):
             }
             for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"])
         ]
-        sections.append(
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append(
             render_table(
-                table_data[:10],
+                table_data,
                 columns=["Session", "请求数", "粘性率", "切换次数"],
                 right_align={"请求数", "粘性率", "切换次数"},
             )
         )
-        sections.append("")
+        detail_sections.append("")
 
     # 非最优选择
     if result.get("suboptimal_selections"):
         subs = result["suboptimal_selections"]
         sections.append(f"### 非最优选择 ({len(subs)} 次)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
         for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
             sections.append(f"  {reason}: {count} 次")
         sections.append("")
+        detail_sections.append("## 非最优选择（Top 20）")
+        detail_sections.append("")
+        for s in subs[:20]:
+            detail_sections.append(
+                f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
+            )
+        detail_sections.append("")
 
     # 驱逐影响
     if result.get("eviction_impact"):
@@ -430,13 +501,61 @@ def format_cache_report(result):
         evicted = [e for e in evictions if e["evicted"]]
         sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
         sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        for e in evictions[:50]:
+            detail_sections.append(
+                f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
+            )
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
         sections.append(f'  冷启动: {result["cold_starts"]} 次（hitRatios=map[]）')
         sections.append("")
+        detail_sections.append("## 冷启动识别")
+        detail_sections.append("")
+        detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
+        detail_sections.append("")
+
+    if result.get("cross_diagnosis"):
+        sections.append("### 交叉诊断")
+        sections.append("")
+        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
+        sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["cross_diagnosis"],
+                columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"],
+                right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"},
+            )
+        )
+        detail_sections.append("")
+
+    if any(
+        [
+            result.get("session_stickiness"),
+            result.get("suboptimal_selections"),
+            result.get("eviction_impact"),
+            result.get("cross_diagnosis"),
+            result.get("diagnoses"),
+        ]
+    ):
+        sections.append(
+            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
+            "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
+            "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
+            "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
+            "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
+            "[detail/cache_cross.md](../detail/cache_cross.md)"
+        )
+        sections.append("")
 
-    return "\n".join(sections)
+    return "\n".join(sections), "\n".join(detail_sections)
 
 
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
index b8217a5ffa4..f0e4c352b6c 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py
@@ -33,6 +33,7 @@
     ("counter already zero", "Router"),
     ("tokenizer failed", "Router"),
     ("Instance {url} role is unknown", "Router"),
+    ("Failed to read YAML file config/register.yaml", "Router"),
     # 客户端
     ("Invalid request body", "客户端"),
     ("Invalid JSON format", "客户端"),
@@ -55,6 +56,15 @@
     ("GetRemoteMetrics failed", "FD 后端"),
 ]
 
+IMPACT_RULES = [
+    ("Failed to select", "请求可能返回 502/503"),
+    ("Failed to connect to backend", "后端不可达，请求失败"),
+    ("Panic recovered", "Router 代码异常，可能影响稳定性"),
+    ("scanner error", "流式响应中断"),
+    ("copy error", "非流式响应中断"),
+    ("Failed to read YAML file config/register.yaml", "可选配置未加载（若未启用可忽略）"),
+]
+
 # scanner error / copy error 特殊处理：context canceled → 客户端，其他 → FD 后端
 SCANNER_COPY_PATTERNS = ("scanner error", "copy error")
 
@@ -75,6 +85,13 @@ def classify_source_layer(template, original=""):
     return "未知"
 
 
+def classify_impact(template):
+    for pattern, impact in IMPACT_RULES:
+        if pattern in template:
+            return impact
+    return "-"
+
+
 # ════════════════════════════════════════════════════════════════
 # 主分析函数
 # ════════════════════════════════════════════════════════════════
@@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n):
                 "count": g["count"],
                 "pct": round(g["count"] / total * 100, 1) if total else 0,
                 "source_layer": source_layer,
+                "impact": classify_impact(g["template"]),
                 "level": g["level"],
+                "urls": _extract_urls(g["originals"]),
                 "sample_originals": g["originals"],
             }
         )
@@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n):
     return result
 
 
+def _extract_urls(originals):
+    import re
+
+    urls = set()
+    for line in originals:
+        for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line):
+            urls.add(m)
+    return sorted(urls)
+
+
 def _grep_lines(log_file, pattern, tail=None):
     """用 grep 从日志文件提取匹配行。"""
     try:
@@ -240,6 +269,9 @@ def format_errors_report(result):
         f'请求总数: {result["total_requests"]}  |  '
         f'错误率: {result["error_rate"]}%'
     )
+    sections.append("  指标口径: ERROR/WARN=日志级别计数；请求总数=HTTP 请求行数；错误率=非200请求数/请求总数×100%。")
+    if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0):
+        sections.append("  ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200；并不代表没有 ERROR/WARN 日志。")
     sections.append("")
 
     # Panic
@@ -266,22 +298,16 @@ def format_errors_report(result):
         sections.append(render_bar(bar_data, show_count=True))
         sections.append("")
 
-        # 来源层表格
-        table_data = []
-        for e in result["error_top_n"][:10]:
-            table_data.append(
-                {
-                    "模板": e["template"][:60],
-                    "数量": e["count"],
-                    "占比": f'{e["pct"]}%',
-                    "级别": e["level"],
-                    "来源层": e["source_layer"],
-                }
-            )
-        sections.append(
-            render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"})
-        )
+        sections.append("  具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)")
         sections.append("")
+        yaml_missing_count = sum(
+            e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"]
+        )
+        if yaml_missing_count > 0:
+            sections.append(
+                f"  ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次：若未启用该配置文件，可忽略。"
+            )
+            sections.append("")
 
     # 状态码分布
     if result["status_code_dist"]:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
index ca01d718dbc..5d1994d9405 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py
@@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events):
                         break
 
         all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events]
+        for reg in register_by_ip.get(worker_ip, []):
+            all_events.append({"ts": reg["ts"], "type": "REGISTERED"})
         all_events.extend(recovery_events)
         all_events.sort(key=lambda e: e["ts"] or "")
 
         down_periods = _compute_down_periods(all_events)
         down_count = len(down_periods)
         avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0
+        detect_latency = _compute_detect_latency(all_events)
 
         workers[url] = {
             "events": all_events,
@@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events):
             "recovered": recovered,
             "inflight_preserved": counter_counts.get(url, 0),
             "down_periods": down_periods,
+            "avg_detect_latency_s": detect_latency,
         }
 
     return workers
@@ -191,6 +195,24 @@ def _compute_down_periods(events):
     return down_periods
 
 
+def _compute_detect_latency(events):
+    """计算 NOT_HEALTHY -> REMOVED 平均检测延迟（秒）。"""
+    last_unhealthy = None
+    latencies = []
+    for evt in events:
+        if evt["type"] == "NOT_HEALTHY" and evt.get("ts"):
+            last_unhealthy = evt["ts"]
+        elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"):
+            try:
+                latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds())
+            except ValueError:
+                pass
+            last_unhealthy = None
+    if not latencies:
+        return "-"
+    return round(sum(latencies) / len(latencies), 1)
+
+
 def _compute_uptime_pct(events):
     """计算 Worker 可用性百分比。"""
     if not events:
@@ -313,8 +335,7 @@ def format_health_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     # Worker 可用性表格
@@ -335,6 +356,7 @@ def format_health_report(result):
                 "在线率": f'{w["uptime_pct"]}%',
                 "下线次数": str(w["down_count"]),
                 "平均下线时长": avg_down or "-",
+                "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"),
                 "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"),
                 "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-",
             }
@@ -342,8 +364,8 @@ def format_health_report(result):
     sections.append(
         render_table(
             table_data,
-            columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"],
-            right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"},
+            columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"],
+            right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"},
         )
     )
     sections.append("")
@@ -360,6 +382,12 @@ def format_health_report(result):
     # 事件详情 → 拆分到 detail_text
     detail_parts = ["# Worker 健康事件详情", ""]
     has_events = False
+    if result.get("diagnoses"):
+        detail_parts.append("## 诊断")
+        detail_parts.append("")
+        for d in result["diagnoses"]:
+            detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        detail_parts.append("")
     for url, w in sorted(result["workers"].items()):
         if w["events"]:
             has_events = True
@@ -373,7 +401,7 @@ def format_health_report(result):
 
     # 主报告中添加引用
     if has_events:
-        sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)")
+        sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)")
         sections.append("")
 
     return "\n".join(sections), detail_text
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
index eec862910e8..508cf3824d9 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py
@@ -255,6 +255,7 @@ def format_latency_report(result):
         f'p95={_fmt_ms(stats["p95"])}  p99={_fmt_ms(stats["p99"])}  '
         f'max={_fmt_ms(stats["max"])}'
     )
+    sections.append("  指标口径: pXX=延迟分位数；吞吐量=每个时间桶内请求数(count)；调度耗时=同 request_id 的 ts_ms(max-min)。")
     sections.append("")
 
     # 延迟分布
@@ -331,13 +332,10 @@ def format_latency_report(result):
         )
         sections.append("")
 
-    # 诊断
+    # 诊断（仅在 detail 输出）
     if result["diagnoses"]:
         sections.append("### 诊断")
-        for d in result["diagnoses"]:
-            severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "}
-            mark = severity_mark.get(d["severity"], " ")
-            sections.append(f'  [{mark}] {d["message"]}')
+        sections.append("  诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)")
         sections.append("")
 
     return "\n".join(sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index c38b0b80953..7b59b6c5f01 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -30,6 +30,8 @@
 # Token 事件
 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)")
 RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)")
+SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)")
+RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)")
 
 
 def _strip_scheme(url):
@@ -135,11 +137,21 @@ def analyze_load(log_file, tail=None):
     sr_result = (
         match_select_release(h3_lines + h11_lines)
         if h3_lines
-        else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}}
+        else {
+            "matched": [],
+            "unmatched_selects": [],
+            "unmatched_releases": [],
+            "failed_selects": [],
+            "per_worker": {},
+            "id_coverage": {},
+            "type_summary": {},
+            "worker_type_profile": {},
+        }
     )
 
     # Token 统计
     token_stats = _analyze_tokens(h3_lines, h11_lines)
+    counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines)
 
     # 请求堆积检测
     pileup = _detect_pileup(stats_records)
@@ -154,6 +166,7 @@ def analyze_load(log_file, tail=None):
         "counter_anomalies": anomaly_summary,
         "select_release": sr_result,
         "token_stats": token_stats,
+        "counter_last_state": counter_last_state,
         "pileup_detected": pileup,
         "diagnoses": diagnoses,
         "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)",
@@ -191,6 +204,55 @@ def _analyze_tokens(h3_lines, h11_lines):
     return result
 
 
+def _analyze_counter_last_state(lines):
+    """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。"""
+    state = defaultdict(
+        lambda: {
+            "req_last_action": "-",
+            "req_last_value": "-",
+            "token_last_action": "-",
+            "token_last_value": "-",
+            "last_ts": "",
+        }
+    )
+    for line in lines:
+        ts = extract_ts(line) or ""
+        m = SELECT_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["req_last_action"] = "select"
+            state[w]["req_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_REQ_COUNT_RE.search(line)
+        if m:
+            w = m.group(1)
+            state[w]["req_last_action"] = "release"
+            state[w]["req_last_value"] = m.group(2)
+            state[w]["last_ts"] = ts
+            continue
+        m = SELECT_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "select"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+        m = RELEASE_TOKENS_RE.search(line)
+        if m:
+            w = m.group(2)
+            state[w]["token_last_action"] = "release"
+            state[w]["token_last_value"] = m.group(3)
+            state[w]["last_ts"] = ts
+            continue
+
+    result = []
+    for w in sorted(state.keys()):
+        s = state[w]
+        result.append({"worker": _strip_scheme(w), **s})
+    return result
+
+
 def _detect_pileup(stats_records):
     """检测请求堆积：total_running 连续上升 >5 个采样点。"""
     if len(stats_records) < 5:
@@ -254,25 +316,16 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats,
     # Select/Release 不一致（仅在存在可关联 ID 时启用，避免无 ID 场景误报）
     if has_correlatable_ids:
         for w_url, pw in sr_result.get("per_worker", {}).items():
-            if pw.get("delta", 0) > 0:
+            delta = pw.get("delta", 0)
+            if delta >= 3:
                 diagnoses.append(
                     {
-                        "severity": "HIGH",
-                        "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}（请求泄漏/卡住）',
+                        "severity": "MEDIUM",
+                        "message": f'{_strip_scheme(w_url)} select-release 差值 {delta}（可能存在在途请求堆积）',
                         "source_layer": "FD 后端",
                     }
                 )
 
-    # 卡住的请求
-    if sr_result.get("unmatched_selects"):
-        diagnoses.append(
-            {
-                "severity": "HIGH",
-                "message": f'{len(sr_result["unmatched_selects"])} 个 select 无对应 release（疑似卡住）',
-                "source_layer": "FD 后端",
-            }
-        )
-
     # Token 计数器潜在泄漏
     for t in token_stats:
         if t.get("alloc_count", 0) > t.get("release_count", 0):
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
index 86ba1f0d94f..5cbdc829bf6 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py
@@ -25,8 +25,7 @@ def format_load_report(result):
     if result["diagnoses"]:
         sections.append("### 诊断")
         sections.append("")
-        for d in result["diagnoses"]:
-            sections.append(f'  [{d["severity"]}] [{d["source_layer"]}] {d["message"]}')
+        sections.append(f'  共 {len(result["diagnoses"])} 条诊断，见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)')
         sections.append("")
         detail_sections.append("## 诊断")
         detail_sections.append("")
@@ -39,6 +38,7 @@ def format_load_report(result):
     if ls:
         sections.append("### 负载概览 (total_running)")
         sections.append("")
+        sections.append("  说明: stats 采样来自 `[stats]` 周期日志（通常每 5s 一条），用于观察当前并发与负载变化趋势。")
         sections.append(
             f'  mean={ls.get("mean",0)}  p50={ls.get("p50",0)}  p90={ls.get("p90",0)}  '
             f'p99={ls.get("p99",0)}  max={ls.get("max",0)}  stddev={ls.get("stddev",0)}'
@@ -108,6 +108,9 @@ def format_load_report(result):
             sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"]))
             sections.append("")
             sections.append("  说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加；decode 仅 request counter。")
+            sections.append("  说明: `release prefill tokens` 会被识别为 token-release；worker type 按该 worker URL 在 select 中的类型映射（prefill/decode/mixed）。")
+            if type_summary.get("unknown"):
+                sections.append("  说明: unknown 表示日志里缺少 worker type，且无法从邻近 select/release 关系推断。")
             sections.append("")
             detail_sections.append("## 按类型统计")
             detail_sections.append("")
@@ -174,10 +177,35 @@ def format_load_report(result):
         if no_correlatable_id:
             sections.append("  ℹ 当前样本无可关联 ID，Delta 不用于请求泄漏结论。")
             sections.append("")
+        sections.append("  ℹ ReqDelta>0 可能仅表示仍有在途请求（尚未完成推理），需结合时间窗口观察。")
+        sections.append("")
         sections.append("  说明: TokenSelect 按 worker type 统计（prefill + mixed 的 select 都计入），不依赖日志里是否出现 tokens 字段。")
         sections.append("")
         detail_sections.append("## Select/Release Per-Worker")
         detail_sections.append("")
+
+    if sr.get("worker_type_profile"):
+        sections.append("### Worker URL 类型画像（基于 select）")
+        sections.append("")
+        rows = []
+        for w, p in sorted(sr["worker_type_profile"].items()):
+            rows.append(
+                {
+                    "Worker": _strip_scheme(w),
+                    "Dominant": p.get("dominant_type", "unknown"),
+                    "Prefill": p.get("prefill", 0),
+                    "Decode": p.get("decode", 0),
+                    "Mixed": p.get("mixed", 0),
+                }
+            )
+        sections.append(
+            render_table(
+                rows,
+                columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"],
+                right_align={"Prefill", "Decode", "Mixed"},
+            )
+        )
+        sections.append("")
         detail_sections.append(
             render_table(
                 table_data,
@@ -187,35 +215,6 @@ def format_load_report(result):
         )
         detail_sections.append("")
 
-    if sr.get("unmatched_selects"):
-        sections.append(f'  ⚠ {len(sr["unmatched_selects"])} 个未匹配 select（疑似请求卡住）')
-        sections.append("  解释: 出现 request select，但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。")
-        for u in sr["unmatched_selects"][:3]:
-            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
-        sections.append("")
-        detail_sections.append("## 未匹配 select（完整）")
-        detail_sections.append("")
-        for u in sr["unmatched_selects"]:
-            detail_sections.append(
-                f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}'
-            )
-        detail_sections.append("")
-
-    if sr.get("untracked_selects"):
-        sections.append(f'  ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID，未参与卡住判定')
-        for u in sr["untracked_selects"][:3]:
-            sections.append(f'    [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})')
-        sections.append("  > 完整列表见: [details/load_select_release.md](details/load_select_release.md)")
-        sections.append("")
-        detail_sections.append("## Untracked selects（缺少可关联 ID）")
-        detail_sections.append("")
-        for u in sr["untracked_selects"]:
-            detail_sections.append(
-                f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}'
-            )
-        detail_sections.append("")
-
     if sr.get("failed_selects"):
         sections.append(f'  ⚠ Failed to select: {len(sr["failed_selects"])} 次')
         sections.append("  解释: 路由在该时刻未能选出可用 worker，通常意味着可用池不足或健康状态异常。")
@@ -239,4 +238,20 @@ def format_load_report(result):
         )
         sections.append("")
 
+    if result.get("counter_last_state"):
+        sections.append("### 计数器末状态")
+        sections.append("")
+        sections.append("  末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)")
+        sections.append("")
+        detail_sections.append("## Counter / Token Counter 末状态（最后一条计数日志）")
+        detail_sections.append("")
+        detail_sections.append(
+            render_table(
+                result["counter_last_state"],
+                columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"],
+                right_align={"req_last_value", "token_last_value"},
+            )
+        )
+        detail_sections.append("")
+
     return "\n".join(sections), "\n".join(detail_sections)
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 6c9a0323724..24af9a23500 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -16,6 +16,7 @@
 from log_parser import (
     extract_tags,
     extract_ts,
+    match_select_release,
     parse_cache_strategy_line,
     parse_http_line,
 )
@@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None):
         # 解析事件链
         events = _parse_event_chain(all_lines)
         lifecycle_complete = _check_lifecycle_complete(events)
-        diagnoses = _diagnose_trace(events, lifecycle_complete)
+        sr_check = match_select_release(all_lines)
+        diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check)
 
         traces[tid] = {
             "events": events,
             "lifecycle_complete": lifecycle_complete,
             "diagnoses": diagnoses,
+            "sr_check": sr_check,
             "matched_tag": "session_id" if is_session else "request_id/trace_id",
             "related_ids": {
                 "request_ids": sorted(related_request_ids) if is_session else [],
@@ -271,7 +274,7 @@ def _check_lifecycle_complete(events):
     return has_entry and has_exit and (not has_select or has_release)
 
 
-def _diagnose_trace(events, lifecycle_complete):
+def _diagnose_trace(events, lifecycle_complete, sr_check=None):
     """生成追踪诊断。"""
     diagnoses = []
     types = [e["type"] for e in events]
@@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete):
     if "FAILED_SELECT" in types:
         diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"})
 
+    if sr_check:
+        if sr_check.get("unmatched_selects"):
+            diagnoses.append(
+                {
+                    "severity": "HIGH",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select',
+                }
+            )
+        if sr_check.get("unmatched_releases"):
+            diagnoses.append(
+                {
+                    "severity": "MEDIUM",
+                    "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release',
+                }
+            )
+
     return diagnoses
 
 
@@ -367,7 +386,7 @@ def format_trace_report(result):
             # 主报告中添加引用和摘要
             safe_tid = tid.replace("/", "_")
             sections.append(f'  事件数: {len(trace["events"])}')
-            sections.append(f"  > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)")
+            sections.append(f"  > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)")
             sections.append("")
 
     return "\n".join(sections), detail_dict
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
index 83bb0203432..1eaea1369f8 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py
@@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None):
         w = col_widths[col]
         if col in right_align:
             header_parts.append(f" {col:>{w}} ")
+            sep_parts.append("-" * (w + 1) + ":")
         else:
             header_parts.append(f" {col:<{w}} ")
-        sep_parts.append("-" * (w + 2))
+            sep_parts.append(":" + "-" * (w + 1))
 
     lines = []
     lines.append("|" + "|".join(header_parts) + "|")
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 200f976f2ff..2f98511a811 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -501,6 +501,83 @@ def _normalize_worker_type(worker_type):
     return "unknown"
 
 
+def _normalize_worker_url_key(url):
+    if not url:
+        return ""
+    return re.sub(r"^https?://", "", str(url).strip().rstrip("/"))
+
+
+def _infer_release_worker_type(release, selects, fallback_window_s=120):
+    """为未显式标注 type 的 release 近似推断 worker type。
+
+    优先级：
+      1) 同 worker、时间上最近且不晚于 release 的 select type
+      2) 若无可解析时间戳，则使用同 worker 的最后一个 select type
+      3) 推断失败返回 unknown
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    # 回退：按出现顺序取同 worker 的最近 select
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
+def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
+    """为 token release 推断 worker type（prefill/mixed）。
+
+    注意：日志文本通常固定为 `release prefill tokens`，即使 mixed 也可能走这条日志。
+    因此 token release 的类型优先依据同 worker 的邻近 select 推断。
+    """
+    worker = release.get("worker")
+    if not worker:
+        return "unknown"
+
+    r_ts = _parse_ts_safe(release.get("ts"))
+    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    if not candidates:
+        return "unknown"
+
+    if r_ts:
+        best = None
+        best_delta = None
+        for s in candidates:
+            s_ts = _parse_ts_safe(s.get("ts"))
+            if not s_ts:
+                continue
+            delta = (r_ts - s_ts).total_seconds()
+            if delta < 0 or delta > fallback_window_s:
+                continue
+            if best_delta is None or delta < best_delta:
+                best = s
+                best_delta = delta
+        if best is not None:
+            return _normalize_worker_type(best.get("type"))
+
+    return _normalize_worker_type(candidates[-1].get("type"))
+
+
 def match_select_release(lines, fallback_window_s=120):
     """匹配 select/release worker 事件对。
 
@@ -525,6 +602,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": tm.group(2),
+                    "worker_key": _normalize_worker_url_key(tm.group(2)),
                     "type": _normalize_worker_type(tm.group(1)),
                     "tags": tags,
                     "tokens": int(tm.group(3)),
@@ -536,12 +614,15 @@ def match_select_release(lines, fallback_window_s=120):
         # Token-bearing release
         trm = RELEASE_TOKENS_RE.search(line)
         if trm:
-            token_type = trm.group(1) or "prefill"
+            token_type = trm.group(1)
             releases.append(
                 {
                     "ts": ts,
                     "worker": trm.group(2),
-                    "type": f'{_normalize_worker_type(token_type)}_tokens',
+                    "worker_key": _normalize_worker_url_key(trm.group(2)),
+                    # 文本默认按 prefill 记，再结合同 worker 邻近 select 做纠偏（mixed 场景）
+                    "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens',
+                    "raw_token_type": token_type or "",
                     "tags": tags,
                     "tokens": int(trm.group(3)),
                     "line": line_no,
@@ -555,6 +636,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": sm.group(2),
+                    "worker_key": _normalize_worker_url_key(sm.group(2)),
                     "type": _normalize_worker_type(sm.group(1)),
                     "tags": tags,
                     "tokens": None,
@@ -569,6 +651,7 @@ def match_select_release(lines, fallback_window_s=120):
                 {
                     "ts": ts,
                     "worker": rm.group(2),
+                    "worker_key": _normalize_worker_url_key(rm.group(2)),
                     "type": _normalize_worker_type(rm.group(1)),
                     "tags": tags,
                     "tokens": None,
@@ -580,20 +663,14 @@ def match_select_release(lines, fallback_window_s=120):
         if FAILED_SELECT_RE.search(line):
             failed_selects.append({"ts": ts, "tags": tags, "line": line_no})
 
-    # Match by request_id / alt_id
+    # Match by worker FIFO（select -> 同 worker 下一条 release）
     matched = []
     unmatched_selects = []
     release_used = set()
 
     # 请求生命周期匹配只使用 request counter release（排除 token release）
+    # 说明：request_id 只用于覆盖率观测，不参与 select/release 配对条件。
     counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")]
-    release_by_key = defaultdict(list)
-    for i in counter_release_indexes:
-        r = releases[i]
-        _, key = _select_match_key(r.get("tags", {}))
-        if key:
-            release_by_key[key].append(i)
-
     # 请求 ID 覆盖（按 select 事件近似请求数）
     total_req_est = len(selects)
     with_request_id = 0
@@ -601,7 +678,6 @@ def match_select_release(lines, fallback_window_s=120):
     without_any_id = 0
 
     pending_selects = []
-    untracked_selects = []
     for s in selects:
         key_type, key = _select_match_key(s.get("tags", {}))
         if key_type == "request_id":
@@ -611,62 +687,28 @@ def match_select_release(lines, fallback_window_s=120):
         else:
             without_any_id += 1
 
-        found = False
-        if not key:
-            # 没有任何可用 ID 时，不做退化匹配（只统计可观测信息）
-            untracked_selects.append(
-                {
-                    "worker": s["worker"],
-                    "select_ts": s["ts"],
-                    "type": s["type"],
-                    "tags": s["tags"],
-                    "note": "no correlatable id (request_id/req_id/trace_id/session_id)",
-                }
-            )
-            continue
+        pending_selects.append(s)
 
-        if key and key in release_by_key:
-            for ri in release_by_key[key]:
-                if ri not in release_used:
-                    r = releases[ri]
-                    matched.append(
-                        {
-                            "request_id": s["tags"].get("request_id", ""),
-                            "worker": s["worker"],
-                            "select_ts": s["ts"],
-                            "release_ts": r["ts"],
-                            "type": s["type"],
-                            "match_method": key_type or "id",
-                        }
-                    )
-                    release_used.add(ri)
-                    found = True
-                    break
-
-        if not found:
-            pending_selects.append(s)
-
-    # Fallback: 有 ID 但未匹配时，按 worker + 时间邻近匹配
     for s in pending_selects:
-        sdt = _parse_ts_safe(s["ts"])
+        sdt = _parse_ts_safe(s.get("ts"))
         best_idx = None
-        best_delta = None
+        best_ts = None
         for ri in counter_release_indexes:
-            r = releases[ri]
             if ri in release_used:
                 continue
-            if r.get("worker") != s.get("worker"):
+            r = releases[ri]
+            if r.get("worker_key") != s.get("worker_key"):
                 continue
             rdt = _parse_ts_safe(r.get("ts"))
-            if sdt and rdt:
-                delta = (rdt - sdt).total_seconds()
-                if delta < 0 or delta > fallback_window_s:
-                    continue
-            else:
-                delta = 0
-            if best_delta is None or delta < best_delta:
-                best_delta = delta
+            # 优先选择时间不早于 select 的最早 release；解析失败则按出现顺序
+            if sdt and rdt and rdt < sdt:
+                continue
+            if best_idx is None:
                 best_idx = ri
+                best_ts = rdt
+            elif rdt and best_ts and rdt < best_ts:
+                best_idx = ri
+                best_ts = rdt
 
         if best_idx is not None:
             r = releases[best_idx]
@@ -677,7 +719,7 @@ def match_select_release(lines, fallback_window_s=120):
                     "select_ts": s["ts"],
                     "release_ts": r["ts"],
                     "type": s["type"],
-                    "match_method": "worker_time_fallback",
+                    "match_method": "worker_fifo",
                 }
             )
             release_used.add(best_idx)
@@ -688,7 +730,7 @@ def match_select_release(lines, fallback_window_s=120):
                     "select_ts": s["ts"],
                     "type": s["type"],
                     "tags": s["tags"],
-                    "note": "no matching release found (request_id/worker-time)",
+                    "note": "no matching release found (worker FIFO)",
                 }
             )
 
@@ -697,14 +739,16 @@ def match_select_release(lines, fallback_window_s=120):
     per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0})
     for s in selects:
         s_type = _normalize_worker_type(s.get("type"))
-        per_worker[s["worker"]]["selects"] += 1
+        wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker"))
+        per_worker[wkey]["selects"] += 1
         if s_type in ("prefill", "mixed"):
-            per_worker[s["worker"]]["token_selects"] += 1
+            per_worker[wkey]["token_selects"] += 1
     for r in releases:
+        wkey = r.get("worker_key") or _normalize_worker_url_key(r.get("worker"))
         if str(r.get("type", "")).endswith("_tokens"):
-            per_worker[r["worker"]]["token_releases"] += 1
+            per_worker[wkey]["token_releases"] += 1
         else:
-            per_worker[r["worker"]]["releases"] += 1
+            per_worker[wkey]["releases"] += 1
 
     pw_result = {}
     for w, counts in per_worker.items():
@@ -716,7 +760,34 @@ def match_select_release(lines, fallback_window_s=120):
             "token_releases": counts["token_releases"],
         }
 
-    # 按 worker type 分类统计（prefill/decode/mixed）
+    # 基于 select 构建 worker URL -> dominant type 映射
+    per_worker_type_counts = defaultdict(lambda: defaultdict(int))
+    for s in selects:
+        wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker"))
+        per_worker_type_counts[wkey][_normalize_worker_type(s.get("type"))] += 1
+    worker_dominant_type = {}
+    for w, counts in per_worker_type_counts.items():
+        worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown"
+
+    # 为未显式标注 type 的 release 推断 worker type（避免大量 unknown）
+    inferred_release_types = {}
+    for i, r in enumerate(releases):
+        r_type_raw = str(r.get("type", ""))
+        if r_type_raw.endswith("_tokens"):
+            base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
+            # token release 按 worker URL 对应的 select 类型映射，不做邻近时间纠偏
+            mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown")
+            if mapped_t in ("prefill", "decode", "mixed"):
+                base_t = mapped_t
+            inferred_release_types[i] = f"{base_t}_tokens"
+            continue
+        base_t = _normalize_worker_type(r_type_raw)
+        if base_t != "unknown":
+            inferred_release_types[i] = base_t
+            continue
+        inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s)
+
+    # 按 worker type 分类统计（prefill/decode/mixed，必要时保留 unknown）
     type_summary = defaultdict(
         lambda: {
             "counter_selects": 0,
@@ -730,17 +801,32 @@ def match_select_release(lines, fallback_window_s=120):
         type_summary[s_type]["counter_selects"] += 1
         if s_type in ("prefill", "mixed"):
             type_summary[s_type]["token_selects"] += 1
-    for r in releases:
-        r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", ""))
-        if str(r.get("type", "")).endswith("_tokens"):
+    for i, r in enumerate(releases):
+        inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", ""))))
+        r_type = _normalize_worker_type(str(inferred).replace("_tokens", ""))
+        if str(inferred).endswith("_tokens"):
             type_summary[r_type]["token_releases"] += 1
         else:
             type_summary[r_type]["counter_releases"] += 1
 
+    # 每个 worker URL 的类型画像（基于 select）
+    worker_type_profile = {}
+    for w, counts in per_worker_type_counts.items():
+        dominant = "unknown"
+        if counts:
+            dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0]
+        worker_type_profile[w] = {
+            "dominant_type": dominant,
+            "prefill": counts.get("prefill", 0),
+            "decode": counts.get("decode", 0),
+            "mixed": counts.get("mixed", 0),
+            "unknown": counts.get("unknown", 0),
+        }
+
     return {
         "matched": matched,
         "unmatched_selects": unmatched_selects,
-        "untracked_selects": untracked_selects,
+        "unmatched_releases": [],
         "failed_selects": failed_selects,
         "per_worker": pw_result,
         "id_coverage": {
@@ -751,6 +837,7 @@ def match_select_release(lines, fallback_window_s=120):
             "without_any_id": without_any_id,
         },
         "type_summary": dict(type_summary),
+        "worker_type_profile": worker_type_profile,
     }
 
 
@@ -949,6 +1036,15 @@ def check(name, got, expected):
         "dial tcp {ip:port}: connection refused",
     )
 
+    print("\n=== Testing match_select_release (token release type inference) ===")
+    sample_lines = [
+        "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1",
+        "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10",
+        "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0",
+    ]
+    msr = match_select_release(sample_lines)
+    check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
+    check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index a818d31150f..803bf6fba43 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -24,6 +24,7 @@
 import os
 import sys
 from datetime import datetime
+from pathlib import Path
 
 # 确保能 import 同级模块
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -46,23 +47,56 @@ def determine_log_file(user_path=None):
     3. fd-router.log（golang_router 根目录）
     """
     if user_path:
-        if os.path.isfile(user_path):
-            return user_path
+        p = Path(user_path).expanduser()
+        if p.is_file():
+            return str(p)
         print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr)
+        print(
+            "提示: 若路径含空格/括号，请使用引号，例如: "
+            "python3 scripts/troubleshoot.py 'fastdeploy/golang_router/logs/fd-router (2).log' --load",
+            file=sys.stderr,
+        )
         sys.exit(1)
 
-    # 尝试不同 CWD 下的候选路径
-    candidates = [
-        "logs/router.log",  # CWD = golang_router/
-        "fd-router.log",  # CWD = golang_router/
-        "fastdeploy/golang_router/logs/router.log",  # CWD = 项目根
-        "fastdeploy/golang_router/fd-router.log",  # CWD = 项目根
+    # 统一基于脚本位置与当前工作目录搜索，避免 CWD 差异导致找不到日志。
+    script_dir = Path(__file__).resolve().parent
+    golang_router_dir = script_dir.parents[2]  # .../fastdeploy/golang_router
+    cwd = Path.cwd()
+
+    # 精确候选（优先常见命名）
+    exact_candidates = [
+        golang_router_dir / "logs" / "router.log",
+        golang_router_dir / "fd-router.log",
+        cwd / "logs" / "router.log",
+        cwd / "fd-router.log",
+        cwd / "fastdeploy" / "golang_router" / "logs" / "router.log",
+        cwd / "fastdeploy" / "golang_router" / "fd-router.log",
     ]
-    for path in candidates:
-        if os.path.isfile(path):
-            return path
+    for p in exact_candidates:
+        if p.is_file():
+            return str(p)
+
+    # 模糊候选：支持 fd-router (2).log 等命名
+    pattern_roots = [
+        golang_router_dir / "logs",
+        golang_router_dir,
+        cwd / "logs",
+        cwd,
+        cwd / "fastdeploy" / "golang_router" / "logs",
+        cwd / "fastdeploy" / "golang_router",
+    ]
+    dynamic_candidates = []
+    for root in pattern_roots:
+        if not root.is_dir():
+            continue
+        dynamic_candidates.extend(sorted(root.glob("fd-router*.log")))
+        dynamic_candidates.extend(sorted(root.glob("router*.log")))
+
+    if dynamic_candidates:
+        return str(dynamic_candidates[0])
 
     print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr)
+    print("已搜索: logs/router.log, fd-router.log, fd-router*.log, router*.log", file=sys.stderr)
     sys.exit(1)
 
 
@@ -128,7 +162,14 @@ def determine_status(results):
                 reasons.append(d["message"])
 
     if reasons:
-        return "DEGRADED", ", ".join(reasons)
+        # 去重但保留完整信息
+        deduped = []
+        seen = set()
+        for r in reasons:
+            if r not in seen:
+                deduped.append(r)
+                seen.add(r)
+        return "DEGRADED", "；".join(deduped)
 
     if not results:
         return "HEALTHY", "无分析数据"
@@ -148,19 +189,65 @@ def format_full_report(results, status, status_reason):
                 - 'trace_files': {trace_id: text} 或 {}
     """
     parts = []
-    details = {"health_events": None, "load_select_release": None, "trace_files": {}}
+    details = {
+        "health_events": None,
+        "load_select_release": None,
+        "latency_diagnoses": None,
+        "cache_diagnosis": None,
+        "load_diagnoses": None,
+        "load_counter_state": None,
+        "cache_session_stickiness": None,
+        "cache_suboptimal": None,
+        "cache_eviction": None,
+        "cache_fallback": None,
+        "cache_cross": None,
+        "errors_topn": None,
+        "trace_files": {},
+    }
 
     # 状态行
     parts.append(f"STATUS: {status} — {status_reason}")
+    parts.append(
+        "状态定义: HEALTHY=无明显异常；DEGRADED=服务可用但存在性能/稳定性问题（需关注）；CRITICAL=服务不可用或高风险故障。"
+    )
     parts.append("=" * 60)
     parts.append("")
 
     # 各维度报告
     if "errors" in results:
         parts.append(format_errors_report(results["errors"]))
+        if results["errors"].get("error_top_n"):
+            lines = [
+                "# Errors TopN 详情",
+                "",
+                "| 模板 | 数量 | 级别 | 来源层 | 影响 |",
+                "|:--|--:|:--|:--|:--|",
+            ]
+            for e in results["errors"]["error_top_n"]:
+                lines.append(
+                    f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |'
+                )
+            lines.append("")
+            lines.append("## 涉及 URLs")
+            lines.append("")
+            for e in results["errors"]["error_top_n"]:
+                urls = e.get("urls") or []
+                if not urls:
+                    continue
+                lines.append(f'- 模板: {e.get("template","")}')
+                for u in urls:
+                    lines.append(f'  - {u}')
+            lines.append("")
+            details["errors_topn"] = "\n".join(lines)
 
     if "latency" in results:
         parts.append(format_latency_report(results["latency"]))
+        if results["latency"].get("diagnoses"):
+            lines = ["# 延迟诊断详情", ""]
+            for d in results["latency"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] {d.get("message","")}')
+            lines.append("")
+            details["latency_diagnoses"] = "\n".join(lines)
 
     if "health" in results:
         summary, detail = format_health_report(results["health"])
@@ -173,9 +260,58 @@ def format_full_report(results, status, status_reason):
         parts.append(summary)
         if detail:
             details["load_select_release"] = detail
+        if results["load"].get("diagnoses"):
+            lines = ["# Load 诊断详情", ""]
+            for d in results["load"]["diagnoses"]:
+                lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}')
+            lines.append("")
+            details["load_diagnoses"] = "\n".join(lines)
+        if results["load"].get("counter_last_state"):
+            rows = results["load"]["counter_last_state"]
+            lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"]
+            for r in rows:
+                lines.append(
+                    f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |'
+                )
+            lines.append("")
+            details["load_counter_state"] = "\n".join(lines)
 
     if "cache" in results:
-        parts.append(format_cache_report(results["cache"]))
+        summary, detail = format_cache_report(results["cache"])
+        parts.append(summary)
+        if detail:
+            details["cache_diagnosis"] = detail
+        c = results["cache"]
+        if c.get("session_stickiness"):
+            lines = ["# Cache Session 粘性详情", ""]
+            for sid, s in c["session_stickiness"].items():
+                lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}')
+            lines.append("")
+            details["cache_session_stickiness"] = "\n".join(lines)
+        if c.get("suboptimal_selections"):
+            lines = ["# Cache 非最优选择详情", ""]
+            for x in c["suboptimal_selections"][:200]:
+                lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}')
+            lines.append("")
+            details["cache_suboptimal"] = "\n".join(lines)
+        if c.get("eviction_impact"):
+            lines = ["# Cache 驱逐影响详情", ""]
+            for x in c["eviction_impact"][:200]:
+                lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}')
+            lines.append("")
+            details["cache_eviction"] = "\n".join(lines)
+        if c.get("fallback_reasons"):
+            lines = ["# Cache Fallback 原因详情", ""]
+            for x in c["fallback_reasons"]:
+                lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
+            lines.append("")
+            details["cache_fallback"] = "\n".join(lines)
+        if c.get("cross_diagnosis"):
+            lines = ["# Cache 交叉诊断详情", ""]
+            for x in c["cross_diagnosis"]:
+                lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%')
+            lines.append("")
+            details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -217,6 +353,40 @@ def save_detailed_report(report_text, output_dir, details=None):
             with open(load_path, "w", encoding="utf-8") as f:
                 f.write(details["load_select_release"])
 
+        if details.get("latency_diagnoses"):
+            latency_path = os.path.join(detail_dir, "latency_diagnoses.md")
+            with open(latency_path, "w", encoding="utf-8") as f:
+                f.write(details["latency_diagnoses"])
+
+        if details.get("cache_diagnosis"):
+            cache_path = os.path.join(detail_dir, "cache_diagnosis.md")
+            with open(cache_path, "w", encoding="utf-8") as f:
+                f.write(details["cache_diagnosis"])
+        if details.get("load_diagnoses"):
+            with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_diagnoses"])
+        if details.get("load_counter_state"):
+            with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
+                f.write(details["load_counter_state"])
+        if details.get("cache_session_stickiness"):
+            with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_session_stickiness"])
+        if details.get("cache_suboptimal"):
+            with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_suboptimal"])
+        if details.get("cache_eviction"):
+            with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_eviction"])
+        if details.get("cache_fallback"):
+            with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_fallback"])
+        if details.get("cache_cross"):
+            with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
+                f.write(details["cache_cross"])
+        if details.get("errors_topn"):
+            with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f:
+                f.write(details["errors_topn"])
+
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
             trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")

From 0afcff2beef44a81e350280f6f6d279d756b8e51 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 14:35:02 +0800
Subject: [PATCH 20/40] [Feature] Add troubleshoot and stats-cache-hitratio
 skills

---
 .../troubleshoot/scripts/analyzers/load.py    |   7 +-
 .../skills/troubleshoot/scripts/log_parser.py |  33 +-
 fastdeploy/golang_router/cmd/main.go          |  10 +-
 .../config/config.example.yaml                |   3 +
 .../config/config.example.yaml                |   3 +
 .../golang_router/internal/config/config.go   |  16 +-
 .../internal/gateway/completions_test.go      |   2 +-
 .../internal/manager/health_test.go           |   2 +-
 .../internal/middleware/logger_test.go        |   2 +-
 .../scheduler/handler/prefill_cache_aware.go  |   3 +
 .../scheduler/handler/tokenizer_test.go       |  12 +-
 fastdeploy/golang_router/pkg/logger/logger.go | 288 +++++++++++++++++-
 .../golang_router/pkg/logger/logger_test.go   |  15 +-
 13 files changed, 335 insertions(+), 61 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
index 7b59b6c5f01..83b9c8a05e1 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py
@@ -15,7 +15,6 @@
 
 from log_parser import extract_ts, match_select_release, parse_stats_line
 from stats import compute_statistics, time_bucket
-from analyzers.load_report import format_load_report
 
 # ════════════════════════════════════════════════════════════════
 # Counter 异常检测正则
@@ -82,7 +81,7 @@ def analyze_load(log_file, tail=None):
         r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters",
         tail,
     )
-    h11_lines = _grep_lines(log_file, r"release (?:[a-zA-Z_]+\s+)?tokens", tail)
+    h11_lines = _grep_lines(log_file, r"release [a-zA-Z_]+ tokens:", tail)
 
     # 解析 stats 行
     stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r]
@@ -321,7 +320,7 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats,
                 diagnoses.append(
                     {
                         "severity": "MEDIUM",
-                        "message": f'{_strip_scheme(w_url)} select-release 差值 {delta}（可能存在在途请求堆积）',
+                        "message": f"{_strip_scheme(w_url)} select-release 差值 {delta}（可能存在在途请求堆积）",
                         "source_layer": "FD 后端",
                     }
                 )
@@ -345,8 +344,6 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats,
 # ════════════════════════════════════════════════════════════════
 
 
-
-
 # ════════════════════════════════════════════════════════════════
 # Grep 工具
 # ════════════════════════════════════════════════════════════════
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
index 44a0f285fe5..99864e1de16 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py
@@ -506,6 +506,7 @@ def _normalize_worker_url_key(url):
         return ""
     return re.sub(r"^https?://", "", str(url).strip().rstrip("/"))
 
+
 def _infer_release_worker_type(release, selects, fallback_window_s=120):
     """为未显式标注 type 的 release 近似推断 worker type。
 
@@ -554,7 +555,11 @@ def _infer_token_release_worker_type(release, selects, fallback_window_s=120):
         return "unknown"
 
     r_ts = _parse_ts_safe(release.get("ts"))
-    candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")]
+    candidates = [
+        s
+        for s in selects
+        if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")
+    ]
     if not candidates:
         return "unknown"
 
@@ -716,31 +721,14 @@ def match_select_release(lines, fallback_window_s=120):
             if s_key and r_key:
                 if s_key == r_key:
                     id_check = "match"
-                    id_consistency["both_present_and_equal"] += 1
                 else:
                     id_check = "mismatch"
-                    id_consistency["both_present_but_mismatch"] += 1
-                    id_mismatched_matches.append(
-                        {
-                            "worker": s["worker"],
-                            "select_ts": s["ts"],
-                            "release_ts": r["ts"],
-                            "select_id_key": s_key_type,
-                            "select_id": s_key,
-                            "release_id_key": r_key_type,
-                            "release_id": r_key,
-                            "note": "worker FIFO matched, but ID mismatched",
-                        }
-                    )
             elif s_key and not r_key:
                 id_check = "select_only"
-                id_consistency["only_select_has_id"] += 1
             elif (not s_key) and r_key:
                 id_check = "release_only"
-                id_consistency["only_release_has_id"] += 1
             else:
                 id_check = "both_missing"
-                id_consistency["both_missing"] += 1
 
             matched.append(
                 {
@@ -750,6 +738,7 @@ def match_select_release(lines, fallback_window_s=120):
                     "release_ts": r["ts"],
                     "type": s["type"],
                     "match_method": "worker_fifo",
+                    "id_check": id_check,
                 }
             )
             release_used.add(best_idx)
@@ -806,7 +795,9 @@ def match_select_release(lines, fallback_window_s=120):
         if r_type_raw.endswith("_tokens"):
             base_t = _normalize_worker_type(r_type_raw.replace("_tokens", ""))
             # token release 按 worker URL 对应的 select 类型映射，不做邻近时间纠偏
-            mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown")
+            mapped_t = worker_dominant_type.get(
+                r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown"
+            )
             if mapped_t in ("prefill", "decode", "mixed"):
                 base_t = mapped_t
             inferred_release_types[i] = f"{base_t}_tokens"
@@ -866,8 +857,6 @@ def match_select_release(lines, fallback_window_s=120):
             "with_alt_id": with_alt_id,
             "without_any_id": without_any_id,
         },
-        "id_consistency": id_consistency,
-        "id_mismatched_matches": id_mismatched_matches,
         "type_summary": dict(type_summary),
         "worker_type_profile": worker_type_profile,
     }
@@ -1077,7 +1066,7 @@ def check(name, got, expected):
     msr = match_select_release(sample_lines)
     check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1)
     check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0)
-    
+
     print(f'\n{"=" * 40}')
     print(f"Results: {passed} passed, {failed} failed")
     if failed:
diff --git a/fastdeploy/golang_router/cmd/main.go b/fastdeploy/golang_router/cmd/main.go
index e0e8c98e137..6664436823c 100644
--- a/fastdeploy/golang_router/cmd/main.go
+++ b/fastdeploy/golang_router/cmd/main.go
@@ -41,7 +41,14 @@ func main() {
 	}
 
 	// Initialize logger
-	logger.Init(cfg.Log.Level, cfg.Log.Output)
+	logCfg := logger.Config{
+		Level:               cfg.Log.Level,
+		Output:              cfg.Log.Output,
+		MaxAgeDays:          cfg.Log.MaxAgeDays,
+		MaxTotalSizeMB:      cfg.Log.MaxTotalSizeMB,
+		CleanupIntervalSecs: cfg.Log.CleanupIntervalSecs,
+	}
+	logger.Init(logCfg)
 	defer logger.CloseLogFile()
 
 	// Initialize manager
@@ -59,6 +66,7 @@ func main() {
 	go scheduler_handler.StartBackupCleanupTask(context.Background(), intervalCleanupSecs)
 	statsIntervalSecs := cfg.Scheduler.StatsIntervalSecs
 	go scheduler_handler.StartStatsReporter(context.Background(), statsIntervalSecs)
+	go logger.StartLogCleanup(context.Background(), logCfg)
 
 	// Start server
 	addr := ":" + cfg.Server.Port
diff --git a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
index be4b11227d2..5e1091b0eef 100644
--- a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
+++ b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
@@ -29,3 +29,6 @@ manager:
 log:
   level: "info"  # debug, info, warn, error
   output: "file" # stdout, file
+  max-age-days: 7              # max days to keep log files; default: 7
+  max-total-size-mb: 500       # max total log size in MB; default: 500
+  cleanup-interval-secs: 3600  # cleanup check interval in seconds; default: 3600
diff --git a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml
index be4b11227d2..5e1091b0eef 100644
--- a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml
+++ b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml
@@ -29,3 +29,6 @@ manager:
 log:
   level: "info"  # debug, info, warn, error
   output: "file" # stdout, file
+  max-age-days: 7              # max days to keep log files; default: 7
+  max-total-size-mb: 500       # max total log size in MB; default: 500
+  cleanup-interval-secs: 3600  # cleanup check interval in seconds; default: 3600
diff --git a/fastdeploy/golang_router/internal/config/config.go b/fastdeploy/golang_router/internal/config/config.go
index 2cb8226961d..f184a5b16da 100644
--- a/fastdeploy/golang_router/internal/config/config.go
+++ b/fastdeploy/golang_router/internal/config/config.go
@@ -49,8 +49,11 @@ type SchedulerConfig struct {
 }
 
 type LogConfig struct {
-	Level  string `yaml:"level"`  // debug, info, warn, error
-	Output string `yaml:"output"` // stdout, file
+	Level               string  `yaml:"level"`                // debug, info, warn, error
+	Output              string  `yaml:"output"`               // stdout, file
+	MaxAgeDays          int     `yaml:"max-age-days"`         // max days to keep log files; 0 = use default (7)
+	MaxTotalSizeMB      int     `yaml:"max-total-size-mb"`    // max total log size in MB; 0 = use default (500)
+	CleanupIntervalSecs float64 `yaml:"cleanup-interval-secs"` // cleanup check interval in seconds; 0 = use default (3600)
 }
 
 func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) {
@@ -81,6 +84,15 @@ func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) {
 	if cfg.Log.Level == "" {
 		cfg.Log.Level = "info"
 	}
+	if cfg.Log.MaxAgeDays == 0 {
+		cfg.Log.MaxAgeDays = 7
+	}
+	if cfg.Log.MaxTotalSizeMB == 0 {
+		cfg.Log.MaxTotalSizeMB = 500
+	}
+	if cfg.Log.CleanupIntervalSecs == 0 {
+		cfg.Log.CleanupIntervalSecs = 3600
+	}
 	if cfg.Manager.HealthCheckEndpoint == "" {
 		cfg.Manager.HealthCheckEndpoint = "/health"
 	}
diff --git a/fastdeploy/golang_router/internal/gateway/completions_test.go b/fastdeploy/golang_router/internal/gateway/completions_test.go
index 825544ff5e3..4fea9736ad6 100644
--- a/fastdeploy/golang_router/internal/gateway/completions_test.go
+++ b/fastdeploy/golang_router/internal/gateway/completions_test.go
@@ -20,7 +20,7 @@ import (
 )
 
 func TestMain(m *testing.M) {
-	logger.Init("info", "stdout")
+	logger.Init(logger.Config{Level: "info", Output: "stdout"})
 	gin.SetMode(gin.TestMode)
 	os.Exit(m.Run())
 }
diff --git a/fastdeploy/golang_router/internal/manager/health_test.go b/fastdeploy/golang_router/internal/manager/health_test.go
index bc42031d85f..f50ea2d00b2 100644
--- a/fastdeploy/golang_router/internal/manager/health_test.go
+++ b/fastdeploy/golang_router/internal/manager/health_test.go
@@ -15,7 +15,7 @@ import (
 
 func init() {
 	// Initialize logger for all tests
-	logger.Init("info", "stdout")
+	logger.Init(logger.Config{Level: "info", Output: "stdout"})
 }
 
 func TestCheckServiceHealth(t *testing.T) {
diff --git a/fastdeploy/golang_router/internal/middleware/logger_test.go b/fastdeploy/golang_router/internal/middleware/logger_test.go
index da9c7290567..47b63742547 100644
--- a/fastdeploy/golang_router/internal/middleware/logger_test.go
+++ b/fastdeploy/golang_router/internal/middleware/logger_test.go
@@ -12,7 +12,7 @@ import (
 
 func init() {
 	// Initialize logger to avoid nil pointer dereference in recovery middleware
-	logger.Init("info", "stdout")
+	logger.Init(logger.Config{Level: "info", Output: "stdout"})
 }
 
 func TestLoggerMiddleware(t *testing.T) {
diff --git a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go
index 48737c03c72..2259087d619 100644
--- a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go
+++ b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go
@@ -384,6 +384,9 @@ func (c *radixPrefixCache) Record(tokens []int, worker string) {
 
 // evictionWorker periodically evicts inactive nodes
 func (c *radixPrefixCache) evictionWorker(interval time.Duration) {
+	if interval <= 0 {
+		return
+	}
 	ticker := time.NewTicker(interval)
 	defer ticker.Stop()
 	for {
diff --git a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go
index d3b6dacfdc4..e1155e3686b 100644
--- a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go
+++ b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go
@@ -586,13 +586,13 @@ func TestParseTokensFromBody(t *testing.T) {
 			name:     "invalid JSON format",
 			input:    []byte(`invalid json`),
 			expected: nil,
-			err:      errors.New("tokenizer response missing tokens"),
+			err:      errors.New("tokenizer response unmarshal failed"),
 		},
 		{
 			name:     "empty body",
 			input:    []byte(``),
 			expected: nil,
-			err:      errors.New("tokenizer response missing tokens"),
+			err:      errors.New("tokenizer response unmarshal failed"),
 		},
 		{
 			name:     "large array of tokens",
@@ -610,13 +610,13 @@ func TestParseTokensFromBody(t *testing.T) {
 			name:     "non-array input_ids",
 			input:    []byte(`{"input_ids": "not an array"}`),
 			expected: nil,
-			err:      errors.New("tokenizer response missing tokens"),
+			err:      errors.New("tokenizer response unmarshal failed"),
 		},
 		{
 			name:     "malformed array",
 			input:    []byte(`{"input_ids": [1, "two", 3]}`),
 			expected: nil,
-			err:      errors.New("tokenizer response missing tokens"),
+			err:      errors.New("tokenizer response unmarshal failed"),
 		},
 	}
 
@@ -629,8 +629,8 @@ func TestParseTokensFromBody(t *testing.T) {
 				t.Errorf("parseTokensFromBody() error = %v, wantErr %v", err, tt.err)
 				return
 			}
-			if err != nil && tt.err != nil && err.Error() != tt.err.Error() {
-				t.Errorf("parseTokensFromBody() error message = %v, want %v", err.Error(), tt.err.Error())
+			if err != nil && tt.err != nil && !strings.Contains(err.Error(), tt.err.Error()) {
+				t.Errorf("parseTokensFromBody() error message = %v, want containing %v", err.Error(), tt.err.Error())
 				return
 			}
 
diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 8e213fc0c9f..07412670628 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -1,12 +1,26 @@
 package logger
 
 import (
+	"context"
+	"fmt"
 	"log"
 	"os"
+	"path/filepath"
+	"sort"
+	"strings"
 	"sync"
-	"context"
+	"time"
 )
 
+// Config holds logger configuration.
+type Config struct {
+	Level               string
+	Output              string
+	MaxAgeDays          int
+	MaxTotalSizeMB      int
+	CleanupIntervalSecs float64
+}
+
 var (
 	infoLogger  *log.Logger
 	errorLogger *log.Logger
@@ -14,37 +28,166 @@ var (
 	debugLogger *log.Logger
 	level       string
 	once        sync.Once
-	logFile     *os.File
+	writer      *rotatingWriter // nil when output is stdout
 )
 
+// nowFunc is overridable in tests for time-dependent logic.
+var nowFunc = time.Now
+
 type contextKey string
+
 const TraceIDKey contextKey = "trace_id"
 const ReqIDKey contextKey = "req_id"
 const RequestIDKey contextKey = "request_id"
 const SessionIDKey contextKey = "session_id"
 
-// Init initialize logger
-func Init(logLevel, output string) {
-	once.Do(func() {
-		level = logLevel
+// gracePeriod is how long we keep the previous day's file open after rotation.
+const gracePeriod = 5 * time.Minute
+
+// rotatingWriter implements io.Writer with day-level rotation and dual-file writes.
+// Current day's log is always "router.log"; on day change it is renamed to
+// "router-YYYY-MM-DD.log" and a new "router.log" is created. During a short
+// grace period after rotation, log lines whose timestamp belongs to the previous
+// day are written to the archived file.
+type rotatingWriter struct {
+	mu          sync.Mutex
+	currentFile *os.File  // today's router.log
+	prevFile    *os.File  // previous day's router-<date>.log during grace period (may be nil)
+	currentDate string    // "2006-01-02"
+	prevDate    string    // previous date during grace period
+	graceUntil  time.Time // when to close prevFile
+	logDir      string
+}
+
+func newRotatingWriter(logDir string) (*rotatingWriter, error) {
+	today := nowFunc().Format("2006-01-02")
+	f, err := os.OpenFile(filepath.Join(logDir, "router.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	if err != nil {
+		return nil, err
+	}
+	return &rotatingWriter{
+		currentFile: f,
+		currentDate: today,
+		logDir:      logDir,
+	}, nil
+}
+
+func (w *rotatingWriter) Write(p []byte) (n int, err error) {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	today := nowFunc().Format("2006-01-02")
+
+	// Detect day change and rotate.
+	if today != w.currentDate {
+		w.rotateLocked(today)
+	}
+
+	// Close previous file if grace period expired.
+	if w.prevFile != nil && nowFunc().After(w.graceUntil) {
+		w.prevFile.Close()
+		w.prevFile = nil
+		w.prevDate = ""
+	}
+
+	// During grace period, route log lines to the correct file based on timestamp.
+	target := w.currentFile
+	if w.prevFile != nil {
+		if logDate := parseLogDate(p); logDate == w.prevDate {
+			target = w.prevFile
+		}
+	}
+
+	return target.Write(p)
+}
+
+func (w *rotatingWriter) Close() error {
+	w.mu.Lock()
+	defer w.mu.Unlock()
+	if w.prevFile != nil {
+		w.prevFile.Close()
+		w.prevFile = nil
+	}
+	if w.currentFile != nil {
+		return w.currentFile.Close()
+	}
+	return nil
+}
+
+// rotateLocked performs the actual file rotation. Must be called with w.mu held.
+func (w *rotatingWriter) rotateLocked(newDate string) {
+	// Close any lingering previous file.
+	if w.prevFile != nil {
+		w.prevFile.Close()
+		w.prevFile = nil
+	}
+
+	// Close current router.log so we can rename it.
+	if w.currentFile != nil {
+		w.currentFile.Close()
+	}
+
+	// Rename router.log -> router-<currentDate>.log
+	oldPath := filepath.Join(w.logDir, "router.log")
+	archivePath := filepath.Join(w.logDir, "router-"+w.currentDate+".log")
+	if err := os.Rename(oldPath, archivePath); err != nil {
+		// Rename failed; try to reopen router.log and continue without rotation.
+		w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+		return
+	}
+
+	// Open the archived file for dual-write grace period.
+	w.prevFile, _ = os.OpenFile(archivePath, os.O_WRONLY|os.O_APPEND, 0666)
+	w.prevDate = w.currentDate
+	w.graceUntil = nowFunc().Add(gracePeriod)
+
+	// Create new router.log for the new day.
+	w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	w.currentDate = newDate
+}
+
+// parseLogDate extracts the date from a log line produced by log.LstdFlags.
+// Format: "[LEVEL] 2006/01/02 15:04:05 ..."
+// Returns "2006-01-02" or empty string on parse failure.
+func parseLogDate(p []byte) string {
+	// Find the date pattern "YYYY/MM/DD" in the log prefix.
+	// log.LstdFlags produces: "2006/01/02 15:04:05" after the logger prefix.
+	// The prefix is like "[INFO] " (7 chars), so the date starts around index 7.
+	s := string(p)
+	for i := 0; i+10 <= len(s); i++ {
+		c := s[i]
+		if c >= '1' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' {
+			// Found a candidate "YYYY/MM/DD"
+			year := s[i : i+4]
+			month := s[i+5 : i+7]
+			day := s[i+8 : i+10]
+			return year + "-" + month + "-" + day
+		}
+	}
+	return ""
+}
 
+// Init initializes the logger.
+func Init(cfg Config) {
+	once.Do(func() {
+		level = cfg.Level
 		flags := log.LstdFlags | log.Lshortfile
 
-		if output == "file" {
-			// Check if logs directory exists
+		if cfg.Output == "file" {
 			if _, err := os.Stat("logs"); os.IsNotExist(err) {
 				if err := os.MkdirAll("logs", 0755); err != nil {
 					log.Fatalln("Failed to create logs directory:", err)
 				}
 			}
-			logFile, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+			var err error
+			writer, err = newRotatingWriter("logs")
 			if err != nil {
-				log.Fatalln("Failed to open log file:", err)
+				log.Fatalln("Failed to create rotating log writer:", err)
 			}
-			infoLogger = log.New(logFile, "[INFO] ", flags)
-			errorLogger = log.New(logFile, "[ERROR] ", flags)
-			warnLogger = log.New(logFile, "[WARN] ", flags)
-			debugLogger = log.New(logFile, "[DEBUG] ", flags)
+			infoLogger = log.New(writer, "[INFO] ", flags)
+			errorLogger = log.New(writer, "[ERROR] ", flags)
+			warnLogger = log.New(writer, "[WARN] ", flags)
+			debugLogger = log.New(writer, "[DEBUG] ", flags)
 		} else {
 			infoLogger = log.New(os.Stdout, "[INFO] ", flags)
 			errorLogger = log.New(os.Stderr, "[ERROR] ", flags)
@@ -54,9 +197,122 @@ func Init(logLevel, output string) {
 	})
 }
 
+// CloseLogFile closes the log file if in file output mode.
 func CloseLogFile() {
-	if logFile != nil {
-		logFile.Close()
+	if writer != nil {
+		writer.Close()
+	}
+}
+
+// StartLogCleanup runs periodic log cleanup in a background goroutine.
+// It deletes archived log files older than MaxAgeDays and trims total log size
+// to stay under MaxTotalSizeMB.
+func StartLogCleanup(ctx context.Context, cfg Config) {
+	if cfg.Output != "file" {
+		return
+	}
+	if cfg.CleanupIntervalSecs <= 0 {
+		return
+	}
+
+	ticker := time.NewTicker(time.Duration(cfg.CleanupIntervalSecs * float64(time.Second)))
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			cleanupLogs("logs", cfg.MaxAgeDays, cfg.MaxTotalSizeMB)
+		}
+	}
+}
+
+type logFileInfo struct {
+	name string
+	path string
+	date time.Time
+	size int64
+}
+
+// cleanupLogs removes archived log files based on age and total size limits.
+func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
+	entries, err := os.ReadDir(logDir)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "[WARN] Failed to read log directory for cleanup: %v\n", err)
+		return
+	}
+
+	now := nowFunc()
+	var archives []logFileInfo
+	var routerLogSize int64
+
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+		name := entry.Name()
+		info, err := entry.Info()
+		if err != nil {
+			continue
+		}
+
+		// Count router.log size but never delete it.
+		if name == "router.log" {
+			routerLogSize = info.Size()
+			continue
+		}
+
+		// Match archived files: router-YYYY-MM-DD.log
+		if !strings.HasPrefix(name, "router-") || !strings.HasSuffix(name, ".log") {
+			continue
+		}
+		dateStr := strings.TrimPrefix(name, "router-")
+		dateStr = strings.TrimSuffix(dateStr, ".log")
+		fileDate, err := time.Parse("2006-01-02", dateStr)
+		if err != nil {
+			continue
+		}
+		archives = append(archives, logFileInfo{
+			name: name,
+			path: filepath.Join(logDir, name),
+			date: fileDate,
+			size: info.Size(),
+		})
+	}
+
+	// Sort by date ascending (oldest first).
+	sort.Slice(archives, func(i, j int) bool {
+		return archives[i].date.Before(archives[j].date)
+	})
+
+	// Phase 1: Age-based cleanup.
+	if maxAgeDays > 0 {
+		cutoff := now.AddDate(0, 0, -maxAgeDays)
+		remaining := archives[:0]
+		for _, f := range archives {
+			if f.date.Before(cutoff) {
+				os.Remove(f.path)
+			} else {
+				remaining = append(remaining, f)
+			}
+		}
+		archives = remaining
+	}
+
+	// Phase 2: Size-based cleanup.
+	if maxTotalSizeMB > 0 {
+		maxBytes := int64(maxTotalSizeMB) * 1024 * 1024
+		var totalSize int64 = routerLogSize
+		for _, f := range archives {
+			totalSize += f.size
+		}
+		for len(archives) > 0 && totalSize > maxBytes {
+			oldest := archives[0]
+			os.Remove(oldest.path)
+			totalSize -= oldest.size
+			archives = archives[1:]
+		}
 	}
 }
 
diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go
index 59faeee2a4d..fea0b853cf7 100644
--- a/fastdeploy/golang_router/pkg/logger/logger_test.go
+++ b/fastdeploy/golang_router/pkg/logger/logger_test.go
@@ -10,7 +10,7 @@ import (
 
 func TestLoggerInit(t *testing.T) {
 	t.Run("stdout output", func(t *testing.T) {
-		Init("debug", "stdout")
+		Init(Config{Level: "debug", Output: "stdout"})
 
 		if infoLogger == nil || errorLogger == nil || warnLogger == nil || debugLogger == nil {
 			t.Error("Loggers should be initialized")
@@ -117,7 +117,7 @@ func TestLogLevels(t *testing.T) {
 
 func TestLogFunctions(t *testing.T) {
 	var buf bytes.Buffer
-	Init("debug", "stdout")
+	Init(Config{Level: "debug", Output: "stdout"})
 	level = "debug"
 
 	// Redirect output
@@ -132,7 +132,7 @@ func TestLogFunctions(t *testing.T) {
 }
 
 func TestContextPrefix(t *testing.T) {
-	Init("debug", "stdout")
+	Init(Config{Level: "debug", Output: "stdout"})
 	level = "debug"
 
 	t.Run("nil context produces no prefix", func(t *testing.T) {
@@ -151,7 +151,7 @@ func TestContextPrefix(t *testing.T) {
 		}
 	})
 
-	t.Run("context without request_id produces [request_id:null]", func(t *testing.T) {
+	t.Run("context without request_id produces no request_id prefix", func(t *testing.T) {
 		var buf bytes.Buffer
 		oldOutput := infoLogger.Writer()
 		defer func() { infoLogger.SetOutput(oldOutput) }()
@@ -160,8 +160,11 @@ func TestContextPrefix(t *testing.T) {
 		ctx := context.Background()
 		Info(ctx, "mixed mode log")
 		output := buf.String()
-		if !strings.Contains(output, "[request_id:null]") {
-			t.Errorf("context without request_id should produce [request_id:null], got: %s", output)
+		if strings.Contains(output, "[request_id:") {
+			t.Errorf("context without request_id should not produce request_id prefix, got: %s", output)
+		}
+		if !strings.Contains(output, "mixed mode log") {
+			t.Errorf("message should be present, got: %s", output)
 		}
 	})
 

From e03b69f559e50fa25c3c0101042f99e8c18aabfb Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 14:58:29 +0800
Subject: [PATCH 21/40] docs: add router troubleshoot playbook with skill
 workflow

---
 docs/zh/online_serving/router_faq.md          |   1 +
 .../router_troubleshoot_playbook.md           | 190 ++++++++++++++++++
 .../.claude/skills/troubleshoot/SKILL.md      |   5 +-
 .../troubleshoot/scripts/analyzers/trace.py   | 141 ++++++++++++-
 .../troubleshoot/scripts/troubleshoot.py      |   2 +
 5 files changed, 328 insertions(+), 11 deletions(-)
 create mode 100644 docs/zh/online_serving/router_troubleshoot_playbook.md

diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md
index 9c32726f4dc..a431065dbf0 100644
--- a/docs/zh/online_serving/router_faq.md
+++ b/docs/zh/online_serving/router_faq.md
@@ -5,6 +5,7 @@
 本文档基于 [Golang Router](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/golang_router) 的代码实现，汇总了 Router 在使用过程中常见的日志信息、返回输出及问题排查方法，帮助用户快速定位和解决问题。
 
 Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
+如需按“日志定位 + troubleshoot skill”流程化排查，请参考 [Router 问题排查实战手册](router_troubleshoot_playbook.md)。
 
 ## 常见日志分析
 
diff --git a/docs/zh/online_serving/router_troubleshoot_playbook.md b/docs/zh/online_serving/router_troubleshoot_playbook.md
new file mode 100644
index 00000000000..0ccee9c6d55
--- /dev/null
+++ b/docs/zh/online_serving/router_troubleshoot_playbook.md
@@ -0,0 +1,190 @@
+# Router 问题排查实战手册（日志定位 + troubleshoot skill）
+
+本文档结合以下两部分信息整理：
+- Router 常见问题与日志语义：[`docs/zh/online_serving/router_faq.md`](router_faq.md)
+- `fastdeploy/golang_router/.claude/skills/troubleshoot` 的脚本能力与使用方式
+
+目标：给出一套可落地的排查流程，帮助你从“现象”快速定位到“日志证据”和“处理建议”。
+
+---
+
+## 1. 先定范围：全量 / 尾部 / 指定时间段
+
+建议先根据问题发生时间选择分析范围（这是和分析模式并列的维度）：
+
+- **全量分析**：适合历史慢性问题、趋势问题。
+- **尾部分析（`--tail`）**：适合刚发生的故障，优先看最近 N 行或 N 分钟。
+- **指定时间段（`--start/--end`）**：适合已知故障窗口（例如 14:05~14:20）。
+
+> 说明：`--tail` 与 `--start/--end` 互斥，二选一。
+
+---
+
+## 2. 先看健康与注册，再看调度与请求
+
+根据 `router_faq.md` 的建议，先确认“有没有可用实例”，再看“请求是否调度成功”。
+
+### 2.1 健康与注册检查（必做）
+
+```bash
+# 已注册实例列表
+curl -X GET http://{router_url}/registered
+
+# 已注册实例数量
+curl -X GET http://{router_url}/registered_number
+
+# 从 Router 机器检查后端健康
+curl -X GET http://{server_url}/health
+```
+
+重点日志关键词：
+- 健康移除：`Removed unhealthy ... instance`
+- 注册失败：`Failed to register instance`
+- 健康检查失败：`failed to send request to ...` / `Server ... is not healthy`
+
+若实例都不健康或未注册，后续 502/503 多数是结果，不是根因。
+
+### 2.2 调度失败检查
+
+常见错误：
+- `Failed to select worker`
+- `Failed to select worker pair`
+- `No available prefill/decode workers`
+
+这类问题先确认：
+1) 注册数量是否为 0；
+2) 调度策略与部署模式是否匹配；
+3) `fd_metrics_score` 依赖的 `/metrics` 是否可访问。
+
+### 2.3 请求链路与后端请求失败
+
+常见日志：
+- `Failed to connect to backend service`
+- `Request failed (attempt n/max)`
+- `Decode/Prefill/Backend request failed for {url}`
+- `Panic recovered`
+
+这类问题通常需要结合 trace（ID 级别）看完整链路。
+
+---
+
+## 3. 使用 troubleshoot skill 的标准方式
+
+脚本入口（在 `fastdeploy/golang_router/` 下）：
+
+```bash
+SCRIPTS=.claude/skills/troubleshoot/scripts
+python3 $SCRIPTS/troubleshoot.py <log_file> [options]
+```
+
+### 3.1 全量体检（默认推荐首轮）
+
+```bash
+python3 $SCRIPTS/troubleshoot.py <log_file>
+```
+
+会同时输出：errors / latency / health / cache / load 的综合结果。
+
+### 3.2 指定维度分析（精准打点）
+
+```bash
+python3 $SCRIPTS/troubleshoot.py <log_file> --errors
+python3 $SCRIPTS/troubleshoot.py <log_file> --latency
+python3 $SCRIPTS/troubleshoot.py <log_file> --health
+python3 $SCRIPTS/troubleshoot.py <log_file> --cache
+python3 $SCRIPTS/troubleshoot.py <log_file> --load
+```
+
+### 3.3 请求追踪（ID 级排查）
+
+```bash
+# 单个 ID
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace <ID>
+
+# 多个 ID
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace "id1,id2,id3"
+```
+
+trace 会展示：
+- 匹配到的 tag 类型（request_id / trace_id / session_id / req_id）
+- 生命周期完整性
+- 事件链（含原始日志 RAW）
+- 仅 request_id / 仅 session_id / 仅 trace_id 的统计
+- 各标签组合形式（detail 中给出组合与对应 ID）
+
+### 3.4 范围过滤与 trace 组合
+
+当你要“在某个时间窗内追踪某个 ID”时，使用范围参数和 trace 组合：
+
+```bash
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "2026/04/13 14:05:00" --end "2026/04/13 14:20:00" --trace "<ID>"
+```
+
+这符合“范围维度（全量/尾部/时间段）”与“模式维度（含 trace）”分离的使用方式。
+
+---
+
+## 4. 一套可复制的故障定位流程
+
+### 步骤 A：确认故障窗口与错误现象
+- 收集用户报错时间、HTTP 状态码（502/503/500/400）和请求路径。
+
+### 步骤 B：先跑时间窗综合分析
+```bash
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "HH:MM:SS" --end "HH:MM:SS"
+```
+- 看 STATUS（HEALTHY / DEGRADED / CRITICAL）。
+- 优先看 errors、health 章节，判断是否是后端健康/注册问题。
+
+### 步骤 C：按症状进入专项
+- 502/503：`--errors --health --load`
+- 延迟突增：`--latency --load --cache`
+- 单请求失败：`--trace <ID>`（可叠加步骤 B 的时间窗）
+
+### 步骤 D：在 detail 文件中取证
+报告目录默认：
+`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`
+
+重点文件：
+- `summary/troubleshoot_report.md`
+- `detail/trace_<ID>.md`
+- `detail/health_events.md`
+- `detail/load_select_release.md`
+
+---
+
+## 5. 现象到日志的快速映射
+
+| 现象 | 优先看日志/关键词 | 推荐命令 |
+|---|---|---|
+| 503 无可用 worker | `No available prefill/decode workers`, `Removed unhealthy ...` | `--health --errors` |
+| 502 调度失败 | `Failed to select worker`, `Failed to select worker pair` | `--errors --health --load` |
+| 502 后端连接失败 | `Failed to connect to backend service`, `Request failed (attempt ...)` | `--errors --trace <ID>` |
+| 请求卡住/链路不完整 | 有 select 无 release、无 `Request completed successfully.` | `--trace <ID>` |
+| 延迟抖动 | HTTP latency、`[stats] total_running...` | `--latency --load --cache` |
+
+---
+
+## 6. 常见误区
+
+1. **只看 502/503 响应，不看健康与注册日志**：容易把“结果”当“根因”。
+2. **不限定时间窗口**：日志噪音大，容易误判。
+3. **trace 只看结构化事件，不看 RAW**：可能漏掉关键上下文（例如同一秒的 WARN/ERROR 细节）。
+4. **把范围维度和模式维度混在一起**：建议先定范围（全量/尾部/时间段），再定模式（完整/多维/trace）。
+
+---
+
+## 7. 推荐排查命令模板
+
+```bash
+# 模板 1：故障窗口综合体检
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "YYYY/MM/DD HH:MM:SS" --end "YYYY/MM/DD HH:MM:SS"
+
+# 模板 2：最近 30 分钟快速巡检
+python3 $SCRIPTS/troubleshoot.py <log_file> --tail 30m
+
+# 模板 3：单请求深挖（配合时间窗）
+python3 $SCRIPTS/troubleshoot.py <log_file> --start "HH:MM:SS" --end "HH:MM:SS" --trace "<request_or_trace_or_session_id>"
+```
+
+如果你已经知道故障集中在特定 ID，优先从模板 3 入手，然后回到模板 1 看全局背景。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index 7f7a5793e91..2ea74156c82 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -55,10 +55,13 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
 - 选项 1: `完整分析（默认）` — 运行所有维度（errors + latency + health + cache + load）
 - 选项 2: `单维度/多维度分析` — 选择特定维度（errors / latency / health / cache / load），可选多个
-- 选项 3: `请求追踪` — 追踪特定请求 ID（需提供 ID）
+- 选项 3: `请求追踪` — 追踪特定请求 ID
 
 如果用户未选择，默认使用完整分析。
 
+当用户选择“请求追踪”选项时，AskUserQuestion 的选项文案应直接提示可输入：
+- `trace_id/request_id/session_id`（逗号分隔多 ID）
+
 ### 4. 输出目录
 诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
 用户可通过 `--output` 指定**基目录**，脚本会继续在其下创建 `<YYYYMMDD_HHMMSS>/summary` 与 `<YYYYMMDD_HHMMSS>/detail`，避免覆盖历史明细。
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 24af9a23500..37006121994 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -112,15 +112,21 @@ def analyze_trace(log_file, trace_ids, tail=None):
         sr_check = match_select_release(all_lines)
         diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check)
 
+        tag_coverage = _build_id_coverage_stats(all_lines)
+        tag_combos = _build_id_combo_stats(all_lines)
+        matched_tags = _detect_matched_tags(all_lines, tid)
         traces[tid] = {
             "events": events,
             "lifecycle_complete": lifecycle_complete,
             "diagnoses": diagnoses,
             "sr_check": sr_check,
-            "matched_tag": "session_id" if is_session else "request_id/trace_id",
+            "matched_tag": _format_matched_tag(matched_tags),
+            "matched_tags": matched_tags,
             "related_ids": {
                 "request_ids": sorted(related_request_ids) if is_session else [],
             },
+            "id_coverage": tag_coverage,
+            "id_combos": tag_combos,
         }
 
     total_traced = len(traces)
@@ -152,13 +158,14 @@ def _parse_event_chain(lines):
                     "path": http["path"],
                     "status": http["status"],
                     "latency_ms": http["latency_ms"],
+                    "raw": line.strip(),
                 }
             )
             continue
 
         # Parsing completed
         if PARSING_COMPLETE_RE.search(line):
-            events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags})
+            events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags, "raw": line.strip()})
             continue
 
         # Cache-aware strategy
@@ -172,6 +179,7 @@ def _parse_event_chain(lines):
                     "strategy": strategy.get("strategy"),
                     "selected": strategy.get("selected", ""),
                     "selected_hitRatio": strategy.get("selected_hitRatio", 0),
+                    "raw": line.strip(),
                 }
             )
             continue
@@ -186,6 +194,7 @@ def _parse_event_chain(lines):
                     "tags": tags,
                     "worker_type": m.group(1) or "unknown",
                     "worker": m.group(2),
+                    "raw": line.strip(),
                 }
             )
             continue
@@ -200,6 +209,7 @@ def _parse_event_chain(lines):
                     "tags": tags,
                     "worker_type": m.group(1) or "unknown",
                     "worker": m.group(2),
+                    "raw": line.strip(),
                 }
             )
             continue
@@ -214,6 +224,7 @@ def _parse_event_chain(lines):
                     "tags": tags,
                     "worker": m.group(1),
                     "tokens": int(m.group(2)),
+                    "raw": line.strip(),
                 }
             )
             continue
@@ -221,39 +232,45 @@ def _parse_event_chain(lines):
         # Prefill events
         m = PREFILL_FIRST_CHUNK_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1)})
+            events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()})
             continue
         m = PREFILL_DONE_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1)})
+            events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1), "raw": line.strip()})
             continue
         m = PREFILL_ERROR_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2)})
+            events.append(
+                {"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2), "raw": line.strip()}
+            )
             continue
         m = PREFILL_DEFER_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1)})
+            events.append(
+                {"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()}
+            )
             continue
         m = PREFILL_ERR_PATH_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1)})
+            events.append(
+                {"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()}
+            )
             continue
 
         # Request completed
         if REQUEST_COMPLETE_RE.search(line):
-            events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags})
+            events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags, "raw": line.strip()})
             continue
 
         # ts_ms
         m = TS_MS_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1)})
+            events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1), "raw": line.strip()})
             continue
 
         # Failed to select
         if FAILED_SELECT_RE.search(line):
-            events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags})
+            events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags, "raw": line.strip()})
             continue
 
     # 按时间排序
@@ -339,6 +356,12 @@ def format_trace_report(result):
         sections.append(f"### ID: {tid}")
         if trace.get("matched_tag"):
             sections.append(f'  匹配类型: {trace["matched_tag"]}')
+        if trace.get("id_coverage"):
+            c = trace["id_coverage"]
+            sections.append(
+                "  ID统计: "
+                f'request_only={c["request_only"]}, session_only={c["session_only"]}, trace_only={c["trace_only"]}'
+            )
         if trace.get("related_ids", {}).get("request_ids"):
             sections.append(f'  关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}')
 
@@ -357,6 +380,19 @@ def format_trace_report(result):
             detail_lines = [f"# 请求追踪事件链: {tid}", ""]
             if trace.get("matched_tag"):
                 detail_lines.append(f'匹配类型: {trace["matched_tag"]}')
+            if trace.get("id_coverage"):
+                c = trace["id_coverage"]
+                detail_lines.append("ID覆盖统计:")
+                detail_lines.append(
+                    f'- only_request_id: {c["request_only"]} | only_session_id: {c["session_only"]} | only_trace_id: {c["trace_only"]}'
+                )
+            if trace.get("id_combos"):
+                detail_lines.append("")
+                detail_lines.append("标签组合明细（按唯一ID计数）:")
+                for item in trace["id_combos"]:
+                    detail_lines.append(
+                        f'- combo={item["combo"]} | count={item["count"]} | ids={", ".join(item["ids"])}'
+                    )
             if trace.get("related_ids", {}).get("request_ids"):
                 detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}')
             detail_lines.append(f"生命周期: {status}")
@@ -379,7 +415,11 @@ def format_trace_report(result):
                     line += f' tokens={evt["tokens"]}'
                 if evt.get("error"):
                     line += f' error={evt["error"]}'
+                if evt.get("ts_ms"):
+                    line += f' ts_ms={evt["ts_ms"]}'
                 detail_lines.append(line)
+                if evt.get("raw"):
+                    detail_lines.append(f'    RAW: {evt["raw"]}')
             detail_lines.append("")
             detail_dict[tid] = "\n".join(detail_lines)
 
@@ -413,3 +453,84 @@ def _grep_lines(log_file, pattern, tail=None):
 
 def _shell_quote(s):
     return "'" + s.replace("'", "'\\''") + "'"
+
+
+def _detect_matched_tags(lines, target_id):
+    matched = set()
+    for line in lines:
+        tags = extract_tags(line)
+        for key in ("request_id", "trace_id", "session_id", "req_id"):
+            if tags.get(key) == target_id:
+                matched.add(key)
+    return sorted(matched)
+
+
+def _format_matched_tag(matched_tags):
+    if not matched_tags:
+        return "unknown"
+    if len(matched_tags) == 1:
+        return matched_tags[0]
+    return "+".join(matched_tags)
+
+
+def _build_id_coverage_stats(lines):
+    request_only_ids = set()
+    session_only_ids = set()
+    trace_only_ids = set()
+
+    for line in lines:
+        tags = extract_tags(line)
+        req_val = tags.get("request_id") or tags.get("req_id")
+        session_val = tags.get("session_id")
+        trace_val = tags.get("trace_id")
+        has_request = bool(req_val)
+        has_session = bool(session_val)
+        has_trace = bool(trace_val)
+
+        if has_request and not has_session and not has_trace:
+            request_only_ids.add(req_val)
+        if has_session and not has_request and not has_trace:
+            session_only_ids.add(session_val)
+        if has_trace and not has_request and not has_session:
+            trace_only_ids.add(trace_val)
+
+    return {
+        "request_only": len(request_only_ids),
+        "session_only": len(session_only_ids),
+        "trace_only": len(trace_only_ids),
+    }
+
+
+def _build_id_combo_stats(lines):
+    combo_to_ids = {}
+    for line in lines:
+        tags = extract_tags(line)
+        keys = []
+        if tags.get("request_id"):
+            keys.append("request_id")
+        if tags.get("req_id"):
+            keys.append("req_id")
+        if tags.get("session_id"):
+            keys.append("session_id")
+        if tags.get("trace_id"):
+            keys.append("trace_id")
+        combo = "+".join(keys) if keys else "no_id_tag"
+
+        ids = []
+        if tags.get("request_id"):
+            ids.append(tags["request_id"])
+        if tags.get("req_id"):
+            ids.append(tags["req_id"])
+        if tags.get("session_id"):
+            ids.append(tags["session_id"])
+        if tags.get("trace_id"):
+            ids.append(tags["trace_id"])
+        id_key = "|".join(ids) if ids else "<none>"
+
+        combo_to_ids.setdefault(combo, set()).add(id_key)
+
+    rows = []
+    for combo, ids in combo_to_ids.items():
+        rows.append({"combo": combo, "count": len(ids), "ids": sorted(ids)})
+    rows.sort(key=lambda x: x["count"], reverse=True)
+    return rows
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 803bf6fba43..8378cbe20a1 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -430,6 +430,8 @@ def main():
     # 时间范围预过滤（--start 和 --end 可单独或同时指定）
     import atexit
 
+    start_ts = None
+    end_ts = None
     if args.start or args.end:
         start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None
         end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None

From 09c18242446243c966f9d54f334c4cc164149496 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 15:29:26 +0800
Subject: [PATCH 22/40] [Feature] Add troubleshoot and stats-cache-hitratio
 skills

---
 .../troubleshoot/scripts/analyzers/cache.py   | 41 +++++++++++--------
 .../troubleshoot/scripts/troubleshoot.py      | 34 +++++++++++----
 2 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 3a5c19ad00b..57a1490d3fd 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -456,7 +456,7 @@ def format_cache_report(result):
         sections.append("")
         table_data = [
             {
-                "Session": sid[:16],
+                "Session": sid,
                 "请求数": str(s["total_requests"]),
                 "粘性率": f'{s["stickiness_pct"]}%',
                 "切换次数": str(s["switches"]),
@@ -530,28 +530,35 @@ def format_cache_report(result):
         detail_sections.append(
             render_table(
                 result["cross_diagnosis"],
-                columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"],
+                columns=[
+                    "avg_stickiness_pct",
+                    "mean_hitRatio_pct",
+                    "fallback_pct",
+                    "evicted_after_timeout",
+                    "diagnosis",
+                    "action",
+                ],
                 right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"},
             )
         )
         detail_sections.append("")
 
-    if any(
-        [
-            result.get("session_stickiness"),
-            result.get("suboptimal_selections"),
-            result.get("eviction_impact"),
-            result.get("cross_diagnosis"),
-            result.get("diagnoses"),
-        ]
-    ):
+    # 只显示实际生成了文件的链接
+    detail_links = []
+    if result.get("session_stickiness"):
+        detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)")
+    if result.get("suboptimal_selections"):
+        detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)")
+    if result.get("eviction_impact"):
+        detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)")
+    if result.get("fallback_reasons"):
+        detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)")
+    if result.get("cross_diagnosis"):
+        detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)")
+
+    if detail_links:
         sections.append(
-            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
-            "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
-            "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
-            "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
-            "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
-            "[detail/cache_cross.md](../detail/cache_cross.md)"
+            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links)
         )
         sections.append("")
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 8378cbe20a1..d869f9c71cc 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -33,9 +33,14 @@
 from analyzers.errors import analyze_errors, format_errors_report
 from analyzers.health import analyze_health, format_health_report
 from analyzers.latency import analyze_latency, format_latency_report
-from analyzers.load import analyze_load, format_load_report
+from analyzers.load import analyze_load
+from analyzers.load_report import format_load_report
 from analyzers.trace import analyze_trace, format_trace_report
-from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range
+from log_parser import (
+    complete_time_arg,
+    filter_file_by_recent_minutes,
+    filter_file_by_time_range,
+)
 
 
 def determine_log_file(user_path=None):
@@ -236,7 +241,7 @@ def format_full_report(results, status, status_reason):
                     continue
                 lines.append(f'- 模板: {e.get("template","")}')
                 for u in urls:
-                    lines.append(f'  - {u}')
+                    lines.append(f"  - {u}")
             lines.append("")
             details["errors_topn"] = "\n".join(lines)
 
@@ -268,7 +273,12 @@ def format_full_report(results, status, status_reason):
             details["load_diagnoses"] = "\n".join(lines)
         if results["load"].get("counter_last_state"):
             rows = results["load"]["counter_last_state"]
-            lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"]
+            lines = [
+                "# Load Counter 末状态",
+                "",
+                "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |",
+                "|:--|:--|--:|:--|--:|:--|",
+            ]
             for r in rows:
                 lines.append(
                     f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |'
@@ -285,19 +295,25 @@ def format_full_report(results, status, status_reason):
         if c.get("session_stickiness"):
             lines = ["# Cache Session 粘性详情", ""]
             for sid, s in c["session_stickiness"].items():
-                lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}')
+                lines.append(
+                    f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}'
+                )
             lines.append("")
             details["cache_session_stickiness"] = "\n".join(lines)
         if c.get("suboptimal_selections"):
             lines = ["# Cache 非最优选择详情", ""]
             for x in c["suboptimal_selections"][:200]:
-                lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}')
+                lines.append(
+                    f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}'
+                )
             lines.append("")
             details["cache_suboptimal"] = "\n".join(lines)
         if c.get("eviction_impact"):
             lines = ["# Cache 驱逐影响详情", ""]
             for x in c["eviction_impact"][:200]:
-                lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}')
+                lines.append(
+                    f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}'
+                )
             lines.append("")
             details["cache_eviction"] = "\n".join(lines)
         if c.get("fallback_reasons"):
@@ -309,7 +325,9 @@ def format_full_report(results, status, status_reason):
         if c.get("cross_diagnosis"):
             lines = ["# Cache 交叉诊断详情", ""]
             for x in c["cross_diagnosis"]:
-                lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%')
+                lines.append(
+                    f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%'
+                )
             lines.append("")
             details["cache_cross"] = "\n".join(lines)
 

From fe3c0d1d0bece7395b9da49ead94533e969689b8 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 16:05:55 +0800
Subject: [PATCH 23/40] Adjust trace input flow to direct prompt instead of
 AskUserQuestion

---
 .../.claude/skills/troubleshoot/SKILL.md      | 25 ++++-
 .../references/report_templates.md            |  5 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 97 +++++++++++--------
 .../troubleshoot/scripts/analyzers/trace.py   | 69 ++++++++++++-
 .../troubleshoot/scripts/troubleshoot.py      | 60 +++++++-----
 5 files changed, 184 insertions(+), 72 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index 2ea74156c82..919e25a1101 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -11,14 +11,14 @@ description: >
   关键词：troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、
   health、load、cache、trace、/troubleshoot。
 
-IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。
-错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。
 ---
 
 # Router Troubleshooting
 
 综合排查 FastDeploy Go Router 问题，输出完整诊断报告。
 
+> IMPORTANT: 执行前务必先读取 `references/log_patterns.md` 了解日志格式和提取规则。错误分类时参考 `references/error_catalog.md`。涉及后端问题时参考 `references/fastdeploy_cross_reference.md`。
+
 ## 执行前交互
 
 运行脚本前，Claude 必须按以下顺序向用户确认参数：
@@ -51,6 +51,16 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 缺失部分自动从日志首末行推断（缺年份取首行，缺日期取末行）。
 `--start/--end` 与 `--tail` 互斥。
 
+当用户选择“指定时间段”时，必须再发起一次 **AskUserQuestion**（离散选项）引导时间输入：
+- 选项 1: `当天（00:00:00 到当前）`（推荐）
+- 选项 2: `最近半小时`（自动换算为 `--start now-30m --end now` 语义）
+
+用户若通过客户端默认 `Other` 输入时间，则将该输入直接作为时间范围参数解析。
+可补充一条简短示例引导：
+- 示例 1：`16:00-16:30`
+- 示例 2：`03/31 16:00 ~ 03/31 18:00`
+- 示例 3：`2026/03/31 16:00:00`（仅起始）
+
 ### 3. 分析模式
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
 - 选项 1: `完整分析（默认）` — 运行所有维度（errors + latency + health + cache + load）
@@ -59,8 +69,12 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 
 如果用户未选择，默认使用完整分析。
 
-当用户选择“请求追踪”选项时，AskUserQuestion 的选项文案应直接提示可输入：
-- `trace_id/request_id/session_id`（逗号分隔多 ID）
+当用户选择“请求追踪”后，**不要再发 AskUserQuestion** 收集 trace ID。
+直接发一条提示并等待用户输入完成后再继续执行即可。
+
+提示文案建议：
+- `请输入要追踪的 ID（支持 trace_id / request_id / session_id，多个用逗号分隔；输入 all 可全量追踪）`
+- 示例：`a1b2c3d4` / `trace-001,trace-002` / `session-abc-123` / `all`
 
 ### 4. 输出目录
 诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
@@ -86,6 +100,7 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --load
 # 请求追踪（需指定 ID，支持逗号分隔多 ID）
 python3 $SCRIPTS/troubleshoot.py <log_file> --trace <ID>
 python3 $SCRIPTS/troubleshoot.py <log_file> --trace "id1,id2"
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace all
 
 # 尾部分析
 python3 $SCRIPTS/troubleshoot.py <log_file> --tail 5000
@@ -110,6 +125,8 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00" --end "17:00" --erro
 - **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/summary/troubleshoot_report.md`
   - 逐分钟事件详情拆分到 `detail/health_events.md`
   - 请求追踪事件链拆分到 `detail/trace_<ID>.md`
+- **Cache 明细要求**：`cache_session_stickiness.md` / `cache_suboptimal.md` / `cache_eviction.md` / `cache_fallback.md` / `cache_cross.md`
+  必须始终生成（即使无异常也写“未发现/样本不足”总结，避免链接缺失）
 - **状态行**：`STATUS: HEALTHY / DEGRADED / CRITICAL`
 
 ## 三层诊断框架
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index cd705d02816..2ec683f2299 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -102,15 +102,18 @@ Worker 可用性时间线
 可用性统计
 ```
 
-### Cache（调度诊断）— 待实现
+### Cache（调度诊断）
 
 ```
 调度策略分布
 Session 粘性分析
 非最优选择分析
 Fallback 原因分类
+驱逐影响与交叉诊断
 ```
 
+要求：即使某项计数为 0（例如“非最优选择”），也要输出该小节并给出“未发现/样本不足”总结，保证 detail 链接稳定存在。
+
 ### Load（负载分析）— 待实现
 
 ```
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 57a1490d3fd..a12341967a0 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -449,11 +449,11 @@ def format_cache_report(result):
 
     # Session 粘性
     stickiness = result.get("session_stickiness", {})
+    sections.append("### Session 粘性")
+    sections.append("")
+    sections.append("  Session 粘性详情见: [detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)")
+    sections.append("")
     if stickiness:
-        sections.append("### Session 粘性")
-        sections.append("")
-        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
         table_data = [
             {
                 "Session": sid,
@@ -473,14 +473,21 @@ def format_cache_report(result):
             )
         )
         detail_sections.append("")
+    else:
+        sections.append("  未检测到可计算粘性的多请求 Session。")
+        sections.append("")
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append("- 无可用样本（需要同一 session 至少 2 次请求）。")
+        detail_sections.append("")
 
     # 非最优选择
-    if result.get("suboptimal_selections"):
-        subs = result["suboptimal_selections"]
-        sections.append(f"### 非最优选择 ({len(subs)} 次)")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
+    subs = result.get("suboptimal_selections") or []
+    sections.append(f"### 非最优选择 ({len(subs)} 次)")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_suboptimal.md](../detail/cache_suboptimal.md)")
+    sections.append("")
+    if subs:
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
@@ -494,15 +501,22 @@ def format_cache_report(result):
                 f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
             )
         detail_sections.append("")
+    else:
+        sections.append("  未发现非最优选择（selected_hitRatio 始终为当次最高）。")
+        sections.append("")
+        detail_sections.append("## 非最优选择")
+        detail_sections.append("")
+        detail_sections.append("- 未发现非最优选择。")
+        detail_sections.append("")
 
     # 驱逐影响
-    if result.get("eviction_impact"):
-        evictions = result["eviction_impact"]
-        evicted = [e for e in evictions if e["evicted"]]
-        sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
+    evictions = result.get("eviction_impact") or []
+    evicted = [e for e in evictions if e["evicted"]]
+    sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_eviction.md](../detail/cache_eviction.md)")
+    sections.append("")
+    if evictions:
         detail_sections.append("## 驱逐影响")
         detail_sections.append("")
         for e in evictions[:50]:
@@ -510,6 +524,13 @@ def format_cache_report(result):
                 f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
             )
         detail_sections.append("")
+    else:
+        sections.append("  未检测到超时导致的潜在驱逐影响。")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        detail_sections.append("- 未检测到超时驱逐样本。")
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
@@ -520,11 +541,11 @@ def format_cache_report(result):
         detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
         detail_sections.append("")
 
+    sections.append("### 交叉诊断")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_cross.md](../detail/cache_cross.md)")
+    sections.append("")
     if result.get("cross_diagnosis"):
-        sections.append("### 交叉诊断")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
         detail_sections.append("## 交叉诊断")
         detail_sections.append("")
         detail_sections.append(
@@ -542,25 +563,23 @@ def format_cache_report(result):
             )
         )
         detail_sections.append("")
-
-    # 只显示实际生成了文件的链接
-    detail_links = []
-    if result.get("session_stickiness"):
-        detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)")
-    if result.get("suboptimal_selections"):
-        detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)")
-    if result.get("eviction_impact"):
-        detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)")
-    if result.get("fallback_reasons"):
-        detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)")
-    if result.get("cross_diagnosis"):
-        detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)")
-
-    if detail_links:
-        sections.append(
-            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links)
-        )
+    else:
+        sections.append("  样本不足，未生成交叉诊断。")
         sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append("- 样本不足，未生成交叉诊断。")
+        detail_sections.append("")
+
+    sections.append(
+        "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
+        "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
+        "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
+        "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
+        "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
+        "[detail/cache_cross.md](../detail/cache_cross.md)"
+    )
+    sections.append("")
 
     return "\n".join(sections), "\n".join(detail_sections)
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 37006121994..d9a599b305c 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -62,8 +62,13 @@ def analyze_trace(log_file, trace_ids, tail=None):
     Returns:
         dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary}
     """
+    auto_discovery_summary = ""
     if isinstance(trace_ids, str):
-        trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()]
+        normalized = trace_ids.strip().lower()
+        if normalized in ("all", "full", "all_ids", "全部", "全量"):
+            trace_ids, auto_discovery_summary = _discover_full_trace_targets(log_file, tail=tail)
+        else:
+            trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()]
 
     if not trace_ids:
         return {"traces": {}, "summary": "未指定追踪 ID"}
@@ -132,10 +137,64 @@ def analyze_trace(log_file, trace_ids, tail=None):
     total_traced = len(traces)
     complete = sum(1 for t in traces.values() if t["lifecycle_complete"])
 
-    return {
-        "traces": traces,
-        "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整",
-    }
+    summary = f"{total_traced} ID(s) 追踪, {complete} 生命周期完整"
+    if auto_discovery_summary:
+        summary += f" | {auto_discovery_summary}"
+
+    return {"traces": traces, "summary": summary}
+
+
+def _discover_full_trace_targets(log_file, tail=None):
+    """全量追踪目标发现。
+
+    规则：
+    1) 有 session_id 的优先按 session_id 追踪
+    2) 无 session 但有 trace_id 的按 trace_id 追踪
+    3) 剩余“孤立”的 request_id/req_id 单独追踪
+    """
+    lines = _grep_lines(log_file, r"session_id:|trace_id:|request_id:|req_id:", tail=tail)
+    if not lines:
+        return [], "全量追踪未发现任何可用 ID"
+
+    session_ids = set()
+    trace_ids = set()
+    all_request_ids = set()
+    request_ids_with_session_or_trace = set()
+
+    for line in lines:
+        tags = extract_tags(line)
+        sid = tags.get("session_id")
+        tid = tags.get("trace_id")
+        rid = tags.get("request_id") or tags.get("req_id")
+        has_session = bool(sid)
+        has_trace = bool(tid)
+        has_request = bool(rid)
+
+        if has_session:
+            session_ids.add(sid)
+        if has_trace:
+            trace_ids.add(tid)
+        if has_request:
+            all_request_ids.add(rid)
+            if has_session or has_trace:
+                request_ids_with_session_or_trace.add(rid)
+
+    standalone_request_ids = all_request_ids - request_ids_with_session_or_trace
+
+    targets = []
+    chosen = set()
+    for bucket in (sorted(session_ids), sorted(trace_ids), sorted(standalone_request_ids)):
+        for _id in bucket:
+            if _id and _id not in chosen:
+                chosen.add(_id)
+                targets.append(_id)
+
+    summary = (
+        "全量ID发现: "
+        f"session={len(session_ids)}, trace={len(trace_ids)}, "
+        f"standalone_request={len(standalone_request_ids)}, total_targets={len(targets)}"
+    )
+    return targets, summary
 
 
 def _parse_event_chain(lines):
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index d869f9c71cc..96a37ff9577 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -11,7 +11,7 @@
     --health            仅分析 Worker 健康
     --cache             仅分析 Cache 调度
     --load              仅分析负载与计数器
-    --trace ID          追踪指定请求（支持逗号分隔多 ID）
+    --trace ID          追踪指定请求（支持逗号分隔多 ID；传 all 可全量追踪）
     --tail N            仅分析尾部 N 行（支持 N 或 Nm 格式如 30m）
     --start TIME        起始时间（如 "16:00:00"、"03/31 16:00"）
     --end TIME          结束时间（如 "17:00:00"、"2026/03/31 17:00:00"）
@@ -292,44 +292,58 @@ def format_full_report(results, status, status_reason):
         if detail:
             details["cache_diagnosis"] = detail
         c = results["cache"]
+        lines = ["# Cache Session 粘性详情", ""]
         if c.get("session_stickiness"):
-            lines = ["# Cache Session 粘性详情", ""]
             for sid, s in c["session_stickiness"].items():
                 lines.append(
                     f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}'
                 )
-            lines.append("")
-            details["cache_session_stickiness"] = "\n".join(lines)
+        else:
+            lines.append("- 无可用样本（需要同一 session 至少 2 次请求）。")
+        lines.append("")
+        details["cache_session_stickiness"] = "\n".join(lines)
+
+        lines = ["# Cache 非最优选择详情", ""]
         if c.get("suboptimal_selections"):
-            lines = ["# Cache 非最优选择详情", ""]
             for x in c["suboptimal_selections"][:200]:
                 lines.append(
                     f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}'
                 )
-            lines.append("")
-            details["cache_suboptimal"] = "\n".join(lines)
+        else:
+            lines.append("- 未发现非最优选择。")
+        lines.append("")
+        details["cache_suboptimal"] = "\n".join(lines)
+
+        lines = ["# Cache 驱逐影响详情", ""]
         if c.get("eviction_impact"):
-            lines = ["# Cache 驱逐影响详情", ""]
             for x in c["eviction_impact"][:200]:
                 lines.append(
                     f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}'
                 )
-            lines.append("")
-            details["cache_eviction"] = "\n".join(lines)
+        else:
+            lines.append("- 未检测到超时驱逐样本。")
+        lines.append("")
+        details["cache_eviction"] = "\n".join(lines)
+
+        lines = ["# Cache Fallback 原因详情", ""]
         if c.get("fallback_reasons"):
-            lines = ["# Cache Fallback 原因详情", ""]
             for x in c["fallback_reasons"]:
                 lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
-            lines.append("")
-            details["cache_fallback"] = "\n".join(lines)
+        else:
+            lines.append("- 未出现 fallback 记录。")
+        lines.append("")
+        details["cache_fallback"] = "\n".join(lines)
+
+        lines = ["# Cache 交叉诊断详情", ""]
         if c.get("cross_diagnosis"):
-            lines = ["# Cache 交叉诊断详情", ""]
             for x in c["cross_diagnosis"]:
                 lines.append(
                     f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%'
                 )
-            lines.append("")
-            details["cache_cross"] = "\n".join(lines)
+        else:
+            lines.append("- 样本不足，未生成交叉诊断。")
+        lines.append("")
+        details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -386,19 +400,19 @@ def save_detailed_report(report_text, output_dir, details=None):
         if details.get("load_counter_state"):
             with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
                 f.write(details["load_counter_state"])
-        if details.get("cache_session_stickiness"):
+        if details.get("cache_session_stickiness") is not None:
             with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_session_stickiness"])
-        if details.get("cache_suboptimal"):
+        if details.get("cache_suboptimal") is not None:
             with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_suboptimal"])
-        if details.get("cache_eviction"):
+        if details.get("cache_eviction") is not None:
             with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_eviction"])
-        if details.get("cache_fallback"):
+        if details.get("cache_fallback") is not None:
             with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_fallback"])
-        if details.get("cache_cross"):
+        if details.get("cache_cross") is not None:
             with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_cross"])
         if details.get("errors_topn"):
@@ -426,7 +440,7 @@ def main():
     parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康")
     parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度")
     parser.add_argument("--load", action="store_true", help="仅分析负载与计数器")
-    parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID）")
+    parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID；传 all 可全量追踪）")
     parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)")
     parser.add_argument(
         "--start", default=None, help='起始时间（如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00"）'
@@ -478,7 +492,7 @@ def main():
     run_health = args.health or (not any_mode)
     run_load = args.load or (not any_mode)
     run_cache = args.cache or (not any_mode)
-    run_trace = bool(args.trace)  # trace 需要指定 ID，全量扫描不自动调用
+    run_trace = bool(args.trace)  # trace 需要指定 ID（支持 all），全量扫描不自动调用
 
     results = {}
     step = 0

From b65a31f03b2decd979b4de641ebca88aecddeb43 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 16:20:57 +0800
Subject: [PATCH 24/40] Store trace detail markdowns under detail/trace
 subfolder

---
 .../.claude/skills/troubleshoot/SKILL.md      | 27 +++++-
 .../references/report_templates.md            |  7 +-
 .../troubleshoot/scripts/analyzers/cache.py   | 97 +++++++++++--------
 .../troubleshoot/scripts/analyzers/trace.py   | 73 ++++++++++++--
 .../troubleshoot/scripts/troubleshoot.py      | 67 ++++++++-----
 5 files changed, 194 insertions(+), 77 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index 2ea74156c82..00c94a2f487 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -11,14 +11,14 @@ description: >
   关键词：troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、
   health、load、cache、trace、/troubleshoot。
 
-IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。
-错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。
 ---
 
 # Router Troubleshooting
 
 综合排查 FastDeploy Go Router 问题，输出完整诊断报告。
 
+> IMPORTANT: 执行前务必先读取 `references/log_patterns.md` 了解日志格式和提取规则。错误分类时参考 `references/error_catalog.md`。涉及后端问题时参考 `references/fastdeploy_cross_reference.md`。
+
 ## 执行前交互
 
 运行脚本前，Claude 必须按以下顺序向用户确认参数：
@@ -51,6 +51,16 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 缺失部分自动从日志首末行推断（缺年份取首行，缺日期取末行）。
 `--start/--end` 与 `--tail` 互斥。
 
+当用户选择“指定时间段”时，必须再发起一次 **AskUserQuestion**（离散选项）引导时间输入：
+- 选项 1: `当天（00:00:00 到当前）`（推荐）
+- 选项 2: `最近半小时`（自动换算为 `--start now-30m --end now` 语义）
+
+用户若通过客户端默认 `Other` 输入时间，则将该输入直接作为时间范围参数解析。
+可补充一条简短示例引导：
+- 示例 1：`16:00-16:30`
+- 示例 2：`03/31 16:00 ~ 03/31 18:00`
+- 示例 3：`2026/03/31 16:00:00`（仅起始）
+
 ### 3. 分析模式
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
 - 选项 1: `完整分析（默认）` — 运行所有维度（errors + latency + health + cache + load）
@@ -59,8 +69,12 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格
 
 如果用户未选择，默认使用完整分析。
 
-当用户选择“请求追踪”选项时，AskUserQuestion 的选项文案应直接提示可输入：
-- `trace_id/request_id/session_id`（逗号分隔多 ID）
+当用户选择“请求追踪”后，**不要再发 AskUserQuestion** 收集 trace ID。
+直接发一条提示并等待用户输入完成后再继续执行即可。
+
+提示文案建议：
+- `请输入要追踪的 ID（支持 trace_id / request_id / session_id，多个用逗号分隔；输入 all 可全量追踪）`
+- 示例：`a1b2c3d4` / `trace-001,trace-002` / `session-abc-123` / `all`
 
 ### 4. 输出目录
 诊断报告默认保存到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`（自动按运行时间创建子目录）。
@@ -86,6 +100,7 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --load
 # 请求追踪（需指定 ID，支持逗号分隔多 ID）
 python3 $SCRIPTS/troubleshoot.py <log_file> --trace <ID>
 python3 $SCRIPTS/troubleshoot.py <log_file> --trace "id1,id2"
+python3 $SCRIPTS/troubleshoot.py <log_file> --trace all
 
 # 尾部分析
 python3 $SCRIPTS/troubleshoot.py <log_file> --tail 5000
@@ -109,7 +124,9 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00" --end "17:00" --erro
 - **终端**：简洁三层汇总（Router / FD 后端 / 客户端），含状态码分布、错误 Top N、趋势图
 - **文件**：详细报告导出到 `skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/summary/troubleshoot_report.md`
   - 逐分钟事件详情拆分到 `detail/health_events.md`
-  - 请求追踪事件链拆分到 `detail/trace_<ID>.md`
+  - 请求追踪事件链拆分到 `detail/trace/trace_<ID>.md`
+- **Cache 明细要求**：`cache_session_stickiness.md` / `cache_suboptimal.md` / `cache_eviction.md` / `cache_fallback.md` / `cache_cross.md`
+  必须始终生成（即使无异常也写“未发现/样本不足”总结，避免链接缺失）
 - **状态行**：`STATUS: HEALTHY / DEGRADED / CRITICAL`
 
 ## 三层诊断框架
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
index cd705d02816..61db59ec7e6 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md
@@ -62,7 +62,7 @@
   - `detail/latency_diagnoses.md` — 延迟诊断详情
   - `detail/cache_diagnosis.md` — cache 六维诊断详情（session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断）
   - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细
-  - `detail/trace_<ID>.md` — 请求追踪事件链
+  - `detail/trace/trace_<ID>.md` — 请求追踪事件链
 
 ---
 
@@ -102,15 +102,18 @@ Worker 可用性时间线
 可用性统计
 ```
 
-### Cache（调度诊断）— 待实现
+### Cache（调度诊断）
 
 ```
 调度策略分布
 Session 粘性分析
 非最优选择分析
 Fallback 原因分类
+驱逐影响与交叉诊断
 ```
 
+要求：即使某项计数为 0（例如“非最优选择”），也要输出该小节并给出“未发现/样本不足”总结，保证 detail 链接稳定存在。
+
 ### Load（负载分析）— 待实现
 
 ```
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
index 57a1490d3fd..a12341967a0 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py
@@ -449,11 +449,11 @@ def format_cache_report(result):
 
     # Session 粘性
     stickiness = result.get("session_stickiness", {})
+    sections.append("### Session 粘性")
+    sections.append("")
+    sections.append("  Session 粘性详情见: [detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)")
+    sections.append("")
     if stickiness:
-        sections.append("### Session 粘性")
-        sections.append("")
-        sections.append("  Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
         table_data = [
             {
                 "Session": sid,
@@ -473,14 +473,21 @@ def format_cache_report(result):
             )
         )
         detail_sections.append("")
+    else:
+        sections.append("  未检测到可计算粘性的多请求 Session。")
+        sections.append("")
+        detail_sections.append("## Session 粘性")
+        detail_sections.append("")
+        detail_sections.append("- 无可用样本（需要同一 session 至少 2 次请求）。")
+        detail_sections.append("")
 
     # 非最优选择
-    if result.get("suboptimal_selections"):
-        subs = result["suboptimal_selections"]
-        sections.append(f"### 非最优选择 ({len(subs)} 次)")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
+    subs = result.get("suboptimal_selections") or []
+    sections.append(f"### 非最优选择 ({len(subs)} 次)")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_suboptimal.md](../detail/cache_suboptimal.md)")
+    sections.append("")
+    if subs:
         reason_counts = defaultdict(int)
         for s in subs:
             reason_counts[s["reason"]] += 1
@@ -494,15 +501,22 @@ def format_cache_report(result):
                 f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}'
             )
         detail_sections.append("")
+    else:
+        sections.append("  未发现非最优选择（selected_hitRatio 始终为当次最高）。")
+        sections.append("")
+        detail_sections.append("## 非最优选择")
+        detail_sections.append("")
+        detail_sections.append("- 未发现非最优选择。")
+        detail_sections.append("")
 
     # 驱逐影响
-    if result.get("eviction_impact"):
-        evictions = result["eviction_impact"]
-        evicted = [e for e in evictions if e["evicted"]]
-        sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
+    evictions = result.get("eviction_impact") or []
+    evicted = [e for e in evictions if e["evicted"]]
+    sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_eviction.md](../detail/cache_eviction.md)")
+    sections.append("")
+    if evictions:
         detail_sections.append("## 驱逐影响")
         detail_sections.append("")
         for e in evictions[:50]:
@@ -510,6 +524,13 @@ def format_cache_report(result):
                 f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}'
             )
         detail_sections.append("")
+    else:
+        sections.append("  未检测到超时导致的潜在驱逐影响。")
+        sections.append("")
+        detail_sections.append("## 驱逐影响")
+        detail_sections.append("")
+        detail_sections.append("- 未检测到超时驱逐样本。")
+        detail_sections.append("")
 
     # 冷启动
     if result.get("cold_starts", 0) > 0:
@@ -520,11 +541,11 @@ def format_cache_report(result):
         detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}')
         detail_sections.append("")
 
+    sections.append("### 交叉诊断")
+    sections.append("")
+    sections.append("  详情见: [detail/cache_cross.md](../detail/cache_cross.md)")
+    sections.append("")
     if result.get("cross_diagnosis"):
-        sections.append("### 交叉诊断")
-        sections.append("")
-        sections.append("  详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)")
-        sections.append("")
         detail_sections.append("## 交叉诊断")
         detail_sections.append("")
         detail_sections.append(
@@ -542,25 +563,23 @@ def format_cache_report(result):
             )
         )
         detail_sections.append("")
-
-    # 只显示实际生成了文件的链接
-    detail_links = []
-    if result.get("session_stickiness"):
-        detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)")
-    if result.get("suboptimal_selections"):
-        detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)")
-    if result.get("eviction_impact"):
-        detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)")
-    if result.get("fallback_reasons"):
-        detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)")
-    if result.get("cross_diagnosis"):
-        detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)")
-
-    if detail_links:
-        sections.append(
-            "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links)
-        )
+    else:
+        sections.append("  样本不足，未生成交叉诊断。")
         sections.append("")
+        detail_sections.append("## 交叉诊断")
+        detail_sections.append("")
+        detail_sections.append("- 样本不足，未生成交叉诊断。")
+        detail_sections.append("")
+
+    sections.append(
+        "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | "
+        "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | "
+        "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | "
+        "[detail/cache_eviction.md](../detail/cache_eviction.md) | "
+        "[detail/cache_fallback.md](../detail/cache_fallback.md) | "
+        "[detail/cache_cross.md](../detail/cache_cross.md)"
+    )
+    sections.append("")
 
     return "\n".join(sections), "\n".join(detail_sections)
 
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index 37006121994..d0dcbdca6d9 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -62,8 +62,13 @@ def analyze_trace(log_file, trace_ids, tail=None):
     Returns:
         dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary}
     """
+    auto_discovery_summary = ""
     if isinstance(trace_ids, str):
-        trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()]
+        normalized = trace_ids.strip().lower()
+        if normalized in ("all", "full", "all_ids", "全部", "全量"):
+            trace_ids, auto_discovery_summary = _discover_full_trace_targets(log_file, tail=tail)
+        else:
+            trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()]
 
     if not trace_ids:
         return {"traces": {}, "summary": "未指定追踪 ID"}
@@ -132,10 +137,64 @@ def analyze_trace(log_file, trace_ids, tail=None):
     total_traced = len(traces)
     complete = sum(1 for t in traces.values() if t["lifecycle_complete"])
 
-    return {
-        "traces": traces,
-        "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整",
-    }
+    summary = f"{total_traced} ID(s) 追踪, {complete} 生命周期完整"
+    if auto_discovery_summary:
+        summary += f" | {auto_discovery_summary}"
+
+    return {"traces": traces, "summary": summary}
+
+
+def _discover_full_trace_targets(log_file, tail=None):
+    """全量追踪目标发现。
+
+    规则：
+    1) 有 session_id 的优先按 session_id 追踪
+    2) 无 session 但有 trace_id 的按 trace_id 追踪
+    3) 剩余“孤立”的 request_id/req_id 单独追踪
+    """
+    lines = _grep_lines(log_file, r"session_id:|trace_id:|request_id:|req_id:", tail=tail)
+    if not lines:
+        return [], "全量追踪未发现任何可用 ID"
+
+    session_ids = set()
+    trace_ids = set()
+    all_request_ids = set()
+    request_ids_with_session_or_trace = set()
+
+    for line in lines:
+        tags = extract_tags(line)
+        sid = tags.get("session_id")
+        tid = tags.get("trace_id")
+        rid = tags.get("request_id") or tags.get("req_id")
+        has_session = bool(sid)
+        has_trace = bool(tid)
+        has_request = bool(rid)
+
+        if has_session:
+            session_ids.add(sid)
+        if has_trace:
+            trace_ids.add(tid)
+        if has_request:
+            all_request_ids.add(rid)
+            if has_session or has_trace:
+                request_ids_with_session_or_trace.add(rid)
+
+    standalone_request_ids = all_request_ids - request_ids_with_session_or_trace
+
+    targets = []
+    chosen = set()
+    for bucket in (sorted(session_ids), sorted(trace_ids), sorted(standalone_request_ids)):
+        for _id in bucket:
+            if _id and _id not in chosen:
+                chosen.add(_id)
+                targets.append(_id)
+
+    summary = (
+        "全量ID发现: "
+        f"session={len(session_ids)}, trace={len(trace_ids)}, "
+        f"standalone_request={len(standalone_request_ids)}, total_targets={len(targets)}"
+    )
+    return targets, summary
 
 
 def _parse_event_chain(lines):
@@ -426,7 +485,9 @@ def format_trace_report(result):
             # 主报告中添加引用和摘要
             safe_tid = tid.replace("/", "_")
             sections.append(f'  事件数: {len(trace["events"])}')
-            sections.append(f"  > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)")
+            sections.append(
+                f"  > 完整事件链: [detail/trace/trace_{safe_tid}.md](../detail/trace/trace_{safe_tid}.md)"
+            )
             sections.append("")
 
     return "\n".join(sections), detail_dict
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index d869f9c71cc..251a21c7e81 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -11,7 +11,7 @@
     --health            仅分析 Worker 健康
     --cache             仅分析 Cache 调度
     --load              仅分析负载与计数器
-    --trace ID          追踪指定请求（支持逗号分隔多 ID）
+    --trace ID          追踪指定请求（支持逗号分隔多 ID；传 all 可全量追踪）
     --tail N            仅分析尾部 N 行（支持 N 或 Nm 格式如 30m）
     --start TIME        起始时间（如 "16:00:00"、"03/31 16:00"）
     --end TIME          结束时间（如 "17:00:00"、"2026/03/31 17:00:00"）
@@ -191,7 +191,7 @@ def format_full_report(results, status, status_reason):
             details: dict 包含需要拆分到独立文件的详情数据
                 - 'health_events': str 或 None
                 - 'load_select_release': str 或 None
-                - 'trace_files': {trace_id: text} 或 {}
+                - 'trace_files': {trace_id: text} 或 {}（写入 detail/trace/）
     """
     parts = []
     details = {
@@ -292,44 +292,58 @@ def format_full_report(results, status, status_reason):
         if detail:
             details["cache_diagnosis"] = detail
         c = results["cache"]
+        lines = ["# Cache Session 粘性详情", ""]
         if c.get("session_stickiness"):
-            lines = ["# Cache Session 粘性详情", ""]
             for sid, s in c["session_stickiness"].items():
                 lines.append(
                     f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}'
                 )
-            lines.append("")
-            details["cache_session_stickiness"] = "\n".join(lines)
+        else:
+            lines.append("- 无可用样本（需要同一 session 至少 2 次请求）。")
+        lines.append("")
+        details["cache_session_stickiness"] = "\n".join(lines)
+
+        lines = ["# Cache 非最优选择详情", ""]
         if c.get("suboptimal_selections"):
-            lines = ["# Cache 非最优选择详情", ""]
             for x in c["suboptimal_selections"][:200]:
                 lines.append(
                     f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}'
                 )
-            lines.append("")
-            details["cache_suboptimal"] = "\n".join(lines)
+        else:
+            lines.append("- 未发现非最优选择。")
+        lines.append("")
+        details["cache_suboptimal"] = "\n".join(lines)
+
+        lines = ["# Cache 驱逐影响详情", ""]
         if c.get("eviction_impact"):
-            lines = ["# Cache 驱逐影响详情", ""]
             for x in c["eviction_impact"][:200]:
                 lines.append(
                     f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}'
                 )
-            lines.append("")
-            details["cache_eviction"] = "\n".join(lines)
+        else:
+            lines.append("- 未检测到超时驱逐样本。")
+        lines.append("")
+        details["cache_eviction"] = "\n".join(lines)
+
+        lines = ["# Cache Fallback 原因详情", ""]
         if c.get("fallback_reasons"):
-            lines = ["# Cache Fallback 原因详情", ""]
             for x in c["fallback_reasons"]:
                 lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)')
-            lines.append("")
-            details["cache_fallback"] = "\n".join(lines)
+        else:
+            lines.append("- 未出现 fallback 记录。")
+        lines.append("")
+        details["cache_fallback"] = "\n".join(lines)
+
+        lines = ["# Cache 交叉诊断详情", ""]
         if c.get("cross_diagnosis"):
-            lines = ["# Cache 交叉诊断详情", ""]
             for x in c["cross_diagnosis"]:
                 lines.append(
                     f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%'
                 )
-            lines.append("")
-            details["cache_cross"] = "\n".join(lines)
+        else:
+            lines.append("- 样本不足，未生成交叉诊断。")
+        lines.append("")
+        details["cache_cross"] = "\n".join(lines)
 
     if "trace" in results:
         summary, detail_dict = format_trace_report(results["trace"])
@@ -386,28 +400,31 @@ def save_detailed_report(report_text, output_dir, details=None):
         if details.get("load_counter_state"):
             with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f:
                 f.write(details["load_counter_state"])
-        if details.get("cache_session_stickiness"):
+        if details.get("cache_session_stickiness") is not None:
             with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_session_stickiness"])
-        if details.get("cache_suboptimal"):
+        if details.get("cache_suboptimal") is not None:
             with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_suboptimal"])
-        if details.get("cache_eviction"):
+        if details.get("cache_eviction") is not None:
             with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_eviction"])
-        if details.get("cache_fallback"):
+        if details.get("cache_fallback") is not None:
             with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_fallback"])
-        if details.get("cache_cross"):
+        if details.get("cache_cross") is not None:
             with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f:
                 f.write(details["cache_cross"])
         if details.get("errors_topn"):
             with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f:
                 f.write(details["errors_topn"])
 
+        trace_detail_dir = os.path.join(detail_dir, "trace")
+        if details.get("trace_files"):
+            os.makedirs(trace_detail_dir, exist_ok=True)
         for trace_id, trace_text in details.get("trace_files", {}).items():
             safe_id = trace_id.replace("/", "_")
-            trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md")
+            trace_path = os.path.join(trace_detail_dir, f"trace_{safe_id}.md")
             with open(trace_path, "w", encoding="utf-8") as f:
                 f.write(trace_text)
 
@@ -426,7 +443,7 @@ def main():
     parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康")
     parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度")
     parser.add_argument("--load", action="store_true", help="仅分析负载与计数器")
-    parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID）")
+    parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID；传 all 可全量追踪）")
     parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)")
     parser.add_argument(
         "--start", default=None, help='起始时间（如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00"）'
@@ -478,7 +495,7 @@ def main():
     run_health = args.health or (not any_mode)
     run_load = args.load or (not any_mode)
     run_cache = args.cache or (not any_mode)
-    run_trace = bool(args.trace)  # trace 需要指定 ID，全量扫描不自动调用
+    run_trace = bool(args.trace)  # trace 需要指定 ID（支持 all），全量扫描不自动调用
 
     results = {}
     step = 0

From 41f56f75932a6e90d68790a1faa19bf2acdb7d9e Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 17:07:36 +0800
Subject: [PATCH 25/40] stat-cache-hitrate: remove watch mode and loop guidance

---
 .../skills/stat-cache-hitrate/SKILL.md        | 15 ++--
 .../references/report_templates.md            | 10 ---
 .../scripts/stat_cache_hitrate.py             | 68 ++++++++++++++++---
 3 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index 251cbb04c2a..3d52bc4b2be 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -4,7 +4,7 @@ description: >
   统计 FastDeploy Go Router 日志中的三层 cache 命中率指标，生成可视化报告。
   三层指标：Prefix Hit Ratio（KV Cache 内容复用度）、Session Hit Rate（请求级路由粘性）、
   Per-Worker Cache Stats（各 prefill worker 的缓存利用排名）。支持全量统计、tail 快速查看、
-  持续监控模式、指定时间段统计（--start/--end）。
+  指定时间段统计（--start/--end）。
 
   当用户提到以下内容时触发此 skill：统计/查看 cache 命中率、查看 cache-aware 调度效果、
   查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。
@@ -37,9 +37,8 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 ### 2. 分析模式
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号，避免客户端偶发不显示第 4 项）：
 - 选项 1: `全量统计（默认）` — 扫描完整日志
-- 选项 2: `快速查看尾部` — 只看最近的数据（可指定行数如 2000 或时间如 30m）
-- 选项 3: `持续监控` — 全量分析后提示监控命令
-- 选项 4: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
+- 选项 2: `快速查看尾部` — 只看最近的数据（可指定 `2000/2k` 行，或 `30m/2h/1d` 时间窗口）
+- 选项 3: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
 
 若用户选择“指定时间段”，直接让用户填写：  
 - 从 `xxx` 开始，到 `xxx` 结束（`start/end` 可只填一个）；  
@@ -66,10 +65,10 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 # 快速查看尾部数据
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail       # 默认最后 2000 行
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000   # 指定行数
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m    # 指定时间
-
-# 持续监控
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --watch
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2k     # 行数缩写
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m    # 分钟窗口
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2h     # 小时窗口
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1d     # 天窗口
 
 # 指定时间段（--start 和 --end 可单独或同时使用）
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00"
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
index f5a0def5f55..ebca39be2c4 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md
@@ -198,14 +198,4 @@ output = f"{bar}  {percentage}%   (N={count})"
       +---+---+---+---+---→
        -5m  -4m  -3m  -2m  -1m
 
-💡 持续跟踪: /loop 30s /analyze-cache-hitrate --tail
-```
-
-## --watch 持续监控模板
-
-`--watch` 模式先输出完整报告（同终端概览报告模板），末尾额外提示：
-
-```
-💡 全量分析完成。持续跟踪后续变化:
-   /loop 30s /analyze-cache-hitrate --tail
 ```
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index bd85730b7d1..fc729af47c7 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -8,11 +8,12 @@
   3. Per-Worker Stats   — 各 worker 缓存利用排名
 
 用法：
-  python3 stat_cache_hitrate.py <log_file> [--tail N|Nm] [--watch] [--output DIR]
+  python3 stat_cache_hitrate.py <log_file> [--tail N|2k|30m|2h|1d] [--output DIR]
 """
 
 import argparse
 import json
+import math
 import os
 import re
 import subprocess
@@ -170,7 +171,7 @@ def count_lines(filepath):
 
 def read_lines(filepath, tail=None):
     """读取日志文件，支持 tail 模式。"""
-    if tail:
+    if tail is not None:
         if isinstance(tail, str) and tail.endswith("m"):
             # 按时间 tail：读取全部行，过滤最近 N 分钟
             minutes = int(tail[:-1])
@@ -282,7 +283,7 @@ def extract_data(filepath, tail=None):
         strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail)
         stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail)
         inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail)
-        line_count = int(tail) if tail and not (isinstance(tail, str) and tail.endswith("m")) else total
+        line_count = int(tail) if tail is not None and not (isinstance(tail, str) and tail.endswith("m")) else total
         return strategy_recs, stats_recs, inference_count, line_count
 
 
@@ -984,8 +985,12 @@ def parse_args():
         epilog=__doc__,
     )
     parser.add_argument("log_file", help="日志文件路径")
-    parser.add_argument("--tail", nargs="?", const="2000", help="只分析尾部数据（行数如 2000，或时间如 30m）")
-    parser.add_argument("--watch", action="store_true", help="全量分析后提示持续监控命令")
+    parser.add_argument(
+        "--tail",
+        nargs="?",
+        const="2000",
+        help="只分析尾部数据（支持 2000/2k 行，或 30m/2h/1d 时间窗口）",
+    )
     parser.add_argument(
         "--output", default=None, help="详细报告输出目录（默认：skill_output/stat-cache-hitrate/<timestamp>/）"
     )
@@ -996,6 +1001,45 @@ def parse_args():
     return parser.parse_args()
 
 
+def parse_tail_arg(tail_str):
+    """解析 --tail 参数，返回 int(行数) 或 '<minutes>m'(时间窗口)。"""
+    if tail_str is None:
+        return None
+
+    s = str(tail_str).strip().lower()
+    if not s:
+        raise ValueError("--tail 不能为空")
+
+    # 行数: 2000
+    if re.fullmatch(r"\d+", s):
+        value = int(s)
+        if value <= 0:
+            raise ValueError("--tail 行数必须 > 0")
+        return value
+
+    # 行数缩写: 2k => 2000
+    m = re.fullmatch(r"(\d+)k", s)
+    if m:
+        value = int(m.group(1)) * 1000
+        if value <= 0:
+            raise ValueError("--tail 行数必须 > 0")
+        return value
+
+    # 时间窗口: 30m/2h/1d（最终统一成分钟）
+    m = re.fullmatch(r"(\d+)(m|h|d)", s)
+    if m:
+        num = int(m.group(1))
+        unit = m.group(2)
+        if num <= 0:
+            raise ValueError("--tail 时间窗口必须 > 0")
+        factor = {"m": 1, "h": 60, "d": 1440}[unit]
+        minutes = num * factor
+        minutes = max(1, math.ceil(minutes))
+        return f"{minutes}m"
+
+    raise ValueError("不支持的 --tail 格式：请使用 2000/2k 或 30m/2h/1d")
+
+
 def main():
     args = parse_args()
 
@@ -1009,6 +1053,12 @@ def main():
         print("Error: --tail 与 --start/--end 不能同时使用，请选择其一", file=sys.stderr)
         sys.exit(1)
 
+    try:
+        tail = parse_tail_arg(args.tail)
+    except ValueError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+
     # 时间范围预过滤（--start 和 --end 可单独或同时指定）
     import atexit
 
@@ -1023,7 +1073,7 @@ def main():
         print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr)
 
     # Phase 2: 提取 + 解析
-    strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, args.tail)
+    strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, tail)
 
     if not strategy_recs and not stats_recs:
         print(
@@ -1039,7 +1089,7 @@ def main():
     diagnosis = cross_diagnose(prefix_hr, session_hr)
 
     # Phase 4: 输出
-    if args.tail:
+    if tail is not None:
         print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling))
     else:
         time_span = compute_time_span(strategy_recs, stats_recs)
@@ -1094,9 +1144,5 @@ def main():
             print(f"  - Session 明细: {session_abs}")
             print(f"    URI: {session_uri}")
 
-    if args.watch:
-        print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail")
-
-
 if __name__ == "__main__":
     main()

From 984a925ef3d5b1d7e670445cd46adc86cb1c3214 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Mon, 13 Apr 2026 17:38:32 +0800
Subject: [PATCH 26/40] skills: generalize tail shorthand parsing for line
 counts

---
 .../skills/stat-cache-hitrate/SKILL.md        | 11 +--
 .../scripts/stat_cache_hitrate.py             | 99 +++++--------------
 .../.claude/skills/troubleshoot/SKILL.md      |  9 +-
 .../troubleshoot/scripts/troubleshoot.py      | 33 ++++---
 4 files changed, 50 insertions(+), 102 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index 3d52bc4b2be..ad9b3f29fd2 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -37,7 +37,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 ### 2. 分析模式
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号，避免客户端偶发不显示第 4 项）：
 - 选项 1: `全量统计（默认）` — 扫描完整日志
-- 选项 2: `快速查看尾部` — 只看最近的数据（可指定 `2000/2k` 行，或 `30m/2h/1d` 时间窗口）
+- 选项 2: `快速查看尾部` — 只看最近的数据（支持 `2000`、`1k`、`1w` 等行数写法）
 - 选项 3: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
 
 若用户选择“指定时间段”，直接让用户填写：  
@@ -47,6 +47,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 如果用户未选择，默认使用全量统计。
 
 `--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。
+`--tail` 仅支持“行数”语义（如 `2000`，也兼容 `1k/1w` 自动换算），不再支持 `30m/2h/1d` 这类时间窗口；按时间请使用 `--start/--end`。
 时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。
 缺失部分自动从日志首末行推断。
 
@@ -65,12 +66,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志
 # 快速查看尾部数据
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail       # 默认最后 2000 行
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000   # 指定行数
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2k     # 行数缩写
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m    # 分钟窗口
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2h     # 小时窗口
-python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1d     # 天窗口
-
-# 指定时间段（--start 和 --end 可单独或同时使用）
+python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1k     # 行数缩写（自动换算）
+# 指定时间段（需要按时间筛选时使用；--start 和 --end 可单独或同时使用）
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00"
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "2026/03/31 16:00:00"
 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00"
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index fc729af47c7..1e27f96a476 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -8,12 +8,11 @@
   3. Per-Worker Stats   — 各 worker 缓存利用排名
 
 用法：
-  python3 stat_cache_hitrate.py <log_file> [--tail N|2k|30m|2h|1d] [--output DIR]
+  python3 stat_cache_hitrate.py <log_file> [--tail N|Nk|Nw] [--output DIR]
 """
 
 import argparse
 import json
-import math
 import os
 import re
 import subprocess
@@ -28,7 +27,6 @@
 from chart import render_bar, render_sparkline, render_table
 from log_parser import (
     complete_time_arg,
-    extract_ts,
     filter_file_by_time_range,
     parse_cache_strategy_line,
     parse_stats_line,
@@ -172,16 +170,10 @@ def count_lines(filepath):
 def read_lines(filepath, tail=None):
     """读取日志文件，支持 tail 模式。"""
     if tail is not None:
-        if isinstance(tail, str) and tail.endswith("m"):
-            # 按时间 tail：读取全部行，过滤最近 N 分钟
-            minutes = int(tail[:-1])
-            all_lines = _read_file_lines(filepath)
-            return _filter_by_time(all_lines, minutes)
-        else:
-            # 按行数 tail
-            n = int(tail)
-            result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True)
-            return result.stdout.splitlines() if result.returncode == 0 else []
+        # 按行数 tail
+        n = int(tail)
+        result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True)
+        return result.stdout.splitlines() if result.returncode == 0 else []
     return _read_file_lines(filepath)
 
 
@@ -190,35 +182,6 @@ def _read_file_lines(filepath):
         return f.readlines()
 
 
-def _filter_by_time(lines, minutes):
-    """过滤最近 N 分钟的日志行。"""
-    # 找最后一行的时间戳作为基准
-    last_ts = None
-    for line in reversed(lines):
-        ts = extract_ts(line)
-        if ts:
-            last_ts = parse_ts(ts)
-            break
-    if not last_ts:
-        return lines
-
-    from datetime import timedelta
-
-    cutoff = last_ts - timedelta(minutes=minutes)
-    result = []
-    for line in lines:
-        ts = extract_ts(line)
-        if ts:
-            try:
-                if parse_ts(ts) >= cutoff:
-                    result.append(line)
-            except ValueError:
-                result.append(line)
-        else:
-            result.append(line)
-    return result
-
-
 # ════════════════════════════════════════════════════════════════
 # Phase 2: 日志提取与解析
 # ════════════════════════════════════════════════════════════════
@@ -237,7 +200,7 @@ def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None):
     """大文件模式：grep 过滤 + log_parser.py CLI 管道解析。"""
     parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log_parser.py")
 
-    if tail and not (isinstance(tail, str) and tail.endswith("m")):
+    if tail:
         grep_cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -F {_shell_quote(grep_pattern)} | python3 {_shell_quote(parser_path)} {parse_cmd}"
     else:
         grep_cmd = f"grep -F {_shell_quote(grep_pattern)} {_shell_quote(filepath)} | python3 {_shell_quote(parser_path)} {parse_cmd}"
@@ -255,7 +218,7 @@ def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None):
 
 def grep_count(filepath, grep_pattern, tail=None):
     """大文件模式：grep 计数。"""
-    if tail and not (isinstance(tail, str) and tail.endswith("m")):
+    if tail:
         cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -cE {_shell_quote(grep_pattern)}"
     else:
         cmd = f"grep -cE {_shell_quote(grep_pattern)} {_shell_quote(filepath)}"
@@ -283,7 +246,7 @@ def extract_data(filepath, tail=None):
         strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail)
         stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail)
         inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail)
-        line_count = int(tail) if tail is not None and not (isinstance(tail, str) and tail.endswith("m")) else total
+        line_count = int(tail) if tail is not None else total
         return strategy_recs, stats_recs, inference_count, line_count
 
 
@@ -989,7 +952,7 @@ def parse_args():
         "--tail",
         nargs="?",
         const="2000",
-        help="只分析尾部数据（支持 2000/2k 行，或 30m/2h/1d 时间窗口）",
+        help="只分析尾部数据（支持 2000、1k、1w 等行数写法）。按时间请使用 --start/--end",
     )
     parser.add_argument(
         "--output", default=None, help="详细报告输出目录（默认：skill_output/stat-cache-hitrate/<timestamp>/）"
@@ -1002,7 +965,7 @@ def parse_args():
 
 
 def parse_tail_arg(tail_str):
-    """解析 --tail 参数，返回 int(行数) 或 '<minutes>m'(时间窗口)。"""
+    """解析 --tail 参数，返回行数 int。支持数字及 k/w 缩写。"""
     if tail_str is None:
         return None
 
@@ -1010,34 +973,20 @@ def parse_tail_arg(tail_str):
     if not s:
         raise ValueError("--tail 不能为空")
 
-    # 行数: 2000
-    if re.fullmatch(r"\d+", s):
-        value = int(s)
-        if value <= 0:
-            raise ValueError("--tail 行数必须 > 0")
-        return value
-
-    # 行数缩写: 2k => 2000
-    m = re.fullmatch(r"(\d+)k", s)
-    if m:
-        value = int(m.group(1)) * 1000
-        if value <= 0:
-            raise ValueError("--tail 行数必须 > 0")
-        return value
-
-    # 时间窗口: 30m/2h/1d（最终统一成分钟）
-    m = re.fullmatch(r"(\d+)(m|h|d)", s)
-    if m:
-        num = int(m.group(1))
-        unit = m.group(2)
-        if num <= 0:
-            raise ValueError("--tail 时间窗口必须 > 0")
-        factor = {"m": 1, "h": 60, "d": 1440}[unit]
-        minutes = num * factor
-        minutes = max(1, math.ceil(minutes))
-        return f"{minutes}m"
-
-    raise ValueError("不支持的 --tail 格式：请使用 2000/2k 或 30m/2h/1d")
+    m = re.fullmatch(r"(\d+)([kw])?", s)
+    if not m:
+        raise ValueError("不支持的 --tail 格式：请使用 2000、1k、1w 等行数写法。按时间请改用 --start/--end")
+
+    value = int(m.group(1))
+    unit = m.group(2)
+    if unit == "k":
+        value *= 1000
+    elif unit == "w":
+        value *= 10000
+
+    if value <= 0:
+        raise ValueError("--tail 行数必须 > 0")
+    return value
 
 
 def main():
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
index 00c94a2f487..ecb27c1436a 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md
@@ -38,7 +38,7 @@ description: >
 ### 2. 分析范围
 必须使用 **AskUserQuestion 的离散选项**（不要只发纯文本编号）：
 - 选项 1: `全量分析（默认）` — 分析整个日志文件
-- 选项 2: `尾部分析` — 只分析最近数据（可指定行数或时间如 `--tail 5000` 或 `--tail 30m`）
+- 选项 2: `尾部分析` — 只分析最近数据（仅支持行数，如 `--tail 5000`）
 - 选项 3: `指定时间段` — 分析特定时间范围内的日志
 
 如果用户未选择，默认使用全量分析。
@@ -50,10 +50,11 @@ description: >
 时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。
 缺失部分自动从日志首末行推断（缺年份取首行，缺日期取末行）。
 `--start/--end` 与 `--tail` 互斥。
+`--tail` 仅支持“行数”语义（如 `5000`，也兼容 `1k/1w` 自动换算），不再支持 `30m` 这类时间写法；凡是按时间筛选都使用 `--start/--end`。
 
 当用户选择“指定时间段”时，必须再发起一次 **AskUserQuestion**（离散选项）引导时间输入：
 - 选项 1: `当天（00:00:00 到当前）`（推荐）
-- 选项 2: `最近半小时`（自动换算为 `--start now-30m --end now` 语义）
+- 选项 2: `自定义时间段`（由用户直接输入起止时间）
 
 用户若通过客户端默认 `Other` 输入时间，则将该输入直接作为时间范围参数解析。
 可补充一条简短示例引导：
@@ -104,9 +105,7 @@ python3 $SCRIPTS/troubleshoot.py <log_file> --trace all
 
 # 尾部分析
 python3 $SCRIPTS/troubleshoot.py <log_file> --tail 5000
-python3 $SCRIPTS/troubleshoot.py <log_file> --tail 30m
-
-# 指定时间段（--start 和 --end 可单独或同时使用）
+# 指定时间段（需要按时间筛选时使用；--start 和 --end 可单独或同时使用）
 python3 $SCRIPTS/troubleshoot.py <log_file> --start "16:00:00" --end "17:00:00"
 python3 $SCRIPTS/troubleshoot.py <log_file> --start "2026/03/31 16:00:00"
 python3 $SCRIPTS/troubleshoot.py <log_file> --start "03/31" --end "03/31 18:00"
diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
index 251a21c7e81..b00521e6b01 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py
@@ -12,7 +12,7 @@
     --cache             仅分析 Cache 调度
     --load              仅分析负载与计数器
     --trace ID          追踪指定请求（支持逗号分隔多 ID；传 all 可全量追踪）
-    --tail N            仅分析尾部 N 行（支持 N 或 Nm 格式如 30m）
+    --tail N            仅分析尾部 N 行（支持 5000/1k/1w 等行数写法）
     --start TIME        起始时间（如 "16:00:00"、"03/31 16:00"）
     --end TIME          结束时间（如 "17:00:00"、"2026/03/31 17:00:00"）
     --output DIR        详细报告导出目录（默认: skill_output/troubleshoot/<timestamp>/）
@@ -21,6 +21,7 @@
 """
 
 import argparse
+import re
 import os
 import sys
 from datetime import datetime
@@ -38,7 +39,6 @@
 from analyzers.trace import analyze_trace, format_trace_report
 from log_parser import (
     complete_time_arg,
-    filter_file_by_recent_minutes,
     filter_file_by_time_range,
 )
 
@@ -106,12 +106,22 @@ def determine_log_file(user_path=None):
 
 
 def parse_tail_arg(tail_str):
-    """解析 --tail 参数：支持纯数字(行数)或 Nm(分钟)格式。"""
+    """解析 --tail 参数：支持数字及 k/w 缩写。"""
     if tail_str is None:
         return None
-    if tail_str.endswith("m"):
-        return {"type": "minutes", "value": int(tail_str[:-1])}
-    return {"type": "lines", "value": int(tail_str)}
+    s = str(tail_str).strip().lower()
+    m = re.fullmatch(r"(\d+)([kw])?", s)
+    if not m:
+        raise ValueError("--tail 仅支持行数（如 5000、1k、1w）。按时间请改用 --start/--end")
+    value = int(m.group(1))
+    unit = m.group(2)
+    if unit == "k":
+        value *= 1000
+    elif unit == "w":
+        value *= 10000
+    if value <= 0:
+        raise ValueError("--tail 行数必须 > 0")
+    return {"type": "lines", "value": value}
 
 
 def determine_status(results):
@@ -444,7 +454,7 @@ def main():
     parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度")
     parser.add_argument("--load", action="store_true", help="仅分析负载与计数器")
     parser.add_argument("--trace", metavar="ID", help="追踪指定请求（逗号分隔多 ID；传 all 可全量追踪）")
-    parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)")
+    parser.add_argument("--tail", help="尾部行数（如 5000、1k、1w）。按时间请使用 --start/--end")
     parser.add_argument(
         "--start", default=None, help='起始时间（如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00"）'
     )
@@ -478,14 +488,7 @@ def main():
 
     tail_arg = parse_tail_arg(args.tail)
     tail = None
-    # --tail Nm 采用真实时间窗口过滤，再全量分析过滤后的临时文件
-    if tail_arg and tail_arg["type"] == "minutes":
-        filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"])
-        if is_temp:
-            atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None)
-        log_file = filtered_path
-        print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr)
-    elif tail_arg and tail_arg["type"] == "lines":
+    if tail_arg and tail_arg["type"] == "lines":
         tail = tail_arg["value"]
 
     # 确定分析模式

From b68181da133e19b3e406d4d03d2792b31db75390 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 16:09:51 +0800
Subject: [PATCH 27/40] [Feature] Add troubleshoot and stats-cache-hitratio
 skills

---
 .../troubleshoot/scripts/analyzers/trace.py   | 29 +++++++++++++++----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
index d0dcbdca6d9..ba4c7bd1051 100644
--- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
+++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py
@@ -291,7 +291,9 @@ def _parse_event_chain(lines):
         # Prefill events
         m = PREFILL_FIRST_CHUNK_RE.search(line)
         if m:
-            events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()})
+            events.append(
+                {"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()}
+            )
             continue
         m = PREFILL_DONE_RE.search(line)
         if m:
@@ -300,7 +302,14 @@ def _parse_event_chain(lines):
         m = PREFILL_ERROR_RE.search(line)
         if m:
             events.append(
-                {"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2), "raw": line.strip()}
+                {
+                    "ts": ts,
+                    "type": "PREFILL_ERROR",
+                    "tags": tags,
+                    "error": m.group(1),
+                    "worker": m.group(2),
+                    "raw": line.strip(),
+                }
             )
             continue
         m = PREFILL_DEFER_RE.search(line)
@@ -312,7 +321,13 @@ def _parse_event_chain(lines):
         m = PREFILL_ERR_PATH_RE.search(line)
         if m:
             events.append(
-                {"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()}
+                {
+                    "ts": ts,
+                    "type": "PREFILL_ERROR_PATH_RELEASE",
+                    "tags": tags,
+                    "worker": m.group(1),
+                    "raw": line.strip(),
+                }
             )
             continue
 
@@ -456,7 +471,7 @@ def format_trace_report(result):
                 detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}')
             detail_lines.append(f"生命周期: {status}")
             detail_lines.append("")
-            detail_lines.append("## 事件链")
+            detail_lines.append("## 事件链（整理）")
             detail_lines.append("")
             for evt in trace["events"]:
                 line = f'  [{evt.get("ts","")}] {evt["type"]}'
@@ -477,8 +492,12 @@ def format_trace_report(result):
                 if evt.get("ts_ms"):
                     line += f' ts_ms={evt["ts_ms"]}'
                 detail_lines.append(line)
+            detail_lines.append("")
+            detail_lines.append("## 原始日志 RAW")
+            detail_lines.append("")
+            for evt in trace["events"]:
                 if evt.get("raw"):
-                    detail_lines.append(f'    RAW: {evt["raw"]}')
+                    detail_lines.append(evt["raw"])
             detail_lines.append("")
             detail_dict[tid] = "\n".join(detail_lines)
 

From 87a79104b09d12bbacd900ff4d6f61bc9ff2e28f Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 17:47:47 +0800
Subject: [PATCH 28/40] [Feature] Add troubleshoot and stats-cache-hitratio
 skills

---
 fastdeploy/golang_router/pkg/logger/logger.go | 73 +++++++++++--------
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 07412670628..c14565e348d 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -45,13 +45,13 @@ const SessionIDKey contextKey = "session_id"
 const gracePeriod = 5 * time.Minute
 
 // rotatingWriter implements io.Writer with day-level rotation and dual-file writes.
-// Current day's log is always "router.log"; on day change it is renamed to
-// "router-YYYY-MM-DD.log" and a new "router.log" is created. During a short
-// grace period after rotation, log lines whose timestamp belongs to the previous
-// day are written to the archived file.
+// Current day's log is written to "router-YYYY-MM-DD.log" and "router.log" is a
+// symlink pointing to the current day's file. On day change a new date file is
+// created and the symlink is updated. During a short grace period after rotation,
+// log lines whose timestamp belongs to the previous day are written to the old file.
 type rotatingWriter struct {
 	mu          sync.Mutex
-	currentFile *os.File  // today's router.log
+	currentFile *os.File  // today's router-<date>.log
 	prevFile    *os.File  // previous day's router-<date>.log during grace period (may be nil)
 	currentDate string    // "2006-01-02"
 	prevDate    string    // previous date during grace period
@@ -61,10 +61,24 @@ type rotatingWriter struct {
 
 func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	today := nowFunc().Format("2006-01-02")
-	f, err := os.OpenFile(filepath.Join(logDir, "router.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	datePath := filepath.Join(logDir, "router-"+today+".log")
+	symlinkPath := filepath.Join(logDir, "router.log")
+
+	// Migration: if router.log is a regular file (legacy), rename it to the date file.
+	if info, err := os.Lstat(symlinkPath); err == nil && info.Mode().IsRegular() {
+		os.Rename(symlinkPath, datePath)
+	}
+
+	// Open the date file (append mode).
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
 	if err != nil {
 		return nil, err
 	}
+
+	// Create/update symlink: router.log -> router-<today>.log
+	os.Remove(symlinkPath)
+	os.Symlink("router-"+today+".log", symlinkPath)
+
 	return &rotatingWriter{
 		currentFile: f,
 		currentDate: today,
@@ -122,28 +136,20 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 		w.prevFile = nil
 	}
 
-	// Close current router.log so we can rename it.
-	if w.currentFile != nil {
-		w.currentFile.Close()
-	}
-
-	// Rename router.log -> router-<currentDate>.log
-	oldPath := filepath.Join(w.logDir, "router.log")
-	archivePath := filepath.Join(w.logDir, "router-"+w.currentDate+".log")
-	if err := os.Rename(oldPath, archivePath); err != nil {
-		// Rename failed; try to reopen router.log and continue without rotation.
-		w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
-		return
-	}
-
-	// Open the archived file for dual-write grace period.
-	w.prevFile, _ = os.OpenFile(archivePath, os.O_WRONLY|os.O_APPEND, 0666)
+	// Keep the old date file open for grace period writes.
+	w.prevFile = w.currentFile
 	w.prevDate = w.currentDate
 	w.graceUntil = nowFunc().Add(gracePeriod)
 
-	// Create new router.log for the new day.
-	w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	// Open new date file for the new day.
+	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
+	w.currentFile, _ = os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
 	w.currentDate = newDate
+
+	// Update symlink: router.log -> router-<newDate>.log
+	symlinkPath := filepath.Join(w.logDir, "router.log")
+	os.Remove(symlinkPath)
+	os.Symlink("router-"+newDate+".log", symlinkPath)
 }
 
 // parseLogDate extracts the date from a log line produced by log.LstdFlags.
@@ -244,22 +250,17 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
 	}
 
 	now := nowFunc()
+	today := now.Format("2006-01-02")
 	var archives []logFileInfo
-	var routerLogSize int64
 
 	for _, entry := range entries {
 		if entry.IsDir() {
 			continue
 		}
 		name := entry.Name()
-		info, err := entry.Info()
-		if err != nil {
-			continue
-		}
 
-		// Count router.log size but never delete it.
+		// router.log is now a symlink; skip it.
 		if name == "router.log" {
-			routerLogSize = info.Size()
 			continue
 		}
 
@@ -273,6 +274,14 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
 		if err != nil {
 			continue
 		}
+		// Never delete today's active date file.
+		if dateStr == today {
+			continue
+		}
+		info, err := entry.Info()
+		if err != nil {
+			continue
+		}
 		archives = append(archives, logFileInfo{
 			name: name,
 			path: filepath.Join(logDir, name),
@@ -303,7 +312,7 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
 	// Phase 2: Size-based cleanup.
 	if maxTotalSizeMB > 0 {
 		maxBytes := int64(maxTotalSizeMB) * 1024 * 1024
-		var totalSize int64 = routerLogSize
+		var totalSize int64
 		for _, f := range archives {
 			totalSize += f.size
 		}

From 109a8e5b2aee056ba17a95e59c22924fe9bd9e23 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 19:22:06 +0800
Subject: [PATCH 29/40] [Feature] Add skills and Add logging cleanup

---
 .../skills/stat-cache-hitrate/SKILL.md        |  14 ++-
 .../stat-cache-hitrate/scripts/log_parser.py  |  65 +++++++++++
 .../scripts/stat_cache_hitrate.py             | 102 ++++++++++--------
 3 files changed, 133 insertions(+), 48 deletions(-)

diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
index ad9b3f29fd2..097a10f8163 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md
@@ -40,15 +40,21 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析
 - 选项 2: `快速查看尾部` — 只看最近的数据（支持 `2000`、`1k`、`1w` 等行数写法）
 - 选项 3: `指定时间段` — 分析特定时间范围（如 `--start "16:00" --end "17:00"`）
 
-若用户选择“指定时间段”，直接让用户填写：  
-- 从 `xxx` 开始，到 `xxx` 结束（`start/end` 可只填一个）；  
+**若用户选择"快速查看尾部"，必须再询问行数**，提供选项：
+- 选项 1: `2000 行（默认）`
+- 选项 2: `5000 行`
+- 选项 3: `1万行`
+
+若用户选择”指定时间段”，直接让用户填写：
+- 从 `xxx` 开始，到 `xxx` 结束（`start/end` 可只填一个）；
+- 支持相对时间写法：`30m`、`2h`、`1d`、`最后30分钟` 等（换算为绝对时间）
 - 然后映射为 `--start/--end` 参数执行。
 
 如果用户未选择，默认使用全量统计。
 
 `--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。
-`--tail` 仅支持“行数”语义（如 `2000`，也兼容 `1k/1w` 自动换算），不再支持 `30m/2h/1d` 这类时间窗口；按时间请使用 `--start/--end`。
-时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。
+`--tail` 仅支持”行数”语义（如 `2000`，也兼容 `1k/1w` 自动换算），不再支持 `30m/2h/1d` 这类时间窗口；按时间请使用 `--start/--end`。
+时间格式灵活：支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`、相对时间（`30m`、`2h`、`1d`、`最后30分钟`）。
 缺失部分自动从日志首末行推断。
 
 ### 3. 输出目录
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
index d43d6909c64..bb31235f3fa 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py
@@ -76,6 +76,63 @@ def parse_ts(ts_str):
 _SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$")
 _TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$")
 
+# 相对时间正则：支持 30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟
+_RELATIVE_TIME_RE = re.compile(r"^(?:last|最后)?\s*(\d+)\s*(m|分钟|mins?|h|小时|hours?|d|天|days?)$", re.IGNORECASE)
+
+
+def _parse_relative_time(time_str):
+    """解析相对时间字符串，返回 timedelta。
+
+    支持格式：30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟
+    """
+    m = _RELATIVE_TIME_RE.match(time_str.strip())
+    if not m:
+        return None
+
+    value = int(m.group(1))
+    unit = m.group(2).lower()
+
+    if unit.startswith("m") and "in" not in unit:  # m, min, mins
+        from datetime import timedelta
+
+        return timedelta(minutes=value)
+    elif unit.startswith("h"):  # h, hour, hours
+        from datetime import timedelta
+
+        return timedelta(hours=value)
+    else:  # d, day, days
+        from datetime import timedelta
+
+        return timedelta(days=value)
+
+
+def _relative_to_absolute(time_str, log_file, is_end=False):
+    """将相对时间转换为绝对时间，基于日志文件的时间边界。
+
+    - start: 从日志末行时间往前推
+    - end: 直接使用日志末行时间（或当前时间）
+    """
+    relative_delta = _parse_relative_time(time_str)
+    if not relative_delta:
+        return None
+
+    # 获取日志文件末行时间作为基准
+    boundary_ts = _get_log_boundary_ts(log_file, "last")
+    if not boundary_ts:
+        return None
+
+    # 解析为 datetime
+    dt = datetime.strptime(boundary_ts, "%Y/%m/%d %H:%M:%S")
+
+    if is_end:
+        # end 时间：直接使用日志末行时间
+        return boundary_ts
+    else:
+        # start 时间：末行时间减去 duration
+
+        abs_time = dt - relative_delta
+        return abs_time.strftime("%Y/%m/%d %H:%M:%S")
+
 
 def _get_log_boundary_ts(log_file, which="first"):
     """从日志文件首行或末行提取时间戳。"""
@@ -93,11 +150,13 @@ def complete_time_arg(time_str, log_file, is_end=False):
     支持格式：
         'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD',
         'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM'
+        相对时间：30m、2h、1d、最后30分钟 等（从日志末行时间算起）
 
     补全规则：
         - 缺年份：从日志首行取
         - 缺日期：从日志末行取
         - 缺时间：start→00:00:00, end→23:59:59
+        - 相对时间：start 从日志末行往前推，end 直接用日志末行时间
 
     Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串
     """
@@ -105,6 +164,12 @@ def complete_time_arg(time_str, log_file, is_end=False):
         return None
     time_str = time_str.strip()
 
+    # Case 0: 相对时间处理（如 "30m"、"最后30分钟"、"2h"）
+    # 从日志文件末行时间开始算起
+    relative_result = _relative_to_absolute(time_str, log_file, is_end)
+    if relative_result:
+        return relative_result
+
     # Case 1: 完整日期时间
     m = _FULL_DT_RE.match(time_str)
     if m:
diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
index 1e27f96a476..7c6e0d40ecf 100644
--- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
+++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py
@@ -17,10 +17,10 @@
 import re
 import subprocess
 import sys
-from pathlib import Path
-from urllib.parse import quote
 from collections import defaultdict
 from datetime import datetime
+from pathlib import Path
+from urllib.parse import quote
 
 # 同目录模块导入
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -154,6 +154,7 @@ def _summarize_id_type_ranges(rows_with_seq):
     ranges.append((start_id, end_id, current_type, start_ts, end_ts))
     return ranges
 
+
 # ════════════════════════════════════════════════════════════════
 # Phase 1: 日志读取
 # ════════════════════════════════════════════════════════════════
@@ -424,7 +425,9 @@ def _quartile_trend(trend, value_field):
     return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}"
 
 
-def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None):
+def format_full_report(
+    filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None
+):
     """格式化完整终端报告。"""
     parts = []
 
@@ -667,13 +670,17 @@ def save_detailed_report(
         trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean")
         if trend_str:
             parts.append(f"- 趋势: {trend_str}")
-        dist_data = [{"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]]
+        dist_data = [
+            {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]
+        ]
         parts.append("")
         parts.append("```text")
         parts.append("Unicode 柱状图（Prefix HR 分布）")
         parts.append(render_bar(dist_data, show_count=True))
         if prefix_hr["trend"]:
-            sparkline_data = [{"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]]
+            sparkline_data = [
+                {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]
+            ]
             parts.append("")
             parts.append("ASCII 折线图（Prefix HR 趋势）")
             parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100)))
@@ -761,7 +768,9 @@ def save_detailed_report(
             f.write("\n".join(detail_parts))
 
         if session_rows:
-            parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)")
+            parts.append(
+                f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)"
+            )
             parts.append("")
 
             all_rows_with_seq = []
@@ -790,7 +799,9 @@ def save_detailed_report(
                     if start_id == end_id:
                         session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
                     else:
-                        session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)")
+                        session_parts.append(
+                            f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)"
+                        )
             session_parts.append("")
             session_parts.append("## 概览")
             session_parts.append("- 字段说明：`avg-hit` = `avg_hit(excl_first)`（去除首请求后的平均命中率）")
@@ -1038,11 +1049,24 @@ def main():
     diagnosis = cross_diagnose(prefix_hr, session_hr)
 
     # Phase 4: 输出
+    # 无论 tail 还是全量模式，都生成详细报告
+    run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    if args.output:
+        output_base = args.output
+    else:
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
+        output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate")
+    output_dir = os.path.join(output_base, run_timestamp)
+
+    time_span = compute_time_span(strategy_recs, stats_recs)
+    window_rows = build_per_window_rows(strategy_recs, stats_recs)
+
     if tail is not None:
+        # tail 精简模式：打印摘要 + 生成详细报告
         print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling))
     else:
-        time_span = compute_time_span(strategy_recs, stats_recs)
-        window_rows = build_per_window_rows(strategy_recs, stats_recs)
+        # 全量模式：打印完整报告
         print(
             format_full_report(
                 args.log_file,
@@ -1057,41 +1081,31 @@ def main():
             )
         )
 
-        # 导出详细报告
-        run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        if args.output:
-            output_base = args.output
-        else:
-            script_dir = os.path.dirname(os.path.abspath(__file__))
-            golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", ".."))
-            output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate")
-        output_dir = os.path.join(output_base, run_timestamp)
-        report_path = save_detailed_report(
-            args.log_file,
-            strategy_recs,
-            stats_recs,
-            prefix_hr,
-            session_hr,
-            per_worker,
-            scheduling,
-            diagnosis,
-            output_dir,
-            time_span=time_span,
-        )
-        print("\n\U0001f4c4 详细数据见:")
-        report_abs, report_uri = _build_path_links(report_path)
-        print(f"  - 报告文件: {report_abs}")
-        print(f"    URI: {report_uri}")
-        details_path = os.path.join(output_dir, "detail", "per_window_data.md")
-        if os.path.exists(details_path):
-            details_abs, details_uri = _build_path_links(details_path)
-            print(f"  - 窗口明细: {details_abs}")
-            print(f"    URI: {details_uri}")
-        session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md")
-        if os.path.exists(session_detail_path):
-            session_abs, session_uri = _build_path_links(session_detail_path)
-            print(f"  - Session 明细: {session_abs}")
-            print(f"    URI: {session_uri}")
+    # 导出详细报告（tail 和全量都生成）
+    report_path = save_detailed_report(
+        args.log_file,
+        strategy_recs,
+        stats_recs,
+        prefix_hr,
+        session_hr,
+        per_worker,
+        scheduling,
+        diagnosis,
+        output_dir,
+        time_span=time_span,
+    )
+    print("\n\U0001f4c4 详细数据见:")
+    report_abs, report_uri = _build_path_links(report_path)
+    print(f"  - 报告文件: [{report_abs}]({report_uri})")
+    details_path = os.path.join(output_dir, "detail", "per_window_data.md")
+    if os.path.exists(details_path):
+        details_abs, details_uri = _build_path_links(details_path)
+        print(f"  - 窗口明细: [{details_abs}]({details_uri})")
+    session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md")
+    if os.path.exists(session_detail_path):
+        session_abs, session_uri = _build_path_links(session_detail_path)
+        print(f"  - Session 明细: [{session_abs}]({session_uri})")
+
 
 if __name__ == "__main__":
     main()

From 888b0ac63d572b35702f9bb417a67a13c9062a3f Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 19:39:08 +0800
Subject: [PATCH 30/40] [Feature] Add skills and logging cleanup

---
 .../router_troubleshoot_playbook.md           | 190 ------------------
 1 file changed, 190 deletions(-)
 delete mode 100644 docs/zh/online_serving/router_troubleshoot_playbook.md

diff --git a/docs/zh/online_serving/router_troubleshoot_playbook.md b/docs/zh/online_serving/router_troubleshoot_playbook.md
deleted file mode 100644
index 0ccee9c6d55..00000000000
--- a/docs/zh/online_serving/router_troubleshoot_playbook.md
+++ /dev/null
@@ -1,190 +0,0 @@
-# Router 问题排查实战手册（日志定位 + troubleshoot skill）
-
-本文档结合以下两部分信息整理：
-- Router 常见问题与日志语义：[`docs/zh/online_serving/router_faq.md`](router_faq.md)
-- `fastdeploy/golang_router/.claude/skills/troubleshoot` 的脚本能力与使用方式
-
-目标：给出一套可落地的排查流程，帮助你从“现象”快速定位到“日志证据”和“处理建议”。
-
----
-
-## 1. 先定范围：全量 / 尾部 / 指定时间段
-
-建议先根据问题发生时间选择分析范围（这是和分析模式并列的维度）：
-
-- **全量分析**：适合历史慢性问题、趋势问题。
-- **尾部分析（`--tail`）**：适合刚发生的故障，优先看最近 N 行或 N 分钟。
-- **指定时间段（`--start/--end`）**：适合已知故障窗口（例如 14:05~14:20）。
-
-> 说明：`--tail` 与 `--start/--end` 互斥，二选一。
-
----
-
-## 2. 先看健康与注册，再看调度与请求
-
-根据 `router_faq.md` 的建议，先确认“有没有可用实例”，再看“请求是否调度成功”。
-
-### 2.1 健康与注册检查（必做）
-
-```bash
-# 已注册实例列表
-curl -X GET http://{router_url}/registered
-
-# 已注册实例数量
-curl -X GET http://{router_url}/registered_number
-
-# 从 Router 机器检查后端健康
-curl -X GET http://{server_url}/health
-```
-
-重点日志关键词：
-- 健康移除：`Removed unhealthy ... instance`
-- 注册失败：`Failed to register instance`
-- 健康检查失败：`failed to send request to ...` / `Server ... is not healthy`
-
-若实例都不健康或未注册，后续 502/503 多数是结果，不是根因。
-
-### 2.2 调度失败检查
-
-常见错误：
-- `Failed to select worker`
-- `Failed to select worker pair`
-- `No available prefill/decode workers`
-
-这类问题先确认：
-1) 注册数量是否为 0；
-2) 调度策略与部署模式是否匹配；
-3) `fd_metrics_score` 依赖的 `/metrics` 是否可访问。
-
-### 2.3 请求链路与后端请求失败
-
-常见日志：
-- `Failed to connect to backend service`
-- `Request failed (attempt n/max)`
-- `Decode/Prefill/Backend request failed for {url}`
-- `Panic recovered`
-
-这类问题通常需要结合 trace（ID 级别）看完整链路。
-
----
-
-## 3. 使用 troubleshoot skill 的标准方式
-
-脚本入口（在 `fastdeploy/golang_router/` 下）：
-
-```bash
-SCRIPTS=.claude/skills/troubleshoot/scripts
-python3 $SCRIPTS/troubleshoot.py <log_file> [options]
-```
-
-### 3.1 全量体检（默认推荐首轮）
-
-```bash
-python3 $SCRIPTS/troubleshoot.py <log_file>
-```
-
-会同时输出：errors / latency / health / cache / load 的综合结果。
-
-### 3.2 指定维度分析（精准打点）
-
-```bash
-python3 $SCRIPTS/troubleshoot.py <log_file> --errors
-python3 $SCRIPTS/troubleshoot.py <log_file> --latency
-python3 $SCRIPTS/troubleshoot.py <log_file> --health
-python3 $SCRIPTS/troubleshoot.py <log_file> --cache
-python3 $SCRIPTS/troubleshoot.py <log_file> --load
-```
-
-### 3.3 请求追踪（ID 级排查）
-
-```bash
-# 单个 ID
-python3 $SCRIPTS/troubleshoot.py <log_file> --trace <ID>
-
-# 多个 ID
-python3 $SCRIPTS/troubleshoot.py <log_file> --trace "id1,id2,id3"
-```
-
-trace 会展示：
-- 匹配到的 tag 类型（request_id / trace_id / session_id / req_id）
-- 生命周期完整性
-- 事件链（含原始日志 RAW）
-- 仅 request_id / 仅 session_id / 仅 trace_id 的统计
-- 各标签组合形式（detail 中给出组合与对应 ID）
-
-### 3.4 范围过滤与 trace 组合
-
-当你要“在某个时间窗内追踪某个 ID”时，使用范围参数和 trace 组合：
-
-```bash
-python3 $SCRIPTS/troubleshoot.py <log_file> --start "2026/04/13 14:05:00" --end "2026/04/13 14:20:00" --trace "<ID>"
-```
-
-这符合“范围维度（全量/尾部/时间段）”与“模式维度（含 trace）”分离的使用方式。
-
----
-
-## 4. 一套可复制的故障定位流程
-
-### 步骤 A：确认故障窗口与错误现象
-- 收集用户报错时间、HTTP 状态码（502/503/500/400）和请求路径。
-
-### 步骤 B：先跑时间窗综合分析
-```bash
-python3 $SCRIPTS/troubleshoot.py <log_file> --start "HH:MM:SS" --end "HH:MM:SS"
-```
-- 看 STATUS（HEALTHY / DEGRADED / CRITICAL）。
-- 优先看 errors、health 章节，判断是否是后端健康/注册问题。
-
-### 步骤 C：按症状进入专项
-- 502/503：`--errors --health --load`
-- 延迟突增：`--latency --load --cache`
-- 单请求失败：`--trace <ID>`（可叠加步骤 B 的时间窗）
-
-### 步骤 D：在 detail 文件中取证
-报告目录默认：
-`skill_output/troubleshoot/<YYYYMMDD_HHMMSS>/`
-
-重点文件：
-- `summary/troubleshoot_report.md`
-- `detail/trace_<ID>.md`
-- `detail/health_events.md`
-- `detail/load_select_release.md`
-
----
-
-## 5. 现象到日志的快速映射
-
-| 现象 | 优先看日志/关键词 | 推荐命令 |
-|---|---|---|
-| 503 无可用 worker | `No available prefill/decode workers`, `Removed unhealthy ...` | `--health --errors` |
-| 502 调度失败 | `Failed to select worker`, `Failed to select worker pair` | `--errors --health --load` |
-| 502 后端连接失败 | `Failed to connect to backend service`, `Request failed (attempt ...)` | `--errors --trace <ID>` |
-| 请求卡住/链路不完整 | 有 select 无 release、无 `Request completed successfully.` | `--trace <ID>` |
-| 延迟抖动 | HTTP latency、`[stats] total_running...` | `--latency --load --cache` |
-
----
-
-## 6. 常见误区
-
-1. **只看 502/503 响应，不看健康与注册日志**：容易把“结果”当“根因”。
-2. **不限定时间窗口**：日志噪音大，容易误判。
-3. **trace 只看结构化事件，不看 RAW**：可能漏掉关键上下文（例如同一秒的 WARN/ERROR 细节）。
-4. **把范围维度和模式维度混在一起**：建议先定范围（全量/尾部/时间段），再定模式（完整/多维/trace）。
-
----
-
-## 7. 推荐排查命令模板
-
-```bash
-# 模板 1：故障窗口综合体检
-python3 $SCRIPTS/troubleshoot.py <log_file> --start "YYYY/MM/DD HH:MM:SS" --end "YYYY/MM/DD HH:MM:SS"
-
-# 模板 2：最近 30 分钟快速巡检
-python3 $SCRIPTS/troubleshoot.py <log_file> --tail 30m
-
-# 模板 3：单请求深挖（配合时间窗）
-python3 $SCRIPTS/troubleshoot.py <log_file> --start "HH:MM:SS" --end "HH:MM:SS" --trace "<request_or_trace_or_session_id>"
-```
-
-如果你已经知道故障集中在特定 ID，优先从模板 3 入手，然后回到模板 1 看全局背景。

From e16652d3a988d81f42437b1bb2c0676c6a5a4726 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Mon, 13 Apr 2026 19:40:52 +0800
Subject: [PATCH 31/40] [Feature] Add skills and logging cleanup

---
 docs/zh/online_serving/router_faq.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md
index a431065dbf0..9c32726f4dc 100644
--- a/docs/zh/online_serving/router_faq.md
+++ b/docs/zh/online_serving/router_faq.md
@@ -5,7 +5,6 @@
 本文档基于 [Golang Router](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/golang_router) 的代码实现，汇总了 Router 在使用过程中常见的日志信息、返回输出及问题排查方法，帮助用户快速定位和解决问题。
 
 Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。
-如需按“日志定位 + troubleshoot skill”流程化排查，请参考 [Router 问题排查实战手册](router_troubleshoot_playbook.md)。
 
 ## 常见日志分析
 

From 38b6ea050fe9886b44660bbd2f199abe6f6f58ca Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 11:28:53 +0800
Subject: [PATCH 32/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index c14565e348d..a3d64a0714d 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -143,7 +143,18 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 
 	// Open new date file for the new day.
 	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
-	w.currentFile, _ = os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	if err != nil {
+		log.Printf("[ERROR] failed to open new log file %s: %v, keeping current file", datePath, err)
+		if w.prevFile != nil {
+			w.currentFile = w.prevFile
+			w.currentDate = w.prevDate
+			w.prevFile = nil
+			w.prevDate = ""
+		}
+		return
+	}
+	w.currentFile = f
 	w.currentDate = newDate
 
 	// Update symlink: router.log -> router-<newDate>.log
@@ -162,7 +173,7 @@ func parseLogDate(p []byte) string {
 	s := string(p)
 	for i := 0; i+10 <= len(s); i++ {
 		c := s[i]
-		if c >= '1' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' {
+		if c >= '0' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' {
 			// Found a candidate "YYYY/MM/DD"
 			year := s[i : i+4]
 			month := s[i+5 : i+7]

From 5582779cc96b61da523c40b980d326019fa01846 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 14:33:38 +0800
Subject: [PATCH 33/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 16 ++++++---
 .../golang_router/pkg/logger/logger_test.go   | 34 +++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index a3d64a0714d..30cbc747ae0 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -76,7 +76,9 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	}
 
 	// Create/update symlink: router.log -> router-<today>.log
-	os.Remove(symlinkPath)
+	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
+	}
 	os.Symlink("router-"+today+".log", symlinkPath)
 
 	return &rotatingWriter{
@@ -159,7 +161,9 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 
 	// Update symlink: router.log -> router-<newDate>.log
 	symlinkPath := filepath.Join(w.logDir, "router.log")
-	os.Remove(symlinkPath)
+	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
+	}
 	os.Symlink("router-"+newDate+".log", symlinkPath)
 }
 
@@ -312,7 +316,9 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
 		remaining := archives[:0]
 		for _, f := range archives {
 			if f.date.Before(cutoff) {
-				os.Remove(f.path)
+				if err := os.Remove(f.path); err != nil {
+					fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", f.path, err)
+				}
 			} else {
 				remaining = append(remaining, f)
 			}
@@ -329,7 +335,9 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) {
 		}
 		for len(archives) > 0 && totalSize > maxBytes {
 			oldest := archives[0]
-			os.Remove(oldest.path)
+			if err := os.Remove(oldest.path); err != nil {
+				fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", oldest.path, err)
+			}
 			totalSize -= oldest.size
 			archives = archives[1:]
 		}
diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go
index fea0b853cf7..b426bc00988 100644
--- a/fastdeploy/golang_router/pkg/logger/logger_test.go
+++ b/fastdeploy/golang_router/pkg/logger/logger_test.go
@@ -182,3 +182,37 @@ func TestContextPrefix(t *testing.T) {
 		}
 	})
 }
+
+func TestParseLogDate(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{"standard INFO log line", "[INFO] 2024/03/15 10:30:45 some message", "2024-03-15"},
+		{"standard ERROR log line", "[ERROR] 2024/01/02 09:00:00 error occurred", "2024-01-02"},
+		{"standard WARN log line", "[WARN] 2025/12/31 23:59:59 warning msg", "2025-12-31"},
+		{"standard DEBUG log line", "[DEBUG] 2024/06/01 00:00:00 debug info", "2024-06-01"},
+		{"empty string", "", ""},
+		{"no date pattern", "no date here at all", ""},
+		{"incomplete date - only year", "2024/", ""},
+		{"incomplete date - year and month", "[INFO] 2024/03", ""},
+		{"short input", "abc", ""},
+		{"date without log prefix", "2024/03/15 10:30:45 message", "2024-03-15"},
+		{"date at different position", "prefix 2024/11/20 rest", "2024-11-20"},
+		{"slash but not date", "path/to/file is not a date", ""},
+		{"single character input", "x", ""},
+		{"exactly 10 chars non-date", "abcdefghij", ""},
+		{"boundary - first day of year", "[INFO] 2024/01/01 00:00:00 new year", "2024-01-01"},
+		{"boundary - last day of year", "[INFO] 2024/12/31 23:59:59 year end", "2024-12-31"},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := parseLogDate([]byte(tt.input))
+			if got != tt.expected {
+				t.Errorf("parseLogDate(%q) = %q, want %q", tt.input, got, tt.expected)
+			}
+		})
+	}
+}

From fd56b0ac357bc28a53fa55e879a7f00657d0bbdd Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 14:57:01 +0800
Subject: [PATCH 34/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 30cbc747ae0..c4e8191d598 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -79,7 +79,9 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
 		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
 	}
-	os.Symlink("router-"+today+".log", symlinkPath)
+	if err := os.Symlink("router-"+today+".log", symlinkPath); err != nil {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err)
+	}
 
 	return &rotatingWriter{
 		currentFile: f,
@@ -164,7 +166,9 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
 		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
 	}
-	os.Symlink("router-"+newDate+".log", symlinkPath)
+	if err := os.Symlink("router-"+newDate+".log", symlinkPath); err != nil {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err)
+	}
 }
 
 // parseLogDate extracts the date from a log line produced by log.LstdFlags.

From cc3864089d559f8129a85ea48cf33db919ffc2b3 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 15:18:22 +0800
Subject: [PATCH 35/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 33 ++++++++++++++-----
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index c4e8191d598..0a0e50ca686 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -76,11 +76,8 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	}
 
 	// Create/update symlink: router.log -> router-<today>.log
-	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
-		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
-	}
-	if err := os.Symlink("router-"+today+".log", symlinkPath); err != nil {
-		fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err)
+	if err := updateSymlink(symlinkPath, "router-"+today+".log"); err != nil {
+		fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale: %v\n", symlinkPath, err)
 	}
 
 	return &rotatingWriter{
@@ -163,12 +160,32 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 
 	// Update symlink: router.log -> router-<newDate>.log
 	symlinkPath := filepath.Join(w.logDir, "router.log")
+	if err := updateSymlink(symlinkPath, "router-"+newDate+".log"); err != nil {
+		fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale (points to old date): %v\n", symlinkPath, err)
+	}
+}
+
+// updateSymlink atomically replaces symlinkPath to point to target.
+// It tries os.Remove + os.Symlink first; if remove fails (e.g. permission denied)
+// it falls back to a temp-symlink + os.Rename for an atomic swap attempt.
+func updateSymlink(symlinkPath, target string) error {
+	// Fast path: remove old, create new.
 	if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) {
-		fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err)
+		// Remove failed (e.g. permission issue). Try atomic rename as fallback.
+		tmp := symlinkPath + ".tmp"
+		if err2 := os.Symlink(target, tmp); err2 != nil {
+			return fmt.Errorf("remove old symlink: %w; create temp symlink: %v", err, err2)
+		}
+		if err2 := os.Rename(tmp, symlinkPath); err2 != nil {
+			os.Remove(tmp) // best-effort cleanup
+			return fmt.Errorf("remove old symlink: %w; rename temp symlink: %v", err, err2)
+		}
+		return nil
 	}
-	if err := os.Symlink("router-"+newDate+".log", symlinkPath); err != nil {
-		fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err)
+	if err := os.Symlink(target, symlinkPath); err != nil {
+		return fmt.Errorf("create symlink: %w", err)
 	}
+	return nil
 }
 
 // parseLogDate extracts the date from a log line produced by log.LstdFlags.

From 06a886eaae52ddf5578f78d5a7daad2e5a8debe4 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 15:59:03 +0800
Subject: [PATCH 36/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 23 ++++++++-----------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 0a0e50ca686..1ec91533826 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -131,6 +131,16 @@ func (w *rotatingWriter) Close() error {
 
 // rotateLocked performs the actual file rotation. Must be called with w.mu held.
 func (w *rotatingWriter) rotateLocked(newDate string) {
+	// Open new date file for the new day first, before touching any state.
+	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err)
+		// Advance currentDate so we don't retry on every Write call.
+		w.currentDate = newDate
+		return
+	}
+
 	// Close any lingering previous file.
 	if w.prevFile != nil {
 		w.prevFile.Close()
@@ -142,19 +152,6 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 	w.prevDate = w.currentDate
 	w.graceUntil = nowFunc().Add(gracePeriod)
 
-	// Open new date file for the new day.
-	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
-	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
-	if err != nil {
-		log.Printf("[ERROR] failed to open new log file %s: %v, keeping current file", datePath, err)
-		if w.prevFile != nil {
-			w.currentFile = w.prevFile
-			w.currentDate = w.prevDate
-			w.prevFile = nil
-			w.prevDate = ""
-		}
-		return
-	}
 	w.currentFile = f
 	w.currentDate = newDate
 

From ce775c9a9cfd0791ae82d0df6f3b7789b56cdb91 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 16:48:33 +0800
Subject: [PATCH 37/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go      | 6 +++---
 fastdeploy/golang_router/pkg/logger/logger_test.go | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 1ec91533826..7822095af36 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -70,7 +70,7 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	}
 
 	// Open the date file (append mode).
-	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
 	if err != nil {
 		return nil, err
 	}
@@ -133,7 +133,7 @@ func (w *rotatingWriter) Close() error {
 func (w *rotatingWriter) rotateLocked(newDate string) {
 	// Open new date file for the new day first, before touching any state.
 	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
-	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err)
 		// Advance currentDate so we don't retry on every Write call.
@@ -243,7 +243,7 @@ func CloseLogFile() {
 	}
 }
 
-// StartLogCleanup runs periodic log cleanup in a background goroutine.
+// StartLogCleanup blocks running periodic log cleanup; call it in a goroutine.
 // It deletes archived log files older than MaxAgeDays and trims total log size
 // to stay under MaxTotalSizeMB.
 func StartLogCleanup(ctx context.Context, cfg Config) {
diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go
index b426bc00988..921f26aa166 100644
--- a/fastdeploy/golang_router/pkg/logger/logger_test.go
+++ b/fastdeploy/golang_router/pkg/logger/logger_test.go
@@ -24,7 +24,7 @@ func TestLoggerInit(t *testing.T) {
 		defer os.RemoveAll("logs")
 
 		// sync.Once prevents re-init, so manually verify file creation logic
-		f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666)
+		f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
 		if err != nil {
 			t.Fatalf("Failed to create log file: %v", err)
 		}

From ee0162a5b09568f9b5d798ecf382bd227d64194a Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 17:31:47 +0800
Subject: [PATCH 38/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/pkg/logger/logger.go | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index 7822095af36..bd1cfdcb7eb 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -56,6 +56,7 @@ type rotatingWriter struct {
 	currentDate string    // "2006-01-02"
 	prevDate    string    // previous date during grace period
 	graceUntil  time.Time // when to close prevFile
+	retryAfter  time.Time // earliest time to retry a failed rotation (backoff)
 	logDir      string
 }
 
@@ -93,8 +94,8 @@ func (w *rotatingWriter) Write(p []byte) (n int, err error) {
 
 	today := nowFunc().Format("2006-01-02")
 
-	// Detect day change and rotate.
-	if today != w.currentDate {
+	// Detect day change and rotate. Also retry failed rotations after backoff.
+	if today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) {
 		w.rotateLocked(today)
 	}
 
@@ -136,11 +137,15 @@ func (w *rotatingWriter) rotateLocked(newDate string) {
 	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err)
-		// Advance currentDate so we don't retry on every Write call.
-		w.currentDate = newDate
+		// Don't advance currentDate — keep writing to the old file and retry
+		// after a backoff to avoid hammering the filesystem on every Write call.
+		w.retryAfter = nowFunc().Add(30 * time.Second)
 		return
 	}
 
+	// Rotation succeeded — clear any retry backoff.
+	w.retryAfter = time.Time{}
+
 	// Close any lingering previous file.
 	if w.prevFile != nil {
 		w.prevFile.Close()

From cbdb5484a69fef0c3e2caebfb62ae240879bc847 Mon Sep 17 00:00:00 2001
From: mouxin <mouxin@baidu.com>
Date: Tue, 14 Apr 2026 19:06:44 +0800
Subject: [PATCH 39/40] [Feature] Update logging cleanup

---
 fastdeploy/golang_router/cmd/main.go          |  1 +
 .../config/config.example.yaml                |  1 +
 .../golang_router/internal/config/config.go   |  1 +
 fastdeploy/golang_router/pkg/logger/logger.go | 98 +++++++++++++++----
 4 files changed, 80 insertions(+), 21 deletions(-)

diff --git a/fastdeploy/golang_router/cmd/main.go b/fastdeploy/golang_router/cmd/main.go
index 6664436823c..c3670622ab2 100644
--- a/fastdeploy/golang_router/cmd/main.go
+++ b/fastdeploy/golang_router/cmd/main.go
@@ -44,6 +44,7 @@ func main() {
 	logCfg := logger.Config{
 		Level:               cfg.Log.Level,
 		Output:              cfg.Log.Output,
+		Dir:                 cfg.Log.Dir,
 		MaxAgeDays:          cfg.Log.MaxAgeDays,
 		MaxTotalSizeMB:      cfg.Log.MaxTotalSizeMB,
 		CleanupIntervalSecs: cfg.Log.CleanupIntervalSecs,
diff --git a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
index 5e1091b0eef..075d8eec5fd 100644
--- a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
+++ b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml
@@ -29,6 +29,7 @@ manager:
 log:
   level: "info"  # debug, info, warn, error
   output: "file" # stdout, file
+  dir: "logs"    # log directory; default: logs
   max-age-days: 7              # max days to keep log files; default: 7
   max-total-size-mb: 500       # max total log size in MB; default: 500
   cleanup-interval-secs: 3600  # cleanup check interval in seconds; default: 3600
diff --git a/fastdeploy/golang_router/internal/config/config.go b/fastdeploy/golang_router/internal/config/config.go
index f184a5b16da..7a6dc3fc504 100644
--- a/fastdeploy/golang_router/internal/config/config.go
+++ b/fastdeploy/golang_router/internal/config/config.go
@@ -51,6 +51,7 @@ type SchedulerConfig struct {
 type LogConfig struct {
 	Level               string  `yaml:"level"`                // debug, info, warn, error
 	Output              string  `yaml:"output"`               // stdout, file
+	Dir                 string  `yaml:"dir"`                  // log directory; defaults to "logs"
 	MaxAgeDays          int     `yaml:"max-age-days"`         // max days to keep log files; 0 = use default (7)
 	MaxTotalSizeMB      int     `yaml:"max-total-size-mb"`    // max total log size in MB; 0 = use default (500)
 	CleanupIntervalSecs float64 `yaml:"cleanup-interval-secs"` // cleanup check interval in seconds; 0 = use default (3600)
diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go
index bd1cfdcb7eb..daa23d55450 100644
--- a/fastdeploy/golang_router/pkg/logger/logger.go
+++ b/fastdeploy/golang_router/pkg/logger/logger.go
@@ -16,6 +16,7 @@ import (
 type Config struct {
 	Level               string
 	Output              string
+	Dir                 string // log directory; defaults to "logs"
 	MaxAgeDays          int
 	MaxTotalSizeMB      int
 	CleanupIntervalSecs float64
@@ -88,15 +89,53 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) {
 	}, nil
 }
 
-func (w *rotatingWriter) Write(p []byte) (n int, err error) {
+// needsRotate checks if rotation is needed under the lock.
+func (w *rotatingWriter) needsRotate(today string) (bool, string) {
 	w.mu.Lock()
 	defer w.mu.Unlock()
+	needs := today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter))
+	return needs, w.logDir
+}
+
+// tryOpenRotateFile checks if rotation is needed and pre-opens the new log file
+// outside the lock to avoid blocking other writers on slow file I/O.
+func (w *rotatingWriter) tryOpenRotateFile(today string) *os.File {
+	needs, logDir := w.needsRotate(today)
+	if !needs {
+		return nil
+	}
+
+	datePath := filepath.Join(logDir, "router-"+today+".log")
+	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err)
+		return nil
+	}
+	return f
+}
 
+func (w *rotatingWriter) Write(p []byte) (n int, err error) {
 	today := nowFunc().Format("2006-01-02")
 
-	// Detect day change and rotate. Also retry failed rotations after backoff.
+	// Pre-open new file outside the lock to reduce lock-held I/O time.
+	preOpened := w.tryOpenRotateFile(today)
+
+	w.mu.Lock()
+	defer w.mu.Unlock()
+
+	// Authoritative rotation check under lock.
 	if today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) {
-		w.rotateLocked(today)
+		if preOpened != nil {
+			w.commitRotate(today, preOpened)
+			preOpened = nil // ownership transferred
+		} else {
+			// File open failed; set backoff so we don't retry on every Write.
+			w.retryAfter = nowFunc().Add(30 * time.Second)
+		}
+	}
+	// If another goroutine already rotated, close the unused pre-opened file.
+	if preOpened != nil {
+		preOpened.Close()
 	}
 
 	// Close previous file if grace period expired.
@@ -130,19 +169,8 @@ func (w *rotatingWriter) Close() error {
 	return nil
 }
 
-// rotateLocked performs the actual file rotation. Must be called with w.mu held.
-func (w *rotatingWriter) rotateLocked(newDate string) {
-	// Open new date file for the new day first, before touching any state.
-	datePath := filepath.Join(w.logDir, "router-"+newDate+".log")
-	f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err)
-		// Don't advance currentDate — keep writing to the old file and retry
-		// after a backoff to avoid hammering the filesystem on every Write call.
-		w.retryAfter = nowFunc().Add(30 * time.Second)
-		return
-	}
-
+// commitRotate finalises the rotation with a pre-opened file. Must be called with w.mu held.
+func (w *rotatingWriter) commitRotate(newDate string, f *os.File) {
 	// Rotation succeeded — clear any retry backoff.
 	w.retryAfter = time.Time{}
 
@@ -201,16 +229,35 @@ func parseLogDate(p []byte) string {
 	for i := 0; i+10 <= len(s); i++ {
 		c := s[i]
 		if c >= '0' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' {
-			// Found a candidate "YYYY/MM/DD"
+			// Found a candidate "YYYY/MM/DD" — validate it.
 			year := s[i : i+4]
 			month := s[i+5 : i+7]
 			day := s[i+8 : i+10]
+			if !isAllDigits(month) || !isAllDigits(day) {
+				continue
+			}
+			m := (month[0]-'0')*10 + (month[1] - '0')
+			d := (day[0]-'0')*10 + (day[1] - '0')
+			if m < 1 || m > 12 || d < 1 || d > 31 {
+				continue
+			}
+			_ = year // year already starts with a digit; any 4-digit year is acceptable
 			return year + "-" + month + "-" + day
 		}
 	}
 	return ""
 }
 
+// isAllDigits returns true if every byte in s is an ASCII digit.
+func isAllDigits(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] < '0' || s[i] > '9' {
+			return false
+		}
+	}
+	return true
+}
+
 // Init initializes the logger.
 func Init(cfg Config) {
 	once.Do(func() {
@@ -218,13 +265,17 @@ func Init(cfg Config) {
 		flags := log.LstdFlags | log.Lshortfile
 
 		if cfg.Output == "file" {
-			if _, err := os.Stat("logs"); os.IsNotExist(err) {
-				if err := os.MkdirAll("logs", 0755); err != nil {
+			logDir := cfg.Dir
+			if logDir == "" {
+				logDir = "logs"
+			}
+			if _, err := os.Stat(logDir); os.IsNotExist(err) {
+				if err := os.MkdirAll(logDir, 0755); err != nil {
 					log.Fatalln("Failed to create logs directory:", err)
 				}
 			}
 			var err error
-			writer, err = newRotatingWriter("logs")
+			writer, err = newRotatingWriter(logDir)
 			if err != nil {
 				log.Fatalln("Failed to create rotating log writer:", err)
 			}
@@ -259,6 +310,11 @@ func StartLogCleanup(ctx context.Context, cfg Config) {
 		return
 	}
 
+	logDir := cfg.Dir
+	if logDir == "" {
+		logDir = "logs"
+	}
+
 	ticker := time.NewTicker(time.Duration(cfg.CleanupIntervalSecs * float64(time.Second)))
 	defer ticker.Stop()
 
@@ -267,7 +323,7 @@ func StartLogCleanup(ctx context.Context, cfg Config) {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			cleanupLogs("logs", cfg.MaxAgeDays, cfg.MaxTotalSizeMB)
+			cleanupLogs(logDir, cfg.MaxAgeDays, cfg.MaxTotalSizeMB)
 		}
 	}
 }

From fd3c013b90284d9c11181509e1485b60c5740926 Mon Sep 17 00:00:00 2001
From: mouxin <494624263qq@gmail.com>
Date: Tue, 14 Apr 2026 19:35:29 +0800
Subject: [PATCH 40/40] test(golang_router): cover cleanup loop and cross-day
 log rolling

---
 .../golang_router/pkg/logger/logger_test.go   | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go
index 921f26aa166..1d9874ded6f 100644
--- a/fastdeploy/golang_router/pkg/logger/logger_test.go
+++ b/fastdeploy/golang_router/pkg/logger/logger_test.go
@@ -4,8 +4,10 @@ import (
 	"bytes"
 	"context"
 	"os"
+	"path/filepath"
 	"strings"
 	"testing"
+	"time"
 )
 
 func TestLoggerInit(t *testing.T) {
@@ -216,3 +218,142 @@ func TestParseLogDate(t *testing.T) {
 		})
 	}
 }
+
+func TestStartLogCleanup(t *testing.T) {
+	t.Run("cleanup runs for file output and respects cancellation", func(t *testing.T) {
+		tmpDir := t.TempDir()
+
+		originalNowFunc := nowFunc
+		fixedNow := time.Date(2026, 4, 10, 12, 0, 0, 0, time.UTC)
+		nowFunc = func() time.Time { return fixedNow }
+		defer func() { nowFunc = originalNowFunc }()
+
+		// Create archived logs: one older than 1 day and one recent.
+		oldLog := filepath.Join(tmpDir, "router-2026-04-07.log")
+		recentLog := filepath.Join(tmpDir, "router-2026-04-09.log")
+		todayLog := filepath.Join(tmpDir, "router-2026-04-10.log")
+		for _, p := range []string{oldLog, recentLog, todayLog} {
+			if err := os.WriteFile(p, []byte("test"), 0644); err != nil {
+				t.Fatalf("failed to create test log %s: %v", p, err)
+			}
+		}
+
+		ctx, cancel := context.WithCancel(context.Background())
+		done := make(chan struct{})
+		go func() {
+			defer close(done)
+			StartLogCleanup(ctx, Config{
+				Output:              "file",
+				Dir:                 tmpDir,
+				MaxAgeDays:          2,
+				CleanupIntervalSecs: 0.01,
+			})
+		}()
+
+		waitForCondition(t, 500*time.Millisecond, func() bool {
+			_, err := os.Stat(oldLog)
+			return os.IsNotExist(err)
+		}, "old log should be removed by StartLogCleanup")
+
+		if _, err := os.Stat(recentLog); err != nil {
+			t.Fatalf("recent log should be kept, stat err: %v", err)
+		}
+		if _, err := os.Stat(todayLog); err != nil {
+			t.Fatalf("today log should be kept, stat err: %v", err)
+		}
+
+		cancel()
+		select {
+		case <-done:
+		case <-time.After(500 * time.Millisecond):
+			t.Fatal("StartLogCleanup did not stop after context cancellation")
+		}
+	})
+
+	t.Run("non-file output returns immediately", func(t *testing.T) {
+		done := make(chan struct{})
+		go func() {
+			defer close(done)
+			StartLogCleanup(context.Background(), Config{Output: "stdout", CleanupIntervalSecs: 1})
+		}()
+		select {
+		case <-done:
+		case <-time.After(200 * time.Millisecond):
+			t.Fatal("StartLogCleanup should return immediately for non-file output")
+		}
+	})
+}
+
+func TestRotatingWriterCrossDayGracePeriodIntegration(t *testing.T) {
+	tmpDir := t.TempDir()
+
+	originalNowFunc := nowFunc
+	defer func() { nowFunc = originalNowFunc }()
+
+	current := time.Date(2026, 4, 10, 23, 59, 59, 0, time.UTC)
+	nowFunc = func() time.Time { return current }
+
+	w, err := newRotatingWriter(tmpDir)
+	if err != nil {
+		t.Fatalf("failed to create rotating writer: %v", err)
+	}
+	defer w.Close()
+
+	if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:59 first day line\n")); err != nil {
+		t.Fatalf("failed to write day-1 line: %v", err)
+	}
+
+	current = time.Date(2026, 4, 11, 0, 0, 1, 0, time.UTC)
+	if _, err = w.Write([]byte("[INFO] 2026/04/11 00:00:01 second day line\n")); err != nil {
+		t.Fatalf("failed to write day-2 line: %v", err)
+	}
+
+	if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:58 late previous-day line\n")); err != nil {
+		t.Fatalf("failed to write late previous-day line: %v", err)
+	}
+
+	day1Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-10.log"))
+	if err != nil {
+		t.Fatalf("failed to read day-1 log: %v", err)
+	}
+	day1Content := string(day1Bytes)
+	if !strings.Contains(day1Content, "first day line") {
+		t.Fatalf("day-1 log missing initial line, content: %s", day1Content)
+	}
+	if !strings.Contains(day1Content, "late previous-day line") {
+		t.Fatalf("day-1 log missing late previous-day line, content: %s", day1Content)
+	}
+
+	day2Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-11.log"))
+	if err != nil {
+		t.Fatalf("failed to read day-2 log: %v", err)
+	}
+	day2Content := string(day2Bytes)
+	if !strings.Contains(day2Content, "second day line") {
+		t.Fatalf("day-2 log missing day-2 line, content: %s", day2Content)
+	}
+	if strings.Contains(day2Content, "late previous-day line") {
+		t.Fatalf("late previous-day line should not be in day-2 file, content: %s", day2Content)
+	}
+
+	symlinkTarget, err := os.Readlink(filepath.Join(tmpDir, "router.log"))
+	if err != nil {
+		t.Fatalf("failed to read symlink: %v", err)
+	}
+	if symlinkTarget != "router-2026-04-11.log" {
+		t.Fatalf("router.log symlink target = %s, want router-2026-04-11.log", symlinkTarget)
+	}
+}
+
+func waitForCondition(t *testing.T, timeout time.Duration, cond func() bool, msg string) {
+	t.Helper()
+
+	deadline := time.Now().Add(timeout)
+	for time.Now().Before(deadline) {
+		if cond() {
+			return
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	t.Fatal(msg)
+}