From cad1e7acf7694842dba56d2af202fad5da263fe8 Mon Sep 17 00:00:00 2001 From: mouxin Date: Sun, 12 Apr 2026 15:31:35 +0800 Subject: [PATCH 01/40] [Feature] Add troubleshoot and stats-cache-hitratio skills --- docs/online_serving/router.md | 2 +- docs/online_serving/router_faq.md | 41 +- docs/zh/online_serving/router.md | 2 +- docs/zh/online_serving/router_faq.md | 41 +- .../skills/stat-cache-hitrate/SKILL.md | 119 +++ .../evals/trigger_eval.json | 18 + .../references/log_formats.md | 139 +++ .../references/report_templates.md | 199 +++++ .../stat-cache-hitrate/scripts/chart.py | 249 ++++++ .../stat-cache-hitrate/scripts/log_parser.py | 358 ++++++++ .../scripts/stat_cache_hitrate.py | 669 ++++++++++++++ .../stat-cache-hitrate/scripts/stats.py | 278 ++++++ .../.claude/skills/troubleshoot/SKILL.md | 148 ++++ .../troubleshoot/evals/trigger_eval.json | 18 + .../troubleshoot/references/error_catalog.md | 122 +++ .../references/fastdeploy_cross_reference.md | 102 +++ .../troubleshoot/references/log_patterns.md | 282 ++++++ .../references/report_templates.md | 120 +++ .../scripts/analyzers/__init__.py | 1 + .../troubleshoot/scripts/analyzers/cache.py | 458 ++++++++++ .../troubleshoot/scripts/analyzers/errors.py | 314 +++++++ .../troubleshoot/scripts/analyzers/health.py | 421 +++++++++ .../troubleshoot/scripts/analyzers/latency.py | 355 ++++++++ .../troubleshoot/scripts/analyzers/load.py | 389 ++++++++ .../troubleshoot/scripts/analyzers/trace.py | 391 ++++++++ .../skills/troubleshoot/scripts/chart.py | 351 ++++++++ .../skills/troubleshoot/scripts/log_parser.py | 832 ++++++++++++++++++ .../skills/troubleshoot/scripts/stats.py | 278 ++++++ .../troubleshoot/scripts/troubleshoot.py | 334 +++++++ fastdeploy/golang_router/.gitignore | 2 + 30 files changed, 7021 insertions(+), 12 deletions(-) create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py create mode 100644 fastdeploy/golang_router/.gitignore diff --git a/docs/online_serving/router.md b/docs/online_serving/router.md index 82940e5680c..7abc9c06af3 100644 --- a/docs/online_serving/router.md +++ b/docs/online_serving/router.md @@ -194,7 +194,7 @@ scheduler: policy: "power_of_two" # Scheduling policy (optional): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score prefill-policy: "cache_aware" # Prefill scheduling policy in PD mode decode-policy: "request_num" # Decode scheduling policy in PD mode - eviction-interval-secs: 60 # Cache eviction interval for CacheAware scheduling + eviction-interval-secs: 60 # Counter eviction interval for CacheAware scheduling eviction-duration-mins: 30 # Eviction duration for cache-aware radix tree nodes (minutes); default: 30 balance-abs-threshold: 1 # Absolute threshold for CacheAware balancing balance-rel-threshold: 0.2 # Relative threshold for CacheAware balancing diff --git a/docs/online_serving/router_faq.md b/docs/online_serving/router_faq.md index 49083539d4c..c0fb8cba4bf 100644 --- a/docs/online_serving/router_faq.md +++ b/docs/online_serving/router_faq.md @@ -29,6 +29,24 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `empty baseURL provided` | Health check received an empty base URL | Health check cannot be performed | Registration parameters | | `failed to create request: {error}` | Failed to create health check request | The instance may be marked as unhealthy | Network environment | | `failed to read response body: {error}` | Failed to read health check response body | The instance may be marked as unhealthy | Backend instance status | +| `Failed to select mixed worker: {error}` | Failed to select Mixed worker in centralized mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to select prefill worker: {error}` | Failed to select Prefill worker in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to read register request body: {error}` | Failed to read registration request body | Registration request returns 400 | Request format | +| `Failed to unmarshal register request JSON: {error}` | Failed to parse registration request JSON | Registration request returns 400 | Request format | +| `Failed to create decode request for {url}: {error}` | Failed to create HTTP request to Decode instance | Current request fails | Network environment | +| `Failed to create prefill request for {url}: {error}` | Failed to create HTTP request to Prefill instance | Current request fails | Network environment | +| `Decode request failed for {url}: {error}` | Request to Decode instance failed | Current request fails | Backend instance status, network connectivity | +| `Prefill request failed for {url}: {error}` | Request to Prefill instance failed | Current request fails | Backend instance status, network connectivity | +| `Failed to read request body: {error}` | Failed to read inference request body | Current request returns 400 | Request format | +| `Failed to unmarshal request JSON: {error}` | Failed to parse inference request JSON | Current request returns 400 | Request format | +| `Failed to select worker pair: {error}` | Failed to select worker pair in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to build disaggregate_info: {error}` | Failed to build PD disaggregation communication info | Current request returns 500 | Registration parameters (connector_port, device_ids, etc.) | +| `Failed to encode modified request: {error}` | Failed to encode modified request body | Current request returns 500 | Request content | +| `Failed to select worker: {error}` | Failed to select worker in centralized mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to connect to backend service: {error}` | Failed to connect to backend inference instance (after 3 retries) | Current request returns 502 | Backend instance status, network connectivity | +| `Request failed (attempt {n}/{max}): {error}` | Request attempt {n} failed | If retries exhausted, request returns 502 | Backend instance status, network connectivity | +| `Failed to create backend request for {url}: {error}` | Failed to create HTTP request to backend | Current request fails | Network environment | +| `Backend request failed for {url}: {error}` | Request to backend instance failed | Current request fails | Backend instance status, network connectivity | ### Warn-Level Logs @@ -37,8 +55,9 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `Server {url} is not healthy` | The instance at this URL failed health check | Router cannot register the instance, or will remove it from the registered list | Health status | | `Instance {url} role is unknown` | Instance role cannot be recognized | The instance will not be added to the scheduling list | Registration parameters | | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer service call failed, automatically falling back to character-based tokenization | cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer; normal request processing is not affected | Tokenizer service status | -| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | Tokenization completely failed (e.g., empty input), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status | -| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed (new format), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status | +| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | Failed to fetch remote metrics, falling back to local counter | Scheduling accuracy may decrease; normal request processing is not affected | Backend instance metrics port, network connectivity | +| `release worker: {url} skipped, counter already cleaned up` | Worker counter was already cleaned up when trying to release | May occur when a worker is removed by health check while requests are still in-flight | Health status, request timing | +| `release worker: {url} skipped, counter already zero (possible double-release)` | Worker counter is already zero when trying to release | Possible duplicate counter release | Request processing logic | ### Info-Level Logs @@ -49,7 +68,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `Successfully registered instance from index {index}` | Instance from config file registered successfully | Normal startup log | | `No instances found in config file {path}` | No instances found in the registration config file | Check whether register.yaml is empty | | `Request completed successfully.` | Request processing completed | Normal operation log | -| `Request failed, retrying...` | Request failed, retrying | Router will retry up to 3 times | | `select worker (prefill): {url}, tokens: {tokens}` | Prefill scheduler selected a worker, showing current token processing count | Normal operation log | | `select worker ({type}): {url}, count: {count}` | Decode/Mixed scheduler selected a worker, showing current request concurrency | Normal operation log | | `release worker: {url}, count: {count}` | Request ended, worker counter released | Normal operation log | @@ -58,7 +76,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `removed counters for {count} unhealthy workers: {urls}` | Batch cleanup of counters for unhealthy workers | Normal operation log | | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | Periodic stats: total requests, worker loads, cache hit rate | Normal operation log, useful for monitoring and tuning | | `Parsing completed; starting worker selection.` | Request parsing completed, starting worker selection | Normal operation log | -| `Request completed with an error.` | Request processing completed with an error | Check backend instance status | | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | Decode selection failed in PD disaggregated mode, releasing Prefill counter | Error handling log | | `[prefill] first chunk received, release counter url={url}` | Prefill streaming response received first chunk, counter released | Normal operation log | | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill non-streaming response completed, counter released | Normal operation log | @@ -66,12 +83,17 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `[prefill] release in defer (fallback) url={url}, isStream={bool}` | Fallback resource release when Prefill request exits abnormally | Error handling log | | `[prefill] release in CommonCompletions defer (error path) url={url}` | Prefill resource release on error path | Error handling log | | `cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized` | cache_aware strategy not initialized, falling back to process_tokens | Check cache_aware configuration | +| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed, falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | | `cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads={loads}. ts_ms={ts}` | Load imbalanced across instances, falling back to process_tokens strategy | Normal operation log, automatic load balancing switch | | `cache-aware prefill: final strategy: cache_aware_scoring, selected={url}, loads={loads}, hitRatios={ratios}. ts_ms={ts}` | cache_aware scoring strategy selected a worker | Normal operation log, showing loads and hit ratios | | `[{method}] {path} {proto} {status} {latency} {clientIP}` | HTTP request access log | Normal operation log, records basic info for each request | | `before SelectWorker prefill. ts_ms={ts}` | Starting Prefill worker selection in PD disaggregated mode | Normal operation log, for performance tracing | | `before SelectWorker decode, after prefill. ts_ms={ts}` | Starting Decode worker selection after Prefill selection | Normal operation log, for performance tracing | | `after SelectWorker decode, before return. ts_ms={ts}` | Decode worker selection completed | Normal operation log, for performance tracing | +| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | Unhealthy worker still has in-flight requests, counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | Unhealthy worker still has in-flight token load, token counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete | +| `cleanup unhealthy worker token counter: {url}` | Cleaned up token counter for unhealthy worker | Normal operation log | +| `preserved counters for {count} workers with inflight requests: {urls}` | Batch preserved counters for workers with in-flight requests | Normal operation log | ### Debug-Level Logs @@ -100,6 +122,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `{"error": "Failed to build disaggregate_info"}` | 500 | Failed to build PD disaggregation communication info | Registration parameters (connector_port, device_ids, etc.) | | `{"error": "Invalid request body"}` | 400 | Failed to read request body | Request format | | `{"error": "Invalid JSON format"}` | 400 | Failed to parse request body JSON | Request format | +| `{"error": "Failed to encode modified request: {error}"}` | 500 | Failed to encode modified request body | Request content | +| `{"code": 500, "msg": "Internal server error"}` | 500 | A panic occurred during request processing and was recovered | Backend instance status, request content | + +> **Note**: In PD disaggregated (splitwise) mode, the above error responses include an additional `request_id` field, e.g., `{"error": "...", "request_id": "xxx"}`. Additionally, `Invalid request body` and `Invalid JSON format` responses include specific error details, e.g., `{"error": "Invalid request body: EOF"}`. ### Registration Request Errors (/register) @@ -111,6 +137,7 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `{"code": 400, "msg": "splitwise mode only supports PREFILL/DECODE instances"}` | 400 | MIXED instances are not allowed in PD disaggregated mode | Deployment mode, instance role | | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | Only MIXED instances are allowed in centralized mode | Deployment mode, instance role | | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | Instance registration info validation failed | Registration parameters | +| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router internal manager not initialized | Router startup status | | `{"code": 200, "msg": "Register success"}` | 200 | Registration successful | — | ### Common Registration Parameter Validation Errors @@ -124,6 +151,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `port is required` | Missing port field | Add the port field | | `invalid port: {port}` | port is not a valid port number | Provide a port number in the range 1-65535 | | `invalid protocol: {protocol}` | Invalid transfer protocol | Use a valid protocol value: ipc / rdma | +| `invalid connector_port: {port}` | connector_port is not a valid port number | Provide a port number in the range 1-65535 | +| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port is not a valid port number | Provide a port number in the range 1-65535 | +| `invalid metrics_port: {port}` | metrics_port is not a valid port number | Provide a port number in the range 1-65535 | +| `rdma_ports[{index}] invalid port: {port}` | Port at index {index} in RDMA ports list is not valid | Provide a port number in the range 1-65535 | ## Troubleshooting Guide @@ -236,7 +267,7 @@ If `Failed to start server` appears in startup logs, check: When using the `cache_aware` scheduling strategy, the Router calls a Tokenizer service to tokenize requests for cache hit ratio computation. When the Tokenizer service is unavailable, the Router has a two-level degradation mechanism: 1. **Fallback to character-based tokenization** (common case): The log will show `tokenizer failed, fallback to char tokens`. The cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer. Cache hit accuracy may decrease, but normal request processing is not affected. -2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `tokenize failed, fallback to process_tokens`. The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected. +2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` (Info level). The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected. To restore full cache_aware functionality: diff --git a/docs/zh/online_serving/router.md b/docs/zh/online_serving/router.md index 0ace28c2da1..375f036ad2c 100644 --- a/docs/zh/online_serving/router.md +++ b/docs/zh/online_serving/router.md @@ -194,7 +194,7 @@ scheduler: policy: "power_of_two" # 调度策略(可选): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score; 默认: request_num prefill-policy: "cache_aware" # pd分离模式下prefill节点调度策略; 默认: process_tokens decode-policy: "request_num" # pd分离模式下decode节点调度策略; 默认: request_num - eviction-interval-secs: 60 # cache-aware策略清理过期cache的间隔时间 + eviction-interval-secs: 60 # cache-aware策略清理过期计数器的间隔时间 eviction-duration-mins: 30 # cache-aware策略radix tree节点驱逐时间(分钟); 默认: 30 balance-abs-threshold: 1 # cache-aware策略绝对阈值 balance-rel-threshold: 0.2 # cache-aware策略相对阈值 diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md index a42ed015283..9c32726f4dc 100644 --- a/docs/zh/online_serving/router_faq.md +++ b/docs/zh/online_serving/router_faq.md @@ -29,6 +29,24 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `empty baseURL provided` | 健康检查时传入了空的基础 URL | 健康检查无法执行 | 注册参数 | | `failed to create request: {error}` | 创建健康检查请求失败 | 该实例可能被判定为不健康 | 网络环境 | | `failed to read response body: {error}` | 读取健康检查响应体失败 | 该实例可能被判定为不健康 | 后端实例状态 | +| `Failed to select mixed worker: {error}` | 集中式模式下选择 Mixed Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to select prefill worker: {error}` | PD 分离模式下选择 Prefill Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to read register request body: {error}` | 读取注册请求体失败 | 该注册请求返回 400 | 请求格式 | +| `Failed to unmarshal register request JSON: {error}` | 解析注册请求 JSON 失败 | 该注册请求返回 400 | 请求格式 | +| `Failed to create decode request for {url}: {error}` | 创建发往 Decode 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Failed to create prefill request for {url}: {error}` | 创建发往 Prefill 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Decode request failed for {url}: {error}` | 发往 Decode 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | +| `Prefill request failed for {url}: {error}` | 发往 Prefill 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | +| `Failed to read request body: {error}` | 读取推理请求体失败 | 当前请求返回 400 | 请求格式 | +| `Failed to unmarshal request JSON: {error}` | 解析推理请求 JSON 失败 | 当前请求返回 400 | 请求格式 | +| `Failed to select worker pair: {error}` | PD 分离模式下选择 Worker 对失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to build disaggregate_info: {error}` | 构建 PD 分离通信信息失败 | 当前请求返回 500 | 注册参数(connector_port、device_ids 等) | +| `Failed to encode modified request: {error}` | 编码修改后的请求体失败 | 当前请求返回 500 | 请求内容 | +| `Failed to select worker: {error}` | 集中式模式下选择 Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to connect to backend service: {error}` | 连接后端推理实例失败(已重试 3 次仍失败) | 当前请求返回 502 | 后端实例状态、网络连通性 | +| `Request failed (attempt {n}/{max}): {error}` | 请求发送第 {n} 次尝试失败 | 若重试耗尽则请求返回 502 | 后端实例状态、网络连通性 | +| `Failed to create backend request for {url}: {error}` | 创建发往后端的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Backend request failed for {url}: {error}` | 发往后端实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | ### Warn 级别日志 @@ -37,8 +55,9 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `Server {url} is not healthy` | 该 URL 对应的实例未通过健康检查 | Router 无法注册该实例,或将该实例从已注册列表中移除 | 健康状况 | | `Instance {url} role is unknown` | 实例角色无法识别 | 该实例不会被加入调度列表 | 注册参数 | | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer 服务调用失败,已自动回退至字符级分词 | cache_aware 策略仍然生效,使用字符级分词代替 Tokenizer 进行缓存匹配,不影响正常请求处理 | Tokenizer 服务状态 | -| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | 分词彻底失败(如输入为空),回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | 请求内容、Tokenizer 服务状态 | -| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败(新格式),回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | 请求内容、Tokenizer 服务状态 | +| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | 获取远程 metrics 失败,已回退至本地计数器 | 调度精度可能下降,不影响正常请求处理 | 后端实例 metrics 端口、网络连通性 | +| `release worker: {url} skipped, counter already cleaned up` | 释放 Worker 计数器时发现已被清理 | 可能是 Worker 被健康检查移除后仍有在途请求完成 | 健康状况、请求时序 | +| `release worker: {url} skipped, counter already zero (possible double-release)` | 释放 Worker 计数器时发现已归零 | 可能存在计数器重复释放 | 请求处理逻辑 | ### Info 级别日志 @@ -49,7 +68,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `Successfully registered instance from index {index}` | 配置文件中的实例注册成功 | 正常启动日志 | | `No instances found in config file {path}` | 注册配置文件中未找到实例信息 | 请检查 register.yaml 内容是否为空 | | `Request completed successfully.` | 请求处理完成 | 正常运行日志 | -| `Request failed, retrying...` | 请求失败,正在进行重试 | Router 最多重试 3 次 | | `select worker (prefill): {url}, tokens: {tokens}` | Prefill 调度选中 Worker,显示当前 token 处理量 | 正常运行日志 | | `select worker ({type}): {url}, count: {count}` | Decode/Mixed 调度选中 Worker,显示当前请求并发数 | 正常运行日志 | | `release worker: {url}, count: {count}` | 请求结束,释放 Worker 计数器 | 正常运行日志 | @@ -58,7 +76,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `removed counters for {count} unhealthy workers: {urls}` | 批量清理不健康 Worker 的计数器 | 正常运行日志 | | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | 周期性统计:总请求数、各 Worker 负载、缓存命中率 | 正常运行日志,用于监控调优 | | `Parsing completed; starting worker selection.` | 请求解析完成,开始选择 Worker | 正常运行日志 | -| `Request completed with an error.` | 请求处理完成但发生错误 | 请排查后端实例状态 | | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | PD 分离模式下 Decode 选择失败,释放 Prefill 计数器 | 异常处理日志 | | `[prefill] first chunk received, release counter url={url}` | Prefill 流式响应收到首个数据块,释放计数器 | 正常运行日志 | | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill 非流式响应完成,释放计数器 | 正常运行日志 | @@ -72,6 +89,11 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `before SelectWorker prefill. ts_ms={ts}` | PD 分离模式下开始选择 Prefill Worker | 正常运行日志,用于性能追踪 | | `before SelectWorker decode, after prefill. ts_ms={ts}` | Prefill 选择完成后开始选择 Decode Worker | 正常运行日志,用于性能追踪 | | `after SelectWorker decode, before return. ts_ms={ts}` | Decode Worker 选择完成 | 正常运行日志,用于性能追踪 | +| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败,回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | +| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | 不健康 Worker 仍有在途请求,计数器暂时保留 | 正常运行日志,待在途请求完成后自动清理 | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | 不健康 Worker 仍有在途 token 负载,token 计数器暂时保留 | 正常运行日志,待在途请求完成后自动清理 | +| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数器 | 正常运行日志 | +| `preserved counters for {count} workers with inflight requests: {urls}` | 批量保留仍有在途请求的 Worker 计数器 | 正常运行日志 | ### Debug 级别日志 @@ -100,6 +122,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `{"error": "Failed to build disaggregate_info"}` | 500 | 构建 PD 分离通信信息失败 | 注册参数(connector_port、device_ids 等) | | `{"error": "Invalid request body"}` | 400 | 请求体读取失败 | 请求格式 | | `{"error": "Invalid JSON format"}` | 400 | 请求体 JSON 解析失败 | 请求格式 | +| `{"error": "Failed to encode modified request: {error}"}` | 500 | 编码修改后的请求体失败 | 请求内容 | +| `{"code": 500, "msg": "Internal server error"}` | 500 | 请求处理过程中发生 panic 并被恢复 | 后端实例状态、请求内容 | + +> **说明**:在 PD 分离(splitwise)模式下,以上错误响应会额外包含 `request_id` 字段,如 `{"error": "...", "request_id": "xxx"}`。此外,`Invalid request body` 和 `Invalid JSON format` 的实际输出会包含具体的错误详情,如 `{"error": "Invalid request body: EOF"}`。 ### 注册请求错误(/register) @@ -112,6 +138,7 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | 集中式模式下只允许注册 MIXED 实例 | 部署模式、实例角色 | | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | 实例注册信息校验失败 | 注册参数 | | `{"code": 200, "msg": "Register success"}` | 200 | 注册成功 | — | +| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router 内部管理器未初始化 | Router 启动状态 | ### 常见注册参数校验错误 @@ -124,6 +151,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `port is required` | 缺少 port 字段 | 添加 port 字段 | | `invalid port: {port}` | port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | | `invalid protocol: {protocol}` | 传输协议不合法 | 使用合法的协议值:ipc / rdma | +| `invalid connector_port: {port}` | connector_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `invalid metrics_port: {port}` | metrics_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `rdma_ports[{index}] invalid port: {port}` | RDMA 端口列表中第 {index} 个端口号不合法 | 填写 1-65535 范围内的端口号 | ## 常见问题排查方式 @@ -236,7 +267,7 @@ PD 分离模式下建议完整配置以下参数,以确保 KV Cache 传输正 使用 `cache_aware` 调度策略时,Router 会调用 Tokenizer 服务对请求进行分词以计算缓存命中率。当 Tokenizer 服务不可用时,Router 内置了两级退化机制: 1. **回退至字符级分词**(常见情况):日志出现 `tokenizer failed, fallback to char tokens`。此时 cache_aware 策略仍然生效,只是使用字符级分词代替 Tokenizer 进行缓存匹配,缓存命中精度会有所下降,但不影响正常请求处理。 -2. **回退至 process_tokens 策略**(极端情况):当分词彻底失败(如请求内容为空)时,日志出现 `tokenize failed, fallback to process_tokens`。此时 cache_aware 策略暂时不生效,改为按 token 处理量进行调度,同样不影响正常请求处理。 +2. **回退至 process_tokens 策略**(极端情况):当分词彻底失败(如请求内容为空)时,日志出现 `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}`(Info 级别)。此时 cache_aware 策略暂时不生效,改为按 token 处理量进行调度,同样不影响正常请求处理。 如需恢复 cache_aware 策略的完整功能: diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md new file mode 100644 index 00000000000..6534fb332f2 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -0,0 +1,119 @@ +--- +name: stat-cache-hitrate +description: > + 统计 FastDeploy Go Router 日志中的三层 cache 命中率指标,生成可视化报告。 + 三层指标:Prefix Hit Ratio(KV Cache 内容复用度)、Session Hit Rate(请求级路由粘性)、 + Per-Worker Cache Stats(各 prefill worker 的缓存利用排名)。支持全量统计、tail 快速查看、 + 持续监控模式。 + + 当用户提到以下内容时触发此 skill:统计/查看 cache 命中率、查看 cache-aware 调度效果、 + 查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。 + 关键词:cache 命中率、hitRatio、cache-aware、prefix hit、session hit rate、 + 缓存预热、/stat-cache-hitrate。 + +IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析规则。 +--- + +# Cache Hit Rate Statistics + +统计 FastDeploy Go Router 的三层 cache 命中率,生成可视化报告。 + +## 执行前交互 + +运行脚本前,Claude 必须先向用户确认以下参数: + +### 1. 日志文件路径 +使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项,同时允许用户直接输入自定义路径(支持绝对路径和相对路径): +- 选项 1: `logs/router.log`(默认) +- 选项 2: `fd-router.log`(golang_router 根目录) +- 选项 3: 用户通过 Other 输入自定义路径 + +**重要规则**: +- 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 +- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件 +- 支持绝对路径(如 `/home/user/logs/xxx.log`)和相对路径(如 `logs/fd-router (2).log`) + +如果用户直接确认或未指定路径,使用默认值 `logs/router.log`。 + +### 2. 分析模式 +向用户询问分析模式: +> "请选择分析模式: +> 1. **全量统计**(默认)— 扫描完整日志 +> 2. **快速查看尾部** — 只看最近的数据(可指定行数如 2000 或时间如 30m) +> 3. **持续监控** — 全量分析后提示监控命令 +> 4. **指定时间段** — 分析特定时间范围(如 `--start "16:00" --end "17:00"`)" + +如果用户未选择,默认使用全量统计。 + +`--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。 +时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 +缺失部分自动从日志首末行推断。 + +### 3. 输出目录 +分析结果默认保存到 `skill_output/stat-cache-hitrate//`(自动按运行时间创建子目录)。 +用户可通过 `--output` 指定自定义目录。 + +## 使用方式 + +运行统计脚本(相对于 `fastdeploy/golang_router/` 目录): + +```bash +# 全量统计 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --output skill_output/stat-cache-hitrate/ + +# 快速查看尾部数据 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail # 默认最后 2000 行 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000 # 指定行数 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m # 指定时间 + +# 持续监控 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --watch + +# 指定时间段(--start 和 --end 可单独或同时使用) +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00" +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "2026/03/31 16:00:00" +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00" +``` + +默认日志路径:`logs/router.log` 或 `fd-router.log`(相对于 `fastdeploy/golang_router/`)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 + +脚本会自动根据文件大小选择解析策略:小文件(<5000 行)在内存中处理,大文件用 grep + 管道流式处理。 + +## 输出说明 + +### 三层指标 + +| 层级 | 指标 | 含义 | +|------|------|------| +| 第一层 | Prefix Hit Ratio | 被选中 worker 的 KV cache 命中率,反映内容级复用度 | +| 第二层 | Session Hit Rate | 带 session_id 的请求被路由到同一 worker 的比例 | +| 第三层 | Per-Worker Stats | 每个 prefill worker 被选中的次数和平均命中率排名 | + +### 输出文件位置 + +详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 + +- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 +- `details/per_window_data.md` — 每5s窗口的完整明细数据(Prefix HR / Session HR / Scoring / Fallback / Running) + +### 交叉诊断矩阵 + +| Session HR | Prefix HR | 诊断 | +|------------|-----------|------| +| 高 | 高 | cache-aware 策略运行良好 | +| 高 | 低 | session 粘性好但 prompt 内容变化大,KV cache 实际复用低 | +| 低 | 高 | 换 worker 了但新 worker 也有类似前缀缓存 | +| 低 | 低 | 负载均衡强制分散或缓存未预热 | + +## 重要规则 + +1. **`[stats]` 计数器 per-interval**:每 5s `atomic.Swap(0)` 重置,必须 sum 所有行计算累计值 +2. **Session HR 只统计带 session_id 的请求** +3. **Prefix HR 取 selected worker 的值**:不在 hitRatios map 中则为 0 +4. **此 skill 只关注 cache 命中率**:延迟/错误/健康等排查由 troubleshoot skill 负责 +5. **与 troubleshoot-cache 互补**:本 skill 做数值统计,troubleshoot-cache 做调度策略诊断 + +## 参考文件 + +- `references/log_formats.md` — 日志格式和解析规则 +- `references/report_templates.md` — 终端报告和详细导出的模板 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json new file mode 100644 index 00000000000..23c7f6d86aa --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json @@ -0,0 +1,18 @@ +[ + {"query": "帮我统计一下 router 的 cache 命中率,日志在 logs/fd-router.log", "should_trigger": true}, + {"query": "我想看看 cache-aware 调度的效果怎么样,hitRatio 数据是多少", "should_trigger": true}, + {"query": "prefix hit ratio 和 session hit rate 分别是多少?分析一下 logs/router.log", "should_trigger": true}, + {"query": "看一下最近30分钟的缓存预热情况,用 tail 模式快速扫一下", "should_trigger": true}, + {"query": "我刚部署了新的 cache-aware 策略,帮我跑一下 /stat-cache-hitrate 看看效果", "should_trigger": true}, + {"query": "每个 prefill worker 的缓存利用率排名是怎样的?哪个 worker 命中率最高", "should_trigger": true}, + {"query": "stat cache hitrate on our go router log, need to check the KV cache reuse rate", "should_trigger": true}, + {"query": "持续监控 cache 命中率变化趋势,我想看实时数据", "should_trigger": true}, + {"query": "router 最近老是返回 502,帮我排查一下什么问题", "should_trigger": false}, + {"query": "分析一下 router 的请求延迟,p99 是不是太高了", "should_trigger": false}, + {"query": "帮我 trace 一下这个请求 ID: abc-123-def,看看整个链路", "should_trigger": false}, + {"query": "Worker 健康状态怎么样?有没有频繁下线的", "should_trigger": false}, + {"query": "帮我写一个 Go 语言的 HTTP 路由框架", "should_trigger": false}, + {"query": "分析一下 nginx 的 access log,统计各个 URL 的访问量", "should_trigger": false}, + {"query": "router 负载不均衡,某些 worker 的 running 计数异常高", "should_trigger": false}, + {"query": "帮我看看 FastDeploy 的部署文档,我想部署一个新模型", "should_trigger": false} +] diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md new file mode 100644 index 00000000000..bc29a4cbb25 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md @@ -0,0 +1,139 @@ +# 日志格式参考 + +本文件描述 FastDeploy Go Router 的日志格式和解析规则。统计 cache 命中率前必须阅读。 + +--- + +## 通用日志行格式 + +``` +[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:: +``` + +- **Level**:`[INFO]`、`[ERROR]`、`[WARN]`、`[DEBUG]` +- **Timestamp**:`YYYY/MM/DD HH:MM:SS` +- **可选 context 前缀**:`[trace_id:...]`、`[req_id:...]`、`[session_id:...]`、`[request_id:...]` 可能出现在 `logger.go:XX:` 和实际消息之间,顺序固定(trace_id → req_id → session_id → request_id),但不一定全部出现 + +--- + +## 类别 A:Cache-Aware 策略行 + +### A1. cache_aware_scoring(正常走 cache-aware 路径) + +``` +[INFO] 2026/03/30 20:16:57 logger.go:79: [session_id:slimshetty/swebench-verified:sweb.eval.x86_64.psf__requests-1766] [request_id:565a594c-...] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20 ...], hitRatios=map[http://10.52.95.17:9263:100]. ts_ms=2026-03-30 20:16:57.021 +``` + +**提取字段**: +- `selected=` — 被选中的 worker URL,格式 `http://IP:PORT` +- `hitRatios=map[...]` — Go map 格式,详见下方解析规则 +- `loads=map[...]` — 各 worker 的负载 + +### A2. process_tokens fallback(未走 cache-aware 路径) + +``` +cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...] +cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: +cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized +``` + +--- + +## 类别 B:Stats 行 + +``` +[INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14, workers: [http://10.52.96.143:9867: running=0, http://10.52.95.26:9867: running=1, ...], cache_hit_rate=0.00% (hits=0/total=7) +``` + +**提取字段**: +- `total_running=` — 所有 worker 的运行请求总数 +- `workers: [...]` — 各 worker 的 `running=N` +- `cache_hit_rate=%` — 该窗口的命中率百分比 +- `(hits=/total=)` — 该 5s 窗口的命中次数和总次数 + +**关键**:`hits` 和 `total` 是 **per-interval** 的,代码使用 `atomic.Swap(0)` 每 5s 重置为 0。 + +--- + +## 类别 C:推理请求行 + +``` +[INFO] 2026/03/30 18:25:49 logger.go:79: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.52.95.139 +``` + +格式:`[METHOD] /path HTTP/1.1 ` + +延迟单位可能是 `s`、`ms`、`µs`/`us`。 + +**注意**:仅 `POST /v1/chat/completions` 和 `POST /v1/completions` 为推理请求。其余路径(`/register`、`/registered_number`、`/registered`、`/health_generate`、`/metrics`)为管理/监控请求,统计推理吞吐量时应排除。 + +--- + +## Go Map 解析规则 + +Go 的 `fmt.Sprintf("%v", map)` 输出格式:`map[key1:val1 key2:val2 ...]` + +### hitRatios 的特殊挑战 + +Worker URL 包含 `:`(如 `http://10.52.95.17:9263`),而 Go map 的 key-value 分隔符也是 `:`。 +因此 `hitRatios=map[http://10.52.95.17:9263:100]` 中: +- URL = `http://10.52.95.17:9263` +- Ratio = `100` + +### 推荐解析方法 + +**方法 1:正则匹配**(推荐) + +提取 `hitRatios=map[` 和 `]` 之间的内容,然后用正则匹配每个 entry: + +``` +正则:(http://[^\s:]+:\d+):(\d+) +``` + +示例: +``` +输入:http://10.52.95.17:9263:100 http://10.52.96.143:9867:50 +匹配1:group1=http://10.52.95.17:9263, group2=100 +匹配2:group1=http://10.52.96.143:9867, group2=50 +``` + +**方法 2:从右分割** + +对 map 内容按空格分割每个 token,然后对每个 token 找最后一个 `:` 分割: +``` +token = "http://10.52.95.17:9263:100" +lastColon = 最后一个 ":" 的位置 +url = token[:lastColon] → "http://10.52.95.17:9263" +ratio = token[lastColon+1:] → "100" +``` + +### 空 map + +`hitRatios=map[]` 表示冷启动,没有任何 worker 有匹配的前缀缓存。 + +### loads map 解析 + +同样的规则适用于 `loads=map[...]`,value 是负载数: +``` +loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20] +``` + +### workers 列表解析(stats 行) + +`workers: [http://10.52.96.143:9867: running=0, ...]` 格式不同: +- 用 `,` 分割每个 entry +- 每个 entry 格式:`http://IP:PORT: running=N` +- 注意 URL 后面跟的是 `: running=`(带空格),不是 Go map 的 `:val` + +--- + +## 时间戳解析 + +日志时间戳格式:`YYYY/MM/DD HH:MM:SS` + +提取正则:`(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})` + +用于: +- 确定日志时间跨度 +- 按时间分窗口(5s、1min 等) +- 按 quartile 分段统计趋势 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md new file mode 100644 index 00000000000..dcef9c47498 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -0,0 +1,199 @@ +# 报告输出模板 + +本文件包含 cache 命中率分析报告的终端输出模板和详细数据导出模板。 + +--- + +## 终端概览报告模板 + +``` +## Cache Hit Rate Analysis Report +**File**: | **Lines**: N | **Span**: ~ () + +### 1. Prefix Hit Ratio (KV Cache 内容复用度) + 累计平均: XX.X% (被选中 worker) + 分布: + 0-20% ██░░░░░░░░░░░░░░░░░░ X% (N=xxx) + 20-40% ███░░░░░░░░░░░░░░░░░ X% (N=xxx) + 40-60% █████░░░░░░░░░░░░░░░ X% (N=xxx) + 60-80% ████████████░░░░░░░░ X% (N=xxx) + 80-100% ████████████████████ X% (N=xxx) + 冷启动率: X.X% + 趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X% ↑/↓/→ + + Prefix Hit Ratio (5s 窗口): + 100%| ····················· + 80%| ····· ··· + 60%| ····· + 40%| ····· + 20%| ······ + 0%|···· + +---+---+---+---+---+---+---+---+---+---→ time + 18:25 18:26 18:27 18:28 18:29 18:30 + +### 2. Session Hit Rate (请求级路由粘性) + 累计: XX.X% (hits=N / total=N) + 覆盖率: X.X% 的推理请求带 session_id + 趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X% + + Session Hit Rate (5s 窗口): + 100%| ···················· + 80%| ·········· + 60%| ··········· + 40%| + 20%| + 0%|······· + +---+---+---+---+---+---+---+---+---+---→ time + +### 3. Per-Worker Cache Stats + ┌───────────────────────────┬──────────┬──────────┬─────────────────┐ + │ Prefill Worker │ Selected │ Select % │ Avg Hit(Select) │ + ├───────────────────────────┼──────────┼──────────┼─────────────────┤ + │ http://10.52.95.17:9263 │ 1,234 │ 15.2% │ 82% │ + │ http://10.52.96.143:9867 │ 890 │ 11.0% │ 74% │ + │ ... │ ... │ ... │ ... │ + └───────────────────────────┴──────────┴──────────┴─────────────────┘ + +### 4. Scheduling Strategy + cache_aware_scoring: N (X%) | fallback: N (X%) + fallback reasons: load_imbalanced=N, tokenize_failed=N, not_initialized=N + 非最优命中选择: X% (负载均衡优先于命中率的比例) + +### 5. Diagnosis + ✅/⚠/❌ <综合诊断> + +📄 详细数据见: skill_output/stat-cache-hitrate//cache_hitrate_report_.md +``` + +--- + +## 格式规则 + +### Unicode 柱状图 + +- 总宽度 20 个字符 +- `█` 表示已填充部分,`░` 表示空白部分 +- 后跟百分比和绝对数量 + +``` +计算方法: +filled = round(percentage / 100 * 20) +bar = "█" * filled + "░" * (20 - filled) +output = f"{bar} {percentage}% (N={count})" +``` + +示例: +``` +████████████░░░░░░░░ 60% (N=1200) +██████████████████░░ 90% (N=1800) +██░░░░░░░░░░░░░░░░░░ 10% (N=200) +``` + +### ASCII 折线图 + +- Y 轴:0-100% 范围,6 行(0%, 20%, 40%, 60%, 80%, 100%) +- X 轴:时间,标注关键时间点 +- 数据点用 `·` 绘制 +- 坐标轴用 `|` `+` `─` `→` + +``` +时间粒度自动调整: +- 日志跨度 <30min → 5s 原始粒度 +- 日志跨度 <3h → 1min 粒度 +- 日志跨度 >3h → 5min 粒度 +``` + +图表宽度约 60 列。数据点太多时自动聚合到更粗的粒度。 + +### 表格 + +使用 Unicode box-drawing 字符: + +``` +┌ ─ ┬ ─ ┐ 顶部 +│ │ │ 数据行 +├ ─ ┼ ─ ┤ 分隔行 +│ │ │ 数据行 +└ ─ ┴ ─ ┘ 底部 +``` + +### 趋势箭头 + +- `↑` — 上升趋势(Q4 > Q1 + 10%) +- `↓` — 下降趋势(Q4 < Q1 - 10%) +- `→` — 稳定(变化 < 10%) + +--- + +## 详细数据导出模板 + +主报告:`skill_output/stat-cache-hitrate//cache_hitrate_report_.md` +每窗口明细:`skill_output/stat-cache-hitrate//details/per_window_data.md` + +### 主报告 + +```markdown +# Cache Hit Rate Detailed Report + +**Generated**: +**Source**: + +## 1. Per-Worker 完整统计 + +| Worker | Selected | Select % | Avg Hit (Selected) | Avg Hit (All) | Max Hit | +|--------|----------|----------|--------------------|----- ---------|---------| +| http://10.52.95.17:9263 | 1,234 | 15.2% | 82% | 68% | 100% | +| ... | ... | ... | ... | ... | ... | + +## 2. Fallback 明细 + +### 3.1 load imbalanced (N 次) +| Time | Loads | +|------|-------| +| 20:15:03 | map[...] | + +### 3.2 tokenize failed (N 次) +| Time | Error | +|------|-------| +| ... | ... | + +## 4. 非最优命中选择明细 + +| Time | Selected | Selected HR | Best Worker | Best HR | Load Diff | +|------|----------|-------------|-------------|---------|-----------| +| 20:15:10 | w1:9263 | 60% | w2:9867 | 85% | w1=5, w2=18 | +| ... | ... | ... | ... | ... | ... | +``` + +--- + +## --tail 快速查看模板 + +`--tail` 模式下只输出核心指标: + +``` +## Cache Hit Rate (Recent) +**File**: | **tail lines** | **Span**: ~ + + Prefix Hit Ratio: XX.X% (avg) | Cold start: X.X% + Session Hit Rate: XX.X% (hits=N/total=N) | Coverage: X.X% + Strategy: scoring N (X%) | fallback N (X%) + + Recent trend (1min buckets): + 100%| ····· + 80%| ····· + 60%|····· + +---+---+---+---+---→ + -5m -4m -3m -2m -1m + +💡 持续跟踪: /loop 30s /analyze-cache-hitrate --tail +``` + +## --watch 持续监控模板 + +`--watch` 模式先输出完整报告(同终端概览报告模板),末尾额外提示: + +``` +💡 全量分析完成。持续跟踪后续变化: + /loop 30s /analyze-cache-hitrate --tail +``` diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py new file mode 100644 index 00000000000..cc5534a757d --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Chart — 终端可视化渲染工具 + +提供 sparkline 折线图、Unicode 柱状图、Markdown 表格的渲染函数。 +所有函数返回字符串(不直接打印),方便组装到报告中。 + +Python 3 stdlib only,零依赖。 +""" + + +# ════════════════════════════════════════════════════════════════ +# Sparkline 折线图 +# ════════════════════════════════════════════════════════════════ + +BLOCK_CHARS = " ▁▂▃▄▅▆▇█" + + +def render_sparkline( + records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60 +): + """渲染 8 级 Unicode sparkline 折线图。 + + Args: + records: dict 列表,每个 dict 包含 bucket_field 和 value_field + value_field: 数值字段名 + bucket_field: 时间桶字段名 + title: 图表标题 + y_label: Y 轴标签(如 '%') + y_range: Y 轴范围 (min, max) 元组,None 则自动 + width: 图表宽度(字符数) + + Returns: + str: 渲染后的图表文本 + """ + if not records: + return " (no data)" + + all_values = [] + for r in records: + v = r.get(value_field) + if v is not None: + all_values.append(float(v)) + + if not all_values: + return " (no numeric data)" + + # Y 轴范围 + if y_range: + y_min, y_max = y_range + else: + y_min = min(all_values) + y_max = max(all_values) + if y_max == y_min: + y_min = 0 if y_max > 0 else y_max - 1 + y_max = max(y_max, 1) + + y_span = y_max - y_min if y_max != y_min else 1 + + # 降采样 + n = len(records) + if n > width: + step = n / width + sampled = [] + for i in range(width): + start_idx = int(i * step) + end_idx = int((i + 1) * step) + chunk = records[start_idx:end_idx] + vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None] + avg_record = { + bucket_field: chunk[0].get(bucket_field, ""), + value_field: sum(vals) / len(vals) if vals else 0, + } + sampled.append(avg_record) + records = sampled + + lines = [] + + # 标题行 + def fmt_val(v): + if abs(v) >= 1000: + return f"{v:.0f}" + elif abs(v) >= 10: + return f"{v:.1f}" + return f"{v:.2f}" + + header_parts = [] + if title: + header_parts.append(title) + header_parts.append(f"min={fmt_val(min(all_values))}") + header_parts.append(f"max={fmt_val(max(all_values))}") + if y_label: + header_parts.append(f"({y_label})") + lines.append(" " + " ".join(header_parts)) + + # Sparkline 字符 + spark_chars = [] + for r in records: + v = r.get(value_field) + if v is None: + spark_chars.append(" ") + continue + v = float(v) + normalized = (v - y_min) / y_span + level = max(0, min(8, round(normalized * 8))) + spark_chars.append(BLOCK_CHARS[level]) + lines.append(" " + "".join(spark_chars)) + + # X 轴标签 + data_width = len(records) + if data_width > 0: + + def short_bucket(r): + b = str(r.get(bucket_field, "")) + if " " in b: + b = b.split(" ")[-1] + return b[:5] if len(b) >= 5 else b + + lbl_width = 6 + max_labels = max(1, data_width // lbl_width) + n_records = len(records) + + if n_records <= 2: + indices = list(range(n_records)) + elif n_records <= max_labels: + indices = [0, n_records - 1] + else: + n_labels = min(5, max(2, max_labels)) + indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)] + + label_line = [" "] * (data_width + lbl_width + 2) + last_end = -1 + for idx in indices: + lbl = short_bucket(records[idx]) + pos = idx + if pos < last_end: + continue + for ci, c in enumerate(lbl): + p = pos + ci + if p < len(label_line): + label_line[p] = c + last_end = pos + len(lbl) + 1 + lines.append(" " + "".join(label_line).rstrip()) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Unicode 柱状图 +# ════════════════════════════════════════════════════════════════ + + +def render_bar(data, bar_width=20, show_count=False): + """渲染 Unicode 柱状图。 + + Args: + data: dict 列表,每个 dict 包含 label, value(百分比 0-100), 可选 count + bar_width: 柱状图宽度(字符数) + show_count: 是否显示绝对数量 + + Returns: + str: 渲染后的图表文本 + """ + if not data: + return " (no data)" + + max_label_len = max(len(str(d.get("label", ""))) for d in data) + max_label_len = max(max_label_len, 4) + + lines = [] + for d in data: + label = str(d.get("label", "")) + value = float(d.get("value", 0)) + count = d.get("count") + + filled = round(value / 100 * bar_width) if value > 0 else 0 + filled = max(1, filled) if value > 0 else 0 + filled = min(bar_width, filled) + empty = bar_width - filled + bar = "█" * filled + "░" * empty + + line = f" {label:<{max_label_len}} {bar} {value:>5.1f}%" + if show_count and count is not None: + line += f" (N={count})" + lines.append(line) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Markdown 表格 +# ════════════════════════════════════════════════════════════════ + + +def render_table(data, columns=None, right_align=None): + """渲染 Markdown 表格。 + + Args: + data: dict 列表 + columns: 列名列表,None 则用第一条记录的所有 key + right_align: 右对齐的列名集合 + + Returns: + str: 渲染后的表格文本 + """ + if not data: + return " (no data)" + + if columns is None: + columns = list(data[0].keys()) + if right_align is None: + right_align = set() + + # 计算列宽 + col_widths = {} + for col in columns: + col_widths[col] = len(col) + for row in data: + val = str(row.get(col, "")) + col_widths[col] = max(col_widths[col], len(val)) + + # 表头 + header_parts = [] + sep_parts = [] + for col in columns: + w = col_widths[col] + if col in right_align: + header_parts.append(f" {col:>{w}} ") + else: + header_parts.append(f" {col:<{w}} ") + sep_parts.append("-" * (w + 2)) + + lines = [] + lines.append("|" + "|".join(header_parts) + "|") + lines.append("|" + "|".join(sep_parts) + "|") + + # 数据行 + for row in data: + row_parts = [] + for col in columns: + val = str(row.get(col, "")) + w = col_widths[col] + if col in right_align: + row_parts.append(f" {val:>{w}} ") + else: + row_parts.append(f" {val:<{w}} ") + lines.append("|" + "|".join(row_parts) + "|") + + return "\n".join(lines) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py new file mode 100644 index 00000000000..0b7377b4865 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py @@ -0,0 +1,358 @@ +#!/usr/bin/env python3 +""" +Router Log Parser — FastDeploy Go Router 日志解析原语 + +支持两种调用方式: +1. 作为模块导入:from log_parser import parse_cache_strategy_line, parse_stats_line +2. 作为 CLI 工具:grep 'pattern' logfile | python3 log_parser.py parse-cache-strategy + +Python 3 stdlib only,零依赖。 +""" + +import argparse +import json +import re +import sys +from datetime import datetime + +# ════════════════════════════════════════════════════════════════ +# 通用解析原语 +# ════════════════════════════════════════════════════════════════ + + +def parse_go_map(s): + """解析 Go fmt.Sprintf('%v', map) 输出:map[key1:val1 key2:val2 ...] + + 处理 URL 中冒号与 Go map key-value 分隔符的冲突(从最后一个冒号分割)。 + 空 map 'map[]' 返回空 dict。 + """ + inner_match = re.search(r"map\[(.*?)\]", s) + if not inner_match: + return {} + inner = inner_match.group(1).strip() + if not inner: + return {} + result = {} + for token in inner.split(): + idx = token.rfind(":") + if idx > 0: + key = token[:idx] + val_str = token[idx + 1 :] + try: + result[key] = int(val_str) if "." not in val_str else float(val_str) + except ValueError: + result[key] = val_str + return result + + +# 时间戳:YYYY/MM/DD HH:MM:SS +TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})") + +# ts_ms:2025-01-15 18:25:33.123 +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + + +def extract_ts(line): + """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。""" + m = TS_RE.search(line) + return m.group(1) if m else None + + +def parse_ts(ts_str): + """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。""" + return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + + +# ════════════════════════════════════════════════════════════════ +# 时间范围过滤 +# ════════════════════════════════════════════════════════════════ + +import os +import subprocess +import tempfile + +_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$") +_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$") +_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$") +_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$") + + +def _get_log_boundary_ts(log_file, which="first"): + """从日志文件首行或末行提取时间戳。""" + cmd = "head" if which == "first" else "tail" + try: + r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5) + return extract_ts(r.stdout) if r.returncode == 0 else None + except (subprocess.TimeoutExpired, FileNotFoundError): + return None + + +def complete_time_arg(time_str, log_file, is_end=False): + """解析灵活时间输入,补全缺失部分。 + + 支持格式: + 'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD', + 'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM' + + 补全规则: + - 缺年份:从日志首行取 + - 缺日期:从日志末行取 + - 缺时间:start→00:00:00, end→23:59:59 + + Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串 + """ + if time_str is None: + return None + time_str = time_str.strip() + + # Case 1: 完整日期时间 + m = _FULL_DT_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + h, mi = m.group(4).zfill(2), m.group(5) + s = (m.group(6) or "00").zfill(2) + return f"{y}/{mo}/{d} {h}:{mi}:{s}" + + # Case 2: 仅日期 YYYY/MM/DD + m = _DATE_ONLY_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + t = "23:59:59" if is_end else "00:00:00" + return f"{y}/{mo}/{d} {t}" + + # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS] + m = _SHORT_DATE_RE.match(time_str) + if m: + mo, d = m.group(1).zfill(2), m.group(2).zfill(2) + ts = _get_log_boundary_ts(log_file, "first") + year = ts[:4] if ts else "2026" + if m.group(3): # 有时间部分 + h, mi = m.group(3).zfill(2), m.group(4) + s = (m.group(5) or "00").zfill(2) + return f"{year}/{mo}/{d} {h}:{mi}:{s}" + t = "23:59:59" if is_end else "00:00:00" + return f"{year}/{mo}/{d} {t}" + + # Case 4: 仅时间 HH:MM[:SS] + m = _TIME_ONLY_RE.match(time_str) + if m: + h, mi = m.group(1).zfill(2), m.group(2) + s = (m.group(3) or "00").zfill(2) + ts = _get_log_boundary_ts(log_file, "last") + date_part = ts[:10] if ts else "2026/01/01" + return f"{date_part} {h}:{mi}:{s}" + + # Fallback: 原样返回 + return time_str + + +def filter_file_by_time_range(log_file, start_str=None, end_str=None): + """用 awk 按时间范围预过滤日志文件。 + + 时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比,直接用 awk 字符串比较。 + 无时间戳的行(如 panic 堆栈续行)保留。 + + Args: + log_file: 原日志文件路径 + start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if not start_str and not end_str: + return (log_file, False) + + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_") + tmp.close() + + awk_script = r"""{ + ts = "" + if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + } + if (ts == "") { print; next } + if ((start == "" || ts >= start) && (end == "" || ts <= end)) print + }""" + + cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file] + + try: + with open(tmp.name, "w") as outf: + result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120) + if result.returncode != 0: + os.unlink(tmp.name) + return (log_file, False) + except (subprocess.TimeoutExpired, OSError): + if os.path.exists(tmp.name): + os.unlink(tmp.name) + return (log_file, False) + + return (tmp.name, True) + + +# Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] +TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") + + +def extract_tags(line): + """从日志行提取 context tag。""" + return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)} + + +# ════════════════════════════════════════════════════════════════ +# Cache-Aware 策略行解析(类别 A) +# ════════════════════════════════════════════════════════════════ + +STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") +SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") + + +def parse_cache_strategy_line(line): + """解析 cache-aware prefill 策略行。 + + 输入示例: + [INFO] 2026/03/30 20:16:57 logger.go:79: ... cache-aware prefill: final strategy: + cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[...], hitRatios=map[...] + + 返回 dict 或 None(如果不是策略行)。 + """ + sm = STRATEGY_RE.search(line) + if not sm: + return None + + ts = extract_ts(line) + strategy = sm.group(1) + record = {"ts": ts or "", "strategy": strategy} + + # selected worker URL + sel_m = SELECTED_RE.search(line) + if sel_m: + record["selected"] = sel_m.group(1) + + # reason(仅 process_tokens fallback) + reason_m = REASON_RE.search(line) + if reason_m and strategy == "process_tokens": + record["reason"] = reason_m.group(1).strip() + + # hitRatios map + hr_match = re.search(r"hitRatios=(map\[.*?\])", line) + if hr_match: + hit_ratios = parse_go_map(hr_match.group(1)) + record["hitRatios"] = hit_ratios + if "selected" in record: + record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0) + else: + record["hitRatios"] = {} + if "selected" in record: + record["selected_hitRatio"] = 0 + + # loads map + loads_match = re.search(r"loads=(map\[.*?\])", line) + if loads_match: + record["loads"] = parse_go_map(loads_match.group(1)) + + # ts_ms(精确到毫秒的调度时间戳) + ts_ms_m = TS_MS_RE.search(line) + if ts_ms_m: + record["ts_ms"] = ts_ms_m.group(1) + + # context tags + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Stats 行解析(类别 B) +# ════════════════════════════════════════════════════════════════ + +TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") +WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") + + +def parse_stats_line(line): + """解析 [stats] 统计行。 + + 输入示例: + [INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14, + workers: [...], cache_hit_rate=0.00% (hits=0/total=7) + + 注意:hits 和 total 是 per-interval 的(每 5s 重置),累计值必须 sum 所有行。 + + 返回 dict 或 None(如果不是 stats 行)。 + """ + if "[stats]" not in line: + return None + + ts = extract_ts(line) + record = {"ts": ts or ""} + + # total_running + tr_m = TOTAL_RUNNING_RE.search(line) + if tr_m: + record["total_running"] = int(tr_m.group(1)) + + # per-worker running + workers = {} + for wm in WORKER_RUNNING_RE.finditer(line): + workers[wm.group(1)] = int(wm.group(2)) + record["workers"] = workers + + # cache_hit_rate + hits/total + chr_m = CACHE_HR_RE.search(line) + if chr_m: + record["cache_hit_rate"] = float(chr_m.group(1)) + record["hits"] = int(chr_m.group(2)) + record["total"] = int(chr_m.group(3)) + + return record + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def _cli_parse_stream(parse_fn): + """通用 CLI 流式解析:从 stdin 读入日志行,输出 JSON Lines 到 stdout。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_fn(line) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Log Parser", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + sub = parser.add_subparsers(dest="command") + + sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 → JSON Lines") + sub.add_parser("parse-stats", help="解析 [stats] 统计行 → JSON Lines") + + args = parser.parse_args() + + if args.command == "parse-cache-strategy": + _cli_parse_stream(parse_cache_strategy_line) + elif args.command == "parse-stats": + _cli_parse_stream(parse_stats_line) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py new file mode 100644 index 00000000000..c193e99d47c --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -0,0 +1,669 @@ +#!/usr/bin/env python3 +""" +stat_cache_hitrate — FastDeploy Go Router Cache 命中率统计工具 + +统计三层 cache 命中率指标: + 1. Prefix Hit Ratio — KV Cache 内容复用度 + 2. Session Hit Rate — 请求级路由粘性 + 3. Per-Worker Stats — 各 worker 缓存利用排名 + +用法: + python3 stat_cache_hitrate.py [--tail N|Nm] [--watch] [--output DIR] +""" + +import argparse +import json +import os +import subprocess +import sys +from collections import defaultdict +from datetime import datetime + +# 同目录模块导入 +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from chart import render_bar, render_sparkline, render_table +from log_parser import ( + complete_time_arg, + extract_ts, + filter_file_by_time_range, + parse_cache_strategy_line, + parse_stats_line, + parse_ts, +) +from stats import compute_statistics, count_by, time_bucket + +# ════════════════════════════════════════════════════════════════ +# Phase 1: 日志读取 +# ════════════════════════════════════════════════════════════════ + + +def count_lines(filepath): + """快速统计文件行数。""" + result = subprocess.run(["wc", "-l", filepath], capture_output=True, text=True) + if result.returncode == 0: + return int(result.stdout.strip().split()[0]) + return 0 + + +def read_lines(filepath, tail=None): + """读取日志文件,支持 tail 模式。""" + if tail: + if isinstance(tail, str) and tail.endswith("m"): + # 按时间 tail:读取全部行,过滤最近 N 分钟 + minutes = int(tail[:-1]) + all_lines = _read_file_lines(filepath) + return _filter_by_time(all_lines, minutes) + else: + # 按行数 tail + n = int(tail) + result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True) + return result.stdout.splitlines() if result.returncode == 0 else [] + return _read_file_lines(filepath) + + +def _read_file_lines(filepath): + with open(filepath, "r", errors="replace") as f: + return f.readlines() + + +def _filter_by_time(lines, minutes): + """过滤最近 N 分钟的日志行。""" + # 找最后一行的时间戳作为基准 + last_ts = None + for line in reversed(lines): + ts = extract_ts(line) + if ts: + last_ts = parse_ts(ts) + break + if not last_ts: + return lines + + from datetime import timedelta + + cutoff = last_ts - timedelta(minutes=minutes) + result = [] + for line in lines: + ts = extract_ts(line) + if ts: + try: + if parse_ts(ts) >= cutoff: + result.append(line) + except ValueError: + result.append(line) + else: + result.append(line) + return result + + +# ════════════════════════════════════════════════════════════════ +# Phase 2: 日志提取与解析 +# ════════════════════════════════════════════════════════════════ + +STRATEGY_PATTERN = "cache-aware prefill: final strategy:" +STATS_PATTERN = "[stats]" +INFERENCE_PATTERNS = ["] [POST] /v1/chat/completions ", "] [POST] /v1/completions "] + + +def _shell_quote(s): + """Shell 引号转义,安全处理含空格、括号、单引号的路径。""" + return "'" + s.replace("'", "'\\''") + "'" + + +def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None): + """大文件模式:grep 过滤 + log_parser.py CLI 管道解析。""" + parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log_parser.py") + + if tail and not (isinstance(tail, str) and tail.endswith("m")): + grep_cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -F {_shell_quote(grep_pattern)} | python3 {_shell_quote(parser_path)} {parse_cmd}" + else: + grep_cmd = f"grep -F {_shell_quote(grep_pattern)} {_shell_quote(filepath)} | python3 {_shell_quote(parser_path)} {parse_cmd}" + + result = subprocess.run(grep_cmd, shell=True, capture_output=True, text=True) + records = [] + for line in result.stdout.strip().splitlines(): + if line: + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +def grep_count(filepath, grep_pattern, tail=None): + """大文件模式:grep 计数。""" + if tail and not (isinstance(tail, str) and tail.endswith("m")): + cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -cE {_shell_quote(grep_pattern)}" + else: + cmd = f"grep -cE {_shell_quote(grep_pattern)} {_shell_quote(filepath)}" + + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + try: + return int(result.stdout.strip()) + except ValueError: + return 0 + + +def extract_data(filepath, tail=None): + """提取并解析日志数据,根据文件大小自动选择策略。""" + total = count_lines(filepath) + + if total < 5000: + # 小文件:内存中处理 + lines = read_lines(filepath, tail) + strategy_recs = [r for l in lines if (r := parse_cache_strategy_line(l)) is not None] + stats_recs = [r for l in lines if (r := parse_stats_line(l)) is not None] + inference_count = sum(1 for l in lines if any(p in l for p in INFERENCE_PATTERNS)) + return strategy_recs, stats_recs, inference_count, len(lines) + else: + # 大文件:grep + subprocess + strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail) + stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail) + inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail) + line_count = int(tail) if tail and not (isinstance(tail, str) and tail.endswith("m")) else total + return strategy_recs, stats_recs, inference_count, line_count + + +# ════════════════════════════════════════════════════════════════ +# Phase 3: 三层指标计算 +# ════════════════════════════════════════════════════════════════ + + +def compute_prefix_hitrate(strategies): + """计算第一层:Prefix Hit Ratio。""" + scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + if not scoring_recs: + return {"mean": 0, "stats": None, "distribution": [], "cold_start_rate": 0, "trend": [], "count": 0} + + hit_ratios = [r.get("selected_hitRatio", 0) for r in scoring_recs] + cold_starts = sum(1 for r in scoring_recs if not r.get("hitRatios")) + + stats = compute_statistics(hit_ratios, distribution_spec="0-20,20-40,40-60,60-80,80-100") + trend = time_bucket(scoring_recs, "auto", [("selected_hitRatio", "mean")]) + + return { + "mean": stats["mean"], + "stats": stats, + "distribution": stats.get("distribution", []), + "cold_start_rate": round(cold_starts / len(scoring_recs) * 100, 1) if scoring_recs else 0, + "trend": trend, + "count": len(scoring_recs), + } + + +def compute_session_hitrate(stats_recs, inference_count): + """计算第二层:Session Hit Rate。""" + total_hits = sum(r.get("hits", 0) for r in stats_recs) + total_total = sum(r.get("total", 0) for r in stats_recs) + + session_hr = round(total_hits / total_total * 100, 1) if total_total else 0 + coverage = round(total_total / inference_count * 100, 1) if inference_count else 0 + + # 趋势:每个窗口的 hits/total + trend = time_bucket(stats_recs, "auto", [("hits", "sum"), ("total", "sum")]) + for t in trend: + h = t.get("hits_sum", 0) + tot = t.get("total_sum", 0) + t["value"] = round(h / tot * 100, 1) if tot else 0 + + return { + "rate": session_hr, + "hits": total_hits, + "total": total_total, + "coverage": coverage, + "inference_count": inference_count, + "trend": trend, + } + + +def compute_per_worker_stats(strategies): + """计算第三层:Per-Worker Cache Stats。""" + scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + if not scoring_recs: + return [] + + worker_data = defaultdict(lambda: {"selected_count": 0, "hit_ratios": []}) + total_scoring = len(scoring_recs) + + for r in scoring_recs: + selected = r.get("selected", "") + if selected: + worker_data[selected]["selected_count"] += 1 + worker_data[selected]["hit_ratios"].append(r.get("selected_hitRatio", 0)) + + result = [] + for worker, data in worker_data.items(): + avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0 + result.append( + { + "Worker": worker.replace("http://", ""), + "Selected": data["selected_count"], + "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%", + "AvgHitRatio": f"{avg_hr}%", + } + ) + + result.sort(key=lambda x: x["Selected"], reverse=True) + return result + + +def compute_scheduling_stats(strategies): + """计算调度策略概况。""" + if not strategies: + return {"scoring_count": 0, "fallback_count": 0, "scoring_pct": 0, "fallback_reasons": [], "suboptimal_pct": 0} + + scoring = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + fallback = [r for r in strategies if r.get("strategy") == "process_tokens"] + + # Fallback 原因分类 + fallback_reasons = count_by(fallback, "reason") if fallback else [] + + # 非最优命中选择比例 + suboptimal = 0 + for r in scoring: + hit_ratios = r.get("hitRatios", {}) + if not hit_ratios: + continue + selected_hr = r.get("selected_hitRatio", 0) + max_hr = max(hit_ratios.values()) if hit_ratios else 0 + if selected_hr < max_hr: + suboptimal += 1 + + total = len(strategies) + return { + "scoring_count": len(scoring), + "fallback_count": len(fallback), + "scoring_pct": round(len(scoring) / total * 100, 1) if total else 0, + "fallback_reasons": fallback_reasons, + "suboptimal_count": suboptimal, + "suboptimal_pct": round(suboptimal / len(scoring) * 100, 1) if scoring else 0, + } + + +def cross_diagnose(prefix_hr, session_hr): + """交叉诊断矩阵。""" + p_high = prefix_hr["mean"] >= 60 + s_high = session_hr["rate"] >= 60 + + if s_high and p_high: + return { + "icon": "\u2705", + "summary": "cache-aware 策略运行良好", + "detail": "Session 粘性好,KV cache 实际复用度高", + } + elif s_high and not p_high: + return { + "icon": "\u26a0\ufe0f", + "summary": "Session 粘性好但 Prefix HR 低", + "detail": "prompt 内容变化大,同 worker 的 KV cache 实际复用低", + } + elif not s_high and p_high: + return { + "icon": "\u26a0\ufe0f", + "summary": "换 worker 频繁但 Prefix HR 尚可", + "detail": "负载均衡分散了请求,但新 worker 也有类似前缀缓存", + } + else: + return { + "icon": "\u274c", + "summary": "命中率全面偏低", + "detail": "负载均衡强制分散或缓存未预热,建议检查 worker 数量和 session 分配策略", + } + + +# ════════════════════════════════════════════════════════════════ +# Phase 4: 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def _quartile_trend(trend, value_field): + """将趋势数据分为 4 个 quartile,计算每段均值。""" + if not trend: + return "" + n = len(trend) + if n < 4: + values = [t.get(value_field, 0) for t in trend] + avg = round(sum(values) / len(values), 1) if values else 0 + return f"{avg}%" + + q_size = n // 4 + quartiles = [] + for i in range(4): + start = i * q_size + end = start + q_size if i < 3 else n + vals = [t.get(value_field, 0) for t in trend[start:end]] + quartiles.append(round(sum(vals) / len(vals), 1) if vals else 0) + + arrow = ( + "\u2191" if quartiles[3] > quartiles[0] + 10 else "\u2193" if quartiles[3] < quartiles[0] - 10 else "\u2192" + ) + return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}" + + +def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None): + """格式化完整终端报告。""" + parts = [] + + # 标题 + span_str = time_span or "" + parts.append("## Cache Hit Rate Report") + parts.append(f"**File**: {filepath} | **Lines**: {line_count:,}") + if span_str: + parts.append(f"**Span**: {span_str}") + parts.append("") + + # 1. Prefix Hit Ratio + parts.append("### 1. Prefix Hit Ratio (KV Cache 内容复用度)") + if prefix_hr["stats"]: + _ = prefix_hr["stats"] + parts.append(f' 累计平均: {prefix_hr["mean"]}% (被选中 worker, N={prefix_hr["count"]})') + parts.append(" 分布:") + + dist_data = [ + {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"] + ] + parts.append(render_bar(dist_data, show_count=True)) + + parts.append(f' 冷启动率: {prefix_hr["cold_start_rate"]}%') + + trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean") + if trend_str: + parts.append(f" 趋势: {trend_str}") + + # Sparkline + if prefix_hr["trend"]: + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] + parts.append("") + parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) + else: + parts.append(" (无 cache_aware_scoring 数据)") + parts.append("") + + # 2. Session Hit Rate + parts.append("### 2. Session Hit Rate (请求级路由粘性)") + parts.append(f' 累计: {session_hr["rate"]}% (hits={session_hr["hits"]} / total={session_hr["total"]})') + parts.append(f' 覆盖率: {session_hr["coverage"]}% 的推理请求带 session_id') + + trend_str = _quartile_trend(session_hr["trend"], "value") + if trend_str: + parts.append(f" 趋势: {trend_str}") + + if session_hr["trend"]: + parts.append("") + parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100))) + parts.append("") + + # 3. Per-Worker + parts.append("### 3. Per-Worker Cache Stats") + if per_worker: + parts.append( + render_table( + per_worker, + columns=["Worker", "Selected", "Select%", "AvgHitRatio"], + right_align={"Selected", "Select%", "AvgHitRatio"}, + ) + ) + else: + parts.append(" (无数据)") + parts.append("") + + # 4. Scheduling Strategy + parts.append("### 4. Scheduling Strategy") + parts.append( + f' cache_aware_scoring: {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)' + f' | fallback: {scheduling["fallback_count"]}' + ) + if scheduling["fallback_reasons"]: + reasons = ", ".join(f'{r["value"]}={r["count"]}' for r in scheduling["fallback_reasons"]) + parts.append(f" fallback reasons: {reasons}") + parts.append( + f' 非最优命中选择: {scheduling["suboptimal_pct"]}%' + f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)' + ) + parts.append("") + + # 5. Diagnosis + parts.append("### 5. Diagnosis") + parts.append(f' {diagnosis["icon"]} {diagnosis["summary"]}') + parts.append(f' {diagnosis["detail"]}') + + return "\n".join(parts) + + +def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling): + """格式化 --tail 精简报告。""" + parts = [] + parts.append("## Cache Hit Rate (Recent)") + parts.append(f"**File**: {filepath} | **tail {line_count} lines**") + parts.append("") + parts.append(f' Prefix Hit Ratio: {prefix_hr["mean"]}% (avg) | Cold start: {prefix_hr["cold_start_rate"]}%') + parts.append( + f' Session Hit Rate: {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})' + f' | Coverage: {session_hr["coverage"]}%' + ) + parts.append( + f' Strategy: scoring {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)' + f' | fallback {scheduling["fallback_count"]}' + ) + + # Sparkline + if prefix_hr["trend"]: + parts.append("") + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] + parts.append(render_sparkline(sparkline_data, title="Recent Prefix HR", y_label="%", y_range=(0, 100))) + + return "\n".join(parts) + + +def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir): + """导出详细数据 Markdown 文件。 + + 主报告包含 Per-Worker 统计和 Fallback 明细。 + 每窗口明细数据拆分到 details/per_window_data.md。 + """ + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md") + + parts = [] + parts.append("# Cache Hit Rate Detailed Report") + parts.append(f'**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') + parts.append(f"**Source**: {filepath}") + parts.append("") + + # Per-Worker 完整统计 + parts.append("## 1. Per-Worker 完整统计") + parts.append("") + if per_worker: + parts.append( + render_table( + per_worker, + columns=["Worker", "Selected", "Select%", "AvgHitRatio"], + right_align={"Selected", "Select%", "AvgHitRatio"}, + ) + ) + parts.append("") + + # Fallback 明细 + if scheduling["fallback_reasons"]: + parts.append("## 2. Fallback 明细") + for reason in scheduling["fallback_reasons"]: + parts.append(f'- **{reason["value"]}**: {reason["count"]} 次 ({reason["pct"]}%)') + parts.append("") + + # 每窗口明细 → 拆分到 details/ + time_data = defaultdict(lambda: {"prefix_hr": "-", "session_hr": "-", "scoring": 0, "fallback": 0, "running": "-"}) + for r in strategies: + ts = r.get("ts", "") + if r.get("strategy") == "cache_aware_scoring": + time_data[ts]["scoring"] += 1 + else: + time_data[ts]["fallback"] += 1 + + for r in stats_recs: + ts = r.get("ts", "") + h = r.get("hits", 0) + t = r.get("total", 0) + time_data[ts]["session_hr"] = f"{round(h / t * 100, 1)}% ({h}/{t})" if t else "0%" + time_data[ts]["running"] = str(r.get("total_running", "-")) + + if time_data: + # 主报告中添加引用 + parts.append( + f"> 每窗口明细数据 ({len(time_data)} 条): [details/per_window_data.md](details/per_window_data.md)" + ) + parts.append("") + + # 写入 details 子目录 + details_dir = os.path.join(output_dir, "details") + os.makedirs(details_dir, exist_ok=True) + detail_parts = ["# 每窗口明细数据", ""] + detail_parts.append("| Time | Prefix HR | Session HR | Scoring | Fallback | Total Running |") + detail_parts.append("|------|-----------|------------|---------|----------|---------------|") + for ts in sorted(time_data.keys()): + d = time_data[ts] + short_ts = ts.split(" ")[-1] if " " in ts else ts + detail_parts.append( + f'| {short_ts} | {d["prefix_hr"]} | {d["session_hr"]} ' + f'| {d["scoring"]} | {d["fallback"]} | {d["running"]} |' + ) + detail_parts.append("") + + detail_path = os.path.join(details_dir, "per_window_data.md") + with open(detail_path, "w") as f: + f.write("\n".join(detail_parts)) + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "w") as f: + f.write("\n".join(parts)) + + return output_path + + +# ════════════════════════════════════════════════════════════════ +# 时间跨度计算 +# ════════════════════════════════════════════════════════════════ + + +def compute_time_span(strategies, stats_recs): + """从数据中计算时间跨度字符串。""" + all_ts = [] + for r in strategies + stats_recs: + ts = r.get("ts", "") + if ts: + try: + all_ts.append(parse_ts(ts)) + except ValueError: + pass + if len(all_ts) < 2: + return None + t_min = min(all_ts) + t_max = max(all_ts) + duration = t_max - t_min + hours = int(duration.total_seconds() // 3600) + minutes = int((duration.total_seconds() % 3600) // 60) + start = t_min.strftime("%H:%M:%S") + end = t_max.strftime("%H:%M:%S") + if hours > 0: + return f"{start} ~ {end} ({hours}h{minutes}m)" + return f"{start} ~ {end} ({minutes}m)" + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def parse_args(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Cache 命中率统计", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("log_file", help="日志文件路径") + parser.add_argument("--tail", nargs="?", const="2000", help="只分析尾部数据(行数如 2000,或时间如 30m)") + parser.add_argument("--watch", action="store_true", help="全量分析后提示持续监控命令") + parser.add_argument( + "--output", default=None, help="详细报告输出目录(默认:skill_output/stat-cache-hitrate//)" + ) + parser.add_argument( + "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' + ) + parser.add_argument("--end", default=None, help='结束时间(如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00")') + return parser.parse_args() + + +def main(): + args = parse_args() + + # 验证文件存在 + if not os.path.isfile(args.log_file): + print(f"Error: 文件不存在: {args.log_file}", file=sys.stderr) + sys.exit(1) + + # --tail 与 --start/--end 不能混用(两者是不同的范围选择方式) + if args.tail and (args.start or args.end): + print("Error: --tail 与 --start/--end 不能同时使用,请选择其一", file=sys.stderr) + sys.exit(1) + + # 时间范围预过滤(--start 和 --end 可单独或同时指定) + import atexit + + log_file = args.log_file + if args.start or args.end: + start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None + end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None + filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + + # Phase 2: 提取 + 解析 + strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, args.tail) + + if not strategy_recs and not stats_recs: + print( + "Warning: 未找到 cache-aware 策略行或 [stats] 行。" "请确认日志文件包含 Go Router 日志。", file=sys.stderr + ) + sys.exit(0) + + # Phase 3: 计算三层指标 + prefix_hr = compute_prefix_hitrate(strategy_recs) + session_hr = compute_session_hitrate(stats_recs, inference_count) + per_worker = compute_per_worker_stats(strategy_recs) + scheduling = compute_scheduling_stats(strategy_recs) + diagnosis = cross_diagnose(prefix_hr, session_hr) + + # Phase 4: 输出 + if args.tail: + print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling)) + else: + time_span = compute_time_span(strategy_recs, stats_recs) + print( + format_full_report( + args.log_file, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span + ) + ) + + # 导出详细报告 + if args.output: + output_dir = args.output + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp) + report_path = save_detailed_report( + args.log_file, strategy_recs, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir + ) + print(f"\n\U0001f4c4 详细数据见: {report_path}") + + if args.watch: + print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail") + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py new file mode 100644 index 00000000000..a197ee7aff0 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Stats — 通用统计计算工具 + +提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。 +不含任何业务逻辑或日志格式依赖。 + +Python 3 stdlib only,零依赖。 +""" + +import math +from collections import defaultdict +from datetime import datetime, timedelta + +# ════════════════════════════════════════════════════════════════ +# 百分位数与基础统计 +# ════════════════════════════════════════════════════════════════ + + +def percentile(sorted_vals, p): + """从已排序列表计算第 p 百分位数(线性插值)。""" + if not sorted_vals: + return 0.0 + n = len(sorted_vals) + k = (p / 100.0) * (n - 1) + f = math.floor(k) + c = math.ceil(k) + if f == c: + return sorted_vals[int(k)] + return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f) + + +def compute_statistics(values, percentiles_list=None, distribution_spec=None): + """计算一组数值的统计量。 + + Args: + values: 数值列表 + percentiles_list: 要计算的百分位数列表,默认 [50, 90, 95, 99] + distribution_spec: 分布区间规格字符串,如 '0-20,20-40,40-60,60-80,80-100' + + Returns: + dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution} + """ + if percentiles_list is None: + percentiles_list = [50, 90, 95, 99] + + if not values: + result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0} + for p in percentiles_list: + result[f"p{p}"] = 0 + if distribution_spec is not None: + result["distribution"] = [] + return result + + sorted_vals = sorted(values) + n = len(sorted_vals) + total = sum(sorted_vals) + mean = total / n + variance = sum((x - mean) ** 2 for x in sorted_vals) / n + stddev = math.sqrt(variance) + + result = { + "count": n, + "min": round(sorted_vals[0], 3), + "max": round(sorted_vals[-1], 3), + "mean": round(mean, 3), + "sum": round(total, 3), + "stddev": round(stddev, 3), + } + + for p in percentiles_list: + result[f"p{p}"] = round(percentile(sorted_vals, p), 3) + + if distribution_spec is not None: + result["distribution"] = compute_distribution(sorted_vals, distribution_spec) + + return result + + +def compute_distribution(sorted_vals, spec_str): + """根据区间规格计算分布直方图。 + + spec_str 示例:'0-20,20-40,40-60,60-80,80-100' + 每个区间是左闭右开 [lo, hi)。 + """ + buckets = _parse_distribution_spec(spec_str) + n = len(sorted_vals) + result = [] + for b in buckets: + if b[0] == "lt": + count = sum(1 for v in sorted_vals if v < b[1]) + label = b[2] + elif b[0] == "gt": + count = sum(1 for v in sorted_vals if v > b[1]) + label = b[2] + elif b[0] == "range": + count = sum(1 for v in sorted_vals if b[1] <= v < b[2]) + label = b[3] + else: + continue + result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0}) + return result + + +def _parse_distribution_spec(spec_str): + """解析分布区间规格:'<100,100-500,>1000' → bucket 定义列表。""" + buckets = [] + for part in spec_str.split(","): + part = part.strip() + if part.startswith("<"): + buckets.append(("lt", float(part[1:]), part)) + elif part.startswith(">"): + buckets.append(("gt", float(part[1:]), part)) + elif "-" in part: + lo, hi = part.split("-", 1) + buckets.append(("range", float(lo), float(hi), part)) + return buckets + + +# ════════════════════════════════════════════════════════════════ +# 时间窗口聚合 +# ════════════════════════════════════════════════════════════════ + + +def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"): + """按时间窗口聚合记录。 + + Args: + records: dict 列表,每个 dict 必须有 ts_field 字段 + window: 窗口大小 '5s'/'1m'/'5m'/'auto' + agg_specs: 聚合规格列表 [(field, func), ...],如 [('selected_hitRatio', 'mean')] + func 支持:count, sum, mean, min, max, pNN + ts_field: 时间戳字段名 + + Returns: + list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...} + """ + if agg_specs is None: + agg_specs = [("_", "count")] + + if not records: + return [] + + window_td = _parse_window(window, records, ts_field) + + # 按窗口分组 + buckets = defaultdict(list) + for r in records: + ts_str = r.get(ts_field, "") + if not ts_str: + continue + try: + dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + except ValueError: + continue + bucket_dt = _align_to_bucket(dt, window_td) + bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S") + buckets[bucket_key].append(r) + + # 按时间排序并聚合 + result = [] + for bucket_key in sorted(buckets.keys()): + bucket_records = buckets[bucket_key] + entry = {"bucket": bucket_key, "count": len(bucket_records)} + + for field, func in agg_specs: + if field == "_": + if func == "count": + entry["count"] = len(bucket_records) + continue + + values = [] + for r in bucket_records: + v = r.get(field) + if v is not None: + try: + values.append(float(v)) + except (ValueError, TypeError): + pass + + out_key = f"{field}_{func}" + entry[out_key] = _aggregate_values(values, func) + + result.append(entry) + + return result + + +def _parse_window(window_str, records, ts_field): + """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。""" + if window_str == "auto": + timestamps = [] + for r in records: + ts_str = r.get(ts_field, "") + if ts_str: + try: + timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")) + except ValueError: + pass + if len(timestamps) < 2: + return timedelta(minutes=1) + span = max(timestamps) - min(timestamps) + if span < timedelta(minutes=30): + return timedelta(seconds=5) + elif span < timedelta(hours=3): + return timedelta(minutes=1) + else: + return timedelta(minutes=5) + elif window_str.endswith("s"): + return timedelta(seconds=int(window_str[:-1])) + elif window_str.endswith("m"): + return timedelta(minutes=int(window_str[:-1])) + elif window_str.endswith("h"): + return timedelta(hours=int(window_str[:-1])) + return timedelta(minutes=1) + + +def _align_to_bucket(dt, window_td): + """将 datetime 对齐到窗口边界。""" + secs = max(1, int(window_td.total_seconds())) + epoch = datetime(dt.year, dt.month, dt.day) + offset = int((dt - epoch).total_seconds()) + aligned = (offset // secs) * secs + return epoch + timedelta(seconds=aligned) + + +def _aggregate_values(values, func): + """用指定函数聚合一组数值。""" + if not values: + return 0 + if func == "count": + return len(values) + elif func == "sum": + return round(sum(values), 3) + elif func == "mean": + return round(sum(values) / len(values), 3) + elif func == "min": + return round(min(values), 3) + elif func == "max": + return round(max(values), 3) + elif func.startswith("p"): + p = int(func[1:]) + return round(percentile(sorted(values), p), 3) + return 0 + + +# ════════════════════════════════════════════════════════════════ +# 分组计数 +# ════════════════════════════════════════════════════════════════ + + +def count_by(records, field, top_n=None): + """按指定字段分组计数。 + + Args: + records: dict 列表 + field: 分组字段名 + top_n: 只返回前 N 个(按计数降序) + + Returns: + list[dict]: [{value, count, pct}],按计数降序排列 + """ + counts = defaultdict(int) + total = 0 + for r in records: + val = r.get(field) + if val is not None: + counts[str(val)] += 1 + total += 1 + + result = [] + for val, count in sorted(counts.items(), key=lambda x: -x[1]): + result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0}) + + if top_n: + result = result[:top_n] + + return result diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md new file mode 100644 index 00000000000..ab0c3ce7219 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -0,0 +1,148 @@ +--- +name: troubleshoot +description: > + FastDeploy Go Router 综合问题排查 skill。覆盖错误分类、延迟分析、请求追踪、Worker 健康时间线、 + Cache 调度诊断、负载与计数器分析六个维度。输出按三层问题来源分类:Router 自身、FastDeploy 后端、客户端。 + + 当用户要求以下操作时触发此 skill:排查 router 问题、分析 router 日志、router 排查、 + 查看 router 状态、综合排查、全量扫描、troubleshoot router、/troubleshoot、 + 分析错误日志、502/503 排查、延迟分析、Worker 健康、负载分析、cache 调度诊断、 + 请求追踪、trace 请求。 + 关键词:troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、 + health、load、cache、trace、/troubleshoot。 + +IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。 +错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。 +--- + +# Router Troubleshooting + +综合排查 FastDeploy Go Router 问题,输出完整诊断报告。 + +## 执行前交互 + +运行脚本前,Claude 必须按以下顺序向用户确认参数: + +### 1. 日志文件路径 +使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项,同时允许用户直接输入自定义路径(支持绝对路径和相对路径): +- 选项 1: `logs/router.log`(默认) +- 选项 2: `fd-router.log`(golang_router 根目录) +- 选项 3: 用户通过 Other 输入自定义路径 + +**重要规则**: +- 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 +- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件 +- 支持绝对路径(如 `/home/user/logs/xxx.log`)和相对路径(如 `logs/fd-router (2).log`) + +如果用户直接确认或未指定路径,使用脚本的自动发现逻辑。 + +### 2. 分析范围 +向用户询问分析范围: +> "请选择分析范围: +> 1. **全量分析**(默认)— 分析整个日志文件 +> 2. **尾部分析** — 只分析最近数据(可指定行数或时间如 `--tail 5000` 或 `--tail 30m`) +> 3. **指定时间段** — 分析特定时间范围内的日志" + +如果用户未选择,默认使用全量分析。 + +#### 指定时间段的处理 + +脚本原生支持 `--start` 和 `--end` 参数,无需手动预过滤。两者可单独或同时指定。 + +时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 +缺失部分自动从日志首末行推断(缺年份取首行,缺日期取末行)。 +`--start/--end` 与 `--tail` 互斥。 + +### 3. 分析模式 +向用户询问分析模式: +> "请选择分析模式: +> 1. **完整分析**(默认)— 运行所有维度(errors + latency + health + cache + load) +> 2. **单维度/多维度分析** — 选择特定维度(errors / latency / health / cache / load),可选多个 +> 3. **请求追踪** — 追踪特定请求 ID(需提供 ID)" + +如果用户未选择,默认使用完整分析。 + +### 4. 输出目录 +诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 +用户可通过 `--output` 指定自定义目录。 + +## 用法 + +脚本路径(相对于 `fastdeploy/golang_router/`):`.claude/skills/troubleshoot/scripts/` + +```bash +SCRIPTS=.claude/skills/troubleshoot/scripts + +# 全量扫描(errors + latency + health + cache + load) +python3 $SCRIPTS/troubleshoot.py + +# 单维度分析 +python3 $SCRIPTS/troubleshoot.py --errors +python3 $SCRIPTS/troubleshoot.py --latency +python3 $SCRIPTS/troubleshoot.py --health +python3 $SCRIPTS/troubleshoot.py --cache +python3 $SCRIPTS/troubleshoot.py --load + +# 请求追踪(需指定 ID,支持逗号分隔多 ID) +python3 $SCRIPTS/troubleshoot.py --trace +python3 $SCRIPTS/troubleshoot.py --trace "id1,id2" + +# 尾部分析 +python3 $SCRIPTS/troubleshoot.py --tail 5000 +python3 $SCRIPTS/troubleshoot.py --tail 30m + +# 指定时间段(--start 和 --end 可单独或同时使用) +python3 $SCRIPTS/troubleshoot.py --start "16:00:00" --end "17:00:00" +python3 $SCRIPTS/troubleshoot.py --start "2026/03/31 16:00:00" +python3 $SCRIPTS/troubleshoot.py --start "03/31" --end "03/31 18:00" + +# 组合模式 +python3 $SCRIPTS/troubleshoot.py --errors --latency +python3 $SCRIPTS/troubleshoot.py --errors --tail 5000 +python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --errors --latency +``` + +默认日志路径:`logs/router.log` → `fd-router.log` + +## 输出 + +- **终端**:简洁三层汇总(Router / FD 后端 / 客户端),含状态码分布、错误 Top N、趋势图 +- **文件**:详细报告导出到 `skill_output/troubleshoot//troubleshoot_report_.md` + - 逐分钟事件详情拆分到 `details/health_events.md` + - 请求追踪事件链拆分到 `details/trace_.md` +- **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` + +## 三层诊断框架 + +| 层 | 典型问题 | 日志特征 | +|----|---------|---------| +| Router | Panic、500、Counter 异常、调度瓶颈、Cache 策略不优 | `Panic recovered`、`Failed to encode`、`double-release` | +| FD 后端 | 502、Worker 下线、高推理延迟、请求卡住 | `Failed to connect`、`Removed unhealthy`、p99 高 | +| 客户端 | 断连、请求格式错误 | `context canceled`、400 | + +## 脚本架构 + +``` +scripts/ + log_parser.py — 日志解析原语(HTTP/Cache/Stats/错误归一化/事件匹配) + stats.py — 通用统计计算(百分位数/时间窗口/分组) + chart.py — 终端可视化(sparkline/柱状图/表格/时间线) + troubleshoot.py — 主编排器 + analyzers/ + errors.py — 错误分类分析 + latency.py — 延迟分析 + health.py — Worker 健康时间线 + cache.py — Cache 调度诊断 + load.py — 负载与计数器分析 + trace.py — 请求追踪 +``` + +## 重要规则 + +1. 大文件 (>5000 行) 用 grep 分类提取,不一次性读取 +2. 每个问题标注来源层(Router / FD 后端 / 客户端) +3. Cache 命中率数值分析用 `/stat-cache-hitrate`,本 skill 做策略诊断 +4. 分析前读取 `references/log_patterns.md` +5. 错误查询参考 `references/error_catalog.md` +6. 后端问题排查参考 `references/fastdeploy_cross_reference.md` +7. 输出格式参考 `references/report_templates.md` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json new file mode 100644 index 00000000000..4b961e85b36 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json @@ -0,0 +1,18 @@ +[ + {"query": "router 最近频繁 502 和 503,帮我全面排查一下问题", "should_trigger": true}, + {"query": "帮我 troubleshoot 一下 Go Router,感觉有些请求延迟特别高", "should_trigger": true}, + {"query": "分析 logs/fd-router.log 里面的错误日志,看看哪些错误最多", "should_trigger": true}, + {"query": "有几个 Worker 好像不太健康,帮我看看 Worker 健康时间线", "should_trigger": true}, + {"query": "cache 调度策略最近好像有问题,fallback 比例太高了,诊断一下", "should_trigger": true}, + {"query": "帮我追踪请求 trace-id-12345,看看这个请求在 router 里经历了什么", "should_trigger": true}, + {"query": "/troubleshoot 全量扫描 router 日志,给我一份完整的诊断报告", "should_trigger": true}, + {"query": "router 负载分析一下,有没有 counter 异常或者 double-release 的情况", "should_trigger": true}, + {"query": "统计一下 cache 命中率是多少,prefix hit ratio 和 session hit rate 各是多少", "should_trigger": false}, + {"query": "帮我看看 hitRatio 数据,想了解 KV cache 的复用度", "should_trigger": false}, + {"query": "帮我写一个 Go 的 reverse proxy,要支持负载均衡", "should_trigger": false}, + {"query": "分析 Kubernetes pod 的日志,看看为什么 OOMKilled", "should_trigger": false}, + {"query": "FastDeploy 模型部署失败了,帮我看看怎么回事", "should_trigger": false}, + {"query": "帮我优化一下 Python 代码的性能,跑得太慢了", "should_trigger": false}, + {"query": "nginx 返回 504 Gateway Timeout,帮我排查原因", "should_trigger": false}, + {"query": "帮我监控 cache 命中率的实时变化趋势", "should_trigger": false} +] diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md new file mode 100644 index 00000000000..ba48297d9c9 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -0,0 +1,122 @@ +# Router 错误目录 + +按 HTTP 状态码和日志级别分类的 Router 错误快速索引。每条含严重程度、根因、影响、排查命令、问题来源层。 + +--- + +## 按 HTTP 状态码索引 + +注意:HTTP 响应体中的错误消息与 logger 输出的 ERROR 消息**可能不同**。 +例如:HTTP 502 响应 `Failed to select worker: {err}` 对应的日志 ERROR 是 `Failed to select mixed worker: {err}`。 +分析时需将两者关联而非简单去重。 + +### 400 Bad Request + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Invalid request body: {err}` | 请求体读取失败 | 客户端 | 检查客户端请求格式 | +| `Invalid JSON format: {err}` | JSON 解析失败 | 客户端 | 检查 JSON 格式 | +| `DefaultManager is nil` | Manager 未初始化 | Router | 检查 Router 启动日志 | + +### 500 Internal Server Error + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Failed to build disaggregate_info: {err}` | PD 模式配置错误 | Router | 检查 register.yaml 参数 | +| `Failed to encode modified request: {err}` | 请求编码失败 | Router | 检查请求参数特殊字符 | +| `Internal server error` (Panic) | Router 代码 bug | Router | 检查 Panic recovered 日志 | + +### 502 Bad Gateway + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Failed to select worker: {err}` | 无可用 Mixed Worker | FD 后端 | `curl /health` 检查后端 | +| `Failed to select worker pair: {err}` | 无可用 PD Worker | FD 后端 | 检查 prefill/decode 注册状态 | +| `Failed to connect to backend service: {err}` | 后端不可达 | FD 后端 | `curl {worker_url}/health` | + +### 503 Service Unavailable + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `No available prefill/decode workers` | 全部 Worker 不健康 | FD 后端 | 检查部署状态 | + +--- + +## 按日志级别索引 + +### ERROR 级别 + +| 消息模板 | 严重程度 | 来源层 | 影响 | +|---------|---------|-------|------| +| `Failed to select mixed worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to select prefill worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to read register request body: {err}` | MEDIUM | Router | 注册失败 | +| `Failed to unmarshal register request JSON: {err}` | MEDIUM | Router | 注册失败 | +| `Failed to create decode request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Failed to create prefill request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Decode request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Prefill request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Failed to read request body: {err}` | LOW | 客户端 | 单请求失败 | +| `Failed to unmarshal request JSON: {err}` | LOW | 客户端 | 单请求失败 | +| `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | +| `Failed to create backend request for {url}: {err}` | HIGH | FD 后端 | 请求失败 | +| `Backend request failed for {url}: {err}` | HIGH | FD 后端 | 请求失败 | +| `scanner error: {err}` | MEDIUM | FD 后端/客户端 | 流式响应中断(gateway redirect 函数) | +| `[prefill] scanner error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 流式错误 | +| `copy error: {err}` | MEDIUM | FD 后端/客户端 | 非流式响应中断 | +| `[prefill] copy error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 非流式错误 | +| `Removed unhealthy prefill/decode/mixed instance: {url}` | HIGH | FD 后端 | Worker 被移除(注意:这是 ERROR 级别) | + +### WARN 级别 + +| 消息模板 | 严重程度 | 来源层 | 影响 | +|---------|---------|-------|------| +| `GetRemoteMetrics failed for {url}, falling back to local counter` | LOW | FD 后端 | 调度精度降低 | +| `release worker: {url} skipped, counter already cleaned up` | LOW | Router | 计数器异常 | +| `release worker: {url} skipped, counter already zero (possible double-release)` | MEDIUM | Router | 计数器逻辑 bug | +| `cache-aware prefill: tokenizer failed, fallback to char tokens: {err}` | LOW | Router | cache-aware 精度降低 | +| `Instance {url} role is unknown` | LOW | Router | 注册角色不识别 | + +### INFO 级别(异常相关) + +| 消息模板 | 含义 | 关注场景 | +|---------|------|---------| +| `unhealthy worker counter preserved (inflight requests): {url}, count: {N}` | 不健康 Worker 仍有 inflight 请求 | 频繁出现说明 Worker 不稳定 | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {N}` | 不健康 Worker 仍有 token 计数 | 同上 | +| `cleanup unhealthy worker counter: {url}` | 清理不健康 Worker 的请求计数 | 正常清理 | +| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数 | 正常清理 | +| `preserved counters for {N} workers with inflight requests: [...]` | 保留了 N 个 Worker 的计数器 | N 大说明多 Worker 不稳定 | +| `removed counters for {N} unhealthy workers: [...]` | 移除了 N 个 Worker 的计数器 | 正常清理 | +| `Server {url} is healthy` | 健康检查恢复 | Worker 恢复(来自 HealthGenerate 端点) | + +注意:以下事件是 **ERROR 级别**,不是 INFO: +- `Removed unhealthy prefill/decode/mixed instance: {url}` — Worker 被移除 + +注意:以下内容是 **HTTP 响应体**,不是 logger 输出(不会出现在日志行中): +- `Register success` — 注册成功的 HTTP 200 响应体 +- Worker 注册检测应通过 H1 行的 `POST /register 200` 判断 + +--- + +## 注册参数校验错误 + +| 错误消息 | 根因 | 排查 | +|---------|------|------| +| `invalid connector_port: {value}` | connector_port 非数字或范围错误 | 检查 register.yaml | +| `invalid engine_worker_queue_port: {value}` | engine_worker_queue_port 非数字或范围错误 | 检查 register.yaml | +| `invalid metrics_port: {value}` | metrics_port 非数字或范围错误 | 检查 register.yaml | +| `rdma_ports[{i}] invalid port: {value}` | RDMA 端口配置错误 | 检查 register.yaml | + +--- + +## scanner error / copy error 区分 + +| error 内容 | 来源层 | 含义 | +|-----------|-------|------| +| `context canceled` | 客户端 | 客户端主动断连(超时或取消) | +| 其他 | FD 后端 | 后端流式响应异常 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md new file mode 100644 index 00000000000..f35cbcb303a --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md @@ -0,0 +1,102 @@ +# FastDeploy 后端交叉引用 + +从 Router 日志推断 FastDeploy 后端问题时的排查指引。 + +--- + +## 症状 → 后端排查 + +### 1. 后端不可达 (502) + +**Router 日志特征**: +``` +[ERROR] Failed to connect to backend service: dial tcp {ip}:{port}: connect: connection refused +``` + +**排查步骤**: +1. `curl http://{worker_url}/health` — 确认后端是否存活 +2. `curl http://{worker_url}/v1/models` — 确认模型是否加载完成 +3. 检查后端日志 `logs/workerlog.0` +4. `netstat -tlnp | grep {port}` — 确认端口监听 +5. 检查网络连通性(防火墙、安全组) + +### 2. 后端 OOM / 频繁重启 + +**Router 日志特征**: +- Worker 频繁 REMOVED → RE-REGISTERED(短周期内多次) +- 健康检查间歇性失败 + +**排查步骤**: +1. `dmesg | grep -i oom` — 检查 OOM killer +2. `nvidia-smi` — 检查 GPU 内存 +3. 后端日志搜索 `CUDA out of memory` +4. 检查 `max_num_seqs`、`max_model_len` 配置 + +### 3. 高推理延迟 + +**Router 日志特征**: +- 请求 p99 高(>10s)但调度耗时仅 ms 级 +- 确认延迟不在 Router 层(调度耗时 << 总延迟) + +**排查步骤**: +1. 检查后端 Prometheus metrics:`http://{worker_url}:{metrics_port}/metrics` + - `fastdeploy_llm_running_queue_size` — 推理队列 + - `fastdeploy_llm_waiting_queue_size` — 等待队列 + - `fastdeploy_llm_generation_tokens_per_second` — 吞吐量 +2. 确认 GPU 利用率:`nvidia-smi --query-gpu=utilization.gpu --format=csv` +3. 检查是否有长 prompt 请求拖慢整体 + +### 4. 流式响应异常 + +**Router 日志特征**: +``` +[ERROR] scanner error: {err} (非 context canceled) +[ERROR] copy error: {err} (非 context canceled) +``` + +**排查步骤**: +1. 后端日志搜索对应 request_id +2. 检查后端是否产生格式错误的 SSE +3. 检查网络是否有中间代理超时切断 + +### 5. 请求超时/卡住 + +**Router 日志特征**: +- 有 select worker 但长时间无 release/completed +- [stats] 中 running 持续不降 + +**根因**:Router 的 `http.Client{}` 没有设置超时,后端不响应则阻塞到客户端断连或 TCP 超时。 + +**排查步骤**: +1. 检查后端是否还在处理请求 +2. 检查后端是否出现死锁 +3. `ss -tnp | grep {port}` — 检查 TCP 连接状态 + +--- + +## 通用 FastDeploy 排查工具 + +### collect-env + +收集环境信息: +```bash +python -m fastdeploy.utils.collect_env +``` + +### 后端日志位置 + +- 默认:`logs/workerlog.0` +- 多 Worker:`logs/workerlog.{N}` + +### Prometheus Metrics + +后端 metrics 端口(从注册信息获取 `metrics_port`): +``` +http://{worker_ip}:{metrics_port}/metrics +``` + +关键指标: +- `fastdeploy_llm_running_queue_size` — 当前推理中的请求数 +- `fastdeploy_llm_waiting_queue_size` — 等待队列长度 +- `fastdeploy_llm_generation_tokens_per_second` — 生成吞吐 +- `fastdeploy_llm_request_total` — 总请求数 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md new file mode 100644 index 00000000000..cf33b41f723 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -0,0 +1,282 @@ +# 日志格式与提取规则 + +本文档定义 Router 日志的所有类别、Grep 匹配模式、精确正则,供各子 skill 参考。 + +--- + +## 日志基本格式 + +``` +[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:: [context_tags] message +``` + +### Context Tags(可选,顺序固定) + +- `[trace_id:]` +- `[req_id:]` +- `[session_id:]` +- `[request_id:]` + +所有 tag 可能同时出现,也可能只有部分或没有。顺序固定为:`trace_id → req_id → session_id → request_id`。 + +### ID 匹配正则 + +搜索某个 ID 时,同时匹配四种 tag: +``` +session_id:|trace_id:|request_id:|req_id: +``` + +--- + +## 日志分类提取 + +| 类别 | Grep 模式 | 用途 | 典型内容 | +|------|----------|------|---------| +| E1 — ERROR | `\[ERROR\]` | 错误分类 | 各类 Failed to ... 错误 | +| E2 — WARN | `\[WARN\]` | 警告分类 | counter 异常、tokenizer 退化 | +| H1 — HTTP 请求 | `\] \[(POST\|GET)\] /` | 延迟/状态码/吞吐量 | HTTP middleware 日志行 | +| H2 — 健康事件 | `Removed unhealthy\|is not healthy\|is healthy` | Worker 健康时间线 | 上下线事件 | +| H2b — 注册事件 | `\] \[POST\] /register.*200` | Worker 注册 | 从 H1 HTTP 行中匹配 POST /register 返回 200 | +| H3 — 调度事件 | `select worker\|release worker\|Failed to select\|SelectWorkerPair` | 调度/计数器分析 | Worker 选择和释放 | +| H4 — 后端问题 | `Failed to connect\|request failed\|scanner error\|copy error\|Panic recovered` | 后端问题 | 连接/流式/Panic(注意:`scanner error`/`copy error` 与 H9 有重叠,带 `[prefill]` 前缀的行同时属于 H9) | +| H5 — Counter | `counter preserved\|cleanup unhealthy\|removed counters\|counter already\|double-release\|preserved counters` | 计数器异常 | 计数器生命周期 | +| H6 — Cache-aware | `cache-aware prefill: final strategy:` | Cache 调度诊断 | 策略选择 + hitRatios | +| H7 — Stats | `\[stats\]` | 负载/命中率 | 周期性统计行 | +| H8 — ts_ms | `ts_ms=` | 调度耗时 | 调度开始结束时间戳 | +| H9 — Prefill 生命周期 | `\[prefill\]` | PD 模式 prefill 追踪 | 首包/释放/错误 | +| H10 — 请求标记 | `Parsing completed\|Request completed successfully` | 请求生命周期 | 调度开始/请求结束标记 | +| H11 — Token 释放 | `release prefill tokens` | Token 计数器生命周期 | Token 释放事件 | + +--- + +## H1 — HTTP 请求行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1 +``` + +字段:`[METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP` + +### 延迟单位归一化 + +Go `time.Duration.String()` 输出格式不固定,需归一化为毫秒: + +| 原始格式 | 含义 | 转换为 ms | +|---------|------|----------| +| `1.5s` | 秒 | × 1000 | +| `150ms` | 毫秒 | 直接使用 | +| `150.5ms` | 毫秒 | 直接使用 | +| `500µs` | 微秒 | ÷ 1000 | +| `500us` | 微秒(ASCII) | ÷ 1000 | +| `500ns` | 纳秒 | ÷ 1000000 | +| `1m30s` | 分+秒 | 分×60000 + 秒×1000 | +| `1h2m3s` | 时+分+秒 | 时×3600000 + 分×60000 + 秒×1000 | + +正则提取延迟值:`(\d+(?:\.\d+)?(?:h|m(?!s)|s|ms|µs|us|ns))+` + +### 仅推理请求 + +延迟分析只统计推理请求路径: +- `/v1/chat/completions` +- `/v1/completions` + +排除健康检查 `/health`、注册 `/register` 等管理路径。 + +--- + +## H6 — Cache-aware 策略行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:87: [trace_id:xxx] [session_id:xxx] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.0.0.1:9965, loads=map[http://10.0.0.1:9965:2 http://10.0.0.2:9965:5], hitRatios=map[http://10.0.0.1:9965:0.85 http://10.0.0.2:9965:0.42]. ts_ms=2025-01-15 18:25:33.123 +``` + +``` +[INFO] ... cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...]. ts_ms=2025-01-15 18:25:33.123 +``` + +注意:日志中**没有** `scores=map[...]` 字段。scores 仅在 DEBUG 级别的 `chooseByScore` 中逐条打印。 +如需分析非最优选择,需从 hitRatios + loads 使用公式重新计算: +`score = (100-hitRatio)/100 * hitRatioWeight + loadRatio * loadBalanceWeight` + +### Go map 解析 + +`hitRatios=map[key1:val1 key2:val2]` + +- 空 map:`hitRatios=map[]` — 表示冷启动 +- 正则提取 map 内容:`map\[(.*?)\]` +- 每对 key:value 用空格分隔:`(\S+):(\S+)` +- key 是 worker URL,value 是 float64 + +### selected worker 的 hitRatio + +从 hitRatios map 中查找 selected URL 的值: +- 在 map 中找到 → 使用该值 +- 不在 map 中 → hitRatio = 0 +- map 为空 → 冷启动,hitRatio = 0 + +### ts_ms 格式 + +`ts_ms=2025-01-15 18:25:33.123` + +格式:`2006-01-02 15:04:05.000`(Go reference time) + +用于计算调度耗时(两个 ts_ms 之间的差值)。 + +--- + +## H7 — Stats 行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:87: [stats] total_running=5, workers: [http://10.0.0.1:9965: running=2, http://10.0.0.2:9965: running=3], cache_hit_rate=85.71% (hits=6/total=7) +``` + +注意:由于 Go `log.Lshortfile` 打印的是 `Printf` 调用处,stats 行的源文件始终为 `logger.go:NN:`(行号随编译变化),而非 `handler.go`。 + +注意:stats 行**不包含**任何 context tag(trace_id 等),因为由后台 goroutine 周期输出。 + +### 关键:per-interval 计数器 + +`hits` 和 `total` 是 **per-interval** 的值(每 5s 通过 `atomic.Swap(0)` 重置为 0)。 + +计算累计值必须 **sum 所有行**: +- 累计 Session Hit Rate = `sum(hits) / sum(total) * 100` + +### Worker 负载提取 + +`workers: [url1: running=N, url2: running=N]` + +- 注意格式:`workers:` 带冒号+空格,每个 worker 格式为 `url: running=N`,逗号+空格分隔 +- **不包含 token 数据**(reportStats 只读取 running 计数) + +正则:`(http://[^:]+:\d+): running=(\d+)` + +### cache_hit_rate 提取 + +`cache_hit_rate=85.71% (hits=6/total=7)` + +正则:`cache_hit_rate=([\d.]+)% \(hits=(\d+)/total=(\d+)\)` + +--- + +## 模板归一化 + +ERROR/WARN 消息分组时,需将变量替换为占位符: + +| 变量类型 | 正则 | 替换为 | +|---------|------|-------| +| URL | `https?://[\w.:]+` | `{url}` | +| UUID | `[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}` | `{uuid}` | +| 数字 | `\d+` (仅在特定位置) | `{N}` | +| IP:Port | `\d+\.\d+\.\d+\.\d+:\d+` | `{ip:port}` | + +--- + +## Fallback 策略行识别 + +| final strategy | reason 关键词 | 含义 | +|---------------|--------------|------| +| `cache_aware_scoring` | (无 reason) | 正常 cache-aware 调度 | +| `process_tokens` | `tokenize failed` | 退化 B:字符级 tokenize 也失败 | +| `process_tokens` | `load imbalanced` | 退化 C:负载不均衡 | +| `process_tokens` | (其他) | 退化 D:策略未初始化等 | + +退化 A(Tokenizer 服务→字符级)在 WARN 行识别: +``` +[WARN] ... cache-aware prefill: tokenizer failed, fallback to char tokens: {err} +``` +注意完整前缀 `cache-aware prefill: tokenizer failed`。 +退化 A 后仍可走 cache_aware_scoring(精度降低),与 B/C/D 不互斥。 + +--- + +## H4 — 后端问题匹配说明 + +H4 的 `request failed` 模式会匹配多个消息模板: +- `Request failed (attempt {n}/{max}): {err}` — 重试日志 +- `Decode request failed for {url}: {err}` — PD 模式 decode 失败 +- `Prefill request failed for {url}: {err}` — PD 模式 prefill 失败 +- `Backend request failed for {url}: {err}` — 后端请求失败 + +分析时需通过模板归一化去重。 + +--- + +## H9 — Prefill 生命周期事件 + +PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` 前缀日志: + +| 消息模板 | 含义 | +|---------|------| +| `[prefill] first chunk received, release counter url=%s` | Prefill 首包到达,释放计数器 | +| `[prefill] non-stream prefill response done, release counter url=%s` | 非流式 prefill 完成 | +| `[prefill] release in defer (fallback) url=%s, isStream=%v` | defer 兜底释放 | +| `[prefill] release in CommonCompletions defer (error path) url=%s` | 错误路径释放 | +| `[prefill] backendResp is nil or backendResp.Body is nil, url=%s` | 后端响应异常 | +| `[prefill] scanner error: %v, message=%s` | 流式读取错误(ERROR 级别) | +| `[prefill] copy error: %v, message=%s` | 非流式复制错误(ERROR 级别) | + +--- + +## H10 — 请求生命周期标记 + +| 消息 | 含义 | 级别 | +|------|------|------| +| `Parsing completed; starting worker selection.` | 请求解析完成,开始调度 | INFO | +| `Request completed successfully.` | 请求成功完成 | INFO | + +--- + +## H11 — Token 释放 + +`release prefill tokens: %s, tokens: %d` — 释放 prefill token 计数。 +数据源:`handler.go:333`。用于 troubleshoot-load 的 token 计数器分析。 + +--- + +## 使用脚本工具 + +各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 + +### 快速参考 + +| 任务 | 脚本 | +|------|------| +| 解析 H1 HTTP 行 | `log_parser.py parse-http [--inference-only]` | +| 解析 H6 cache 策略行 | `log_parser.py parse-cache-strategy` | +| 解析 H7 stats 行 | `log_parser.py parse-stats` | +| 检测非支持请求 | `log_parser.py unsupported-requests [--summary-only]` | +| ASCII 折线图 | `chart.py` | +| Unicode 柱状图 | `chart.py` | +| Markdown 表格 | `chart.py` | +| Worker 时间线 | `chart.py` | + +所有工具从 stdin 读取,输出到 stdout。中间数据使用 JSON Lines 格式。 + +--- + +## 已知路由列表 + +Router 支持的全部路由(来自 `internal/router/router.go`): + +| Method | Path | 类型 | +|--------|------|------| +| POST | `/v1/chat/completions` | 推理 | +| POST | `/v1/completions` | 推理 | +| POST | `/register` | 实例注册 | +| GET | `/registered_number` | 注册数量查询 | +| GET | `/registered` | 注册列表查询 | +| GET | `/health_generate` | 健康检查 | +| GET | `/metrics` | Prometheus 指标 | + +### 非支持请求排查 + +客户端可能发送不属于已知路由的请求(如 `/v1/models`),会收到 404 但仍记录在 H1 HTTP 日志中。 + +使用 `log_parser.py unsupported-requests` 子命令检测: +```bash +# 完整输出(详细列表 + 汇总) +grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests + +# 仅汇总 +grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests --summary-only +``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md new file mode 100644 index 00000000000..ba9e40e9869 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -0,0 +1,120 @@ +# 报告输出规范 + +所有 troubleshoot 分析维度共享的可视化和格式规范。 + +--- + +## 通用可视化组件 + +### Unicode 柱状图 +- 填充块:`█`(U+2588),空块:`░`(U+2591) +- 总宽度:20 字符,右侧标注百分比和计数 +- 块数 = round(percentage / 100 * 20),最小 1 块(>0% 时) + +### Sparkline 折线图 +- 字符集:`▁▂▃▄▅▆▇█`(8 级高度) +- 图表宽度:60 字符,自动降采样 +- X 轴标注时间(首/尾 + 中间 2-3 个刻度) +- Y 轴自适应:百分比类 0-100%,计数类 0-max + +### Markdown 表格 +- 标准 Markdown 表格格式 +- 数值列右对齐 + +### Worker 可用性时间线 +- `█` = 在线,`░` = 下线 +- 右侧标注在线率百分比 + +--- + +## 严重程度标记 + +| 标记 | 含义 | 使用场景 | +|------|------|---------| +| CRITICAL | 服务不可用 | Panic、全部 Worker 不健康、错误率 >20% | +| HIGH | 部分请求失败 | 502/503、Worker 频繁下线 | +| MEDIUM | 性能下降 | 高延迟、cache 命中率低 | +| LOW | 需关注 | 计数器异常、tokenizer 退化 | +| INFO | 正常 | 统计信息 | + +--- + +## 报告格式 + +### 简洁版(终端输出) + +- 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 按三层分类(Router / FD 后端 / 客户端) +- 每个问题一行摘要 + 关键指标 +- 末尾提示详细版文件路径 + +### 详细版(文件导出) + +- 路径:`skill_output/troubleshoot//troubleshoot_report_.md` +- 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) +- 详情拆分到 `details/` 子目录: + - `details/health_events.md` — Worker 逐分钟健康事件 + - `details/trace_.md` — 请求追踪事件链 + +--- + +## 状态判定规则 + +- **CRITICAL**:存在 Panic、全部 Worker 不健康、或错误率 >20% +- **DEGRADED**:存在 502/503、Worker 不稳定、或错误率 >5% +- **HEALTHY**:无严重问题 + +--- + +## 各维度报告结构 + +### Errors(错误分析) + +``` +HTTP 状态码分布(柱状图) +错误率趋势(折线图) +ERROR/WARN Top N(柱状图 + 表格,标注来源层) +Panic 列表 +``` + +### Latency(延迟分析)— 待实现 + +``` +延迟百分位数 (p50/p90/p95/p99) +延迟分布(柱状图) +吞吐量趋势(折线图) +慢请求 Top 10 +``` + +### Health(Worker 健康)— 待实现 + +``` +Worker 可用性时间线 +健康事件汇总表 +可用性统计 +``` + +### Cache(调度诊断)— 待实现 + +``` +调度策略分布 +Session 粘性分析 +非最优选择分析 +Fallback 原因分类 +``` + +### Load(负载分析)— 待实现 + +``` +Worker 负载分布 +计数器异常检测 +Token 计数器统计 +``` + +### Trace(请求追踪)— 待实现 + +``` +单请求事件链 +生命周期完整性检查 +Session 多请求汇总 +``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py new file mode 100644 index 00000000000..e7bb50660a8 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py @@ -0,0 +1 @@ +# Analyzers package diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py new file mode 100644 index 00000000000..3a18b668a41 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -0,0 +1,458 @@ +#!/usr/bin/env python3 +""" +Cache Analyzer — Cache 调度诊断 + +分析 cache-aware 调度策略:session 粘性、非最优选择评分、驱逐影响、 +fallback 原因、冷启动识别、交叉诊断。 +注意:cache 命中率数值分析由 stat-cache-hitrate skill 负责,本模块做策略诊断。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_table +from log_parser import parse_cache_strategy_line, parse_ts +from stats import compute_statistics, count_by + +# ════════════════════════════════════════════════════════════════ +# Fallback 分类 +# ════════════════════════════════════════════════════════════════ + +TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens") + + +def classify_fallback(record, tokenizer_degraded_ts=None): + """对 process_tokens 策略行分类 fallback 原因。 + + Returns: 'A-Tokenizer退化' | 'B-char tokenize失败' | 'C-负载不均衡' | 'D-其他' + """ + reason = record.get("reason", "") + if "load imbalanced" in reason: + return "C-负载不均衡" + if "tokenize failed" in reason: + return "B-char tokenize失败" + return "D-其他" + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weight=1.0, load_balance_weight=1.0): + """分析 cache-aware 调度策略。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制 + eviction_duration_mins: 驱逐时间(分钟,默认 30) + hit_ratio_weight: hitRatio 权重(默认 1.0) + load_balance_weight: loadBalance 权重(默认 1.0) + + Returns: + dict: {strategy_dist, fallback_reasons, session_stickiness, suboptimal_selections, + eviction_impact, cold_starts, hitratio_stats, diagnoses, summary} + """ + h6_lines = _grep_lines(log_file, r"cache-aware prefill: final strategy:", tail) + tokenizer_warn_lines = _grep_lines(log_file, r"tokenizer failed, fallback to char tokens", tail) + + # 解析策略行 + strategy_records = [r for line in h6_lines for r in [parse_cache_strategy_line(line)] if r] + + if not strategy_records: + return { + "strategy_dist": [], + "fallback_reasons": [], + "session_stickiness": {}, + "suboptimal_selections": [], + "eviction_impact": [], + "cold_starts": 0, + "hitratio_stats": {}, + "diagnoses": [], + "summary": "未检测到 cache-aware 策略日志", + } + + # Tokenizer 退化次数 + tokenizer_degraded_count = len(tokenizer_warn_lines) + + # 策略分布 + strategy_dist = count_by(strategy_records, "strategy") + + # Fallback 原因 + fallback_records = [r for r in strategy_records if r.get("strategy") == "process_tokens"] + fallback_reasons = [] + if fallback_records: + for r in fallback_records: + r["fallback_type"] = classify_fallback(r) + fallback_reasons = count_by(fallback_records, "fallback_type") + + # hitRatio 统计 + hr_vals = [r.get("selected_hitRatio", 0) for r in strategy_records if "selected_hitRatio" in r] + hitratio_stats = compute_statistics(hr_vals) if hr_vals else {} + + # Session 粘性分析 + session_stickiness = _analyze_session_stickiness(strategy_records) + + # 非最优选择分析 + suboptimal = _analyze_suboptimal(strategy_records, hit_ratio_weight, load_balance_weight) + + # 驱逐影响 + eviction_impact = _analyze_eviction(strategy_records, eviction_duration_mins) + + # 冷启动 + cold_starts = sum(1 for r in strategy_records if r.get("hitRatios") == {}) + + total = len(strategy_records) + cache_aware_count = sum(1 for r in strategy_records if r["strategy"] == "cache_aware_scoring") + fallback_count = len(fallback_records) + + diagnoses = _diagnose( + strategy_dist, + fallback_reasons, + session_stickiness, + suboptimal, + eviction_impact, + cold_starts, + total, + tokenizer_degraded_count, + hitratio_stats, + ) + + return { + "strategy_dist": strategy_dist, + "fallback_reasons": fallback_reasons, + "session_stickiness": session_stickiness, + "suboptimal_selections": suboptimal, + "eviction_impact": eviction_impact, + "cold_starts": cold_starts, + "hitratio_stats": hitratio_stats, + "tokenizer_degraded_count": tokenizer_degraded_count, + "diagnoses": diagnoses, + "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " + f"冷启动 {cold_starts}", + } + + +def _analyze_session_stickiness(records): + """Session 粘性分析。""" + sessions = defaultdict(list) + for r in records: + sid = (r.get("tags") or {}).get("session_id") + if sid and "selected" in r: + sessions[sid].append(r["selected"]) + + result = {} + for sid, workers in sessions.items(): + if len(workers) < 2: + continue + same_count = sum(1 for i in range(1, len(workers)) if workers[i] == workers[i - 1]) + stickiness = round(same_count / (len(workers) - 1) * 100, 1) + switches = [(i, workers[i - 1], workers[i]) for i in range(1, len(workers)) if workers[i] != workers[i - 1]] + result[sid] = { + "total_requests": len(workers), + "stickiness_pct": stickiness, + "switches": len(switches), + } + + return result + + +def _analyze_suboptimal(records, hr_weight, lb_weight): + """非最优选择分析:selected 的 hitRatio 不是最高时,重新计算 score 对比。""" + suboptimal = [] + for r in records: + if r.get("strategy") != "cache_aware_scoring": + continue + hit_ratios = r.get("hitRatios", {}) + loads = r.get("loads", {}) + selected = r.get("selected") + if not hit_ratios or not selected or selected not in hit_ratios: + continue + + max_hr = max(hit_ratios.values()) if hit_ratios else 0 + sel_hr = hit_ratios.get(selected, 0) + + if sel_hr >= max_hr: + continue + + # 计算 scores: score = (100-hitRatio)/100 * hrWeight + loadRatio * lbWeight + # Go 源码使用 maxLoad 做归一化: loadRatio = load / maxLoad + max_load = max(loads.values()) if loads else 1 + max_load = max(max_load, 1) + scores = {} + for w_url in hit_ratios: + hr = hit_ratios.get(w_url, 0) + load = loads.get(w_url, 0) + load_ratio = load / max_load + score = (100 - hr) / 100 * hr_weight + load_ratio * lb_weight + scores[w_url] = round(score, 4) + + best_by_hr = min(hit_ratios, key=lambda w: -hit_ratios[w]) + sel_score = scores.get(selected, 0) + best_hr_score = scores.get(best_by_hr, 0) + + # 分类原因 + load_diff = abs(loads.get(selected, 0) - loads.get(best_by_hr, 0)) + if load_diff > 5: + reason = "负载主导" + elif max_hr < 10: + reason = "区分度不够" + elif abs(sel_score - best_hr_score) < 0.05: + reason = "正常竞争" + else: + reason = "综合权衡" + + suboptimal.append( + { + "ts": r.get("ts", ""), + "selected": selected.replace("http://", ""), + "selected_hr": sel_hr, + "best_hr_worker": best_by_hr.replace("http://", ""), + "best_hr": max_hr, + "reason": reason, + } + ) + + return suboptimal + + +def _analyze_eviction(records, eviction_mins): + """驱逐影响分析:同 session 连续请求间隔 > eviction_duration。""" + sessions = defaultdict(list) + for r in records: + sid = (r.get("tags") or {}).get("session_id") + ts = r.get("ts") + if sid and ts: + sessions[sid].append(r) + + impacts = [] + for sid, reqs in sessions.items(): + reqs.sort(key=lambda x: x.get("ts", "")) + for i in range(1, len(reqs)): + try: + prev_dt = parse_ts(reqs[i - 1]["ts"]) + curr_dt = parse_ts(reqs[i]["ts"]) + interval_mins = (curr_dt - prev_dt).total_seconds() / 60 + if interval_mins > eviction_mins: + curr_hr = reqs[i].get("selected_hitRatio", -1) + impacts.append( + { + "session_id": sid, + "interval_mins": round(interval_mins, 1), + "hitRatio_after": curr_hr, + "evicted": curr_hr == 0, + } + ) + except (ValueError, KeyError): + pass + + return impacts + + +def _diagnose( + strategy_dist, + fallback_reasons, + session_stickiness, + suboptimal, + eviction_impact, + cold_starts, + total, + tokenizer_degraded_count, + hitratio_stats, +): + """生成 cache 调度诊断。""" + diagnoses = [] + + # Tokenizer 退化 + if tokenizer_degraded_count > 0: + pct = round(tokenizer_degraded_count / max(total, 1) * 100, 1) + sev = "HIGH" if pct > 10 else "MEDIUM" + diagnoses.append( + { + "severity": sev, + "message": f"Tokenizer 退化 {tokenizer_degraded_count} 次 ({pct}%),精度降低", + "source_layer": "Router", + } + ) + + # Fallback 比例 + for s in strategy_dist: + if s["value"] == "process_tokens" and s["pct"] > 20: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'Fallback 到 process_tokens {s["pct"]}%,cache-aware 策略未生效', + "source_layer": "Router", + } + ) + + # 非最优选择 + if suboptimal and total > 0: + pct = round(len(suboptimal) / total * 100, 1) + if pct > 20: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"非最优选择 {pct}%({len(suboptimal)}/{total})", + "source_layer": "Router", + } + ) + + # 冷启动 + if cold_starts > 0 and total > 0: + pct = round(cold_starts / total * 100, 1) + if pct > 10: + diagnoses.append( + {"severity": "LOW", "message": f"冷启动 {pct}%(hitRatios=map[])", "source_layer": "Router"} + ) + + # 驱逐影响 + evicted = [e for e in eviction_impact if e["evicted"]] + if evicted: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"{len(evicted)} 次驱逐后 hitRatio=0,考虑增大 eviction-duration-mins", + "source_layer": "Router", + } + ) + + # hitRatio 整体偏低 + if hitratio_stats.get("mean", 100) < 20: + diagnoses.append( + { + "severity": "LOW", + "message": f'平均 hitRatio {hitratio_stats["mean"]}%,缓存效果较差', + "source_layer": "Router", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_cache_report(result): + """将分析结果格式化为终端报告。""" + sections = ["## Cache 调度诊断", ""] + sections.append(f' {result["summary"]}') + sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + for d in result["diagnoses"]: + sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append("") + + # 策略分布 + if result["strategy_dist"]: + sections.append("### 策略分布") + sections.append("") + bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # hitRatio 统计 + hs = result.get("hitratio_stats", {}) + if hs: + sections.append("### hitRatio 统计") + sections.append("") + sections.append( + f' mean={hs.get("mean",0)}% p50={hs.get("p50",0)}% p90={hs.get("p90",0)}% ' + f'p99={hs.get("p99",0)}% max={hs.get("max",0)}%' + ) + sections.append("") + + # Fallback 原因 + if result["fallback_reasons"]: + sections.append("### Fallback 原因分布") + sections.append("") + bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # Tokenizer 退化 + if result.get("tokenizer_degraded_count", 0) > 0: + sections.append(f' Tokenizer 退化: {result["tokenizer_degraded_count"]} 次') + sections.append("") + + # Session 粘性 + stickiness = result.get("session_stickiness", {}) + if stickiness: + sections.append("### Session 粘性") + sections.append("") + table_data = [ + { + "Session": sid[:16], + "请求数": str(s["total_requests"]), + "粘性率": f'{s["stickiness_pct"]}%', + "切换次数": str(s["switches"]), + } + for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) + ] + sections.append( + render_table( + table_data[:10], + columns=["Session", "请求数", "粘性率", "切换次数"], + right_align={"请求数", "粘性率", "切换次数"}, + ) + ) + sections.append("") + + # 非最优选择 + if result.get("suboptimal_selections"): + subs = result["suboptimal_selections"] + sections.append(f"### 非最优选择 ({len(subs)} 次)") + sections.append("") + reason_counts = defaultdict(int) + for s in subs: + reason_counts[s["reason"]] += 1 + for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): + sections.append(f" {reason}: {count} 次") + sections.append("") + + # 驱逐影响 + if result.get("eviction_impact"): + evictions = result["eviction_impact"] + evicted = [e for e in evictions if e["evicted"]] + sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") + sections.append("") + + # 冷启动 + if result.get("cold_starts", 0) > 0: + sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') + sections.append("") + + return "\n".join(sections) + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py new file mode 100644 index 00000000000..0817e280aa5 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python3 +""" +Errors Analyzer — 错误分类分析 + +分析 Router 日志中的 ERROR/WARN 日志、HTTP 状态码分布、Panic 事件。 +按问题来源层(Router / FastDeploy 后端 / 客户端)标注每类错误。 +""" + +import os +import subprocess +import sys + +# 让 analyzers 能 import 同级 scripts 下的模块 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_sparkline, render_table +from log_parser import extract_ts, parse_error_line, parse_http_line +from stats import count_by, time_bucket + +# ════════════════════════════════════════════════════════════════ +# 错误来源层映射(从 error_catalog.md 提取的核心规则) +# ════════════════════════════════════════════════════════════════ + +# 模板 → 来源层 映射(归一化后的模板匹配) +SOURCE_LAYER_RULES = [ + # Router 自身 + ("Failed to build disaggregate_info", "Router"), + ("Failed to encode modified request", "Router"), + ("Panic recovered", "Router"), + ("DefaultManager is nil", "Router"), + ("double-release", "Router"), + ("counter already cleaned up", "Router"), + ("counter already zero", "Router"), + ("tokenizer failed", "Router"), + ("Instance {url} role is unknown", "Router"), + # 客户端 + ("Invalid request body", "客户端"), + ("Invalid JSON format", "客户端"), + ("Failed to read request body", "客户端"), + ("Failed to unmarshal request JSON", "客户端"), + # FD 后端(默认多数 ERROR 来自后端) + ("Failed to select", "FD 后端"), + ("Failed to connect to backend", "FD 后端"), + ("No available", "FD 后端"), + ("request failed", "FD 后端"), + ("Removed unhealthy", "FD 后端"), + ("Backend request failed", "FD 后端"), + ("Decode request failed", "FD 后端"), + ("Prefill request failed", "FD 后端"), + ("Failed to create decode request", "FD 后端"), + ("Failed to create prefill request", "FD 后端"), + ("Failed to create backend request", "FD 后端"), + ("GetRemoteMetrics failed", "FD 后端"), +] + +# scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 +SCANNER_COPY_PATTERNS = ("scanner error", "copy error") + + +def classify_source_layer(template, original=""): + """根据错误模板判断来源层。""" + # scanner error / copy error 特殊判断 + for pat in SCANNER_COPY_PATTERNS: + if pat in template or pat in original: + if "context canceled" in original: + return "客户端" + return "FD 后端" + + for pattern, layer in SOURCE_LAYER_RULES: + if pattern in template: + return layer + + return "未知" + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_errors(log_file, tail=None, top_n=20): + """分析日志中的错误。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制(None 则全量) + top_n: 错误 Top N + + Returns: + dict: { + error_top_n: [{template, count, pct, source_layer, level, urls}], + status_code_dist: [{value, count, pct}], + panic_list: [{ts, context}], + error_rate: float, + error_trend: [{bucket, count}], + total_errors: int, + total_warns: int, + total_requests: int, + summary: str, + } + """ + # Phase 1: Grep 提取各类日志 + error_lines = _grep_lines(log_file, r"\[ERROR\]", tail) + warn_lines = _grep_lines(log_file, r"\[WARN\]", tail) + http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail) + panic_lines = _grep_lines(log_file, "Panic recovered", tail) + + # Phase 2: 解析 + # 2.1 ERROR + WARN 归一化 + error_records = [parse_error_line(line) for line in error_lines] + warn_records = [parse_error_line(line) for line in warn_lines] + all_error_records = error_records + warn_records + + # 2.2 HTTP 请求解析 + http_records = [] + for line in http_lines: + r = parse_http_line(line) + if r: + http_records.append(r) + + # 2.3 Panic 提取 + panic_list = [] + for line in panic_lines: + ts = extract_ts(line) + panic_list.append({"ts": ts or "", "context": line.strip()}) + + # Phase 3: 分析 + # 3.1 按模板分组 Top N + error_top = _compute_error_top_n(all_error_records, top_n) + + # 3.2 HTTP 状态码分布 + status_dist = count_by(http_records, "status") + + # 3.3 错误率 + total_requests = len(http_records) + non_200 = sum(1 for r in http_records if r["status"] != 200) + error_rate = round(non_200 / total_requests * 100, 2) if total_requests else 0 + + # 3.4 错误趋势(按时间窗口统计非 200 请求数) + non_200_records = [r for r in http_records if r["status"] != 200] + error_trend = time_bucket(non_200_records, window="auto") + + return { + "error_top_n": error_top, + "status_code_dist": status_dist, + "panic_list": panic_list, + "error_rate": error_rate, + "error_trend": error_trend, + "total_errors": len(error_records), + "total_warns": len(warn_records), + "total_requests": total_requests, + } + + +def _compute_error_top_n(records, top_n): + """按模板分组并标注来源层。""" + # 分组 + groups = {} + for r in records: + tpl = r["template"] + if tpl not in groups: + groups[tpl] = { + "template": tpl, + "count": 0, + "level": r["level"], + "originals": [], + } + groups[tpl]["count"] += 1 + # 保留最多 5 个原始消息用于详细报告中提取 URL + if len(groups[tpl]["originals"]) < 5: + groups[tpl]["originals"].append(r["original"]) + + total = len(records) + result = [] + for g in sorted(groups.values(), key=lambda x: -x["count"]): + source_layer = classify_source_layer(g["template"], g["originals"][0] if g["originals"] else "") + result.append( + { + "template": g["template"], + "count": g["count"], + "pct": round(g["count"] / total * 100, 1) if total else 0, + "source_layer": source_layer, + "level": g["level"], + "sample_originals": g["originals"], + } + ) + if len(result) >= top_n: + break + + return result + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + # 先 tail 再 grep + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + """简单 shell 引号转义。""" + return "'" + s.replace("'", "'\\''") + "'" + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_errors_report(result): + """将分析结果格式化为终端报告。 + + Args: + result: analyze_errors 返回的 dict + + Returns: + str: 格式化后的报告文本 + """ + sections = [] + + # 标题 + sections.append("## 错误分析") + sections.append("") + + # 概览 + sections.append( + f' ERROR: {result["total_errors"]} | ' + f'WARN: {result["total_warns"]} | ' + f'请求总数: {result["total_requests"]} | ' + f'错误率: {result["error_rate"]}%' + ) + sections.append("") + + # Panic + if result["panic_list"]: + sections.append(f' ⚠ Panic 事件: {len(result["panic_list"])} 次') + for p in result["panic_list"][:5]: + sections.append(f' [{p["ts"]}] {p["context"][:100]}') + sections.append("") + + # 错误 Top N + if result["error_top_n"]: + sections.append("### ERROR/WARN Top 分类") + sections.append("") + bar_data = [] + for e in result["error_top_n"][:10]: + label = e["template"][:50] + bar_data.append( + { + "label": label, + "value": e["pct"], + "count": e["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 来源层表格 + table_data = [] + for e in result["error_top_n"][:10]: + table_data.append( + { + "模板": e["template"][:60], + "数量": e["count"], + "占比": f'{e["pct"]}%', + "级别": e["level"], + "来源层": e["source_layer"], + } + ) + sections.append( + render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) + ) + sections.append("") + + # 状态码分布 + if result["status_code_dist"]: + sections.append("### HTTP 状态码分布") + sections.append("") + bar_data = [] + for s in result["status_code_dist"]: + bar_data.append( + { + "label": str(s["value"]), + "value": s["pct"], + "count": s["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 错误趋势 + if result["error_trend"] and len(result["error_trend"]) > 1: + sections.append("### 非 200 请求趋势") + sections.append("") + sections.append( + render_sparkline( + result["error_trend"], + value_field="count", + title="Error Count", + y_label="req", + ) + ) + sections.append("") + + return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py new file mode 100644 index 00000000000..d2d7ca77acb --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -0,0 +1,421 @@ +#!/usr/bin/env python3 +""" +Health Analyzer — Worker 健康时间线分析 + +追踪 Worker 上下线事件、恢复检测、可用性统计。 +按 Worker URL 聚合事件,构建状态时间线。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_table, render_timeline +from log_parser import extract_ts, parse_http_line, parse_ts + +# ════════════════════════════════════════════════════════════════ +# 健康事件解析 +# ════════════════════════════════════════════════════════════════ + +NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy") +REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)") +IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy") +COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") +CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)") + + +def parse_health_event(line): + """解析 H2 健康事件行。返回 {ts, worker, event_type} 或 None。""" + ts = extract_ts(line) + m = REMOVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "REMOVED"} + m = NOT_HEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "NOT_HEALTHY"} + m = IS_HEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "HEALTHY"} + return None + + +def parse_counter_preserved(line): + """解析 H5 counter preserved / cleanup 事件。""" + ts = extract_ts(line) + m = COUNTER_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "COUNTER_PRESERVED"} + m = CLEANUP_UNHEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "CLEANUP_UNHEALTHY"} + return None + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_health(log_file, tail=None): + """分析 Worker 健康状态。 + + Returns: + dict: {workers, diagnoses, time_range, summary} + """ + h2_lines = _grep_lines(log_file, r"Removed unhealthy|is not healthy|is healthy", tail) + h5_lines = _grep_lines(log_file, r"counter preserved|cleanup unhealthy", tail) + register_lines = _grep_lines(log_file, r"\[POST\] /register", tail) + + health_events = [e for line in h2_lines for e in [parse_health_event(line)] if e] + counter_events = [e for line in h5_lines for e in [parse_counter_preserved(line)] if e] + + register_events = [] + for line in register_lines: + r = parse_http_line(line) + if r and r["method"] == "POST" and r["path"] == "/register" and r["status"] == 200: + register_events.append({"ts": r["ts"], "client_ip": r["client_ip"]}) + + if not health_events and not register_events: + return { + "workers": {}, + "diagnoses": [], + "time_range": {"start": "", "end": ""}, + "summary": "未检测到 Worker 健康事件", + } + + workers = _build_worker_timelines(health_events, counter_events, register_events) + + all_ts = sorted([e["ts"] for e in health_events + register_events if e.get("ts")]) + time_range = {"start": all_ts[0] if all_ts else "", "end": all_ts[-1] if all_ts else ""} + + diagnoses = _diagnose(workers) + down_workers = sum(1 for w in workers.values() if w["down_count"] > 0) + + return { + "workers": workers, + "diagnoses": diagnoses, + "time_range": time_range, + "summary": f"{len(workers)} Worker(s), {down_workers} 有下线事件", + } + + +def _build_worker_timelines(health_events, counter_events, register_events): + """构建每个 Worker 的状态时间线。""" + worker_urls = {evt["worker"] for evt in health_events} + + # IP → worker URL 映射 + ip_to_urls = defaultdict(set) + for url in worker_urls: + ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + if ip_m: + ip_to_urls[ip_m.group(1)].add(url) + + worker_events = defaultdict(list) + for evt in health_events: + worker_events[evt["worker"]].append(evt) + + counter_counts = defaultdict(int) + for evt in counter_events: + if evt["event_type"] == "COUNTER_PRESERVED": + counter_counts[evt["worker"]] += 1 + + register_by_ip = defaultdict(list) + for evt in register_events: + register_by_ip[evt["client_ip"]].append(evt) + + workers = {} + for url in sorted(worker_urls): + events = sorted(worker_events[url], key=lambda e: e["ts"] or "") + ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + worker_ip = ip_m.group(1) if ip_m else "" + + # 恢复检测:REMOVED 后有 register + recovered = False + recovery_events = [] + for evt in events: + if evt["event_type"] == "REMOVED" and worker_ip: + for reg in register_by_ip.get(worker_ip, []): + if reg["ts"] and evt["ts"] and reg["ts"] > evt["ts"]: + recovered = True + recovery_events.append({"ts": reg["ts"], "type": "RE-REGISTERED"}) + break + + all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + all_events.extend(recovery_events) + all_events.sort(key=lambda e: e["ts"] or "") + + down_periods = _compute_down_periods(all_events) + down_count = len(down_periods) + avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + + workers[url] = { + "events": all_events, + "uptime_pct": _compute_uptime_pct(all_events), + "down_count": down_count, + "avg_down_duration_s": round(avg_down_s, 1), + "recovered": recovered, + "inflight_preserved": counter_counts.get(url, 0), + "down_periods": down_periods, + } + + return workers + + +def _compute_down_periods(events): + """从事件列表计算下线时段。""" + down_periods = [] + down_start = None + for evt in events: + if evt["type"] in ("NOT_HEALTHY", "REMOVED"): + if down_start is None and evt["ts"]: + down_start = evt["ts"] + elif evt["type"] in ("HEALTHY", "RE-REGISTERED"): + if down_start is not None and evt["ts"]: + try: + duration_s = (parse_ts(evt["ts"]) - parse_ts(down_start)).total_seconds() + down_periods.append({"start": down_start, "end": evt["ts"], "duration_s": max(0, duration_s)}) + except ValueError: + pass + down_start = None + if down_start is not None: + down_periods.append({"start": down_start, "end": None, "duration_s": 0}) + return down_periods + + +def _compute_uptime_pct(events): + """计算 Worker 可用性百分比。""" + if not events: + return 100.0 + ts_list = [e["ts"] for e in events if e["ts"]] + if len(ts_list) < 2: + return 0.0 if events[0]["type"] in ("NOT_HEALTHY", "REMOVED") else 100.0 + try: + first_dt, last_dt = parse_ts(ts_list[0]), parse_ts(ts_list[-1]) + total_s = (last_dt - first_dt).total_seconds() + if total_s <= 0: + return 100.0 + except ValueError: + return 100.0 + + down_s, down_start = 0.0, None + for evt in events: + if evt["type"] in ("NOT_HEALTHY", "REMOVED") and down_start is None and evt["ts"]: + try: + down_start = parse_ts(evt["ts"]) + except ValueError: + pass + elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and down_start is not None and evt["ts"]: + try: + down_s += (parse_ts(evt["ts"]) - down_start).total_seconds() + except ValueError: + pass + down_start = None + if down_start is not None: + down_s += (last_dt - down_start).total_seconds() + + return round(max(0, total_s - down_s) / total_s * 100, 1) + + +def _diagnose(workers): + """根据 Worker 健康数据生成诊断。""" + diagnoses = [] + if not workers: + return diagnoses + + all_down = all(w["events"] and w["events"][-1]["type"] in ("NOT_HEALTHY", "REMOVED") for w in workers.values()) + if all_down: + diagnoses.append( + { + "severity": "CRITICAL", + "message": f"所有 Worker ({len(workers)}) 当前均不可用", + "source_layer": "FD 后端", + } + ) + + for url, w in workers.items(): + s = url.replace("http://", "") + if w["down_count"] > 3: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{s} 下线 {w["down_count"]} 次,Worker 不稳定', + "source_layer": "FD 后端", + } + ) + for p in w.get("down_periods", []): + if p["duration_s"] > 300: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{s} 下线 {p["duration_s"]/60:.1f}min({p["start"]} ~ {p["end"] or "未恢复"})', + "source_layer": "FD 后端", + } + ) + if len(w["events"]) >= 3: + ts_list = [e["ts"] for e in w["events"] if e["ts"]] + if len(ts_list) >= 2: + try: + hours = (parse_ts(ts_list[-1]) - parse_ts(ts_list[0])).total_seconds() / 3600 + if hours > 0 and len(w["events"]) / hours > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{s} 状态变更频繁 ({len(w["events"])/hours:.1f} 次/小时)', + "source_layer": "FD 后端", + } + ) + except ValueError: + pass + if w["inflight_preserved"] > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{s} counter preserved {w["inflight_preserved"]} 次(下线时仍有 inflight 请求)', + "source_layer": "FD 后端", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_health_report(result): + """将分析结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_text) + summary_text: 总结部分(诊断 + 可用性表格 + 时间线) + detail_text: 事件详情(逐条事件记录,可能很长) + """ + sections = ["## Worker 健康分析", ""] + if not result["workers"]: + sections.append(" 未检测到 Worker 健康事件(所有 Worker 状态正常或无健康日志)") + return "\n".join(sections), "" + + sections.append(f' {result["summary"]}') + if result["time_range"]["start"]: + sections.append(f' 时间范围: {result["time_range"]["start"]} ~ {result["time_range"]["end"]}') + sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + for d in result["diagnoses"]: + sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append("") + + # Worker 可用性表格 + sections.append("### Worker 可用性") + sections.append("") + table_data = [] + for url, w in sorted(result["workers"].items()): + avg_down = "" + if w["avg_down_duration_s"] > 0: + avg_down = ( + f'{w["avg_down_duration_s"]/60:.1f}min' + if w["avg_down_duration_s"] >= 60 + else f'{w["avg_down_duration_s"]:.0f}s' + ) + table_data.append( + { + "Worker": url.replace("http://", ""), + "在线率": f'{w["uptime_pct"]}%', + "下线次数": str(w["down_count"]), + "平均下线时长": avg_down or "-", + "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), + "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", + } + ) + sections.append( + render_table( + table_data, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"}, + ) + ) + sections.append("") + + # 时间线 + if result["time_range"]["start"] and result["time_range"]["end"]: + sections.append("### Worker 时间线") + sections.append("") + timeline_data = _build_timeline_data(result) + if timeline_data: + sections.append(render_timeline(timeline_data, width=40)) + sections.append("") + + # 事件详情 → 拆分到 detail_text + detail_parts = ["# Worker 健康事件详情", ""] + has_events = False + for url, w in sorted(result["workers"].items()): + if w["events"]: + has_events = True + detail_parts.append(f'## {url.replace("http://", "")}') + detail_parts.append("") + for evt in w["events"]: + detail_parts.append(f' [{evt["ts"]}] {evt["type"]}') + detail_parts.append("") + + detail_text = "\n".join(detail_parts) if has_events else "" + + # 主报告中添加引用 + if has_events: + sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)") + sections.append("") + + return "\n".join(sections), detail_text + + +def _build_timeline_data(result): + """构建 render_timeline 需要的数据格式。""" + tr = result["time_range"] + if not tr["start"] or not tr["end"]: + return None + workers_data = {} + for url, w in result["workers"].items(): + periods = [] + status, start = "up", tr["start"] + for evt in w["events"]: + if not evt["ts"]: + continue + if evt["type"] in ("NOT_HEALTHY", "REMOVED") and status == "up": + periods.append({"from": start, "to": evt["ts"], "status": "up"}) + status, start = "down", evt["ts"] + elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and status == "down": + periods.append({"from": start, "to": evt["ts"], "status": "down"}) + status, start = "up", evt["ts"] + periods.append({"from": start, "to": tr["end"], "status": status}) + workers_data[url] = periods + return {"start": tr["start"], "end": tr["end"], "workers": workers_data} + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py new file mode 100644 index 00000000000..eec862910e8 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -0,0 +1,355 @@ +#!/usr/bin/env python3 +""" +Latency Analyzer — 延迟分析 + +分析 Router 日志中的请求延迟百分位数、延迟分布、吞吐量趋势、调度耗时、慢请求。 +仅统计推理请求路径(/v1/chat/completions, /v1/completions)。 +""" + +import os +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_sparkline, render_table +from log_parser import TS_MS_RE, extract_tags, parse_http_line +from stats import compute_statistics, time_bucket + +# ════════════════════════════════════════════════════════════════ +# 调度耗时解析 +# ════════════════════════════════════════════════════════════════ + + +def _parse_scheduling_ms(ts_ms_lines): + """从 ts_ms 行计算调度耗时(同一请求两个 ts_ms 之间的差值)。 + + 同一 request_id 的两条 ts_ms 行之间的时间差即为调度耗时。 + 返回 ms 列表。 + """ + from datetime import datetime + + # 按 request_id 分组 + by_reqid = defaultdict(list) + for line in ts_ms_lines: + m = TS_MS_RE.search(line) + if not m: + continue + ts_ms_str = m.group(1) + tags = extract_tags(line) + rid = tags.get("request_id", "") + if rid: + try: + dt = datetime.strptime(ts_ms_str, "%Y-%m-%d %H:%M:%S.%f") + by_reqid[rid].append(dt) + except ValueError: + pass + + # 计算每个 request_id 的 max - min 差值 + durations = [] + for rid, timestamps in by_reqid.items(): + if len(timestamps) >= 2: + timestamps.sort() + delta_ms = (timestamps[-1] - timestamps[0]).total_seconds() * 1000 + durations.append(round(delta_ms, 3)) + + return durations + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + +LATENCY_DIST_SPEC = "<100,100-500,500-1000,1000-5000,5000-10000,>10000" + + +def analyze_latency(log_file, tail=None): + """分析日志中的请求延迟。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制 + + Returns: + dict: { + stats: {count, p50, p90, p95, p99, max, mean, stddev, distribution}, + latency_trend: [{bucket, latency_ms_p50}], + throughput_trend: [{bucket, count}], + slow_top10: [{ts, path, status, latency_ms, client_ip}], + scheduling_stats: {p50, p90, p99} | None, + diagnoses: [{message, severity}], + } + """ + # Phase 1: Grep 提取 + http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail) + ts_ms_lines = _grep_lines(log_file, "ts_ms=", tail) + + # Phase 2: 解析 HTTP 行(仅推理路径) + http_records = [] + for line in http_lines: + r = parse_http_line(line, inference_only=True) + if r: + http_records.append(r) + + # Phase 3: 分析 + + # 3.1 延迟统计 + latency_values = [r["latency_ms"] for r in http_records] + stats = compute_statistics( + latency_values, + percentiles_list=[50, 90, 95, 99], + distribution_spec=LATENCY_DIST_SPEC, + ) + + # 3.2 延迟趋势 (p50) + latency_trend = time_bucket( + http_records, + window="auto", + agg_specs=[("latency_ms", "p50")], + ) + + # 3.3 吞吐量趋势 + throughput_trend = time_bucket(http_records, window="auto") + + # 3.4 慢请求 Top 10 + sorted_by_latency = sorted(http_records, key=lambda r: -r["latency_ms"]) + slow_top10 = [] + for r in sorted_by_latency[:10]: + slow_top10.append( + { + "ts": r["ts"], + "path": r["path"], + "status": r["status"], + "latency_ms": r["latency_ms"], + "client_ip": r["client_ip"], + } + ) + + # 3.5 调度耗时 + scheduling_stats = None + if ts_ms_lines: + sched_durations = _parse_scheduling_ms(ts_ms_lines) + if sched_durations: + sched_raw = compute_statistics(sched_durations, percentiles_list=[50, 90, 99]) + scheduling_stats = { + "p50": sched_raw["p50"], + "p90": sched_raw["p90"], + "p99": sched_raw["p99"], + "count": sched_raw["count"], + } + + # 3.6 诊断规则 + diagnoses = _run_diagnostics(stats, scheduling_stats) + + return { + "stats": stats, + "latency_trend": latency_trend, + "throughput_trend": throughput_trend, + "slow_top10": slow_top10, + "scheduling_stats": scheduling_stats, + "diagnoses": diagnoses, + } + + +def _run_diagnostics(stats, scheduling_stats): + """应用诊断规则。""" + diagnoses = [] + + if stats["count"] == 0: + diagnoses.append({"message": "未找到推理请求", "severity": "INFO"}) + return diagnoses + + p99 = stats.get("p99", 0) + p50 = stats.get("p50", 0) + + # p99 > 10s + if p99 > 10000: + if scheduling_stats and scheduling_stats["p99"] < 100: + diagnoses.append( + { + "message": f'p99={p99:.0f}ms 但调度仅 {scheduling_stats["p99"]:.0f}ms → 延迟在后端推理层', + "severity": "HIGH", + } + ) + elif scheduling_stats and scheduling_stats["p99"] >= 100: + diagnoses.append( + { + "message": f'p99={p99:.0f}ms 且调度 p99={scheduling_stats["p99"]:.0f}ms → 调度层瓶颈', + "severity": "CRITICAL", + } + ) + else: + diagnoses.append( + { + "message": f"p99={p99:.0f}ms (>10s),后端推理延迟高", + "severity": "HIGH", + } + ) + + # 尾延迟 + if p50 > 0 and p99 / p50 > 10: + diagnoses.append( + { + "message": f"p99/p50={p99/p50:.1f}x → 尾延迟严重", + "severity": "MEDIUM", + } + ) + + if not diagnoses: + diagnoses.append( + { + "message": f"延迟正常 (p50={p50:.0f}ms, p99={p99:.0f}ms)", + "severity": "INFO", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_latency_report(result): + """将分析结果格式化为终端报告。""" + sections = [] + stats = result["stats"] + + sections.append("## 延迟分析") + sections.append("") + + if stats["count"] == 0: + sections.append(" 未找到推理请求 (/v1/chat/completions, /v1/completions)") + return "\n".join(sections) + + # 百分位数概览 + sections.append( + f' 推理请求: {stats["count"]} | ' + f'p50={_fmt_ms(stats["p50"])} p90={_fmt_ms(stats["p90"])} ' + f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' + f'max={_fmt_ms(stats["max"])}' + ) + sections.append("") + + # 延迟分布 + if stats.get("distribution"): + sections.append("### 延迟分布") + sections.append("") + bar_data = [] + for d in stats["distribution"]: + bar_data.append( + { + "label": d["range"], + "value": d["pct"], + "count": d["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 延迟趋势 + if result["latency_trend"] and len(result["latency_trend"]) > 1: + sections.append("### 延迟趋势 (p50)") + sections.append("") + sections.append( + render_sparkline( + result["latency_trend"], + value_field="latency_ms_p50", + title="p50 Latency", + y_label="ms", + ) + ) + sections.append("") + + # 吞吐量趋势 + if result["throughput_trend"] and len(result["throughput_trend"]) > 1: + sections.append("### 吞吐量趋势") + sections.append("") + sections.append( + render_sparkline( + result["throughput_trend"], + value_field="count", + title="Throughput", + y_label="req", + ) + ) + sections.append("") + + # 调度耗时 + if result["scheduling_stats"]: + ss = result["scheduling_stats"] + sections.append(f'### 调度耗时 ({ss["count"]} samples)') + sections.append(f' p50={_fmt_ms(ss["p50"])} p90={_fmt_ms(ss["p90"])} p99={_fmt_ms(ss["p99"])}') + sections.append("") + + # 慢请求 Top 10 + if result["slow_top10"]: + sections.append("### 慢请求 Top 10") + sections.append("") + table_data = [] + for r in result["slow_top10"]: + table_data.append( + { + "时间": r["ts"][-8:] if len(r["ts"]) > 8 else r["ts"], + "延迟": _fmt_ms(r["latency_ms"]), + "状态": str(r["status"]), + "路径": r["path"], + "Client": r["client_ip"], + } + ) + sections.append( + render_table( + table_data, + columns=["时间", "延迟", "状态", "路径", "Client"], + ) + ) + sections.append("") + + # 诊断 + if result["diagnoses"]: + sections.append("### 诊断") + for d in result["diagnoses"]: + severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "} + mark = severity_mark.get(d["severity"], " ") + sections.append(f' [{mark}] {d["message"]}') + sections.append("") + + return "\n".join(sections) + + +def _fmt_ms(ms): + """格式化毫秒值为人类可读字符串。""" + if ms >= 60000: + return f"{ms/60000:.1f}min" + elif ms >= 1000: + return f"{ms/1000:.2f}s" + elif ms >= 1: + return f"{ms:.1f}ms" + else: + return f"{ms*1000:.0f}µs" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py new file mode 100644 index 00000000000..e712011d932 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +Load Analyzer — 负载与计数器分析 + +分析 Worker 负载分布、计数器异常、请求堆积检测、token 计数器。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_sparkline, render_table +from log_parser import extract_ts, match_select_release, parse_stats_line +from stats import compute_statistics, time_bucket + +# ════════════════════════════════════════════════════════════════ +# Counter 异常检测正则 +# ════════════════════════════════════════════════════════════════ + +DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release") +COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up") +COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") +TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)") + +# Token 事件 +SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") + + +def parse_counter_anomaly(line): + """解析 H5 counter 异常行。""" + ts = extract_ts(line) + m = DOUBLE_RELEASE_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "double-release"} + m = COUNTER_CLEANED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-cleaned-up"} + m = COUNTER_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-preserved"} + m = TOKEN_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "token-preserved"} + return None + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_load(log_file, tail=None): + """分析负载与计数器。 + + Returns: + dict: {load_stats, worker_load, load_trend, counter_anomalies, + select_release, token_stats, diagnoses, summary} + """ + h7_lines = _grep_lines(log_file, r"\[stats\]", tail) + h3_lines = _grep_lines(log_file, r"select worker|release worker|Failed to select", tail) + h5_lines = _grep_lines( + log_file, + r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters", + tail, + ) + h11_lines = _grep_lines(log_file, r"release prefill tokens", tail) + + # 解析 stats 行 + stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r] + + # 负载统计 + total_running_vals = [r["total_running"] for r in stats_records if "total_running" in r] + load_stats = compute_statistics(total_running_vals) if total_running_vals else {} + + # Per-Worker 负载分布 + worker_running = defaultdict(list) + for r in stats_records: + for w_url, running in r.get("workers", {}).items(): + worker_running[w_url].append(running) + + worker_load = [] + for w_url in sorted(worker_running.keys()): + vals = worker_running[w_url] + avg = sum(vals) / len(vals) if vals else 0 + worker_load.append( + { + "worker": w_url.replace("http://", ""), + "avg_running": round(avg, 1), + "max_running": max(vals) if vals else 0, + "samples": len(vals), + } + ) + + # 负载趋势 + load_trend = ( + time_bucket(stats_records, window="auto", agg_specs=[("total_running", "mean")]) if stats_records else [] + ) + + # Counter 异常 + counter_anomalies = defaultdict(lambda: defaultdict(int)) + for line in h5_lines: + evt = parse_counter_anomaly(line) + if evt: + counter_anomalies[evt["anomaly_type"]][evt["worker"]] += 1 + + anomaly_summary = [] + for atype, workers in counter_anomalies.items(): + total = sum(workers.values()) + anomaly_summary.append( + { + "type": atype, + "total": total, + "workers": dict(workers), + } + ) + + # Select/Release 匹配 + sr_result = ( + match_select_release(h3_lines) + if h3_lines + else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}} + ) + + # Token 统计 + token_stats = _analyze_tokens(h3_lines, h11_lines) + + # 请求堆积检测 + pileup = _detect_pileup(stats_records) + + # 诊断 + diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup) + + return { + "load_stats": load_stats, + "worker_load": worker_load, + "load_trend": load_trend, + "counter_anomalies": anomaly_summary, + "select_release": sr_result, + "token_stats": token_stats, + "pileup_detected": pileup, + "diagnoses": diagnoses, + "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", + } + + +def _analyze_tokens(h3_lines, h11_lines): + """分析 token 分配与释放。""" + token_alloc = defaultdict(list) + token_release = defaultdict(list) + + for line in h3_lines: + m = SELECT_TOKENS_RE.search(line) + if m: + token_alloc[m.group(1)].append(int(m.group(2))) + + for line in h11_lines: + m = RELEASE_TOKENS_RE.search(line) + if m: + token_release[m.group(1)].append(int(m.group(2))) + + result = [] + all_workers = set(token_alloc.keys()) | set(token_release.keys()) + for w in sorted(all_workers): + allocs = token_alloc.get(w, []) + releases = token_release.get(w, []) + result.append( + { + "worker": w.replace("http://", ""), + "alloc_count": len(allocs), + "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0, + "release_count": len(releases), + } + ) + return result + + +def _detect_pileup(stats_records): + """检测请求堆积:total_running 连续上升 >5 个采样点。""" + if len(stats_records) < 5: + return False + vals = [r.get("total_running", 0) for r in stats_records] + max_consecutive = 0 + current = 0 + for i in range(1, len(vals)): + if vals[i] > vals[i - 1]: + current += 1 + max_consecutive = max(max_consecutive, current) + else: + current = 0 + return max_consecutive >= 5 + + +def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): + """生成负载诊断。""" + diagnoses = [] + + if pileup: + diagnoses.append( + {"severity": "HIGH", "message": "total_running 持续上升,疑似请求堆积", "source_layer": "FD 后端"} + ) + + # 空闲 Worker + for w in worker_load: + if w["avg_running"] == 0 and w["samples"] > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{w["worker"]} running 持续 =0(空闲或故障未移除)', + "source_layer": "Router", + } + ) + + # 负载严重不均 + if load_stats.get("stddev", 0) > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'负载标准差 {load_stats["stddev"]},分布不均衡', + "source_layer": "Router", + } + ) + + # Counter 异常 + for a in anomaly_summary: + if a["type"] == "double-release" and a["total"] > 0: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'double-release {a["total"]} 次(计数器逻辑 bug)', + "source_layer": "Router", + } + ) + + # Select/Release 不一致 + for w_url, pw in sr_result.get("per_worker", {}).items(): + if pw.get("delta", 0) > 0: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', + "source_layer": "FD 后端", + } + ) + + # 卡住的请求 + if sr_result.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'{len(sr_result["unmatched_selects"])} 个 select 无对应 release(疑似卡住)', + "source_layer": "FD 后端", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_load_report(result): + """将分析结果格式化为终端报告。""" + sections = ["## 负载与计数器分析", ""] + sections.append(f' {result["summary"]}') + sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + for d in result["diagnoses"]: + sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append("") + + # 负载概览 + ls = result.get("load_stats", {}) + if ls: + sections.append("### 负载概览 (total_running)") + sections.append("") + sections.append( + f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' + f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' + ) + sections.append("") + + # Per-Worker 负载 + if result["worker_load"]: + sections.append("### Per-Worker 负载") + sections.append("") + bar_data = [ + {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]} + for w in result["worker_load"] + ] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 负载趋势 + if result["load_trend"] and len(result["load_trend"]) > 1: + sections.append("### 负载趋势") + sections.append("") + sections.append( + render_sparkline( + result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req" + ) + ) + sections.append("") + + # Counter 异常 + if result["counter_anomalies"]: + sections.append("### 计数器异常") + sections.append("") + for a in result["counter_anomalies"]: + workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items()) + sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') + sections.append("") + + # Select/Release 匹配 + sr = result.get("select_release", {}) + if sr.get("per_worker"): + sections.append("### Select/Release 匹配") + sections.append("") + table_data = [] + for w_url, pw in sorted(sr["per_worker"].items()): + table_data.append( + { + "Worker": w_url.replace("http://", ""), + "Select": str(pw["selects"]), + "Release": str(pw["releases"]), + "Delta": str(pw["delta"]), + } + ) + sections.append( + render_table( + table_data, + columns=["Worker", "Select", "Release", "Delta"], + right_align={"Select", "Release", "Delta"}, + ) + ) + sections.append("") + + if sr.get("unmatched_selects"): + sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') + for u in sr["unmatched_selects"][:5]: + sections.append(f' [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})') + sections.append("") + + # Token 统计 + if result.get("token_stats"): + sections.append("### Token 计数器") + sections.append("") + sections.append( + render_table( + result["token_stats"], + columns=["worker", "alloc_count", "alloc_avg", "release_count"], + right_align={"alloc_count", "alloc_avg", "release_count"}, + ) + ) + sections.append("") + + return "\n".join(sections) + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py new file mode 100644 index 00000000000..45a5056616e --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +""" +Trace Analyzer — 请求追踪 + +通过 session_id / trace_id / request_id / req_id 追踪单个或多个请求的 +完整生命周期,重建事件链,检测不完整生命周期。 +""" + +import os +import re +import subprocess +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from log_parser import ( + extract_tags, + extract_ts, + parse_cache_strategy_line, + parse_http_line, +) + +# ════════════════════════════════════════════════════════════════ +# 事件识别正则 +# ════════════════════════════════════════════════════════════════ + +PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection") +SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)") +RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)") +RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") +REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully") +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + +# Prefill 事件 +PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)") +PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)") +PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)") +PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)") +PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)") +FAILED_SELECT_RE = re.compile(r"Failed to select") + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_trace(log_file, trace_ids, tail=None): + """追踪指定 ID 的请求生命周期。 + + Args: + log_file: 日志文件路径 + trace_ids: ID 列表(逗号分隔的字符串或列表) + tail: 尾部行数限制 + + Returns: + dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary} + """ + if isinstance(trace_ids, str): + trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] + + if not trace_ids: + return {"traces": {}, "summary": "未指定追踪 ID"} + + traces = {} + for tid in trace_ids: + # Grep 搜索四种 context tag + pattern = f"session_id:{tid}|trace_id:{tid}|request_id:{tid}|req_id:{tid}" + matching_lines = _grep_lines(log_file, pattern, tail) + + if not matching_lines: + traces[tid] = { + "events": [], + "lifecycle_complete": False, + "diagnoses": [{"severity": "INFO", "message": f"未找到 ID={tid} 的匹配行"}], + "matched_tag": None, + "related_ids": {}, + } + continue + + # 识别匹配到的 tag 类型,并展开 session 下所有 request_id + first_tags = extract_tags(matching_lines[0]) + is_session = tid in [first_tags.get("session_id", "")] + + # 如果是 session_id,收集所有关联的 request_id + related_request_ids = set() + if is_session: + for line in matching_lines: + tags = extract_tags(line) + rid = tags.get("request_id", "") + if rid: + related_request_ids.add(rid) + + # 为每个 request_id 额外搜索行 + extra_lines = [] + for rid in related_request_ids: + rid_lines = _grep_lines(log_file, f"request_id:{rid}", tail) + extra_lines.extend(rid_lines) + all_lines = list(set(matching_lines + extra_lines)) + else: + all_lines = matching_lines + + # 解析事件链 + events = _parse_event_chain(all_lines) + lifecycle_complete = _check_lifecycle_complete(events) + diagnoses = _diagnose_trace(events, lifecycle_complete) + + traces[tid] = { + "events": events, + "lifecycle_complete": lifecycle_complete, + "diagnoses": diagnoses, + "matched_tag": "session_id" if is_session else "request_id/trace_id", + "related_ids": { + "request_ids": sorted(related_request_ids) if is_session else [], + }, + } + + total_traced = len(traces) + complete = sum(1 for t in traces.values() if t["lifecycle_complete"]) + + return { + "traces": traces, + "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整", + } + + +def _parse_event_chain(lines): + """从匹配行重建事件链,按时间排序。""" + events = [] + + for line in lines: + ts = extract_ts(line) + tags = extract_tags(line) + + # HTTP 请求进入/完成 + http = parse_http_line(line) + if http: + events.append( + { + "ts": ts, + "type": "HTTP", + "tags": tags, + "method": http["method"], + "path": http["path"], + "status": http["status"], + "latency_ms": http["latency_ms"], + } + ) + continue + + # Parsing completed + if PARSING_COMPLETE_RE.search(line): + events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags}) + continue + + # Cache-aware strategy + strategy = parse_cache_strategy_line(line) + if strategy: + events.append( + { + "ts": ts, + "type": "CACHE_STRATEGY", + "tags": tags, + "strategy": strategy.get("strategy"), + "selected": strategy.get("selected", ""), + "selected_hitRatio": strategy.get("selected_hitRatio", 0), + } + ) + continue + + # Select worker + m = SELECT_WORKER_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "SELECT_WORKER", + "tags": tags, + "worker_type": m.group(1) or "unknown", + "worker": m.group(2), + } + ) + continue + + # Release worker + m = RELEASE_WORKER_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "RELEASE_WORKER", + "tags": tags, + "worker_type": m.group(1) or "unknown", + "worker": m.group(2), + } + ) + continue + + # Release tokens + m = RELEASE_TOKENS_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "RELEASE_TOKENS", + "tags": tags, + "worker": m.group(1), + "tokens": int(m.group(2)), + } + ) + continue + + # Prefill events + m = PREFILL_FIRST_CHUNK_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1)}) + continue + m = PREFILL_DONE_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1)}) + continue + m = PREFILL_ERROR_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2)}) + continue + m = PREFILL_DEFER_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1)}) + continue + m = PREFILL_ERR_PATH_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1)}) + continue + + # Request completed + if REQUEST_COMPLETE_RE.search(line): + events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags}) + continue + + # ts_ms + m = TS_MS_RE.search(line) + if m: + events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1)}) + continue + + # Failed to select + if FAILED_SELECT_RE.search(line): + events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags}) + continue + + # 按时间排序 + events.sort(key=lambda e: e.get("ts") or "") + return events + + +def _check_lifecycle_complete(events): + """检查生命周期是否完整。""" + types = {e["type"] for e in events} + has_entry = "HTTP" in types or "PARSING_COMPLETE" in types + has_exit = "REQUEST_COMPLETE" in types or ( + "HTTP" in types and any(e["type"] == "HTTP" and e.get("status") for e in events) + ) + has_select = "SELECT_WORKER" in types + has_release = "RELEASE_WORKER" in types + + return has_entry and has_exit and (not has_select or has_release) + + +def _diagnose_trace(events, lifecycle_complete): + """生成追踪诊断。""" + diagnoses = [] + types = [e["type"] for e in events] + + if not lifecycle_complete: + if "SELECT_WORKER" in types and "RELEASE_WORKER" not in types: + diagnoses.append({"severity": "HIGH", "message": "有 select 但无 release — 疑似请求卡住"}) + elif "HTTP" not in types and "PARSING_COMPLETE" not in types: + diagnoses.append({"severity": "MEDIUM", "message": "未找到 HTTP 入口事件"}) + elif "REQUEST_COMPLETE" not in types: + diagnoses.append({"severity": "MEDIUM", "message": "未检测到请求完成事件 — 疑似异常退出"}) + + if "PREFILL_ERROR" in types: + for e in events: + if e["type"] == "PREFILL_ERROR": + diagnoses.append( + {"severity": "HIGH", "message": f'Prefill 错误: {e.get("error","")} @ {e.get("worker","")}'} + ) + + if "FAILED_SELECT" in types: + diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_trace_report(result): + """将追踪结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_dict) + summary_text: 总结部分(概览 + 诊断 + 生命周期状态) + detail_dict: {trace_id: event_chain_text} 各 ID 的完整事件链 + """ + sections = ["## 请求追踪", ""] + sections.append(f' {result["summary"]}') + sections.append("") + + detail_dict = {} + + for tid, trace in result["traces"].items(): + sections.append(f"### ID: {tid}") + if trace.get("matched_tag"): + sections.append(f' 匹配类型: {trace["matched_tag"]}') + if trace.get("related_ids", {}).get("request_ids"): + sections.append(f' 关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') + + status = "完整" if trace["lifecycle_complete"] else "不完整" + sections.append(f" 生命周期: {status}") + sections.append("") + + # 诊断 + if trace["diagnoses"]: + for d in trace["diagnoses"]: + sections.append(f' [{d["severity"]}] {d["message"]}') + sections.append("") + + # 事件链 → 拆分到 detail_dict + if trace["events"]: + detail_lines = [f"# 请求追踪事件链: {tid}", ""] + if trace.get("matched_tag"): + detail_lines.append(f'匹配类型: {trace["matched_tag"]}') + if trace.get("related_ids", {}).get("request_ids"): + detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') + detail_lines.append(f"生命周期: {status}") + detail_lines.append("") + detail_lines.append("## 事件链") + detail_lines.append("") + for evt in trace["events"]: + line = f' [{evt.get("ts","")}] {evt["type"]}' + if evt.get("worker"): + line += f' → {evt["worker"].replace("http://","")}' + if evt.get("status"): + line += f' [{evt["status"]}]' + if evt.get("latency_ms"): + line += f' {evt["latency_ms"]}ms' + if evt.get("strategy"): + line += f' strategy={evt["strategy"]}' + if evt.get("selected_hitRatio"): + line += f' hitRatio={evt["selected_hitRatio"]}' + if evt.get("tokens"): + line += f' tokens={evt["tokens"]}' + if evt.get("error"): + line += f' error={evt["error"]}' + detail_lines.append(line) + detail_lines.append("") + detail_dict[tid] = "\n".join(detail_lines) + + # 主报告中添加引用和摘要 + safe_tid = tid.replace("/", "_") + sections.append(f' 事件数: {len(trace["events"])}') + sections.append(f" > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)") + sections.append("") + + return "\n".join(sections), detail_dict + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py new file mode 100644 index 00000000000..83bb0203432 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +""" +Chart — 终端可视化渲染工具 + +提供 sparkline 折线图、Unicode 柱状图、Markdown 表格、Worker 时间线的渲染函数。 +所有函数返回字符串(不直接打印),方便组装到报告中。 + +Python 3 stdlib only,零依赖。 +""" + +from datetime import datetime + +# ════════════════════════════════════════════════════════════════ +# Sparkline 折线图 +# ════════════════════════════════════════════════════════════════ + +BLOCK_CHARS = " ▁▂▃▄▅▆▇█" + + +def render_sparkline( + records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60 +): + """渲染 8 级 Unicode sparkline 折线图。 + + Args: + records: dict 列表,每个 dict 包含 bucket_field 和 value_field + value_field: 数值字段名 + bucket_field: 时间桶字段名 + title: 图表标题 + y_label: Y 轴标签(如 '%') + y_range: Y 轴范围 (min, max) 元组,None 则自动 + width: 图表宽度(字符数) + + Returns: + str: 渲染后的图表文本 + """ + if not records: + return " (no data)" + + all_values = [] + for r in records: + v = r.get(value_field) + if v is not None: + all_values.append(float(v)) + + if not all_values: + return " (no numeric data)" + + # Y 轴范围 + if y_range: + y_min, y_max = y_range + else: + y_min = min(all_values) + y_max = max(all_values) + if y_max == y_min: + y_min = 0 if y_max > 0 else y_max - 1 + y_max = max(y_max, 1) + + y_span = y_max - y_min if y_max != y_min else 1 + + # 降采样 + n = len(records) + if n > width: + step = n / width + sampled = [] + for i in range(width): + start_idx = int(i * step) + end_idx = int((i + 1) * step) + chunk = records[start_idx:end_idx] + vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None] + avg_record = { + bucket_field: chunk[0].get(bucket_field, ""), + value_field: sum(vals) / len(vals) if vals else 0, + } + sampled.append(avg_record) + records = sampled + + lines = [] + + # 标题行 + def fmt_val(v): + if abs(v) >= 1000: + return f"{v:.0f}" + elif abs(v) >= 10: + return f"{v:.1f}" + return f"{v:.2f}" + + header_parts = [] + if title: + header_parts.append(title) + header_parts.append(f"min={fmt_val(min(all_values))}") + header_parts.append(f"max={fmt_val(max(all_values))}") + if y_label: + header_parts.append(f"({y_label})") + lines.append(" " + " ".join(header_parts)) + + # Sparkline 字符 + spark_chars = [] + for r in records: + v = r.get(value_field) + if v is None: + spark_chars.append(" ") + continue + v = float(v) + normalized = (v - y_min) / y_span + level = max(0, min(8, round(normalized * 8))) + spark_chars.append(BLOCK_CHARS[level]) + lines.append(" " + "".join(spark_chars)) + + # X 轴标签 + data_width = len(records) + if data_width > 0: + + def short_bucket(r): + b = str(r.get(bucket_field, "")) + if " " in b: + b = b.split(" ")[-1] + return b[:5] if len(b) >= 5 else b + + lbl_width = 6 + max_labels = max(1, data_width // lbl_width) + n_records = len(records) + + if n_records <= 2: + indices = list(range(n_records)) + elif n_records <= max_labels: + indices = [0, n_records - 1] + else: + n_labels = min(5, max(2, max_labels)) + indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)] + + label_line = [" "] * (data_width + lbl_width + 2) + last_end = -1 + for idx in indices: + lbl = short_bucket(records[idx]) + pos = idx + if pos < last_end: + continue + for ci, c in enumerate(lbl): + p = pos + ci + if p < len(label_line): + label_line[p] = c + last_end = pos + len(lbl) + 1 + lines.append(" " + "".join(label_line).rstrip()) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Unicode 柱状图 +# ════════════════════════════════════════════════════════════════ + + +def render_bar(data, bar_width=20, show_count=False): + """渲染 Unicode 柱状图。 + + Args: + data: dict 列表,每个 dict 包含 label, value(百分比 0-100), 可选 count + bar_width: 柱状图宽度(字符数) + show_count: 是否显示绝对数量 + + Returns: + str: 渲染后的图表文本 + """ + if not data: + return " (no data)" + + max_label_len = max(len(str(d.get("label", ""))) for d in data) + max_label_len = max(max_label_len, 4) + + lines = [] + for d in data: + label = str(d.get("label", "")) + value = float(d.get("value", 0)) + count = d.get("count") + + filled = round(value / 100 * bar_width) if value > 0 else 0 + filled = max(1, filled) if value > 0 else 0 + filled = min(bar_width, filled) + empty = bar_width - filled + bar = "█" * filled + "░" * empty + + line = f" {label:<{max_label_len}} {bar} {value:>5.1f}%" + if show_count and count is not None: + line += f" (N={count})" + lines.append(line) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Markdown 表格 +# ════════════════════════════════════════════════════════════════ + + +def render_table(data, columns=None, right_align=None): + """渲染 Markdown 表格。 + + Args: + data: dict 列表 + columns: 列名列表,None 则用第一条记录的所有 key + right_align: 右对齐的列名集合 + + Returns: + str: 渲染后的表格文本 + """ + if not data: + return " (no data)" + + if columns is None: + columns = list(data[0].keys()) + if right_align is None: + right_align = set() + + # 计算列宽 + col_widths = {} + for col in columns: + col_widths[col] = len(col) + for row in data: + val = str(row.get(col, "")) + col_widths[col] = max(col_widths[col], len(val)) + + # 表头 + header_parts = [] + sep_parts = [] + for col in columns: + w = col_widths[col] + if col in right_align: + header_parts.append(f" {col:>{w}} ") + else: + header_parts.append(f" {col:<{w}} ") + sep_parts.append("-" * (w + 2)) + + lines = [] + lines.append("|" + "|".join(header_parts) + "|") + lines.append("|" + "|".join(sep_parts) + "|") + + # 数据行 + for row in data: + row_parts = [] + for col in columns: + val = str(row.get(col, "")) + w = col_widths[col] + if col in right_align: + row_parts.append(f" {val:>{w}} ") + else: + row_parts.append(f" {val:<{w}} ") + lines.append("|" + "|".join(row_parts) + "|") + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Worker 可用性时间线 +# ════════════════════════════════════════════════════════════════ + + +def render_timeline(data, width=40): + """渲染 Worker 可用性时间线。 + + Args: + data: dict,结构为: + { + 'start': 'YYYY/MM/DD HH:MM:SS', + 'end': 'YYYY/MM/DD HH:MM:SS', + 'workers': { + 'http://ip:port': [ + {'from': 'ts', 'to': 'ts', 'status': 'up'|'down'}, + ... + ], + ... + } + } + width: 时间线宽度(字符数) + + Returns: + str: 渲染后的时间线文本 + """ + if not data: + return " (no data)" + + start_str = data.get("start", "") + end_str = data.get("end", "") + workers = data.get("workers", {}) + + if not workers or not start_str or not end_str: + return " (insufficient data)" + + # Parse time range + try: + if "/" in start_str: + fmt = "%Y/%m/%d %H:%M:%S" + else: + fmt = "%H:%M:%S" + t_start = datetime.strptime(start_str, fmt) + t_end = datetime.strptime(end_str, fmt) + except ValueError: + return f" (cannot parse time range: {start_str} ~ {end_str})" + + total_seconds = (t_end - t_start).total_seconds() + if total_seconds <= 0: + total_seconds = 1 + + lines = [] + + for worker_url, periods in workers.items(): + # Short label: just IP:PORT + label = worker_url.replace("http://", "") + bar = ["░"] * width + + for period in periods: + p_start_str = period.get("from", start_str) + p_end_str = period.get("to", end_str) + status = period.get("status", "up") + + try: + if "/" in p_start_str: + p_start = datetime.strptime(p_start_str, "%Y/%m/%d %H:%M:%S") + p_end = datetime.strptime(p_end_str, "%Y/%m/%d %H:%M:%S") + else: + p_start = datetime.strptime(p_start_str, "%H:%M:%S") + p_end = datetime.strptime(p_end_str, "%H:%M:%S") + except ValueError: + continue + + start_pos = int((p_start - t_start).total_seconds() / total_seconds * width) + end_pos = int((p_end - t_start).total_seconds() / total_seconds * width) + start_pos = max(0, min(width - 1, start_pos)) + end_pos = max(0, min(width, end_pos)) + + char = "█" if status == "up" else "░" + for i in range(start_pos, end_pos): + bar[i] = char + + up_count = bar.count("█") + uptime_pct = round(up_count / width * 100, 1) + + max_label_len = max(len(w.replace("http://", "")) for w in workers) + lines.append(f' {label:<{max_label_len}} {"".join(bar)} {uptime_pct}%') + + # Time axis + if lines: + max_label_len = max(len(w.replace("http://", "")) for w in workers) + padding = " " * (max_label_len + 4) + start_lbl = start_str.split(" ")[-1] if " " in start_str else start_str + end_lbl = end_str.split(" ")[-1] if " " in end_str else end_str + gap = width - len(start_lbl) - len(end_lbl) + lines.append(f'{padding}{start_lbl}{" " * max(1, gap)}{end_lbl}') + lines.append(f"{padding}█ = online ░ = offline") + + return "\n".join(lines) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py new file mode 100644 index 00000000000..2a90d39b632 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -0,0 +1,832 @@ +#!/usr/bin/env python3 +""" +Router Log Parser — FastDeploy Go Router 日志解析原语 + +支持两种调用方式: +1. 作为模块导入:from log_parser import parse_http_line, parse_cache_strategy_line, ... +2. 作为 CLI 工具:grep 'pattern' logfile | python3 log_parser.py parse-http + +Python 3 stdlib only,零依赖。 +""" + +import argparse +import json +import re +import sys +from collections import defaultdict +from datetime import datetime + +# ════════════════════════════════════════════════════════════════ +# 通用解析原语 +# ════════════════════════════════════════════════════════════════ + +# Go time.Duration.String() parser: handles 1h2m3.456s, 500µs, 150.5ms, etc. +DURATION_RE = re.compile(r"(\d+(?:\.\d+)?)(h|m(?!s)|s|ms|[µu]s|ns)") + + +def parse_go_duration_ms(s): + """解析 Go time.Duration.String() 输出为毫秒。 + + Examples: '1.5s' -> 1500.0, '500µs' -> 0.5, '1m30s' -> 90000.0 + """ + total = 0.0 + for m in DURATION_RE.finditer(s): + val, unit = float(m.group(1)), m.group(2) + if unit == "h": + total += val * 3600000 + elif unit == "m": + total += val * 60000 + elif unit == "s": + total += val * 1000 + elif unit == "ms": + total += val + elif unit in ("µs", "us"): + total += val / 1000 + elif unit == "ns": + total += val / 1000000 + return total + + +def parse_go_map(s): + """解析 Go fmt.Sprintf('%v', map) 输出:map[key1:val1 key2:val2 ...] + + 处理 URL 中冒号与 Go map key-value 分隔符的冲突(从最后一个冒号分割)。 + 空 map 'map[]' 返回空 dict。 + """ + inner_match = re.search(r"map\[(.*?)\]", s) + if not inner_match: + return {} + inner = inner_match.group(1).strip() + if not inner: + return {} + result = {} + for token in inner.split(): + idx = token.rfind(":") + if idx > 0: + key = token[:idx] + val_str = token[idx + 1 :] + try: + result[key] = int(val_str) if "." not in val_str else float(val_str) + except ValueError: + result[key] = val_str + return result + + +# 时间戳:YYYY/MM/DD HH:MM:SS +TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})") + +# ts_ms:2025-01-15 18:25:33.123 +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + + +def extract_ts(line): + """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。""" + m = TS_RE.search(line) + return m.group(1) if m else None + + +def parse_ts(ts_str): + """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。""" + return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + + +# ════════════════════════════════════════════════════════════════ +# 时间范围过滤 +# ════════════════════════════════════════════════════════════════ + +import os +import subprocess +import tempfile + +_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$") +_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$") +_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$") +_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$") + + +def _get_log_boundary_ts(log_file, which="first"): + """从日志文件首行或末行提取时间戳。""" + cmd = "head" if which == "first" else "tail" + try: + r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5) + return extract_ts(r.stdout) if r.returncode == 0 else None + except (subprocess.TimeoutExpired, FileNotFoundError): + return None + + +def complete_time_arg(time_str, log_file, is_end=False): + """解析灵活时间输入,补全缺失部分。 + + 支持格式: + 'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD', + 'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM' + + 补全规则: + - 缺年份:从日志首行取 + - 缺日期:从日志末行取 + - 缺时间:start→00:00:00, end→23:59:59 + + Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串 + """ + if time_str is None: + return None + time_str = time_str.strip() + + # Case 1: 完整日期时间 + m = _FULL_DT_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + h, mi = m.group(4).zfill(2), m.group(5) + s = (m.group(6) or "00").zfill(2) + return f"{y}/{mo}/{d} {h}:{mi}:{s}" + + # Case 2: 仅日期 YYYY/MM/DD + m = _DATE_ONLY_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + t = "23:59:59" if is_end else "00:00:00" + return f"{y}/{mo}/{d} {t}" + + # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS] + m = _SHORT_DATE_RE.match(time_str) + if m: + mo, d = m.group(1).zfill(2), m.group(2).zfill(2) + ts = _get_log_boundary_ts(log_file, "first") + year = ts[:4] if ts else "2026" + if m.group(3): # 有时间部分 + h, mi = m.group(3).zfill(2), m.group(4) + s = (m.group(5) or "00").zfill(2) + return f"{year}/{mo}/{d} {h}:{mi}:{s}" + t = "23:59:59" if is_end else "00:00:00" + return f"{year}/{mo}/{d} {t}" + + # Case 4: 仅时间 HH:MM[:SS] + m = _TIME_ONLY_RE.match(time_str) + if m: + h, mi = m.group(1).zfill(2), m.group(2) + s = (m.group(3) or "00").zfill(2) + ts = _get_log_boundary_ts(log_file, "last") + date_part = ts[:10] if ts else "2026/01/01" + return f"{date_part} {h}:{mi}:{s}" + + # Fallback: 原样返回 + return time_str + + +def filter_file_by_time_range(log_file, start_str=None, end_str=None): + """用 awk 按时间范围预过滤日志文件。 + + 时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比,直接用 awk 字符串比较。 + 无时间戳的行(如 panic 堆栈续行)保留。 + + Args: + log_file: 原日志文件路径 + start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if not start_str and not end_str: + return (log_file, False) + + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_") + tmp.close() + + awk_script = r"""{ + ts = "" + if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + } + if (ts == "") { print; next } + if ((start == "" || ts >= start) && (end == "" || ts <= end)) print + }""" + + cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file] + + try: + with open(tmp.name, "w") as outf: + result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120) + if result.returncode != 0: + os.unlink(tmp.name) + return (log_file, False) + except (subprocess.TimeoutExpired, OSError): + if os.path.exists(tmp.name): + os.unlink(tmp.name) + return (log_file, False) + + return (tmp.name, True) + + +# Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] +TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") + + +def extract_tags(line): + """从日志行提取 context tag。""" + return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)} + + +# Log level +LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]") + + +def extract_level(line): + """从日志行提取日志级别。""" + m = LEVEL_RE.search(line) + return m.group(1) if m else None + + +# ════════════════════════════════════════════════════════════════ +# HTTP 请求行解析(类别 H1) +# ════════════════════════════════════════════════════════════════ + +# H1 pattern: [METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP +HTTP_RE = re.compile( + r"\[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\]\s+" + r"(/\S*)\s+HTTP/\d\.\d\s+" + r"(\d{3})\s+" + r"(\S+)\s+" # latency (Go duration) + r"(\d+\.\d+\.\d+\.\d+)" # client IP +) + + +def parse_http_line(line, inference_only=False): + """解析 H1 HTTP 请求行。 + + 输入示例: + [INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1 + + Args: + line: 日志行 + inference_only: True 则仅保留推理路径 (/v1/chat/completions, /v1/completions) + + 返回 dict 或 None。 + """ + ts = extract_ts(line) + m = HTTP_RE.search(line) + if not m: + return None + + method, path, status, latency_raw, client_ip = m.groups() + latency_ms = parse_go_duration_ms(latency_raw) + + if inference_only and path not in ("/v1/chat/completions", "/v1/completions"): + return None + + record = { + "ts": ts or "", + "method": method, + "path": path, + "status": int(status), + "latency_ms": round(latency_ms, 3), + "client_ip": client_ip, + } + + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Cache-Aware 策略行解析(类别 H6) +# ════════════════════════════════════════════════════════════════ + +STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") +SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") + + +def parse_cache_strategy_line(line): + """解析 cache-aware prefill 策略行。 + + 返回 dict 或 None(如果不是策略行)。 + """ + sm = STRATEGY_RE.search(line) + if not sm: + return None + + ts = extract_ts(line) + strategy = sm.group(1) + record = {"ts": ts or "", "strategy": strategy} + + sel_m = SELECTED_RE.search(line) + if sel_m: + record["selected"] = sel_m.group(1) + + reason_m = REASON_RE.search(line) + if reason_m and strategy == "process_tokens": + record["reason"] = reason_m.group(1).strip() + + hr_match = re.search(r"hitRatios=(map\[.*?\])", line) + if hr_match: + hit_ratios = parse_go_map(hr_match.group(1)) + record["hitRatios"] = hit_ratios + if "selected" in record: + record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0) + else: + record["hitRatios"] = {} + if "selected" in record: + record["selected_hitRatio"] = 0 + + loads_match = re.search(r"loads=(map\[.*?\])", line) + if loads_match: + record["loads"] = parse_go_map(loads_match.group(1)) + + ts_ms_m = TS_MS_RE.search(line) + if ts_ms_m: + record["ts_ms"] = ts_ms_m.group(1) + + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Stats 行解析(类别 H7) +# ════════════════════════════════════════════════════════════════ + +TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") +WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") + + +def parse_stats_line(line): + """解析 [stats] 统计行。 + + 注意:hits 和 total 是 per-interval 的(每 5s 重置),累计值必须 sum 所有行。 + + 返回 dict 或 None(如果不是 stats 行)。 + """ + if "[stats]" not in line: + return None + + ts = extract_ts(line) + record = {"ts": ts or ""} + + tr_m = TOTAL_RUNNING_RE.search(line) + if tr_m: + record["total_running"] = int(tr_m.group(1)) + + workers = {} + for wm in WORKER_RUNNING_RE.finditer(line): + workers[wm.group(1)] = int(wm.group(2)) + record["workers"] = workers + + chr_m = CACHE_HR_RE.search(line) + if chr_m: + record["cache_hit_rate"] = float(chr_m.group(1)) + record["hits"] = int(chr_m.group(2)) + record["total"] = int(chr_m.group(3)) + + return record + + +# ════════════════════════════════════════════════════════════════ +# 错误消息模板归一化 +# ════════════════════════════════════════════════════════════════ + +NORMALIZE_PATTERNS = [ + (re.compile(r"https?://[\w.:]+"), "{url}"), + (re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I), "{uuid}"), + (re.compile(r"\d+\.\d+\.\d+\.\d+:\d+"), "{ip:port}"), + (re.compile(r"\b\d+\b"), "{N}"), +] + +# Message extraction: everything after "logger.go:NN: " (and optional context tags) +MSG_RE = re.compile(r"logger\.go:\d+:\s*(?:\[[^\]]*\]\s*)*(.+)") + + +def normalize_message(msg): + """将错误消息中的变量替换为占位符模板。""" + for pat, repl in NORMALIZE_PATTERNS: + msg = pat.sub(repl, msg) + return msg + + +def parse_error_line(line): + """解析 ERROR/WARN 行并进行模板归一化。 + + 返回 dict: {ts, level, original, template, tags} + """ + ts = extract_ts(line) + level = extract_level(line) + tags = extract_tags(line) + + mm = MSG_RE.search(line) + original = mm.group(1).strip() if mm else line + + template = normalize_message(original) + + record = { + "ts": ts or "", + "level": level or "", + "original": original, + "template": template, + } + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Select/Release 事件匹配 +# ════════════════════════════════════════════════════════════════ + +SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") +RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") +FAILED_SELECT_RE = re.compile(r"Failed to select") +SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") + + +def match_select_release(lines): + """匹配 select/release worker 事件对。 + + Args: + lines: 日志行列表(字符串) + + Returns: + dict: {matched, unmatched_selects, failed_selects, per_worker} + """ + selects = [] + releases = [] + failed_selects = [] + + for line_no, line in enumerate(lines, 1): + ts = extract_ts(line) + tags = extract_tags(line) + + # Token-bearing select + tm = SELECT_TOKENS_RE.search(line) + if tm: + selects.append( + { + "ts": ts, + "worker": tm.group(1), + "type": "prefill", + "tags": tags, + "tokens": int(tm.group(2)), + "line": line_no, + } + ) + continue + + # Token-bearing release + trm = RELEASE_TOKENS_RE.search(line) + if trm: + releases.append( + { + "ts": ts, + "worker": trm.group(1), + "type": "prefill_tokens", + "tags": tags, + "tokens": int(trm.group(2)), + "line": line_no, + } + ) + continue + + sm = SELECT_RE.search(line) + if sm: + selects.append( + { + "ts": ts, + "worker": sm.group(2), + "type": sm.group(1) or "unknown", + "tags": tags, + "tokens": None, + "line": line_no, + } + ) + continue + + rm = RELEASE_RE.search(line) + if rm: + releases.append( + { + "ts": ts, + "worker": rm.group(2), + "type": rm.group(1) or "unknown", + "tags": tags, + "tokens": None, + "line": line_no, + } + ) + continue + + if FAILED_SELECT_RE.search(line): + failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) + + # Match by request_id + matched = [] + unmatched_selects = [] + release_used = set() + + release_by_reqid = defaultdict(list) + for i, r in enumerate(releases): + rid = r["tags"].get("request_id", "") + if rid: + release_by_reqid[rid].append(i) + + for s in selects: + rid = s["tags"].get("request_id", "") + found = False + if rid and rid in release_by_reqid: + for ri in release_by_reqid[rid]: + if ri not in release_used: + r = releases[ri] + matched.append( + { + "request_id": rid, + "worker": s["worker"], + "select_ts": s["ts"], + "release_ts": r["ts"], + "type": s["type"], + } + ) + release_used.add(ri) + found = True + break + + if not found: + unmatched_selects.append( + { + "worker": s["worker"], + "select_ts": s["ts"], + "type": s["type"], + "tags": s["tags"], + "note": "no matching release found", + } + ) + + # Per-worker summary + per_worker = defaultdict(lambda: {"selects": 0, "releases": 0}) + for s in selects: + per_worker[s["worker"]]["selects"] += 1 + for r in releases: + per_worker[r["worker"]]["releases"] += 1 + + pw_result = {} + for w, counts in per_worker.items(): + pw_result[w] = { + "selects": counts["selects"], + "releases": counts["releases"], + "delta": counts["selects"] - counts["releases"], + } + + return { + "matched": matched, + "unmatched_selects": unmatched_selects, + "failed_selects": failed_selects, + "per_worker": pw_result, + } + + +# ════════════════════════════════════════════════════════════════ +# 不支持请求检测 +# ════════════════════════════════════════════════════════════════ + +# Router 已知路由白名单 (method, path) +KNOWN_ROUTES = { + ("POST", "/v1/chat/completions"), + ("POST", "/v1/completions"), + ("POST", "/register"), + ("GET", "/registered_number"), + ("GET", "/registered"), + ("GET", "/health_generate"), + ("GET", "/metrics"), +} + + +def find_unsupported_requests(lines): + """从 HTTP 日志行中筛选不匹配任何已知路由的请求。 + + Returns: + dict: {details: [...], summary: {total, unique_paths: {path: count}}} + """ + details = [] + path_counts = defaultdict(int) + + for line in lines: + record = parse_http_line(line) + if not record: + continue + key = (record["method"], record["path"]) + if key not in KNOWN_ROUTES: + details.append( + { + "ts": record["ts"], + "method": record["method"], + "path": record["path"], + "status": record["status"], + "client_ip": record["client_ip"], + } + ) + path_counts[f"{record['method']} {record['path']}"] += 1 + + return { + "details": details, + "summary": { + "total": len(details), + "unique_paths": dict(path_counts), + }, + } + + +def _cli_unsupported_requests(args): + """CLI: 检测不支持的请求。""" + lines = [line.rstrip("\n") for line in sys.stdin] + result = find_unsupported_requests(lines) + + if args.summary_only: + print(json.dumps(result["summary"], ensure_ascii=False)) + else: + print(json.dumps(result, ensure_ascii=False)) + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def _cli_parse_stream(parse_fn): + """通用 CLI 流式解析:从 stdin 读入日志行,输出 JSON Lines 到 stdout。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_fn(line) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def _cli_parse_http(args): + """CLI: 解析 HTTP 请求行。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_http_line(line, inference_only=args.inference_only) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def _cli_normalize_errors(args): + """CLI: 归一化错误消息。""" + parsed = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_error_line(line) + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + print(f"Normalized {parsed} lines", file=sys.stderr) + + +def _cli_match_select_release(args): + """CLI: 匹配 select/release 事件。""" + lines = [line.rstrip("\n") for line in sys.stdin] + result = match_select_release(lines) + print(json.dumps(result, ensure_ascii=False)) + + +def _cli_self_test(args): + """运行内置测试。""" + passed = 0 + failed = 0 + + def check(name, got, expected): + nonlocal passed, failed + if got == expected: + print(f" PASS: {name}") + passed += 1 + else: + print(f" FAIL: {name}") + print(f" expected: {expected}") + print(f" got: {got}") + failed += 1 + + print("=== Testing parse_go_duration_ms ===") + check("simple seconds", parse_go_duration_ms("1.5s"), 1500.0) + check("milliseconds", parse_go_duration_ms("150ms"), 150.0) + check("fractional ms", parse_go_duration_ms("150.5ms"), 150.5) + check("microseconds µs", parse_go_duration_ms("500µs"), 0.5) + check("microseconds us", parse_go_duration_ms("500us"), 0.5) + check("nanoseconds", parse_go_duration_ms("500ns"), 0.0005) + check("composite m+s", parse_go_duration_ms("1m30s"), 90000.0) + check("composite h+m+s", parse_go_duration_ms("1h2m3s"), 3723000.0) + check("composite h+m+fractional_s", parse_go_duration_ms("1h2m3.456s"), 3723456.0) + check("pure minutes", parse_go_duration_ms("2m"), 120000.0) + check("zero", parse_go_duration_ms("0s"), 0.0) + check("sub-ms decimal", parse_go_duration_ms("2.798235ms"), 2.798235) + + print("\n=== Testing parse_go_map ===") + check("single entry", parse_go_map("map[http://10.0.0.1:9263:100]"), {"http://10.0.0.1:9263": 100}) + check( + "multi entry", + parse_go_map("map[http://10.0.0.1:9263:100 http://10.0.0.2:9867:50]"), + {"http://10.0.0.1:9263": 100, "http://10.0.0.2:9867": 50}, + ) + check("empty map", parse_go_map("map[]"), {}) + check("float values", parse_go_map("map[http://10.0.0.1:9263:0.85]"), {"http://10.0.0.1:9263": 0.85}) + + print("\n=== Testing extract_ts ===") + check("standard", extract_ts("[INFO] 2025/01/15 18:25:33 logger.go:45: msg"), "2025/01/15 18:25:33") + check("no timestamp", extract_ts("no timestamp here"), None) + + print("\n=== Testing extract_tags ===") + check( + "session+request", + extract_tags("[session_id:abc] [request_id:def]"), + {"session_id": "abc", "request_id": "def"}, + ) + check( + "all four", + extract_tags("[trace_id:t1] [req_id:r1] [session_id:s1] [request_id:rq1]"), + {"trace_id": "t1", "req_id": "r1", "session_id": "s1", "request_id": "rq1"}, + ) + check("no tags", extract_tags("no tags here"), {}) + + print("\n=== Testing parse_http_line ===") + http_line = "[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.0.0.1" + r = parse_http_line(http_line) + check("http method", r["method"], "POST") + check("http path", r["path"], "/v1/chat/completions") + check("http status", r["status"], 200) + check("http latency", r["latency_ms"], 2.798) + check("http client_ip", r["client_ip"], "10.0.0.1") + + r_infer = parse_http_line( + "[INFO] 2025/01/15 18:25:33 logger.go:45: [GET] /health HTTP/1.1 200 1ms 10.0.0.1", inference_only=True + ) + check("inference_only filters health", r_infer, None) + + print("\n=== Testing normalize_message ===") + check("url", normalize_message("Failed to connect to http://10.0.0.1:9965"), "Failed to connect to {url}") + check("uuid", normalize_message("request abc12345-1234-5678-9012-abcdef123456 failed"), "request {uuid} failed") + check( + "ip:port", + normalize_message("dial tcp 10.0.0.1:9965: connection refused"), + "dial tcp {ip:port}: connection refused", + ) + + print(f'\n{"=" * 40}') + print(f"Results: {passed} passed, {failed} failed") + if failed: + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Log Parser", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + sub = parser.add_subparsers(dest="command") + + p = sub.add_parser("parse-http", help="解析 HTTP 请求行 (H1) → JSON Lines") + p.add_argument("--inference-only", action="store_true", help="仅保留推理路径") + + sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 (H6) → JSON Lines") + sub.add_parser("parse-stats", help="解析 [stats] 统计行 (H7) → JSON Lines") + sub.add_parser("normalize-errors", help="ERROR/WARN 行模板归一化 → JSON Lines") + sub.add_parser("match-select-release", help="匹配 select/release worker 事件") + p = sub.add_parser("unsupported-requests", help="检测不匹配已知路由的请求") + p.add_argument("--summary-only", action="store_true", help="仅输出汇总(不含详细列表)") + sub.add_parser("self-test", help="运行内置测试") + + args = parser.parse_args() + + if args.command == "parse-http": + _cli_parse_http(args) + elif args.command == "parse-cache-strategy": + _cli_parse_stream(parse_cache_strategy_line) + elif args.command == "parse-stats": + _cli_parse_stream(parse_stats_line) + elif args.command == "normalize-errors": + _cli_normalize_errors(args) + elif args.command == "match-select-release": + _cli_match_select_release(args) + elif args.command == "unsupported-requests": + _cli_unsupported_requests(args) + elif args.command == "self-test": + _cli_self_test(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py new file mode 100644 index 00000000000..a197ee7aff0 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Stats — 通用统计计算工具 + +提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。 +不含任何业务逻辑或日志格式依赖。 + +Python 3 stdlib only,零依赖。 +""" + +import math +from collections import defaultdict +from datetime import datetime, timedelta + +# ════════════════════════════════════════════════════════════════ +# 百分位数与基础统计 +# ════════════════════════════════════════════════════════════════ + + +def percentile(sorted_vals, p): + """从已排序列表计算第 p 百分位数(线性插值)。""" + if not sorted_vals: + return 0.0 + n = len(sorted_vals) + k = (p / 100.0) * (n - 1) + f = math.floor(k) + c = math.ceil(k) + if f == c: + return sorted_vals[int(k)] + return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f) + + +def compute_statistics(values, percentiles_list=None, distribution_spec=None): + """计算一组数值的统计量。 + + Args: + values: 数值列表 + percentiles_list: 要计算的百分位数列表,默认 [50, 90, 95, 99] + distribution_spec: 分布区间规格字符串,如 '0-20,20-40,40-60,60-80,80-100' + + Returns: + dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution} + """ + if percentiles_list is None: + percentiles_list = [50, 90, 95, 99] + + if not values: + result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0} + for p in percentiles_list: + result[f"p{p}"] = 0 + if distribution_spec is not None: + result["distribution"] = [] + return result + + sorted_vals = sorted(values) + n = len(sorted_vals) + total = sum(sorted_vals) + mean = total / n + variance = sum((x - mean) ** 2 for x in sorted_vals) / n + stddev = math.sqrt(variance) + + result = { + "count": n, + "min": round(sorted_vals[0], 3), + "max": round(sorted_vals[-1], 3), + "mean": round(mean, 3), + "sum": round(total, 3), + "stddev": round(stddev, 3), + } + + for p in percentiles_list: + result[f"p{p}"] = round(percentile(sorted_vals, p), 3) + + if distribution_spec is not None: + result["distribution"] = compute_distribution(sorted_vals, distribution_spec) + + return result + + +def compute_distribution(sorted_vals, spec_str): + """根据区间规格计算分布直方图。 + + spec_str 示例:'0-20,20-40,40-60,60-80,80-100' + 每个区间是左闭右开 [lo, hi)。 + """ + buckets = _parse_distribution_spec(spec_str) + n = len(sorted_vals) + result = [] + for b in buckets: + if b[0] == "lt": + count = sum(1 for v in sorted_vals if v < b[1]) + label = b[2] + elif b[0] == "gt": + count = sum(1 for v in sorted_vals if v > b[1]) + label = b[2] + elif b[0] == "range": + count = sum(1 for v in sorted_vals if b[1] <= v < b[2]) + label = b[3] + else: + continue + result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0}) + return result + + +def _parse_distribution_spec(spec_str): + """解析分布区间规格:'<100,100-500,>1000' → bucket 定义列表。""" + buckets = [] + for part in spec_str.split(","): + part = part.strip() + if part.startswith("<"): + buckets.append(("lt", float(part[1:]), part)) + elif part.startswith(">"): + buckets.append(("gt", float(part[1:]), part)) + elif "-" in part: + lo, hi = part.split("-", 1) + buckets.append(("range", float(lo), float(hi), part)) + return buckets + + +# ════════════════════════════════════════════════════════════════ +# 时间窗口聚合 +# ════════════════════════════════════════════════════════════════ + + +def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"): + """按时间窗口聚合记录。 + + Args: + records: dict 列表,每个 dict 必须有 ts_field 字段 + window: 窗口大小 '5s'/'1m'/'5m'/'auto' + agg_specs: 聚合规格列表 [(field, func), ...],如 [('selected_hitRatio', 'mean')] + func 支持:count, sum, mean, min, max, pNN + ts_field: 时间戳字段名 + + Returns: + list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...} + """ + if agg_specs is None: + agg_specs = [("_", "count")] + + if not records: + return [] + + window_td = _parse_window(window, records, ts_field) + + # 按窗口分组 + buckets = defaultdict(list) + for r in records: + ts_str = r.get(ts_field, "") + if not ts_str: + continue + try: + dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + except ValueError: + continue + bucket_dt = _align_to_bucket(dt, window_td) + bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S") + buckets[bucket_key].append(r) + + # 按时间排序并聚合 + result = [] + for bucket_key in sorted(buckets.keys()): + bucket_records = buckets[bucket_key] + entry = {"bucket": bucket_key, "count": len(bucket_records)} + + for field, func in agg_specs: + if field == "_": + if func == "count": + entry["count"] = len(bucket_records) + continue + + values = [] + for r in bucket_records: + v = r.get(field) + if v is not None: + try: + values.append(float(v)) + except (ValueError, TypeError): + pass + + out_key = f"{field}_{func}" + entry[out_key] = _aggregate_values(values, func) + + result.append(entry) + + return result + + +def _parse_window(window_str, records, ts_field): + """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。""" + if window_str == "auto": + timestamps = [] + for r in records: + ts_str = r.get(ts_field, "") + if ts_str: + try: + timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")) + except ValueError: + pass + if len(timestamps) < 2: + return timedelta(minutes=1) + span = max(timestamps) - min(timestamps) + if span < timedelta(minutes=30): + return timedelta(seconds=5) + elif span < timedelta(hours=3): + return timedelta(minutes=1) + else: + return timedelta(minutes=5) + elif window_str.endswith("s"): + return timedelta(seconds=int(window_str[:-1])) + elif window_str.endswith("m"): + return timedelta(minutes=int(window_str[:-1])) + elif window_str.endswith("h"): + return timedelta(hours=int(window_str[:-1])) + return timedelta(minutes=1) + + +def _align_to_bucket(dt, window_td): + """将 datetime 对齐到窗口边界。""" + secs = max(1, int(window_td.total_seconds())) + epoch = datetime(dt.year, dt.month, dt.day) + offset = int((dt - epoch).total_seconds()) + aligned = (offset // secs) * secs + return epoch + timedelta(seconds=aligned) + + +def _aggregate_values(values, func): + """用指定函数聚合一组数值。""" + if not values: + return 0 + if func == "count": + return len(values) + elif func == "sum": + return round(sum(values), 3) + elif func == "mean": + return round(sum(values) / len(values), 3) + elif func == "min": + return round(min(values), 3) + elif func == "max": + return round(max(values), 3) + elif func.startswith("p"): + p = int(func[1:]) + return round(percentile(sorted(values), p), 3) + return 0 + + +# ════════════════════════════════════════════════════════════════ +# 分组计数 +# ════════════════════════════════════════════════════════════════ + + +def count_by(records, field, top_n=None): + """按指定字段分组计数。 + + Args: + records: dict 列表 + field: 分组字段名 + top_n: 只返回前 N 个(按计数降序) + + Returns: + list[dict]: [{value, count, pct}],按计数降序排列 + """ + counts = defaultdict(int) + total = 0 + for r in records: + val = r.get(field) + if val is not None: + counts[str(val)] += 1 + total += 1 + + result = [] + for val, count in sorted(counts.items(), key=lambda x: -x[1]): + result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0}) + + if top_n: + result = result[:top_n] + + return result diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py new file mode 100644 index 00000000000..4e64a2092b3 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -0,0 +1,334 @@ +#!/usr/bin/env python3 +""" +Troubleshoot — FastDeploy Go Router 综合问题排查主编排器 + +Usage: + python3 troubleshoot.py [options] + +Options: + --errors 仅分析错误日志 + --latency 仅分析延迟 + --health 仅分析 Worker 健康 + --cache 仅分析 Cache 调度 + --load 仅分析负载与计数器 + --trace ID 追踪指定请求(支持逗号分隔多 ID) + --tail N 仅分析尾部 N 行(支持 N 或 Nm 格式如 30m) + --start TIME 起始时间(如 "16:00:00"、"03/31 16:00") + --end TIME 结束时间(如 "17:00:00"、"2026/03/31 17:00:00") + --output DIR 详细报告导出目录(默认: skill_output/troubleshoot//) + +支持维度:errors, latency, health, cache, load, trace +""" + +import argparse +import os +import sys +from datetime import datetime + +# 确保能 import 同级模块 +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from analyzers.cache import analyze_cache, format_cache_report +from analyzers.errors import analyze_errors, format_errors_report +from analyzers.health import analyze_health, format_health_report +from analyzers.latency import analyze_latency, format_latency_report +from analyzers.load import analyze_load, format_load_report +from analyzers.trace import analyze_trace, format_trace_report +from log_parser import complete_time_arg, filter_file_by_time_range + + +def determine_log_file(user_path=None): + """确定日志文件路径。 + + 搜索顺序: + 1. 用户指定路径(直接使用,不质疑) + 2. logs/router.log + 3. fd-router.log(golang_router 根目录) + """ + if user_path: + if os.path.isfile(user_path): + return user_path + print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr) + sys.exit(1) + + # 尝试不同 CWD 下的候选路径 + candidates = [ + "logs/router.log", # CWD = golang_router/ + "fd-router.log", # CWD = golang_router/ + "fastdeploy/golang_router/logs/router.log", # CWD = 项目根 + "fastdeploy/golang_router/fd-router.log", # CWD = 项目根 + ] + for path in candidates: + if os.path.isfile(path): + return path + + print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr) + sys.exit(1) + + +def parse_tail_arg(tail_str): + """解析 --tail 参数:支持纯数字(行数)或 Nm(分钟)格式。""" + if tail_str is None: + return None + if tail_str.endswith("m"): + # 分钟模式:转换为大致行数(假设 ~20 行/秒) + minutes = int(tail_str[:-1]) + return minutes * 60 * 20 + return int(tail_str) + + +def determine_status(results): + """根据分析结果判定全局状态。""" + reasons = [] + + # Errors 维度 + errors_result = results.get("errors") + if errors_result: + if errors_result["panic_list"]: + return "CRITICAL", f'{len(errors_result["panic_list"])} Panic 事件' + if errors_result["error_rate"] > 20: + return "CRITICAL", f'错误率 {errors_result["error_rate"]}%' + if errors_result["error_rate"] > 5: + reasons.append(f'错误率 {errors_result["error_rate"]}%') + for s in errors_result["status_code_dist"]: + code = str(s["value"]) + if code in ("502", "503") and s["count"] > 0: + reasons.append(f'{code}: {s["count"]}') + + # Latency 维度 + latency_result = results.get("latency") + if latency_result: + for d in latency_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Health 维度 + health_result = results.get("health") + if health_result: + for d in health_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Load 维度 + load_result = results.get("load") + if load_result: + for d in load_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Cache 维度 + cache_result = results.get("cache") + if cache_result: + for d in cache_result.get("diagnoses", []): + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + if reasons: + return "DEGRADED", ", ".join(reasons) + + if not results: + return "HEALTHY", "无分析数据" + + return "HEALTHY", "无严重问题" + + +def format_full_report(results, status, status_reason): + """组装完整报告。 + + Returns: + tuple: (report_text, details) + report_text: 主报告文本(总结 + 可视化) + details: dict 包含需要拆分到独立文件的详情数据 + - 'health_events': str 或 None + - 'trace_files': {trace_id: text} 或 {} + """ + parts = [] + details = {"health_events": None, "trace_files": {}} + + # 状态行 + parts.append(f"STATUS: {status} — {status_reason}") + parts.append("=" * 60) + parts.append("") + + # 各维度报告 + if "errors" in results: + parts.append(format_errors_report(results["errors"])) + + if "latency" in results: + parts.append(format_latency_report(results["latency"])) + + if "health" in results: + summary, detail = format_health_report(results["health"]) + parts.append(summary) + if detail: + details["health_events"] = detail + + if "load" in results: + parts.append(format_load_report(results["load"])) + + if "cache" in results: + parts.append(format_cache_report(results["cache"])) + + if "trace" in results: + summary, detail_dict = format_trace_report(results["trace"]) + parts.append(summary) + if detail_dict: + details["trace_files"] = detail_dict + + return "\n".join(parts), details + + +def save_detailed_report(report_text, output_dir, details=None): + """保存报告到文件。 + + Args: + report_text: 主报告文本 + output_dir: 输出目录 + details: 详情数据 dict(来自 format_full_report) + """ + os.makedirs(output_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"troubleshoot_report_{timestamp}.md" + filepath = os.path.join(output_dir, filename) + + with open(filepath, "w", encoding="utf-8") as f: + f.write("# Router Troubleshooting Report\n") + f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n') + f.write(report_text) + + # 保存详情到 details/ 子目录 + if details: + details_dir = os.path.join(output_dir, "details") + + if details.get("health_events"): + os.makedirs(details_dir, exist_ok=True) + health_path = os.path.join(details_dir, "health_events.md") + with open(health_path, "w", encoding="utf-8") as f: + f.write(details["health_events"]) + + for trace_id, trace_text in details.get("trace_files", {}).items(): + os.makedirs(details_dir, exist_ok=True) + safe_id = trace_id.replace("/", "_") + trace_path = os.path.join(details_dir, f"trace_{safe_id}.md") + with open(trace_path, "w", encoding="utf-8") as f: + f.write(trace_text) + + return filepath + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Troubleshooting", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("log_file", nargs="?", help="日志文件路径") + parser.add_argument("--errors", action="store_true", help="仅分析错误日志") + parser.add_argument("--latency", action="store_true", help="仅分析延迟") + parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康") + parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度") + parser.add_argument("--load", action="store_true", help="仅分析负载与计数器") + parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID)") + parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)") + parser.add_argument( + "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' + ) + parser.add_argument("--end", default=None, help='结束时间(如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00")') + parser.add_argument("--output", help="详细报告导出目录(默认:skill_output/troubleshoot//)") + + args = parser.parse_args() + + # 确定日志文件 + log_file = determine_log_file(args.log_file) + print(f"日志文件: {log_file}", file=sys.stderr) + + # --tail 与 --start/--end 不能混用(两者是不同的范围选择方式) + if args.tail and (args.start or args.end): + print("Error: --tail 与 --start/--end 不能同时使用,请选择其一", file=sys.stderr) + sys.exit(1) + + # 时间范围预过滤(--start 和 --end 可单独或同时指定) + import atexit + + if args.start or args.end: + start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None + end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None + filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + + # 确定分析模式 + any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace + run_errors = args.errors or (not any_mode) + run_latency = args.latency or (not any_mode) + run_health = args.health or (not any_mode) + run_load = args.load or (not any_mode) + run_cache = args.cache or (not any_mode) + run_trace = bool(args.trace) # trace 需要指定 ID,全量扫描不自动调用 + + tail = parse_tail_arg(args.tail) + + results = {} + step = 0 + total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace]) + + # 执行分析 + if run_errors: + step += 1 + print(f"[{step}/{total_steps}] 分析错误日志...", file=sys.stderr) + results["errors"] = analyze_errors(log_file, tail=tail) + + if run_latency: + step += 1 + print(f"[{step}/{total_steps}] 分析请求延迟...", file=sys.stderr) + results["latency"] = analyze_latency(log_file, tail=tail) + + if run_health: + step += 1 + print(f"[{step}/{total_steps}] 分析 Worker 健康...", file=sys.stderr) + results["health"] = analyze_health(log_file, tail=tail) + + if run_cache: + step += 1 + print(f"[{step}/{total_steps}] 分析 Cache 调度...", file=sys.stderr) + results["cache"] = analyze_cache(log_file, tail=tail) + + if run_load: + step += 1 + print(f"[{step}/{total_steps}] 分析负载与计数器...", file=sys.stderr) + results["load"] = analyze_load(log_file, tail=tail) + + if run_trace: + step += 1 + print(f"[{step}/{total_steps}] 追踪请求...", file=sys.stderr) + results["trace"] = analyze_trace(log_file, args.trace, tail=tail) + + # 判定状态 + status, status_reason = determine_status(results) + + # 输出报告 + report, details = format_full_report(results, status, status_reason) + print(report) + + # 保存详细报告 + if args.output: + output_dir = args.output + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp) + filepath = save_detailed_report(report, output_dir, details=details) + print(f"\n详细报告已保存到: {filepath}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.gitignore b/fastdeploy/golang_router/.gitignore new file mode 100644 index 00000000000..58b5c84d190 --- /dev/null +++ b/fastdeploy/golang_router/.gitignore @@ -0,0 +1,2 @@ +# Generated skill analysis outputs +skill_output/ From 5346b515809947c7f9657cc130a16bf2b6e4e028 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 18:08:52 +0800 Subject: [PATCH 02/40] chore: remove sample log fixtures per request --- .../stat-cache-hitrate/scripts/log_parser.py | 11 +- .../scripts/stat_cache_hitrate.py | 7 +- .../troubleshoot/scripts/analyzers/cache.py | 8 +- .../troubleshoot/scripts/analyzers/errors.py | 2 + .../troubleshoot/scripts/analyzers/health.py | 25 +-- .../troubleshoot/scripts/analyzers/load.py | 102 ++++++++--- .../troubleshoot/scripts/analyzers/trace.py | 23 ++- .../skills/troubleshoot/scripts/log_parser.py | 168 +++++++++++++++--- .../troubleshoot/scripts/troubleshoot.py | 22 ++- 9 files changed, 285 insertions(+), 83 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py index 0b7377b4865..d43d6909c64 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py @@ -125,7 +125,7 @@ def complete_time_arg(time_str, log_file, is_end=False): if m: mo, d = m.group(1).zfill(2), m.group(2).zfill(2) ts = _get_log_boundary_ts(log_file, "first") - year = ts[:4] if ts else "2026" + year = ts[:4] if ts else str(datetime.now().year) if m.group(3): # 有时间部分 h, mi = m.group(3).zfill(2), m.group(4) s = (m.group(5) or "00").zfill(2) @@ -139,7 +139,7 @@ def complete_time_arg(time_str, log_file, is_end=False): h, mi = m.group(1).zfill(2), m.group(2) s = (m.group(3) or "00").zfill(2) ts = _get_log_boundary_ts(log_file, "last") - date_part = ts[:10] if ts else "2026/01/01" + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" return f"{date_part} {h}:{mi}:{s}" # Fallback: 原样返回 @@ -204,9 +204,10 @@ def extract_tags(line): # Cache-Aware 策略行解析(类别 A) # ════════════════════════════════════════════════════════════════ +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") -SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") -REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") def parse_cache_strategy_line(line): @@ -271,7 +272,7 @@ def parse_cache_strategy_line(line): # ════════════════════════════════════════════════════════════════ TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") -WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index c193e99d47c..5487bc2cc96 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -14,6 +14,7 @@ import argparse import json import os +import re import subprocess import sys from collections import defaultdict @@ -32,6 +33,10 @@ ) from stats import compute_statistics, count_by, time_bucket + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -235,7 +240,7 @@ def compute_per_worker_stats(strategies): avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0 result.append( { - "Worker": worker.replace("http://", ""), + "Worker": _strip_scheme(worker), "Selected": data["selected_count"], "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%", "AvgHitRatio": f"{avg_hr}%", diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3a18b668a41..3fca296f4d6 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -26,6 +26,10 @@ TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens") +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + def classify_fallback(record, tokenizer_degraded_ts=None): """对 process_tokens 策略行分类 fallback 原因。 @@ -210,9 +214,9 @@ def _analyze_suboptimal(records, hr_weight, lb_weight): suboptimal.append( { "ts": r.get("ts", ""), - "selected": selected.replace("http://", ""), + "selected": _strip_scheme(selected), "selected_hr": sel_hr, - "best_hr_worker": best_by_hr.replace("http://", ""), + "best_hr_worker": _strip_scheme(best_by_hr), "best_hr": max_hr, "reason": reason, } diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index 0817e280aa5..b8217a5ffa4 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -44,6 +44,8 @@ ("No available", "FD 后端"), ("request failed", "FD 后端"), ("Removed unhealthy", "FD 后端"), + ("is not healthy", "FD 后端"), + ("is healthy", "FD 后端"), ("Backend request failed", "FD 后端"), ("Decode request failed", "FD 后端"), ("Prefill request failed", "FD 后端"), diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index d2d7ca77acb..ca01d718dbc 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -21,11 +21,16 @@ # 健康事件解析 # ════════════════════════════════════════════════════════════════ -NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy") -REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)") -IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy") -COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") -CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)") +WORKER_URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +NOT_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is not healthy") +REMOVED_RE = re.compile(rf"Removed unhealthy \w+ instance:\s*{WORKER_URL_RE}") +IS_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is healthy") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{WORKER_URL_RE}") +CLEANUP_UNHEALTHY_RE = re.compile(rf"cleanup unhealthy.*?{WORKER_URL_RE}") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) def parse_health_event(line): @@ -110,7 +115,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): # IP → worker URL 映射 ip_to_urls = defaultdict(set) for url in worker_urls: - ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) if ip_m: ip_to_urls[ip_m.group(1)].add(url) @@ -130,7 +135,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): workers = {} for url in sorted(worker_urls): events = sorted(worker_events[url], key=lambda e: e["ts"] or "") - ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) worker_ip = ip_m.group(1) if ip_m else "" # 恢复检测:REMOVED 后有 register @@ -237,7 +242,7 @@ def _diagnose(workers): ) for url, w in workers.items(): - s = url.replace("http://", "") + s = _strip_scheme(url) if w["down_count"] > 3: diagnoses.append( { @@ -326,7 +331,7 @@ def format_health_report(result): ) table_data.append( { - "Worker": url.replace("http://", ""), + "Worker": _strip_scheme(url), "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", @@ -358,7 +363,7 @@ def format_health_report(result): for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True - detail_parts.append(f'## {url.replace("http://", "")}') + detail_parts.append(f"## {_strip_scheme(url)}") detail_parts.append("") for evt in w["events"]: detail_parts.append(f' [{evt["ts"]}] {evt["type"]}') diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index e712011d932..9be82357494 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -21,14 +21,19 @@ # Counter 异常检测正则 # ════════════════════════════════════════════════════════════════ -DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release") -COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up") -COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") -TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)") +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +DOUBLE_RELEASE_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?double-release") +COUNTER_CLEANED_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?counter already cleaned up") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{URL_RE}") +TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}") # Token 事件 -SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) def parse_counter_anomaly(line): @@ -89,7 +94,7 @@ def analyze_load(log_file, tail=None): avg = sum(vals) / len(vals) if vals else 0 worker_load.append( { - "worker": w_url.replace("http://", ""), + "worker": _strip_scheme(w_url), "avg_running": round(avg, 1), "max_running": max(vals) if vals else 0, "samples": len(vals), @@ -121,9 +126,9 @@ def analyze_load(log_file, tail=None): # Select/Release 匹配 sr_result = ( - match_select_release(h3_lines) + match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}} + else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} ) # Token 统计 @@ -133,7 +138,7 @@ def analyze_load(log_file, tail=None): pileup = _detect_pileup(stats_records) # 诊断 - diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup) + diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup) return { "load_stats": load_stats, @@ -170,7 +175,7 @@ def _analyze_tokens(h3_lines, h11_lines): releases = token_release.get(w, []) result.append( { - "worker": w.replace("http://", ""), + "worker": _strip_scheme(w), "alloc_count": len(allocs), "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0, "release_count": len(releases), @@ -195,7 +200,7 @@ def _detect_pileup(stats_records): return max_consecutive >= 5 -def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): +def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup): """生成负载诊断。""" diagnoses = [] @@ -236,16 +241,20 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): } ) - # Select/Release 不一致 - for w_url, pw in sr_result.get("per_worker", {}).items(): - if pw.get("delta", 0) > 0: - diagnoses.append( - { - "severity": "HIGH", - "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', - "source_layer": "FD 后端", - } - ) + id_cov = sr_result.get("id_coverage", {}) + has_correlatable_ids = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) > 0 + + # Select/Release 不一致(仅在存在可关联 ID 时启用,避免无 ID 场景误报) + if has_correlatable_ids: + for w_url, pw in sr_result.get("per_worker", {}).items(): + if pw.get("delta", 0) > 0: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', + "source_layer": "FD 后端", + } + ) # 卡住的请求 if sr_result.get("unmatched_selects"): @@ -257,6 +266,17 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): } ) + # Token 计数器潜在泄漏 + for t in token_stats: + if t.get("alloc_count", 0) > t.get("release_count", 0): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{t["worker"]} token alloc/release 不平衡 ({t["alloc_count"]}/{t["release_count"]})', + "source_layer": "Router", + } + ) + return diagnoses @@ -316,23 +336,44 @@ def format_load_report(result): sections.append("### 计数器异常") sections.append("") for a in result["counter_anomalies"]: - workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items()) + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') sections.append("") + id_cov = result.get("select_release", {}).get("id_coverage", {}) + if id_cov: + sections.append("### 请求标识覆盖(基于 select 近似请求数)") + sections.append("") + sections.append( + " total={total} | with_request_id={with_rid} | without_request_id={without_rid} | " + "with_alt_id={with_alt} | without_any_id={without_any}".format( + total=id_cov.get("total_requests_estimated", 0), + with_rid=id_cov.get("with_request_id", 0), + without_rid=id_cov.get("without_request_id", 0), + with_alt=id_cov.get("with_alt_id", 0), + without_any=id_cov.get("without_any_id", 0), + ) + ) + if id_cov.get("without_any_id", 0) > 0: + sections.append(" ℹ 无 request/session/trace/req_id 时,不做退化匹配,仅统计为 untracked。") + sections.append("") + # Select/Release 匹配 sr = result.get("select_release", {}) if sr.get("per_worker"): sections.append("### Select/Release 匹配") sections.append("") + id_cov = sr.get("id_coverage", {}) + no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0 table_data = [] for w_url, pw in sorted(sr["per_worker"].items()): + delta_display = "N/A" if no_correlatable_id else str(pw["delta"]) table_data.append( { - "Worker": w_url.replace("http://", ""), + "Worker": _strip_scheme(w_url), "Select": str(pw["selects"]), "Release": str(pw["releases"]), - "Delta": str(pw["delta"]), + "Delta": delta_display, } ) sections.append( @@ -343,11 +384,20 @@ def format_load_report(result): ) ) sections.append("") + if no_correlatable_id: + sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") + sections.append("") if sr.get("unmatched_selects"): sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') for u in sr["unmatched_selects"][:5]: - sections.append(f' [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})') + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') + sections.append("") + + if sr.get("untracked_selects"): + sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') + for u in sr["untracked_selects"][:5]: + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') sections.append("") # Token 统计 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 45a5056616e..6c9a0323724 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -25,21 +25,26 @@ # ════════════════════════════════════════════════════════════════ PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection") -SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)") -RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +SELECT_WORKER_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_WORKER_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully") TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") # Prefill 事件 -PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)") -PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)") -PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)") -PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)") -PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)") +PREFILL_FIRST_CHUNK_RE = re.compile(rf"\[prefill\] first chunk received.*?{URL_RE}") +PREFILL_DONE_RE = re.compile(rf"\[prefill\] non-stream prefill response done.*?{URL_RE}") +PREFILL_ERROR_RE = re.compile(rf"\[prefill\] (scanner error|copy error).*?{URL_RE}") +PREFILL_DEFER_RE = re.compile(rf"\[prefill\] release in defer.*?{URL_RE}") +PREFILL_ERR_PATH_RE = re.compile(rf"\[prefill\] release in CommonCompletions defer \(error path\).*?{URL_RE}") FAILED_SELECT_RE = re.compile(r"Failed to select") +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -342,7 +347,7 @@ def format_trace_report(result): for evt in trace["events"]: line = f' [{evt.get("ts","")}] {evt["type"]}' if evt.get("worker"): - line += f' → {evt["worker"].replace("http://","")}' + line += f' → {_strip_scheme(evt["worker"])}' if evt.get("status"): line += f' [{evt["status"]}]' if evt.get("latency_ms"): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 2a90d39b632..44f5cdebd94 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -14,7 +14,7 @@ import re import sys from collections import defaultdict -from datetime import datetime +from datetime import datetime, timedelta # ════════════════════════════════════════════════════════════════ # 通用解析原语 @@ -152,7 +152,7 @@ def complete_time_arg(time_str, log_file, is_end=False): if m: mo, d = m.group(1).zfill(2), m.group(2).zfill(2) ts = _get_log_boundary_ts(log_file, "first") - year = ts[:4] if ts else "2026" + year = ts[:4] if ts else str(datetime.now().year) if m.group(3): # 有时间部分 h, mi = m.group(3).zfill(2), m.group(4) s = (m.group(5) or "00").zfill(2) @@ -166,7 +166,7 @@ def complete_time_arg(time_str, log_file, is_end=False): h, mi = m.group(1).zfill(2), m.group(2) s = (m.group(3) or "00").zfill(2) ts = _get_log_boundary_ts(log_file, "last") - date_part = ts[:10] if ts else "2026/01/01" + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" return f"{date_part} {h}:{mi}:{s}" # Fallback: 原样返回 @@ -218,6 +218,30 @@ def filter_file_by_time_range(log_file, start_str=None, end_str=None): return (tmp.name, True) +def filter_file_by_recent_minutes(log_file, minutes): + """按日志末时间戳向前过滤最近 N 分钟日志。 + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if minutes is None or minutes <= 0: + return (log_file, False) + + last_ts = _get_log_boundary_ts(log_file, "last") + if not last_ts: + return (log_file, False) + + try: + end_dt = parse_ts(last_ts) + except ValueError: + return (log_file, False) + + start_dt = end_dt - timedelta(minutes=minutes) + start_str = start_dt.strftime("%Y/%m/%d %H:%M:%S") + end_str = end_dt.strftime("%Y/%m/%d %H:%M:%S") + return filter_file_by_time_range(log_file, start_str=start_str, end_str=end_str) + + # Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") @@ -228,7 +252,7 @@ def extract_tags(line): # Log level -LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]") +LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN|DEBUG)\]") def extract_level(line): @@ -294,9 +318,10 @@ def parse_http_line(line, inference_only=False): # Cache-Aware 策略行解析(类别 H6) # ════════════════════════════════════════════════════════════════ +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") -SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") -REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") def parse_cache_strategy_line(line): @@ -351,7 +376,7 @@ def parse_cache_strategy_line(line): # ════════════════════════════════════════════════════════════════ TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") -WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") @@ -438,14 +463,37 @@ def parse_error_line(line): # Select/Release 事件匹配 # ════════════════════════════════════════════════════════════════ -SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") -RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") +SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})") +RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})") FAILED_SELECT_RE = re.compile(r"Failed to select") -SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)") + + +def _parse_ts_safe(ts): + if not ts: + return None + try: + return parse_ts(ts) + except ValueError: + return None + + +def _select_match_key(tags): + """构建请求关联 key,优先 request_id,其次 req_id/trace_id/session_id。""" + if not tags: + return (None, None) + rid = tags.get("request_id") + if rid: + return ("request_id", f"request_id:{rid}") + for k in ("req_id", "trace_id", "session_id"): + v = tags.get(k) + if v: + return ("alt_id", f"{k}:{v}") + return (None, None) -def match_select_release(lines): +def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 Args: @@ -523,31 +571,60 @@ def match_select_release(lines): if FAILED_SELECT_RE.search(line): failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) - # Match by request_id + # Match by request_id / alt_id matched = [] unmatched_selects = [] release_used = set() - release_by_reqid = defaultdict(list) + release_by_key = defaultdict(list) for i, r in enumerate(releases): - rid = r["tags"].get("request_id", "") - if rid: - release_by_reqid[rid].append(i) - + _, key = _select_match_key(r.get("tags", {})) + if key: + release_by_key[key].append(i) + + # 请求 ID 覆盖(按 select 事件近似请求数) + total_req_est = len(selects) + with_request_id = 0 + with_alt_id = 0 + without_any_id = 0 + + pending_selects = [] + untracked_selects = [] for s in selects: - rid = s["tags"].get("request_id", "") + key_type, key = _select_match_key(s.get("tags", {})) + if key_type == "request_id": + with_request_id += 1 + elif key_type == "alt_id": + with_alt_id += 1 + else: + without_any_id += 1 + found = False - if rid and rid in release_by_reqid: - for ri in release_by_reqid[rid]: + if not key: + # 没有任何可用 ID 时,不做退化匹配(只统计可观测信息) + untracked_selects.append( + { + "worker": s["worker"], + "select_ts": s["ts"], + "type": s["type"], + "tags": s["tags"], + "note": "no correlatable id (request_id/req_id/trace_id/session_id)", + } + ) + continue + + if key and key in release_by_key: + for ri in release_by_key[key]: if ri not in release_used: r = releases[ri] matched.append( { - "request_id": rid, + "request_id": s["tags"].get("request_id", ""), "worker": s["worker"], "select_ts": s["ts"], "release_ts": r["ts"], "type": s["type"], + "match_method": key_type or "id", } ) release_used.add(ri) @@ -555,13 +632,50 @@ def match_select_release(lines): break if not found: + pending_selects.append(s) + + # Fallback: 有 ID 但未匹配时,按 worker + 时间邻近匹配 + for s in pending_selects: + sdt = _parse_ts_safe(s["ts"]) + best_idx = None + best_delta = None + for ri, r in enumerate(releases): + if ri in release_used: + continue + if r.get("worker") != s.get("worker"): + continue + rdt = _parse_ts_safe(r.get("ts")) + if sdt and rdt: + delta = (rdt - sdt).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + else: + delta = 0 + if best_delta is None or delta < best_delta: + best_delta = delta + best_idx = ri + + if best_idx is not None: + r = releases[best_idx] + matched.append( + { + "request_id": s["tags"].get("request_id", ""), + "worker": s["worker"], + "select_ts": s["ts"], + "release_ts": r["ts"], + "type": s["type"], + "match_method": "worker_time_fallback", + } + ) + release_used.add(best_idx) + else: unmatched_selects.append( { "worker": s["worker"], "select_ts": s["ts"], "type": s["type"], "tags": s["tags"], - "note": "no matching release found", + "note": "no matching release found (request_id/worker-time)", } ) @@ -583,8 +697,16 @@ def match_select_release(lines): return { "matched": matched, "unmatched_selects": unmatched_selects, + "untracked_selects": untracked_selects, "failed_selects": failed_selects, "per_worker": pw_result, + "id_coverage": { + "total_requests_estimated": total_req_est, + "with_request_id": with_request_id, + "without_request_id": total_req_est - with_request_id, + "with_alt_id": with_alt_id, + "without_any_id": without_any_id, + }, } diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 4e64a2092b3..5096c5b294a 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -34,7 +34,7 @@ from analyzers.latency import analyze_latency, format_latency_report from analyzers.load import analyze_load, format_load_report from analyzers.trace import analyze_trace, format_trace_report -from log_parser import complete_time_arg, filter_file_by_time_range +from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range def determine_log_file(user_path=None): @@ -71,10 +71,8 @@ def parse_tail_arg(tail_str): if tail_str is None: return None if tail_str.endswith("m"): - # 分钟模式:转换为大致行数(假设 ~20 行/秒) - minutes = int(tail_str[:-1]) - return minutes * 60 * 20 - return int(tail_str) + return {"type": "minutes", "value": int(tail_str[:-1])} + return {"type": "lines", "value": int(tail_str)} def determine_status(results): @@ -265,6 +263,18 @@ def main(): log_file = filtered_path print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + tail_arg = parse_tail_arg(args.tail) + tail = None + # --tail Nm 采用真实时间窗口过滤,再全量分析过滤后的临时文件 + if tail_arg and tail_arg["type"] == "minutes": + filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"]) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr) + elif tail_arg and tail_arg["type"] == "lines": + tail = tail_arg["value"] + # 确定分析模式 any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace run_errors = args.errors or (not any_mode) @@ -274,8 +284,6 @@ def main(): run_cache = args.cache or (not any_mode) run_trace = bool(args.trace) # trace 需要指定 ID,全量扫描不自动调用 - tail = parse_tail_arg(args.tail) - results = {} step = 0 total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace]) From 4ced999b66bd760e9fa89e7a6b5b70113e6cdc4b Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 20:09:29 +0800 Subject: [PATCH 03/40] fix(stat-cache-hitrate): include dated span and markdown summary output --- .../stat-cache-hitrate/scripts/log_parser.py | 11 +- .../scripts/stat_cache_hitrate.py | 218 +++++++++++++++--- .../troubleshoot/scripts/analyzers/cache.py | 8 +- .../troubleshoot/scripts/analyzers/errors.py | 2 + .../troubleshoot/scripts/analyzers/health.py | 25 +- .../troubleshoot/scripts/analyzers/load.py | 102 +++++--- .../troubleshoot/scripts/analyzers/trace.py | 23 +- .../skills/troubleshoot/scripts/log_parser.py | 168 ++++++++++++-- .../troubleshoot/scripts/troubleshoot.py | 22 +- 9 files changed, 461 insertions(+), 118 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py index 0b7377b4865..d43d6909c64 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py @@ -125,7 +125,7 @@ def complete_time_arg(time_str, log_file, is_end=False): if m: mo, d = m.group(1).zfill(2), m.group(2).zfill(2) ts = _get_log_boundary_ts(log_file, "first") - year = ts[:4] if ts else "2026" + year = ts[:4] if ts else str(datetime.now().year) if m.group(3): # 有时间部分 h, mi = m.group(3).zfill(2), m.group(4) s = (m.group(5) or "00").zfill(2) @@ -139,7 +139,7 @@ def complete_time_arg(time_str, log_file, is_end=False): h, mi = m.group(1).zfill(2), m.group(2) s = (m.group(3) or "00").zfill(2) ts = _get_log_boundary_ts(log_file, "last") - date_part = ts[:10] if ts else "2026/01/01" + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" return f"{date_part} {h}:{mi}:{s}" # Fallback: 原样返回 @@ -204,9 +204,10 @@ def extract_tags(line): # Cache-Aware 策略行解析(类别 A) # ════════════════════════════════════════════════════════════════ +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") -SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") -REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") def parse_cache_strategy_line(line): @@ -271,7 +272,7 @@ def parse_cache_strategy_line(line): # ════════════════════════════════════════════════════════════════ TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") -WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index c193e99d47c..6d63a565fe2 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -14,6 +14,7 @@ import argparse import json import os +import re import subprocess import sys from collections import defaultdict @@ -32,6 +33,10 @@ ) from stats import compute_statistics, count_by, time_bucket + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -235,7 +240,7 @@ def compute_per_worker_stats(strategies): avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0 result.append( { - "Worker": worker.replace("http://", ""), + "Worker": _strip_scheme(worker), "Selected": data["selected_count"], "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%", "AvgHitRatio": f"{avg_hr}%", @@ -339,7 +344,7 @@ def _quartile_trend(trend, value_field): return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}" -def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None): +def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None): """格式化完整终端报告。""" parts = [] @@ -361,6 +366,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, dist_data = [ {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"] ] + parts.append(" Unicode 柱状图(Prefix HR 分布):") parts.append(render_bar(dist_data, show_count=True)) parts.append(f' 冷启动率: {prefix_hr["cold_start_rate"]}%') @@ -375,6 +381,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] ] parts.append("") + parts.append(" ASCII 折线图(Prefix HR 趋势):") parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) else: parts.append(" (无 cache_aware_scoring 数据)") @@ -391,6 +398,7 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, if session_hr["trend"]: parts.append("") + parts.append(" ASCII 折线图(Session HR 趋势):") parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100))) parts.append("") @@ -428,6 +436,18 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, parts.append(f' {diagnosis["icon"]} {diagnosis["summary"]}') parts.append(f' {diagnosis["detail"]}') + # 6. 每窗口明细预览 + if window_rows: + parts.append("") + parts.append("### 6. 每5s窗口明细预览(前10行)") + parts.append( + render_table( + window_rows[:10], + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], + right_align={"Scoring", "Fallback", "Total Running"}, + ) + ) + return "\n".join(parts) @@ -458,7 +478,77 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling): return "\n".join(parts) -def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir): +def build_per_window_rows(strategies, stats_recs): + """构建每窗口明细行,用于终端预览和 details 导出。""" + time_data = defaultdict( + lambda: { + "prefix_vals": [], + "hits": 0, + "total": 0, + "scoring": 0, + "fallback": 0, + "running": 0, + "has_running": False, + } + ) + for r in strategies: + ts = r.get("ts", "") + if r.get("strategy") == "cache_aware_scoring": + time_data[ts]["scoring"] += 1 + time_data[ts]["prefix_vals"].append(r.get("selected_hitRatio", 0)) + else: + time_data[ts]["fallback"] += 1 + + for r in stats_recs: + ts = r.get("ts", "") + time_data[ts]["hits"] += r.get("hits", 0) + time_data[ts]["total"] += r.get("total", 0) + if "total_running" in r: + time_data[ts]["running"] += r.get("total_running", 0) + time_data[ts]["has_running"] = True + + rows = [] + for ts in sorted(time_data.keys()): + d = time_data[ts] + short_ts = ts.split(" ")[-1] if " " in ts else ts + if d["prefix_vals"]: + prefix_mean = round(sum(d["prefix_vals"]) / len(d["prefix_vals"]), 1) + prefix_hr = f"{prefix_mean}%" + else: + prefix_hr = "-" + + if d["total"] > 0: + session_val = round(d["hits"] / d["total"] * 100, 1) + session_hr = f'{session_val}% ({d["hits"]}/{d["total"]})' + else: + session_hr = "-" + + running = str(d["running"]) if d["has_running"] else "-" + rows.append( + { + "Time": short_ts, + "Prefix HR": prefix_hr, + "Session HR": session_hr, + "Scoring": str(d["scoring"]), + "Fallback": str(d["fallback"]), + "Total Running": running, + } + ) + return rows + + +def save_detailed_report( + filepath, + strategies, + stats_recs, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + output_dir, + time_span=None, +): """导出详细数据 Markdown 文件。 主报告包含 Per-Worker 统计和 Fallback 明细。 @@ -471,10 +561,63 @@ def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr parts.append("# Cache Hit Rate Detailed Report") parts.append(f'**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') parts.append(f"**Source**: {filepath}") + if time_span: + parts.append(f"**Span**: {time_span}") + parts.append("") + + # 1) 主指标摘要(与终端一致,避免“只在终端可见”) + parts.append("## 1. Key Metrics Summary") + parts.append("") + parts.append("### Prefix Hit Ratio") + if prefix_hr["stats"]: + parts.append(f'- 累计平均: **{prefix_hr["mean"]}%** (N={prefix_hr["count"]})') + parts.append(f'- 冷启动率: **{prefix_hr["cold_start_rate"]}%**') + trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean") + if trend_str: + parts.append(f"- 趋势: {trend_str}") + dist_data = [{"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]] + parts.append("") + parts.append("```text") + parts.append("Unicode 柱状图(Prefix HR 分布)") + parts.append(render_bar(dist_data, show_count=True)) + if prefix_hr["trend"]: + sparkline_data = [{"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]] + parts.append("") + parts.append("ASCII 折线图(Prefix HR 趋势)") + parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) + parts.append("```") + else: + parts.append("- (无 cache_aware_scoring 数据)") + parts.append("") + + parts.append("### Session Hit Rate") + parts.append(f'- 累计: **{session_hr["rate"]}%** (hits={session_hr["hits"]}/total={session_hr["total"]})') + parts.append(f'- 覆盖率: **{session_hr["coverage"]}%**') + trend_str = _quartile_trend(session_hr["trend"], "value") + if trend_str: + parts.append(f"- 趋势: {trend_str}") + if session_hr["trend"]: + parts.append("") + parts.append("```text") + parts.append("ASCII 折线图(Session HR 趋势)") + parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100))) + parts.append("```") + parts.append("") + + parts.append("### Scheduling Strategy") + parts.append( + f'- cache_aware_scoring: **{scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)**' + f' | fallback: **{scheduling["fallback_count"]}**' + ) + parts.append( + f'- 非最优命中选择: **{scheduling["suboptimal_pct"]}%**' + f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)' + ) + parts.append(f'- Diagnosis: {diagnosis["icon"]} {diagnosis["summary"]};{diagnosis["detail"]}') parts.append("") - # Per-Worker 完整统计 - parts.append("## 1. Per-Worker 完整统计") + # 2) Per-Worker 完整统计 + parts.append("## 2. Per-Worker 完整统计") parts.append("") if per_worker: parts.append( @@ -486,49 +629,34 @@ def save_detailed_report(filepath, strategies, stats_recs, prefix_hr, session_hr ) parts.append("") - # Fallback 明细 + # 3) Fallback 明细 if scheduling["fallback_reasons"]: - parts.append("## 2. Fallback 明细") + parts.append("## 3. Fallback 明细") for reason in scheduling["fallback_reasons"]: parts.append(f'- **{reason["value"]}**: {reason["count"]} 次 ({reason["pct"]}%)') parts.append("") # 每窗口明细 → 拆分到 details/ - time_data = defaultdict(lambda: {"prefix_hr": "-", "session_hr": "-", "scoring": 0, "fallback": 0, "running": "-"}) - for r in strategies: - ts = r.get("ts", "") - if r.get("strategy") == "cache_aware_scoring": - time_data[ts]["scoring"] += 1 - else: - time_data[ts]["fallback"] += 1 - - for r in stats_recs: - ts = r.get("ts", "") - h = r.get("hits", 0) - t = r.get("total", 0) - time_data[ts]["session_hr"] = f"{round(h / t * 100, 1)}% ({h}/{t})" if t else "0%" - time_data[ts]["running"] = str(r.get("total_running", "-")) + window_rows = build_per_window_rows(strategies, stats_recs) - if time_data: + if window_rows: # 主报告中添加引用 parts.append( - f"> 每窗口明细数据 ({len(time_data)} 条): [details/per_window_data.md](details/per_window_data.md)" + f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)" ) parts.append("") # 写入 details 子目录 details_dir = os.path.join(output_dir, "details") os.makedirs(details_dir, exist_ok=True) - detail_parts = ["# 每窗口明细数据", ""] - detail_parts.append("| Time | Prefix HR | Session HR | Scoring | Fallback | Total Running |") - detail_parts.append("|------|-----------|------------|---------|----------|---------------|") - for ts in sorted(time_data.keys()): - d = time_data[ts] - short_ts = ts.split(" ")[-1] if " " in ts else ts - detail_parts.append( - f'| {short_ts} | {d["prefix_hr"]} | {d["session_hr"]} ' - f'| {d["scoring"]} | {d["fallback"]} | {d["running"]} |' + detail_parts = ["# 每5s窗口明细数据", ""] + detail_parts.append( + render_table( + window_rows, + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], + right_align={"Scoring", "Fallback", "Total Running"}, ) + ) detail_parts.append("") detail_path = os.path.join(details_dir, "per_window_data.md") @@ -564,8 +692,8 @@ def compute_time_span(strategies, stats_recs): duration = t_max - t_min hours = int(duration.total_seconds() // 3600) minutes = int((duration.total_seconds() % 3600) // 60) - start = t_min.strftime("%H:%M:%S") - end = t_max.strftime("%H:%M:%S") + start = t_min.strftime("%Y-%m-%d %H:%M:%S") + end = t_max.strftime("%Y-%m-%d %H:%M:%S") if hours > 0: return f"{start} ~ {end} ({hours}h{minutes}m)" return f"{start} ~ {end} ({minutes}m)" @@ -642,9 +770,18 @@ def main(): print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling)) else: time_span = compute_time_span(strategy_recs, stats_recs) + window_rows = build_per_window_rows(strategy_recs, stats_recs) print( format_full_report( - args.log_file, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span + args.log_file, + line_count, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + time_span, + window_rows=window_rows, ) ) @@ -657,7 +794,16 @@ def main(): run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp) report_path = save_detailed_report( - args.log_file, strategy_recs, stats_recs, prefix_hr, session_hr, per_worker, scheduling, output_dir + args.log_file, + strategy_recs, + stats_recs, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + output_dir, + time_span=time_span, ) print(f"\n\U0001f4c4 详细数据见: {report_path}") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3a18b668a41..3fca296f4d6 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -26,6 +26,10 @@ TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens") +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + def classify_fallback(record, tokenizer_degraded_ts=None): """对 process_tokens 策略行分类 fallback 原因。 @@ -210,9 +214,9 @@ def _analyze_suboptimal(records, hr_weight, lb_weight): suboptimal.append( { "ts": r.get("ts", ""), - "selected": selected.replace("http://", ""), + "selected": _strip_scheme(selected), "selected_hr": sel_hr, - "best_hr_worker": best_by_hr.replace("http://", ""), + "best_hr_worker": _strip_scheme(best_by_hr), "best_hr": max_hr, "reason": reason, } diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index 0817e280aa5..b8217a5ffa4 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -44,6 +44,8 @@ ("No available", "FD 后端"), ("request failed", "FD 后端"), ("Removed unhealthy", "FD 后端"), + ("is not healthy", "FD 后端"), + ("is healthy", "FD 后端"), ("Backend request failed", "FD 后端"), ("Decode request failed", "FD 后端"), ("Prefill request failed", "FD 后端"), diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index d2d7ca77acb..ca01d718dbc 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -21,11 +21,16 @@ # 健康事件解析 # ════════════════════════════════════════════════════════════════ -NOT_HEALTHY_RE = re.compile(r"(http://\S+)\s+is not healthy") -REMOVED_RE = re.compile(r"Removed unhealthy \w+ instance:\s*(http://\S+)") -IS_HEALTHY_RE = re.compile(r"(http://\S+)\s+is healthy") -COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") -CLEANUP_UNHEALTHY_RE = re.compile(r"cleanup unhealthy.*?(http://\S+)") +WORKER_URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +NOT_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is not healthy") +REMOVED_RE = re.compile(rf"Removed unhealthy \w+ instance:\s*{WORKER_URL_RE}") +IS_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is healthy") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{WORKER_URL_RE}") +CLEANUP_UNHEALTHY_RE = re.compile(rf"cleanup unhealthy.*?{WORKER_URL_RE}") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) def parse_health_event(line): @@ -110,7 +115,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): # IP → worker URL 映射 ip_to_urls = defaultdict(set) for url in worker_urls: - ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) if ip_m: ip_to_urls[ip_m.group(1)].add(url) @@ -130,7 +135,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): workers = {} for url in sorted(worker_urls): events = sorted(worker_events[url], key=lambda e: e["ts"] or "") - ip_m = re.search(r"http://(\d+\.\d+\.\d+\.\d+)", url) + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) worker_ip = ip_m.group(1) if ip_m else "" # 恢复检测:REMOVED 后有 register @@ -237,7 +242,7 @@ def _diagnose(workers): ) for url, w in workers.items(): - s = url.replace("http://", "") + s = _strip_scheme(url) if w["down_count"] > 3: diagnoses.append( { @@ -326,7 +331,7 @@ def format_health_report(result): ) table_data.append( { - "Worker": url.replace("http://", ""), + "Worker": _strip_scheme(url), "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", @@ -358,7 +363,7 @@ def format_health_report(result): for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True - detail_parts.append(f'## {url.replace("http://", "")}') + detail_parts.append(f"## {_strip_scheme(url)}") detail_parts.append("") for evt in w["events"]: detail_parts.append(f' [{evt["ts"]}] {evt["type"]}') diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index e712011d932..9be82357494 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -21,14 +21,19 @@ # Counter 异常检测正则 # ════════════════════════════════════════════════════════════════ -DOUBLE_RELEASE_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?double-release") -COUNTER_CLEANED_RE = re.compile(r"release worker:\s*(http://\S+)\s+skipped.*?counter already cleaned up") -COUNTER_PRESERVED_RE = re.compile(r"counter preserved.*?(http://\S+)") -TOKEN_PRESERVED_RE = re.compile(r"token counter preserved.*?(http://\S+)") +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +DOUBLE_RELEASE_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?double-release") +COUNTER_CLEANED_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?counter already cleaned up") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{URL_RE}") +TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}") # Token 事件 -SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://\S+),\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) def parse_counter_anomaly(line): @@ -89,7 +94,7 @@ def analyze_load(log_file, tail=None): avg = sum(vals) / len(vals) if vals else 0 worker_load.append( { - "worker": w_url.replace("http://", ""), + "worker": _strip_scheme(w_url), "avg_running": round(avg, 1), "max_running": max(vals) if vals else 0, "samples": len(vals), @@ -121,9 +126,9 @@ def analyze_load(log_file, tail=None): # Select/Release 匹配 sr_result = ( - match_select_release(h3_lines) + match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "failed_selects": [], "per_worker": {}} + else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} ) # Token 统计 @@ -133,7 +138,7 @@ def analyze_load(log_file, tail=None): pileup = _detect_pileup(stats_records) # 诊断 - diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup) + diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup) return { "load_stats": load_stats, @@ -170,7 +175,7 @@ def _analyze_tokens(h3_lines, h11_lines): releases = token_release.get(w, []) result.append( { - "worker": w.replace("http://", ""), + "worker": _strip_scheme(w), "alloc_count": len(allocs), "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0, "release_count": len(releases), @@ -195,7 +200,7 @@ def _detect_pileup(stats_records): return max_consecutive >= 5 -def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): +def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup): """生成负载诊断。""" diagnoses = [] @@ -236,16 +241,20 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): } ) - # Select/Release 不一致 - for w_url, pw in sr_result.get("per_worker", {}).items(): - if pw.get("delta", 0) > 0: - diagnoses.append( - { - "severity": "HIGH", - "message": f'{w_url.replace("http://","")} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', - "source_layer": "FD 后端", - } - ) + id_cov = sr_result.get("id_coverage", {}) + has_correlatable_ids = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) > 0 + + # Select/Release 不一致(仅在存在可关联 ID 时启用,避免无 ID 场景误报) + if has_correlatable_ids: + for w_url, pw in sr_result.get("per_worker", {}).items(): + if pw.get("delta", 0) > 0: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', + "source_layer": "FD 后端", + } + ) # 卡住的请求 if sr_result.get("unmatched_selects"): @@ -257,6 +266,17 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, pileup): } ) + # Token 计数器潜在泄漏 + for t in token_stats: + if t.get("alloc_count", 0) > t.get("release_count", 0): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{t["worker"]} token alloc/release 不平衡 ({t["alloc_count"]}/{t["release_count"]})', + "source_layer": "Router", + } + ) + return diagnoses @@ -316,23 +336,44 @@ def format_load_report(result): sections.append("### 计数器异常") sections.append("") for a in result["counter_anomalies"]: - workers_str = ", ".join(f'{w.replace("http://","")}({c})' for w, c in a["workers"].items()) + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') sections.append("") + id_cov = result.get("select_release", {}).get("id_coverage", {}) + if id_cov: + sections.append("### 请求标识覆盖(基于 select 近似请求数)") + sections.append("") + sections.append( + " total={total} | with_request_id={with_rid} | without_request_id={without_rid} | " + "with_alt_id={with_alt} | without_any_id={without_any}".format( + total=id_cov.get("total_requests_estimated", 0), + with_rid=id_cov.get("with_request_id", 0), + without_rid=id_cov.get("without_request_id", 0), + with_alt=id_cov.get("with_alt_id", 0), + without_any=id_cov.get("without_any_id", 0), + ) + ) + if id_cov.get("without_any_id", 0) > 0: + sections.append(" ℹ 无 request/session/trace/req_id 时,不做退化匹配,仅统计为 untracked。") + sections.append("") + # Select/Release 匹配 sr = result.get("select_release", {}) if sr.get("per_worker"): sections.append("### Select/Release 匹配") sections.append("") + id_cov = sr.get("id_coverage", {}) + no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0 table_data = [] for w_url, pw in sorted(sr["per_worker"].items()): + delta_display = "N/A" if no_correlatable_id else str(pw["delta"]) table_data.append( { - "Worker": w_url.replace("http://", ""), + "Worker": _strip_scheme(w_url), "Select": str(pw["selects"]), "Release": str(pw["releases"]), - "Delta": str(pw["delta"]), + "Delta": delta_display, } ) sections.append( @@ -343,11 +384,20 @@ def format_load_report(result): ) ) sections.append("") + if no_correlatable_id: + sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") + sections.append("") if sr.get("unmatched_selects"): sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') for u in sr["unmatched_selects"][:5]: - sections.append(f' [{u.get("select_ts","")}] {u["worker"].replace("http://","")} ({u["type"]})') + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') + sections.append("") + + if sr.get("untracked_selects"): + sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') + for u in sr["untracked_selects"][:5]: + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') sections.append("") # Token 统计 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 45a5056616e..6c9a0323724 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -25,21 +25,26 @@ # ════════════════════════════════════════════════════════════════ PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection") -SELECT_WORKER_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://\S+)") -RELEASE_WORKER_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://\S+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://\S+),\s*tokens:\s*(\d+)") +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +SELECT_WORKER_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_WORKER_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully") TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") # Prefill 事件 -PREFILL_FIRST_CHUNK_RE = re.compile(r"\[prefill\] first chunk received.*?(http://\S+)") -PREFILL_DONE_RE = re.compile(r"\[prefill\] non-stream prefill response done.*?(http://\S+)") -PREFILL_ERROR_RE = re.compile(r"\[prefill\] (scanner error|copy error).*?(http://\S+)") -PREFILL_DEFER_RE = re.compile(r"\[prefill\] release in defer.*?(http://\S+)") -PREFILL_ERR_PATH_RE = re.compile(r"\[prefill\] release in CommonCompletions defer \(error path\).*?(http://\S+)") +PREFILL_FIRST_CHUNK_RE = re.compile(rf"\[prefill\] first chunk received.*?{URL_RE}") +PREFILL_DONE_RE = re.compile(rf"\[prefill\] non-stream prefill response done.*?{URL_RE}") +PREFILL_ERROR_RE = re.compile(rf"\[prefill\] (scanner error|copy error).*?{URL_RE}") +PREFILL_DEFER_RE = re.compile(rf"\[prefill\] release in defer.*?{URL_RE}") +PREFILL_ERR_PATH_RE = re.compile(rf"\[prefill\] release in CommonCompletions defer \(error path\).*?{URL_RE}") FAILED_SELECT_RE = re.compile(r"Failed to select") +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -342,7 +347,7 @@ def format_trace_report(result): for evt in trace["events"]: line = f' [{evt.get("ts","")}] {evt["type"]}' if evt.get("worker"): - line += f' → {evt["worker"].replace("http://","")}' + line += f' → {_strip_scheme(evt["worker"])}' if evt.get("status"): line += f' [{evt["status"]}]' if evt.get("latency_ms"): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 2a90d39b632..44f5cdebd94 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -14,7 +14,7 @@ import re import sys from collections import defaultdict -from datetime import datetime +from datetime import datetime, timedelta # ════════════════════════════════════════════════════════════════ # 通用解析原语 @@ -152,7 +152,7 @@ def complete_time_arg(time_str, log_file, is_end=False): if m: mo, d = m.group(1).zfill(2), m.group(2).zfill(2) ts = _get_log_boundary_ts(log_file, "first") - year = ts[:4] if ts else "2026" + year = ts[:4] if ts else str(datetime.now().year) if m.group(3): # 有时间部分 h, mi = m.group(3).zfill(2), m.group(4) s = (m.group(5) or "00").zfill(2) @@ -166,7 +166,7 @@ def complete_time_arg(time_str, log_file, is_end=False): h, mi = m.group(1).zfill(2), m.group(2) s = (m.group(3) or "00").zfill(2) ts = _get_log_boundary_ts(log_file, "last") - date_part = ts[:10] if ts else "2026/01/01" + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" return f"{date_part} {h}:{mi}:{s}" # Fallback: 原样返回 @@ -218,6 +218,30 @@ def filter_file_by_time_range(log_file, start_str=None, end_str=None): return (tmp.name, True) +def filter_file_by_recent_minutes(log_file, minutes): + """按日志末时间戳向前过滤最近 N 分钟日志。 + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if minutes is None or minutes <= 0: + return (log_file, False) + + last_ts = _get_log_boundary_ts(log_file, "last") + if not last_ts: + return (log_file, False) + + try: + end_dt = parse_ts(last_ts) + except ValueError: + return (log_file, False) + + start_dt = end_dt - timedelta(minutes=minutes) + start_str = start_dt.strftime("%Y/%m/%d %H:%M:%S") + end_str = end_dt.strftime("%Y/%m/%d %H:%M:%S") + return filter_file_by_time_range(log_file, start_str=start_str, end_str=end_str) + + # Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") @@ -228,7 +252,7 @@ def extract_tags(line): # Log level -LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN)\]") +LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN|DEBUG)\]") def extract_level(line): @@ -294,9 +318,10 @@ def parse_http_line(line, inference_only=False): # Cache-Aware 策略行解析(类别 H6) # ════════════════════════════════════════════════════════════════ +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") -SELECTED_RE = re.compile(r"selected=(http://\S+?)(?:,|\s|$)") -REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|$)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") def parse_cache_strategy_line(line): @@ -351,7 +376,7 @@ def parse_cache_strategy_line(line): # ════════════════════════════════════════════════════════════════ TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") -WORKER_RUNNING_RE = re.compile(r"(http://[^:]+:\d+): running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") @@ -438,14 +463,37 @@ def parse_error_line(line): # Select/Release 事件匹配 # ════════════════════════════════════════════════════════════════ -SELECT_RE = re.compile(r"select worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") -RELEASE_RE = re.compile(r"release worker\s*(?:\((\w+)\))?:\s*(http://[^,\s]+)") +SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})") +RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})") FAILED_SELECT_RE = re.compile(r"Failed to select") -SELECT_TOKENS_RE = re.compile(r"select worker \(prefill\):\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(r"release prefill tokens:\s*(http://[^,\s]+),\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)") + + +def _parse_ts_safe(ts): + if not ts: + return None + try: + return parse_ts(ts) + except ValueError: + return None + + +def _select_match_key(tags): + """构建请求关联 key,优先 request_id,其次 req_id/trace_id/session_id。""" + if not tags: + return (None, None) + rid = tags.get("request_id") + if rid: + return ("request_id", f"request_id:{rid}") + for k in ("req_id", "trace_id", "session_id"): + v = tags.get(k) + if v: + return ("alt_id", f"{k}:{v}") + return (None, None) -def match_select_release(lines): +def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 Args: @@ -523,31 +571,60 @@ def match_select_release(lines): if FAILED_SELECT_RE.search(line): failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) - # Match by request_id + # Match by request_id / alt_id matched = [] unmatched_selects = [] release_used = set() - release_by_reqid = defaultdict(list) + release_by_key = defaultdict(list) for i, r in enumerate(releases): - rid = r["tags"].get("request_id", "") - if rid: - release_by_reqid[rid].append(i) - + _, key = _select_match_key(r.get("tags", {})) + if key: + release_by_key[key].append(i) + + # 请求 ID 覆盖(按 select 事件近似请求数) + total_req_est = len(selects) + with_request_id = 0 + with_alt_id = 0 + without_any_id = 0 + + pending_selects = [] + untracked_selects = [] for s in selects: - rid = s["tags"].get("request_id", "") + key_type, key = _select_match_key(s.get("tags", {})) + if key_type == "request_id": + with_request_id += 1 + elif key_type == "alt_id": + with_alt_id += 1 + else: + without_any_id += 1 + found = False - if rid and rid in release_by_reqid: - for ri in release_by_reqid[rid]: + if not key: + # 没有任何可用 ID 时,不做退化匹配(只统计可观测信息) + untracked_selects.append( + { + "worker": s["worker"], + "select_ts": s["ts"], + "type": s["type"], + "tags": s["tags"], + "note": "no correlatable id (request_id/req_id/trace_id/session_id)", + } + ) + continue + + if key and key in release_by_key: + for ri in release_by_key[key]: if ri not in release_used: r = releases[ri] matched.append( { - "request_id": rid, + "request_id": s["tags"].get("request_id", ""), "worker": s["worker"], "select_ts": s["ts"], "release_ts": r["ts"], "type": s["type"], + "match_method": key_type or "id", } ) release_used.add(ri) @@ -555,13 +632,50 @@ def match_select_release(lines): break if not found: + pending_selects.append(s) + + # Fallback: 有 ID 但未匹配时,按 worker + 时间邻近匹配 + for s in pending_selects: + sdt = _parse_ts_safe(s["ts"]) + best_idx = None + best_delta = None + for ri, r in enumerate(releases): + if ri in release_used: + continue + if r.get("worker") != s.get("worker"): + continue + rdt = _parse_ts_safe(r.get("ts")) + if sdt and rdt: + delta = (rdt - sdt).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + else: + delta = 0 + if best_delta is None or delta < best_delta: + best_delta = delta + best_idx = ri + + if best_idx is not None: + r = releases[best_idx] + matched.append( + { + "request_id": s["tags"].get("request_id", ""), + "worker": s["worker"], + "select_ts": s["ts"], + "release_ts": r["ts"], + "type": s["type"], + "match_method": "worker_time_fallback", + } + ) + release_used.add(best_idx) + else: unmatched_selects.append( { "worker": s["worker"], "select_ts": s["ts"], "type": s["type"], "tags": s["tags"], - "note": "no matching release found", + "note": "no matching release found (request_id/worker-time)", } ) @@ -583,8 +697,16 @@ def match_select_release(lines): return { "matched": matched, "unmatched_selects": unmatched_selects, + "untracked_selects": untracked_selects, "failed_selects": failed_selects, "per_worker": pw_result, + "id_coverage": { + "total_requests_estimated": total_req_est, + "with_request_id": with_request_id, + "without_request_id": total_req_est - with_request_id, + "with_alt_id": with_alt_id, + "without_any_id": without_any_id, + }, } diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 4e64a2092b3..5096c5b294a 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -34,7 +34,7 @@ from analyzers.latency import analyze_latency, format_latency_report from analyzers.load import analyze_load, format_load_report from analyzers.trace import analyze_trace, format_trace_report -from log_parser import complete_time_arg, filter_file_by_time_range +from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range def determine_log_file(user_path=None): @@ -71,10 +71,8 @@ def parse_tail_arg(tail_str): if tail_str is None: return None if tail_str.endswith("m"): - # 分钟模式:转换为大致行数(假设 ~20 行/秒) - minutes = int(tail_str[:-1]) - return minutes * 60 * 20 - return int(tail_str) + return {"type": "minutes", "value": int(tail_str[:-1])} + return {"type": "lines", "value": int(tail_str)} def determine_status(results): @@ -265,6 +263,18 @@ def main(): log_file = filtered_path print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + tail_arg = parse_tail_arg(args.tail) + tail = None + # --tail Nm 采用真实时间窗口过滤,再全量分析过滤后的临时文件 + if tail_arg and tail_arg["type"] == "minutes": + filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"]) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr) + elif tail_arg and tail_arg["type"] == "lines": + tail = tail_arg["value"] + # 确定分析模式 any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace run_errors = args.errors or (not any_mode) @@ -274,8 +284,6 @@ def main(): run_cache = args.cache or (not any_mode) run_trace = bool(args.trace) # trace 需要指定 ID,全量扫描不自动调用 - tail = parse_tail_arg(args.tail) - results = {} step = 0 total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace]) From be5c4f5fa08cd15ecb7454d6cb10f44e9d26078a Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 20:32:40 +0800 Subject: [PATCH 04/40] Fix stat-cache-hitrate path links for terminal output --- .../references/report_templates.md | 11 ++++++- .../scripts/stat_cache_hitrate.py | 33 ++++++++++++++++++- 2 files changed, 42 insertions(+), 2 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md index dcef9c47498..7f060cacb6a 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -62,7 +62,16 @@ ### 5. Diagnosis ✅/⚠/❌ <综合诊断> -📄 详细数据见: skill_output/stat-cache-hitrate//cache_hitrate_report_.md +### 图表说明(Legend) + - Unicode 柱状图:每个区间的请求占比,条越长占比越高 + - ASCII 折线图:横轴是时间窗口,纵轴是命中率(0-100%) + - Q1→Q4 趋势:按时间四等分后的均值变化(↑/↓/→) + +📄 详细数据见: + - 报告文件: /abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md + - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md ``` --- diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 6d63a565fe2..6d09bc1915d 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -17,6 +17,8 @@ import re import subprocess import sys +from pathlib import Path +from urllib.parse import quote from collections import defaultdict from datetime import datetime @@ -37,6 +39,13 @@ def _strip_scheme(url): return re.sub(r"^https?://", "", url) + +def _build_path_links(path): + """返回绝对路径与 file URI,兼容空格/中文路径。""" + abs_path = str(Path(path).resolve()) + file_uri = "file://" + quote(abs_path, safe="/:-._~") + return abs_path, file_uri + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -356,6 +365,13 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, parts.append(f"**Span**: {span_str}") parts.append("") + # 图表说明 + parts.append("### 图表说明(如何解读)") + parts.append(" - Unicode 柱状图:每行代表一个 Prefix HR 区间(如 60-80%),条越长表示该区间请求占比越高。") + parts.append(" - ASCII 折线图:横轴是时间窗口,纵轴是命中率(0-100%);越靠上表示命中率越高。") + parts.append(" - 趋势 Q1→Q4:把时间均分为四段,比较首尾;↑ 上升,↓ 下降,→ 基本稳定。") + parts.append("") + # 1. Prefix Hit Ratio parts.append("### 1. Prefix Hit Ratio (KV Cache 内容复用度)") if prefix_hr["stats"]: @@ -474,6 +490,7 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling): {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] ] parts.append(render_sparkline(sparkline_data, title="Recent Prefix HR", y_label="%", y_range=(0, 100))) + parts.append(" 说明: 折线越靠上表示对应时间窗口 Prefix HR 越高。") return "\n".join(parts) @@ -565,6 +582,12 @@ def save_detailed_report( parts.append(f"**Span**: {time_span}") parts.append("") + parts.append("## 图表说明(Legend)") + parts.append("- **Unicode 柱状图**: 展示 Prefix HR 分布,`█` 越多说明该命中率区间占比越高。") + parts.append("- **ASCII 折线图**: 展示命中率随时间变化,横轴为时间窗口,纵轴为命中率(0-100%)。") + parts.append("- **Q1~Q4 趋势**: 将观察区间均分四段,反映整体走向(↑/↓/→)。") + parts.append("") + # 1) 主指标摘要(与终端一致,避免“只在终端可见”) parts.append("## 1. Key Metrics Summary") parts.append("") @@ -805,7 +828,15 @@ def main(): output_dir, time_span=time_span, ) - print(f"\n\U0001f4c4 详细数据见: {report_path}") + print("\n\U0001f4c4 详细数据见:") + report_abs, report_uri = _build_path_links(report_path) + print(f" - 报告文件: {report_abs}") + print(f" URI: {report_uri}") + details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md") + if os.path.exists(details_path): + details_abs, details_uri = _build_path_links(details_path) + print(f" - 窗口明细: {details_abs}") + print(f" URI: {details_uri}") if args.watch: print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail") From 5d2984999fb485d3cd4ca3cad42ca1cef7f4f6c7 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 21:01:20 +0800 Subject: [PATCH 05/40] split session and window logic out of stat_cache_hitrate --- .../skills/stat-cache-hitrate/SKILL.md | 11 +- .../references/report_templates.md | 3 + .../scripts/session_analysis.py | 116 ++++++++++++++++++ .../scripts/stat_cache_hitrate.py | 65 +++++++++- .../scripts/window_utils.py | 80 ++++++++++++ 5 files changed, 268 insertions(+), 7 deletions(-) create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index 6534fb332f2..f9c5156ca69 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -23,10 +23,10 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 运行脚本前,Claude 必须先向用户确认以下参数: ### 1. 日志文件路径 -使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项,同时允许用户直接输入自定义路径(支持绝对路径和相对路径): +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入(支持绝对路径和相对路径): - 选项 1: `logs/router.log`(默认) -- 选项 2: `fd-router.log`(golang_router 根目录) -- 选项 3: 用户通过 Other 输入自定义路径 +- 选项 2: `fd-router.log`(golang_router 根目录常用文件名) +- 选项 3: Other(用户直接输入任意路径,例如 `logs/fd-router.log`、`/home/user/logs/router.log`) **重要规则**: - 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 @@ -75,7 +75,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00" ``` -默认日志路径:`logs/router.log` 或 `fd-router.log`(相对于 `fastdeploy/golang_router/`)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 +默认日志路径:`logs/router.log`(相对于 `fastdeploy/golang_router/`)。常用备选:`fd-router.log`(根目录)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 脚本会自动根据文件大小选择解析策略:小文件(<5000 行)在内存中处理,大文件用 grep + 管道流式处理。 @@ -94,7 +94,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 -- `details/per_window_data.md` — 每5s窗口的完整明细数据(Prefix HR / Session HR / Scoring / Fallback / Running) +- `details/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) +- `details/session_hit_details.md` — 每个 session 的命中明细(`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`),并附带 `prefill_urls`、prefill URL 切换前后 request_id(或 req_id/trace_id)以及命中率突降 request_id ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md index 7f060cacb6a..f5a0def5f55 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -72,6 +72,9 @@ URI: file:///abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md + - Session 命中详情: /abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + (含 prefill_urls、worker 切换前后 request_id,以及命中率突降 request_id) ``` --- diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py new file mode 100644 index 00000000000..355ba8fc947 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Session 维度分析:聚合每个 session 的命中率、worker 切换与突降请求。 +""" + +from collections import defaultdict + + +def compute_session_details(strategies, strip_scheme): + """按 session 统计命中详情。""" + + def _req_id_from_tags(tags, fallback): + return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback + + session_records = defaultdict(list) + for idx, rec in enumerate(strategies): + if rec.get("strategy") != "cache_aware_scoring": + continue + tags = rec.get("tags", {}) or {} + session_id = tags.get("session_id") + if not session_id: + continue + session_records[session_id].append((idx, rec)) + + rows = [] + for session_id, items in session_records.items(): + items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0])) + recs = [r for _, r in items] + hits = [int(r.get("selected_hitRatio", 0)) for r in recs] + if not hits: + continue + + non_first = hits[1:] + avg_excl_first = round(sum(non_first) / len(non_first), 1) if non_first else "-" + workers = {r.get("selected", "") for r in recs if r.get("selected")} + + prefill_urls = [] + for r in recs: + u = r.get("selected", "") + if u and u not in prefill_urls: + prefill_urls.append(u) + + switch_events = [] + sharp_drop_req_ids = [] + for i in range(1, len(recs)): + prev_r = recs[i - 1] + curr_r = recs[i] + prev_url = prev_r.get("selected", "") + curr_url = curr_r.get("selected", "") + prev_tags = prev_r.get("tags", {}) or {} + curr_tags = curr_r.get("tags", {}) or {} + prev_req = _req_id_from_tags(prev_tags, f"idx#{i}") + curr_req = _req_id_from_tags(curr_tags, f"idx#{i+1}") + + if prev_url and curr_url and prev_url != curr_url: + switch_events.append(f"{prev_req}->{curr_req} ({strip_scheme(prev_url)}→{strip_scheme(curr_url)})") + + prev_hit = int(prev_r.get("selected_hitRatio", 0)) + curr_hit = int(curr_r.get("selected_hitRatio", 0)) + if curr_hit - prev_hit <= -30: + sharp_drop_req_ids.append(f"{curr_req} ({prev_hit}%→{curr_hit}%)") + + rows.append( + { + "session": session_id, + "req_count": len(hits), + "first_hit": f"{hits[0]}%", + "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", + "max_hit": f"{max(hits)}%", + "min_hit": f"{min(hits)}%", + "all_hits": ", ".join(f"{h}%" for h in hits), + "sticky": "yes" if len(workers) <= 1 else "no", + "unique_workers": len(workers), + "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls), + "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-", + "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-", + } + ) + + rows.sort(key=lambda r: (r["req_count"], r["session"]), reverse=True) + return rows + + +def summarize_session_details(rows): + """生成 session 级摘要指标。""" + if not rows: + return { + "total_sessions": 0, + "multi_req": 0, + "single_req": 0, + "sticky_multi": 0, + "non_sticky_multi": 0, + "non_first_avg": 0, + "non_first_total": 0, + } + + multi_req_rows = [r for r in rows if r["req_count"] > 1] + sticky_multi = [r for r in multi_req_rows if r["sticky"] == "yes"] + non_sticky_multi = [r for r in multi_req_rows if r["sticky"] == "no"] + + non_first_vals = [] + for r in rows: + hit_tokens = [h.strip().rstrip("%") for h in r["all_hits"].split(",") if h.strip()] + nums = [int(x) for x in hit_tokens if x.isdigit()] + if len(nums) > 1: + non_first_vals.extend(nums[1:]) + + return { + "total_sessions": len(rows), + "multi_req": len(multi_req_rows), + "single_req": len(rows) - len(multi_req_rows), + "sticky_multi": len(sticky_multi), + "non_sticky_multi": len(non_sticky_multi), + "non_first_avg": round(sum(non_first_vals) / len(non_first_vals), 2) if non_first_vals else 0, + "non_first_total": len(non_first_vals), + } diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 6d09bc1915d..066c15330d7 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -33,7 +33,9 @@ parse_stats_line, parse_ts, ) +from session_analysis import compute_session_details, summarize_session_details from stats import compute_statistics, count_by, time_bucket +from window_utils import merge_blank_window_rows def _strip_scheme(url): @@ -661,11 +663,15 @@ def save_detailed_report( # 每窗口明细 → 拆分到 details/ window_rows = build_per_window_rows(strategies, stats_recs) + window_rows_merged = merge_blank_window_rows(window_rows) + session_rows = compute_session_details(strategies, _strip_scheme) + session_summary = summarize_session_details(session_rows) if window_rows: # 主报告中添加引用 parts.append( - f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)" + f"> 每5s窗口明细数据(原始 {len(window_rows)} 条,合并后 {len(window_rows_merged)} 条):" + " [details/per_window_data.md](details/per_window_data.md)" ) parts.append("") @@ -673,9 +679,13 @@ def save_detailed_report( details_dir = os.path.join(output_dir, "details") os.makedirs(details_dir, exist_ok=True) detail_parts = ["# 每5s窗口明细数据", ""] + detail_parts.append( + "> 注:连续空窗口(Prefix/Session 都为空、且 Scoring/Fallback=0)已按 3 行格式合并展示(起始/合并说明/结束)。" + ) + detail_parts.append("") detail_parts.append( render_table( - window_rows, + window_rows_merged, columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], right_align={"Scoring", "Fallback", "Total Running"}, ) @@ -686,6 +696,57 @@ def save_detailed_report( with open(detail_path, "w") as f: f.write("\n".join(detail_parts)) + if session_rows: + parts.append( + f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)" + ) + parts.append("") + + session_parts = ["# Session 命中详情", ""] + session_parts.append("## 概览") + session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') + session_parts.append( + f'- Sessions with >1 request: **{session_summary["multi_req"]}**' + f' | single request: **{session_summary["single_req"]}**' + ) + if session_summary["multi_req"] > 0: + sticky_pct = round(session_summary["sticky_multi"] / session_summary["multi_req"] * 100, 1) + session_parts.append( + f'- Sticky (multi-request): **{session_summary["sticky_multi"]} ({sticky_pct}%)**' + f' | non-sticky: **{session_summary["non_sticky_multi"]}**' + ) + session_parts.append( + f'- Non-first request avg hit: **{session_summary["non_first_avg"]}%**' + f' (N={session_summary["non_first_total"]})' + ) + session_parts.append("") + session_parts.append("## 明细表") + session_parts.append( + render_table( + session_rows, + columns=[ + "session", + "req_count", + "first_hit", + "avg_hit(excl_first)", + "max_hit", + "min_hit", + "all_hits", + "prefill_urls", + "switch_req_pairs", + "sharp_drop_request_ids", + "sticky", + "unique_workers", + ], + right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"}, + ) + ) + session_parts.append("") + + session_path = os.path.join(details_dir, "session_hit_details.md") + with open(session_path, "w") as f: + f.write("\n".join(session_parts)) + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(parts)) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py new file mode 100644 index 00000000000..526fe2382ce --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +窗口明细压缩工具:合并连续空窗口,降低 per_window_data.md 噪声。 +""" + + +def _is_blank_window_row(row): + """判断是否为空窗口(无 Prefix/Session 明细值)。""" + return ( + row.get("Prefix HR") == "-" + and row.get("Session HR") == "-" + and row.get("Scoring") in {"0", 0} + and row.get("Fallback") in {"0", 0} + ) + + +def merge_blank_window_rows(rows, min_merge_len=5): + """合并连续空窗口,避免明细表被大量 '-' 行淹没。 + + 对于连续空窗口段(长度 >= min_merge_len),压缩成 3 行: + 1) 起始时间行 + 2) 合并说明行(含窗口数量) + 3) 结束时间行 + """ + if not rows: + return rows + + merged = [] + i = 0 + while i < len(rows): + if not _is_blank_window_row(rows[i]): + merged.append(rows[i]) + i += 1 + continue + + j = i + while j < len(rows) and _is_blank_window_row(rows[j]): + j += 1 + + seg_len = j - i + if seg_len < min_merge_len: + merged.extend(rows[i:j]) + i = j + continue + + start_t = rows[i]["Time"] + end_t = rows[j - 1]["Time"] + merged.append( + { + "Time": start_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + "Total Running": rows[i].get("Total Running", "-"), + } + ) + merged.append( + { + "Time": f"... {start_t} ~ {end_t} merged ({seg_len} windows) ...", + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + "Total Running": "-", + } + ) + merged.append( + { + "Time": end_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + "Total Running": rows[j - 1].get("Total Running", "-"), + } + ) + i = j + + return merged From c32897d235520bbc713be8f4105bb0631e8feaab Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 21:11:17 +0800 Subject: [PATCH 06/40] refine merged-window format and print session detail link --- .../skills/stat-cache-hitrate/SKILL.md | 11 +- .../references/report_templates.md | 3 + .../scripts/session_analysis.py | 116 ++++++++++++++++++ .../scripts/stat_cache_hitrate.py | 70 ++++++++++- .../scripts/window_utils.py | 80 ++++++++++++ 5 files changed, 273 insertions(+), 7 deletions(-) create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py create mode 100644 fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index 6534fb332f2..f9c5156ca69 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -23,10 +23,10 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 运行脚本前,Claude 必须先向用户确认以下参数: ### 1. 日志文件路径 -使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项,同时允许用户直接输入自定义路径(支持绝对路径和相对路径): +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入(支持绝对路径和相对路径): - 选项 1: `logs/router.log`(默认) -- 选项 2: `fd-router.log`(golang_router 根目录) -- 选项 3: 用户通过 Other 输入自定义路径 +- 选项 2: `fd-router.log`(golang_router 根目录常用文件名) +- 选项 3: Other(用户直接输入任意路径,例如 `logs/fd-router.log`、`/home/user/logs/router.log`) **重要规则**: - 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 @@ -75,7 +75,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00" ``` -默认日志路径:`logs/router.log` 或 `fd-router.log`(相对于 `fastdeploy/golang_router/`)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 +默认日志路径:`logs/router.log`(相对于 `fastdeploy/golang_router/`)。常用备选:`fd-router.log`(根目录)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 脚本会自动根据文件大小选择解析策略:小文件(<5000 行)在内存中处理,大文件用 grep + 管道流式处理。 @@ -94,7 +94,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 -- `details/per_window_data.md` — 每5s窗口的完整明细数据(Prefix HR / Session HR / Scoring / Fallback / Running) +- `details/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) +- `details/session_hit_details.md` — 每个 session 的命中明细(`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`),并附带 `prefill_urls`、prefill URL 切换前后 request_id(或 req_id/trace_id)以及命中率突降 request_id ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md index 7f060cacb6a..f5a0def5f55 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -72,6 +72,9 @@ URI: file:///abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md + - Session 命中详情: /abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + (含 prefill_urls、worker 切换前后 request_id,以及命中率突降 request_id) ``` --- diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py new file mode 100644 index 00000000000..355ba8fc947 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Session 维度分析:聚合每个 session 的命中率、worker 切换与突降请求。 +""" + +from collections import defaultdict + + +def compute_session_details(strategies, strip_scheme): + """按 session 统计命中详情。""" + + def _req_id_from_tags(tags, fallback): + return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback + + session_records = defaultdict(list) + for idx, rec in enumerate(strategies): + if rec.get("strategy") != "cache_aware_scoring": + continue + tags = rec.get("tags", {}) or {} + session_id = tags.get("session_id") + if not session_id: + continue + session_records[session_id].append((idx, rec)) + + rows = [] + for session_id, items in session_records.items(): + items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0])) + recs = [r for _, r in items] + hits = [int(r.get("selected_hitRatio", 0)) for r in recs] + if not hits: + continue + + non_first = hits[1:] + avg_excl_first = round(sum(non_first) / len(non_first), 1) if non_first else "-" + workers = {r.get("selected", "") for r in recs if r.get("selected")} + + prefill_urls = [] + for r in recs: + u = r.get("selected", "") + if u and u not in prefill_urls: + prefill_urls.append(u) + + switch_events = [] + sharp_drop_req_ids = [] + for i in range(1, len(recs)): + prev_r = recs[i - 1] + curr_r = recs[i] + prev_url = prev_r.get("selected", "") + curr_url = curr_r.get("selected", "") + prev_tags = prev_r.get("tags", {}) or {} + curr_tags = curr_r.get("tags", {}) or {} + prev_req = _req_id_from_tags(prev_tags, f"idx#{i}") + curr_req = _req_id_from_tags(curr_tags, f"idx#{i+1}") + + if prev_url and curr_url and prev_url != curr_url: + switch_events.append(f"{prev_req}->{curr_req} ({strip_scheme(prev_url)}→{strip_scheme(curr_url)})") + + prev_hit = int(prev_r.get("selected_hitRatio", 0)) + curr_hit = int(curr_r.get("selected_hitRatio", 0)) + if curr_hit - prev_hit <= -30: + sharp_drop_req_ids.append(f"{curr_req} ({prev_hit}%→{curr_hit}%)") + + rows.append( + { + "session": session_id, + "req_count": len(hits), + "first_hit": f"{hits[0]}%", + "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", + "max_hit": f"{max(hits)}%", + "min_hit": f"{min(hits)}%", + "all_hits": ", ".join(f"{h}%" for h in hits), + "sticky": "yes" if len(workers) <= 1 else "no", + "unique_workers": len(workers), + "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls), + "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-", + "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-", + } + ) + + rows.sort(key=lambda r: (r["req_count"], r["session"]), reverse=True) + return rows + + +def summarize_session_details(rows): + """生成 session 级摘要指标。""" + if not rows: + return { + "total_sessions": 0, + "multi_req": 0, + "single_req": 0, + "sticky_multi": 0, + "non_sticky_multi": 0, + "non_first_avg": 0, + "non_first_total": 0, + } + + multi_req_rows = [r for r in rows if r["req_count"] > 1] + sticky_multi = [r for r in multi_req_rows if r["sticky"] == "yes"] + non_sticky_multi = [r for r in multi_req_rows if r["sticky"] == "no"] + + non_first_vals = [] + for r in rows: + hit_tokens = [h.strip().rstrip("%") for h in r["all_hits"].split(",") if h.strip()] + nums = [int(x) for x in hit_tokens if x.isdigit()] + if len(nums) > 1: + non_first_vals.extend(nums[1:]) + + return { + "total_sessions": len(rows), + "multi_req": len(multi_req_rows), + "single_req": len(rows) - len(multi_req_rows), + "sticky_multi": len(sticky_multi), + "non_sticky_multi": len(non_sticky_multi), + "non_first_avg": round(sum(non_first_vals) / len(non_first_vals), 2) if non_first_vals else 0, + "non_first_total": len(non_first_vals), + } diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 6d09bc1915d..7adc0b97d02 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -33,7 +33,9 @@ parse_stats_line, parse_ts, ) +from session_analysis import compute_session_details, summarize_session_details from stats import compute_statistics, count_by, time_bucket +from window_utils import merge_blank_window_rows def _strip_scheme(url): @@ -661,11 +663,15 @@ def save_detailed_report( # 每窗口明细 → 拆分到 details/ window_rows = build_per_window_rows(strategies, stats_recs) + window_rows_merged = merge_blank_window_rows(window_rows) + session_rows = compute_session_details(strategies, _strip_scheme) + session_summary = summarize_session_details(session_rows) if window_rows: # 主报告中添加引用 parts.append( - f"> 每5s窗口明细数据 ({len(window_rows)} 条): [details/per_window_data.md](details/per_window_data.md)" + f"> 每5s窗口明细数据(原始 {len(window_rows)} 条,合并后 {len(window_rows_merged)} 条):" + " [details/per_window_data.md](details/per_window_data.md)" ) parts.append("") @@ -673,9 +679,13 @@ def save_detailed_report( details_dir = os.path.join(output_dir, "details") os.makedirs(details_dir, exist_ok=True) detail_parts = ["# 每5s窗口明细数据", ""] + detail_parts.append( + "> 注:连续空窗口(Prefix/Session 都为空、且 Scoring/Fallback=0)已按 3 行格式合并展示(起始/合并说明/结束)。" + ) + detail_parts.append("") detail_parts.append( render_table( - window_rows, + window_rows_merged, columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], right_align={"Scoring", "Fallback", "Total Running"}, ) @@ -686,6 +696,57 @@ def save_detailed_report( with open(detail_path, "w") as f: f.write("\n".join(detail_parts)) + if session_rows: + parts.append( + f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)" + ) + parts.append("") + + session_parts = ["# Session 命中详情", ""] + session_parts.append("## 概览") + session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') + session_parts.append( + f'- Sessions with >1 request: **{session_summary["multi_req"]}**' + f' | single request: **{session_summary["single_req"]}**' + ) + if session_summary["multi_req"] > 0: + sticky_pct = round(session_summary["sticky_multi"] / session_summary["multi_req"] * 100, 1) + session_parts.append( + f'- Sticky (multi-request): **{session_summary["sticky_multi"]} ({sticky_pct}%)**' + f' | non-sticky: **{session_summary["non_sticky_multi"]}**' + ) + session_parts.append( + f'- Non-first request avg hit: **{session_summary["non_first_avg"]}%**' + f' (N={session_summary["non_first_total"]})' + ) + session_parts.append("") + session_parts.append("## 明细表") + session_parts.append( + render_table( + session_rows, + columns=[ + "session", + "req_count", + "first_hit", + "avg_hit(excl_first)", + "max_hit", + "min_hit", + "all_hits", + "prefill_urls", + "switch_req_pairs", + "sharp_drop_request_ids", + "sticky", + "unique_workers", + ], + right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"}, + ) + ) + session_parts.append("") + + session_path = os.path.join(details_dir, "session_hit_details.md") + with open(session_path, "w") as f: + f.write("\n".join(session_parts)) + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(parts)) @@ -837,6 +898,11 @@ def main(): details_abs, details_uri = _build_path_links(details_path) print(f" - 窗口明细: {details_abs}") print(f" URI: {details_uri}") + session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md") + if os.path.exists(session_detail_path): + session_abs, session_uri = _build_path_links(session_detail_path) + print(f" - Session 明细: {session_abs}") + print(f" URI: {session_uri}") if args.watch: print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail") diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py new file mode 100644 index 00000000000..4ff6aa666d5 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +窗口明细压缩工具:合并连续空窗口,降低 per_window_data.md 噪声。 +""" + + +def _is_blank_window_row(row): + """判断是否为空窗口(无 Prefix/Session 明细值)。""" + return ( + row.get("Prefix HR") == "-" + and row.get("Session HR") == "-" + and row.get("Scoring") in {"0", 0} + and row.get("Fallback") in {"0", 0} + ) + + +def merge_blank_window_rows(rows, min_merge_len=5): + """合并连续空窗口,避免明细表被大量 '-' 行淹没。 + + 对于连续空窗口段(长度 >= min_merge_len),压缩成 3 行: + 1) 起始时间行 + 2) 合并说明行(含窗口数量) + 3) 结束时间行 + """ + if not rows: + return rows + + merged = [] + i = 0 + while i < len(rows): + if not _is_blank_window_row(rows[i]): + merged.append(rows[i]) + i += 1 + continue + + j = i + while j < len(rows) and _is_blank_window_row(rows[j]): + j += 1 + + seg_len = j - i + if seg_len < min_merge_len: + merged.extend(rows[i:j]) + i = j + continue + + start_t = rows[i]["Time"] + end_t = rows[j - 1]["Time"] + merged.append( + { + "Time": start_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + "Total Running": rows[i].get("Total Running", "-"), + } + ) + merged.append( + { + "Time": "|", + "Prefix HR": "-", + "Session HR": f"merged {seg_len} windows", + "Scoring": "0", + "Fallback": "0", + "Total Running": "-", + } + ) + merged.append( + { + "Time": end_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + "Total Running": rows[j - 1].get("Total Running", "-"), + } + ) + i = j + + return merged From 37dbd7886b11f526ad0289e379d05f04000dfcc3 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 21:22:25 +0800 Subject: [PATCH 07/40] Improve stat-cache-hitrate UX and running metric normalization --- .../skills/stat-cache-hitrate/SKILL.md | 5 +- .../scripts/stat_cache_hitrate.py | 71 ++++++++++++------- .../scripts/window_utils.py | 8 ++- 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index f9c5156ca69..e7925127dec 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -23,10 +23,9 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 运行脚本前,Claude 必须先向用户确认以下参数: ### 1. 日志文件路径 -使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项 + Other 自定义输入(支持绝对路径和相对路径): +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项(客户端会自动提供 Other 自定义输入): - 选项 1: `logs/router.log`(默认) - 选项 2: `fd-router.log`(golang_router 根目录常用文件名) -- 选项 3: Other(用户直接输入任意路径,例如 `logs/fd-router.log`、`/home/user/logs/router.log`) **重要规则**: - 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 @@ -95,7 +94,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 - 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 - `details/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) -- `details/session_hit_details.md` — 每个 session 的命中明细(`session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits`),并附带 `prefill_urls`、prefill URL 切换前后 request_id(或 req_id/trace_id)以及命中率突降 request_id +- `details/session_hit_details.md` — 每个 session 的命中明细(TSV 单行格式,便于横向滚动查看),包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids` ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 7adc0b97d02..1476e61d724 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -48,6 +48,28 @@ def _build_path_links(path): file_uri = "file://" + quote(abs_path, safe="/:-._~") return abs_path, file_uri + +def _format_half_running(total_running): + """将 stats.total_running 归一化为 prefill 口径(decode+prefill 合计 / 2)。""" + normalized = total_running / 2 + if float(normalized).is_integer(): + return str(int(normalized)) + return f"{normalized:.1f}" + + +def _render_scrollable_tsv(data, columns): + """渲染单行 TSV 文本,适合在 Markdown 查看器里横向滚动。""" + if not data: + return "```tsv\n(no data)\n```" + + def _escape(v): + return str(v).replace("\t", " ").replace("\n", "\\n") + + lines = ["\t".join(columns)] + for row in data: + lines.append("\t".join(_escape(row.get(col, "")) for col in columns)) + return "```tsv\n" + "\n".join(lines) + "\n```" + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -461,8 +483,8 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, parts.append( render_table( window_rows[:10], - columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], - right_align={"Scoring", "Fallback", "Total Running"}, + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"], + right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"}, ) ) @@ -542,7 +564,7 @@ def build_per_window_rows(strategies, stats_recs): else: session_hr = "-" - running = str(d["running"]) if d["has_running"] else "-" + running = _format_half_running(d["running"]) if d["has_running"] else "-" rows.append( { "Time": short_ts, @@ -550,7 +572,7 @@ def build_per_window_rows(strategies, stats_recs): "Session HR": session_hr, "Scoring": str(d["scoring"]), "Fallback": str(d["fallback"]), - "Total Running": running, + "Total Running (prefill≈stats/2)": running, } ) return rows @@ -686,8 +708,8 @@ def save_detailed_report( detail_parts.append( render_table( window_rows_merged, - columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running"], - right_align={"Scoring", "Fallback", "Total Running"}, + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"], + right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"}, ) ) detail_parts.append("") @@ -720,27 +742,22 @@ def save_detailed_report( f' (N={session_summary["non_first_total"]})' ) session_parts.append("") - session_parts.append("## 明细表") - session_parts.append( - render_table( - session_rows, - columns=[ - "session", - "req_count", - "first_hit", - "avg_hit(excl_first)", - "max_hit", - "min_hit", - "all_hits", - "prefill_urls", - "switch_req_pairs", - "sharp_drop_request_ids", - "sticky", - "unique_workers", - ], - right_align={"req_count", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "unique_workers"}, - ) - ) + session_columns = [ + "session", + "req_count", + "first_hit", + "avg_hit(excl_first)", + "max_hit", + "min_hit", + "all_hits", + "prefill_urls", + "switch_req_pairs", + "sharp_drop_request_ids", + "sticky", + "unique_workers", + ] + session_parts.append("## 明细(单行 TSV,可横向滚动)") + session_parts.append(_render_scrollable_tsv(session_rows, session_columns)) session_parts.append("") session_path = os.path.join(details_dir, "session_hit_details.md") diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py index 4ff6aa666d5..4e09710f6f9 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py @@ -3,6 +3,8 @@ 窗口明细压缩工具:合并连续空窗口,降低 per_window_data.md 噪声。 """ +RUNNING_COL = "Total Running (prefill≈stats/2)" + def _is_blank_window_row(row): """判断是否为空窗口(无 Prefix/Session 明细值)。""" @@ -52,7 +54,7 @@ def merge_blank_window_rows(rows, min_merge_len=5): "Session HR": "-", "Scoring": "0", "Fallback": "0", - "Total Running": rows[i].get("Total Running", "-"), + RUNNING_COL: rows[i].get(RUNNING_COL, "-"), } ) merged.append( @@ -62,7 +64,7 @@ def merge_blank_window_rows(rows, min_merge_len=5): "Session HR": f"merged {seg_len} windows", "Scoring": "0", "Fallback": "0", - "Total Running": "-", + RUNNING_COL: "-", } ) merged.append( @@ -72,7 +74,7 @@ def merge_blank_window_rows(rows, min_merge_len=5): "Session HR": "-", "Scoring": "0", "Fallback": "0", - "Total Running": rows[j - 1].get("Total Running", "-"), + RUNNING_COL: rows[j - 1].get(RUNNING_COL, "-"), } ) i = j From 5791a801135df0ff05a497b4f5824b34c78bbb20 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 21:34:37 +0800 Subject: [PATCH 08/40] Improve skill reports: markdown session tables and timestamped output layout --- .../skills/stat-cache-hitrate/SKILL.md | 8 +- .../scripts/stat_cache_hitrate.py | 96 +++++++++++++++---- .../.claude/skills/troubleshoot/SKILL.md | 8 +- .../troubleshoot/scripts/troubleshoot.py | 26 +++-- 4 files changed, 100 insertions(+), 38 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index e7925127dec..150a15a4dd7 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -50,7 +50,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 ### 3. 输出目录 分析结果默认保存到 `skill_output/stat-cache-hitrate//`(自动按运行时间创建子目录)。 -用户可通过 `--output` 指定自定义目录。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 ## 使用方式 @@ -92,9 +92,9 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 -- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 -- `details/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) -- `details/session_hit_details.md` — 每个 session 的命中明细(TSV 单行格式,便于横向滚动查看),包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids` +- `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接 +- `detail/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) +- `detail/session_hit_details.md` — 每个 session 的命中明细(Markdown 表格),包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids` ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 1476e61d724..c338e46030b 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -70,6 +70,29 @@ def _escape(v): lines.append("\t".join(_escape(row.get(col, "")) for col in columns)) return "```tsv\n" + "\n".join(lines) + "\n```" + +def _render_markdown_table(data, columns, align_right=None): + """渲染 Markdown 表格,便于在终端/文档中直接阅读。""" + if not data: + return "_(no data)_" + + align_right = align_right or set() + + def _escape_md(v): + return str(v).replace("\n", "
").replace("|", "\\|") + + header = "| " + " | ".join(columns) + " |" + align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |" + rows = [] + for row in data: + rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |") + return "\n".join([header, align] + rows) + + +def _truncate_text(v, limit=72): + s = str(v) + return s if len(s) <= limit else s[: limit - 1] + "…" + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -595,8 +618,11 @@ def save_detailed_report( 主报告包含 Per-Worker 统计和 Fallback 明细。 每窗口明细数据拆分到 details/per_window_data.md。 """ - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md") + summary_dir = os.path.join(output_dir, "summary") + details_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(details_dir, exist_ok=True) + output_path = os.path.join(summary_dir, "cache_hitrate_report.md") parts = [] parts.append("# Cache Hit Rate Detailed Report") @@ -693,13 +719,11 @@ def save_detailed_report( # 主报告中添加引用 parts.append( f"> 每5s窗口明细数据(原始 {len(window_rows)} 条,合并后 {len(window_rows_merged)} 条):" - " [details/per_window_data.md](details/per_window_data.md)" + " [../detail/per_window_data.md](../detail/per_window_data.md)" ) parts.append("") # 写入 details 子目录 - details_dir = os.path.join(output_dir, "details") - os.makedirs(details_dir, exist_ok=True) detail_parts = ["# 每5s窗口明细数据", ""] detail_parts.append( "> 注:连续空窗口(Prefix/Session 都为空、且 Scoring/Fallback=0)已按 3 行格式合并展示(起始/合并说明/结束)。" @@ -719,9 +743,7 @@ def save_detailed_report( f.write("\n".join(detail_parts)) if session_rows: - parts.append( - f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)" - ) + parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") parts.append("") session_parts = ["# Session 命中详情", ""] @@ -742,6 +764,42 @@ def save_detailed_report( f' (N={session_summary["non_first_total"]})' ) session_parts.append("") + focus_columns = [ + "session", + "req_count", + "sticky", + "unique_workers", + "avg_hit(excl_first)", + "min_hit", + "switch_req_pairs", + "sharp_drop_request_ids", + ] + session_parts.append("## 优先排查 Session(Top 20)") + prioritized_rows = sorted( + session_rows, + key=lambda r: ( + 0 if r.get("sticky") == "no" else 1, + int(str(r.get("min_hit", "0")).rstrip("%") or 0), + -int(r.get("req_count", 0)), + ), + )[:20] + compact_rows = [] + for r in prioritized_rows: + compact_rows.append( + { + "session": r["session"], + "req_count": r["req_count"], + "sticky": r["sticky"], + "unique_workers": r["unique_workers"], + "avg_hit(excl_first)": r["avg_hit(excl_first)"], + "min_hit": r["min_hit"], + "switch_req_pairs": _truncate_text(r["switch_req_pairs"]), + "sharp_drop_request_ids": _truncate_text(r["sharp_drop_request_ids"]), + } + ) + session_parts.append(_render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"})) + session_parts.append("") + session_columns = [ "session", "req_count", @@ -756,15 +814,20 @@ def save_detailed_report( "sticky", "unique_workers", ] - session_parts.append("## 明细(单行 TSV,可横向滚动)") - session_parts.append(_render_scrollable_tsv(session_rows, session_columns)) + session_parts.append("## 全量明细(Markdown 表格)") + session_parts.append( + _render_markdown_table( + session_rows, + session_columns, + align_right={"req_count", "unique_workers"}, + ) + ) session_parts.append("") session_path = os.path.join(details_dir, "session_hit_details.md") with open(session_path, "w") as f: f.write("\n".join(session_parts)) - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(parts)) @@ -887,13 +950,14 @@ def main(): ) # 导出详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if args.output: - output_dir = args.output + output_base = args.output else: script_dir = os.path.dirname(os.path.abspath(__file__)) golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) - run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp) + output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate") + output_dir = os.path.join(output_base, run_timestamp) report_path = save_detailed_report( args.log_file, strategy_recs, @@ -910,12 +974,12 @@ def main(): report_abs, report_uri = _build_path_links(report_path) print(f" - 报告文件: {report_abs}") print(f" URI: {report_uri}") - details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md") + details_path = os.path.join(output_dir, "detail", "per_window_data.md") if os.path.exists(details_path): details_abs, details_uri = _build_path_links(details_path) print(f" - 窗口明细: {details_abs}") print(f" URI: {details_uri}") - session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md") + session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md") if os.path.exists(session_detail_path): session_abs, session_uri = _build_path_links(session_detail_path) print(f" - Session 明细: {session_abs}") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index ab0c3ce7219..43ee91a46b1 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -64,7 +64,7 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 ### 4. 输出目录 诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 -用户可通过 `--output` 指定自定义目录。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 ## 用法 @@ -107,9 +107,9 @@ python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --erro ## 输出 - **终端**:简洁三层汇总(Router / FD 后端 / 客户端),含状态码分布、错误 Top N、趋势图 -- **文件**:详细报告导出到 `skill_output/troubleshoot//troubleshoot_report_.md` - - 逐分钟事件详情拆分到 `details/health_events.md` - - 请求追踪事件链拆分到 `details/trace_.md` +- **文件**:详细报告导出到 `skill_output/troubleshoot//summary/troubleshoot_report.md` + - 逐分钟事件详情拆分到 `detail/health_events.md` + - 请求追踪事件链拆分到 `detail/trace_.md` - **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` ## 三层诊断框架 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 5096c5b294a..30b9df0f443 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -190,30 +190,27 @@ def save_detailed_report(report_text, output_dir, details=None): output_dir: 输出目录 details: 详情数据 dict(来自 format_full_report) """ - os.makedirs(output_dir, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"troubleshoot_report_{timestamp}.md" - filepath = os.path.join(output_dir, filename) + summary_dir = os.path.join(output_dir, "summary") + detail_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(detail_dir, exist_ok=True) + filepath = os.path.join(summary_dir, "troubleshoot_report.md") with open(filepath, "w", encoding="utf-8") as f: f.write("# Router Troubleshooting Report\n") f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n') f.write(report_text) - # 保存详情到 details/ 子目录 + # 保存详情到 detail/ 子目录 if details: - details_dir = os.path.join(output_dir, "details") - if details.get("health_events"): - os.makedirs(details_dir, exist_ok=True) - health_path = os.path.join(details_dir, "health_events.md") + health_path = os.path.join(detail_dir, "health_events.md") with open(health_path, "w", encoding="utf-8") as f: f.write(details["health_events"]) for trace_id, trace_text in details.get("trace_files", {}).items(): - os.makedirs(details_dir, exist_ok=True) safe_id = trace_id.replace("/", "_") - trace_path = os.path.join(details_dir, f"trace_{safe_id}.md") + trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") with open(trace_path, "w", encoding="utf-8") as f: f.write(trace_text) @@ -327,13 +324,14 @@ def main(): print(report) # 保存详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if args.output: - output_dir = args.output + output_base = args.output else: script_dir = os.path.dirname(os.path.abspath(__file__)) golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) - run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp) + output_base = os.path.join(golang_router_root, "skill_output", "troubleshoot") + output_dir = os.path.join(output_base, run_timestamp) filepath = save_detailed_report(report, output_dir, details=details) print(f"\n详细报告已保存到: {filepath}", file=sys.stderr) From fe6403165257a1c7443bb19aedfa863ffd1902de Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 21:50:01 +0800 Subject: [PATCH 09/40] Refine session detail output: indexed IDs, trace fallback, and switch links --- .../skills/stat-cache-hitrate/SKILL.md | 8 +- .../scripts/session_analysis.py | 13 +- .../scripts/stat_cache_hitrate.py | 147 ++++++++++++++---- .../.claude/skills/troubleshoot/SKILL.md | 8 +- .../troubleshoot/scripts/troubleshoot.py | 26 ++-- 5 files changed, 147 insertions(+), 55 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index e7925127dec..e07281576a6 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -50,7 +50,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 ### 3. 输出目录 分析结果默认保存到 `skill_output/stat-cache-hitrate//`(自动按运行时间创建子目录)。 -用户可通过 `--output` 指定自定义目录。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 ## 使用方式 @@ -92,9 +92,9 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 -- 主报告 `cache_hitrate_report_*.md` — Per-Worker 统计 + Fallback 明细 -- `details/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) -- `details/session_hit_details.md` — 每个 session 的命中明细(TSV 单行格式,便于横向滚动查看),包含 `session / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls / switch_req_pairs / sharp_drop_request_ids` +- `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接 +- `detail/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) +- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(可跳转)」。 ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py index 355ba8fc947..f7b4caed542 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -7,7 +7,7 @@ def compute_session_details(strategies, strip_scheme): - """按 session 统计命中详情。""" + """按 session_id(优先)或 trace_id(兜底)统计命中详情。""" def _req_id_from_tags(tags, fallback): return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback @@ -18,12 +18,14 @@ def _req_id_from_tags(tags, fallback): continue tags = rec.get("tags", {}) or {} session_id = tags.get("session_id") - if not session_id: + trace_id = tags.get("trace_id") + identity = session_id or trace_id + if not identity: continue - session_records[session_id].append((idx, rec)) + session_records[identity].append((idx, rec)) rows = [] - for session_id, items in session_records.items(): + for identity, items in session_records.items(): items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0])) recs = [r for _, r in items] hits = [int(r.get("selected_hitRatio", 0)) for r in recs] @@ -62,7 +64,8 @@ def _req_id_from_tags(tags, fallback): rows.append( { - "session": session_id, + "session": identity, + "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id", "req_count": len(hits), "first_hit": f"{hits[0]}%", "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 1476e61d724..b5adcb9bd5f 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -70,6 +70,33 @@ def _escape(v): lines.append("\t".join(_escape(row.get(col, "")) for col in columns)) return "```tsv\n" + "\n".join(lines) + "\n```" + +def _render_markdown_table(data, columns, align_right=None): + """渲染 Markdown 表格,便于在终端/文档中直接阅读。""" + if not data: + return "_(no data)_" + + align_right = align_right or set() + + def _escape_md(v): + return str(v).replace("\n", "
").replace("|", "\\|") + + header = "| " + " | ".join(columns) + " |" + align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |" + rows = [] + for row in data: + rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |") + return "\n".join([header, align] + rows) + + +def _truncate_text(v, limit=72): + s = str(v) + return s if len(s) <= limit else s[: limit - 1] + "…" + + +def _seq_label(n): + return f"S{n:03d}" + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -234,7 +261,6 @@ def compute_session_hitrate(stats_recs, inference_count): total_total = sum(r.get("total", 0) for r in stats_recs) session_hr = round(total_hits / total_total * 100, 1) if total_total else 0 - coverage = round(total_total / inference_count * 100, 1) if inference_count else 0 # 趋势:每个窗口的 hits/total trend = time_bucket(stats_recs, "auto", [("hits", "sum"), ("total", "sum")]) @@ -247,7 +273,6 @@ def compute_session_hitrate(stats_recs, inference_count): "rate": session_hr, "hits": total_hits, "total": total_total, - "coverage": coverage, "inference_count": inference_count, "trend": trend, } @@ -430,8 +455,6 @@ def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, # 2. Session Hit Rate parts.append("### 2. Session Hit Rate (请求级路由粘性)") parts.append(f' 累计: {session_hr["rate"]}% (hits={session_hr["hits"]} / total={session_hr["total"]})') - parts.append(f' 覆盖率: {session_hr["coverage"]}% 的推理请求带 session_id') - trend_str = _quartile_trend(session_hr["trend"], "value") if trend_str: parts.append(f" 趋势: {trend_str}") @@ -498,10 +521,7 @@ def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling): parts.append(f"**File**: {filepath} | **tail {line_count} lines**") parts.append("") parts.append(f' Prefix Hit Ratio: {prefix_hr["mean"]}% (avg) | Cold start: {prefix_hr["cold_start_rate"]}%') - parts.append( - f' Session Hit Rate: {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})' - f' | Coverage: {session_hr["coverage"]}%' - ) + parts.append(f' Session Hit Rate: {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})') parts.append( f' Strategy: scoring {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)' f' | fallback {scheduling["fallback_count"]}' @@ -595,8 +615,11 @@ def save_detailed_report( 主报告包含 Per-Worker 统计和 Fallback 明细。 每窗口明细数据拆分到 details/per_window_data.md。 """ - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_path = os.path.join(output_dir, f"cache_hitrate_report_{timestamp}.md") + summary_dir = os.path.join(output_dir, "summary") + details_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(details_dir, exist_ok=True) + output_path = os.path.join(summary_dir, "cache_hitrate_report.md") parts = [] parts.append("# Cache Hit Rate Detailed Report") @@ -639,7 +662,6 @@ def save_detailed_report( parts.append("### Session Hit Rate") parts.append(f'- 累计: **{session_hr["rate"]}%** (hits={session_hr["hits"]}/total={session_hr["total"]})') - parts.append(f'- 覆盖率: **{session_hr["coverage"]}%**') trend_str = _quartile_trend(session_hr["trend"], "value") if trend_str: parts.append(f"- 趋势: {trend_str}") @@ -693,13 +715,11 @@ def save_detailed_report( # 主报告中添加引用 parts.append( f"> 每5s窗口明细数据(原始 {len(window_rows)} 条,合并后 {len(window_rows_merged)} 条):" - " [details/per_window_data.md](details/per_window_data.md)" + " [../detail/per_window_data.md](../detail/per_window_data.md)" ) parts.append("") # 写入 details 子目录 - details_dir = os.path.join(output_dir, "details") - os.makedirs(details_dir, exist_ok=True) detail_parts = ["# 每5s窗口明细数据", ""] detail_parts.append( "> 注:连续空窗口(Prefix/Session 都为空、且 Scoring/Fallback=0)已按 3 行格式合并展示(起始/合并说明/结束)。" @@ -719,9 +739,7 @@ def save_detailed_report( f.write("\n".join(detail_parts)) if session_rows: - parts.append( - f"> Session 命中详情 ({len(session_rows)} sessions): [details/session_hit_details.md](details/session_hit_details.md)" - ) + parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") parts.append("") session_parts = ["# Session 命中详情", ""] @@ -742,29 +760,101 @@ def save_detailed_report( f' (N={session_summary["non_first_total"]})' ) session_parts.append("") + focus_columns = [ + "id", + "req_count", + "id_type", + "sticky", + "unique_workers", + "avg_hit(excl_first)", + "max_hit", + "min_hit", + "switch_reqids", + ] + session_parts.append("## 优先排查 Session(Top 20)") + prioritized_rows = sorted( + session_rows, + key=lambda r: ( + 0 if r.get("sticky") == "no" else 1, + int(str(r.get("min_hit", "0")).rstrip("%") or 0), + -int(r.get("req_count", 0)), + ), + )[:20] + compact_rows = [] + all_rows_with_seq = [] + for i, r in enumerate(session_rows, start=1): + all_rows_with_seq.append({**r, "id": _seq_label(i)}) + + seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} + + for r in prioritized_rows: + sid = seq_map.get(r["session"], "-") + compact_rows.append( + { + "id": sid, + "req_count": r["req_count"], + "id_type": r.get("id_type", "session_id"), + "sticky": r["sticky"], + "unique_workers": r["unique_workers"], + "avg_hit(excl_first)": r["avg_hit(excl_first)"], + "max_hit": r["max_hit"], + "min_hit": r["min_hit"], + "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", + } + ) + session_parts.append( + _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"}) + ) + session_parts.append("") + session_columns = [ - "session", + "id", "req_count", + "id_type", "first_hit", "avg_hit(excl_first)", "max_hit", "min_hit", "all_hits", "prefill_urls", - "switch_req_pairs", - "sharp_drop_request_ids", "sticky", "unique_workers", ] - session_parts.append("## 明细(单行 TSV,可横向滚动)") - session_parts.append(_render_scrollable_tsv(session_rows, session_columns)) + session_parts.append("## 全量明细(Markdown 表格)") + session_parts.append( + _render_markdown_table( + all_rows_with_seq, + session_columns, + align_right={"req_count", "unique_workers"}, + ) + ) + session_parts.append("") + + session_parts.append("## 序号与会话ID映射") + map_rows = [ + { + "id": r["id"], + "id_type": r.get("id_type", "session_id"), + "session_or_trace_id": r["session"], + } + for r in all_rows_with_seq + ] + session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"])) session_parts.append("") + session_parts.append("## 切换 reqid 明细(可跳转)") + for r in all_rows_with_seq: + session_parts.append(f'### switch-{r["id"].lower()}') + session_parts.append(f'- ID: **{r["id"]}**') + session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})') + session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}') + session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}') + session_parts.append("") + session_path = os.path.join(details_dir, "session_hit_details.md") with open(session_path, "w") as f: f.write("\n".join(session_parts)) - os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) with open(output_path, "w") as f: f.write("\n".join(parts)) @@ -887,13 +977,14 @@ def main(): ) # 导出详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if args.output: - output_dir = args.output + output_base = args.output else: script_dir = os.path.dirname(os.path.abspath(__file__)) golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) - run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate", run_timestamp) + output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate") + output_dir = os.path.join(output_base, run_timestamp) report_path = save_detailed_report( args.log_file, strategy_recs, @@ -910,12 +1001,12 @@ def main(): report_abs, report_uri = _build_path_links(report_path) print(f" - 报告文件: {report_abs}") print(f" URI: {report_uri}") - details_path = os.path.join(os.path.dirname(report_path), "details", "per_window_data.md") + details_path = os.path.join(output_dir, "detail", "per_window_data.md") if os.path.exists(details_path): details_abs, details_uri = _build_path_links(details_path) print(f" - 窗口明细: {details_abs}") print(f" URI: {details_uri}") - session_detail_path = os.path.join(os.path.dirname(report_path), "details", "session_hit_details.md") + session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md") if os.path.exists(session_detail_path): session_abs, session_uri = _build_path_links(session_detail_path) print(f" - Session 明细: {session_abs}") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index ab0c3ce7219..43ee91a46b1 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -64,7 +64,7 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 ### 4. 输出目录 诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 -用户可通过 `--output` 指定自定义目录。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 ## 用法 @@ -107,9 +107,9 @@ python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --erro ## 输出 - **终端**:简洁三层汇总(Router / FD 后端 / 客户端),含状态码分布、错误 Top N、趋势图 -- **文件**:详细报告导出到 `skill_output/troubleshoot//troubleshoot_report_.md` - - 逐分钟事件详情拆分到 `details/health_events.md` - - 请求追踪事件链拆分到 `details/trace_.md` +- **文件**:详细报告导出到 `skill_output/troubleshoot//summary/troubleshoot_report.md` + - 逐分钟事件详情拆分到 `detail/health_events.md` + - 请求追踪事件链拆分到 `detail/trace_.md` - **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` ## 三层诊断框架 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 5096c5b294a..30b9df0f443 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -190,30 +190,27 @@ def save_detailed_report(report_text, output_dir, details=None): output_dir: 输出目录 details: 详情数据 dict(来自 format_full_report) """ - os.makedirs(output_dir, exist_ok=True) - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - filename = f"troubleshoot_report_{timestamp}.md" - filepath = os.path.join(output_dir, filename) + summary_dir = os.path.join(output_dir, "summary") + detail_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(detail_dir, exist_ok=True) + filepath = os.path.join(summary_dir, "troubleshoot_report.md") with open(filepath, "w", encoding="utf-8") as f: f.write("# Router Troubleshooting Report\n") f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n') f.write(report_text) - # 保存详情到 details/ 子目录 + # 保存详情到 detail/ 子目录 if details: - details_dir = os.path.join(output_dir, "details") - if details.get("health_events"): - os.makedirs(details_dir, exist_ok=True) - health_path = os.path.join(details_dir, "health_events.md") + health_path = os.path.join(detail_dir, "health_events.md") with open(health_path, "w", encoding="utf-8") as f: f.write(details["health_events"]) for trace_id, trace_text in details.get("trace_files", {}).items(): - os.makedirs(details_dir, exist_ok=True) safe_id = trace_id.replace("/", "_") - trace_path = os.path.join(details_dir, f"trace_{safe_id}.md") + trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") with open(trace_path, "w", encoding="utf-8") as f: f.write(trace_text) @@ -327,13 +324,14 @@ def main(): print(report) # 保存详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if args.output: - output_dir = args.output + output_base = args.output else: script_dir = os.path.dirname(os.path.abspath(__file__)) golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) - run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - output_dir = os.path.join(golang_router_root, "skill_output", "troubleshoot", run_timestamp) + output_base = os.path.join(golang_router_root, "skill_output", "troubleshoot") + output_dir = os.path.join(output_base, run_timestamp) filepath = save_detailed_report(report, output_dir, details=details) print(f"\n详细报告已保存到: {filepath}", file=sys.stderr) From 81be2a24d2db96470480475457f01ad60581df63 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 22:50:29 +0800 Subject: [PATCH 10/40] Improve session detail markdown id_type summary and table alignment --- .../scripts/stat_cache_hitrate.py | 86 +++++++++++++++++-- 1 file changed, 77 insertions(+), 9 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index b5adcb9bd5f..bd055393637 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None): def _escape_md(v): return str(v).replace("\n", "
").replace("|", "\\|") - header = "| " + " | ".join(columns) + " |" - align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |" - rows = [] + matrix = [] for row in data: - rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |") + matrix.append([_escape_md(row.get(c, "")) for c in columns]) + + widths = [] + for i, col in enumerate(columns): + max_cell = max((len(r[i]) for r in matrix), default=0) + widths.append(max(len(col), max_cell)) + + def _format_cell(text, width, right=False): + return text.rjust(width) if right else text.ljust(width) + + header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)] + header = "| " + " | ".join(header_cells) + " |" + + align_cells = [] + for i, c in enumerate(columns): + w = max(widths[i], 3) + if c in align_right: + align_cells.append("-" * (w - 1) + ":") + else: + align_cells.append(":" + "-" * (w - 1)) + align = "| " + " | ".join(align_cells) + " |" + + rows = [] + for row_cells in matrix: + padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)] + rows.append("| " + " | ".join(padded) + " |") return "\n".join([header, align] + rows) @@ -97,6 +120,36 @@ def _truncate_text(v, limit=72): def _seq_label(n): return f"S{n:03d}" + +def _extract_seq_num(seq_id): + return int(str(seq_id).lstrip("S") or 0) + + +def _summarize_id_type_ranges(rows_with_seq): + """基于序号连续区间汇总 id_type,便于在报告开头快速识别口径。""" + if not rows_with_seq: + return [] + + ranges = [] + current_type = rows_with_seq[0].get("id_type", "session_id") + start_id = rows_with_seq[0]["id"] + end_id = start_id + + for row in rows_with_seq[1:]: + row_type = row.get("id_type", "session_id") + row_id = row["id"] + if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1: + end_id = row_id + continue + + ranges.append((start_id, end_id, current_type)) + current_type = row_type + start_id = row_id + end_id = row_id + + ranges.append((start_id, end_id, current_type)) + return ranges + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -742,7 +795,27 @@ def save_detailed_report( parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") parts.append("") + all_rows_with_seq = [] + for i, r in enumerate(session_rows, start=1): + all_rows_with_seq.append({**r, "id": _seq_label(i)}) + id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq) + seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} + session_parts = ["# Session 命中详情", ""] + session_parts.append("## id_type 摘要") + if len(id_type_ranges) == 1: + start_id, end_id, id_type = id_type_ranges[0] + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}`") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}`") + else: + for start_id, end_id, id_type in id_type_ranges: + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}`") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}`") + session_parts.append("") session_parts.append("## 概览") session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') session_parts.append( @@ -781,11 +854,6 @@ def save_detailed_report( ), )[:20] compact_rows = [] - all_rows_with_seq = [] - for i, r in enumerate(session_rows, start=1): - all_rows_with_seq.append({**r, "id": _seq_label(i)}) - - seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} for r in prioritized_rows: sid = seq_map.get(r["session"], "-") From b8f12c42917150ba5ebb48f5b0e067534f42698e Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 23:13:06 +0800 Subject: [PATCH 11/40] Simplify time-range prompt flow in stat-cache-hitrate skill --- .../skills/stat-cache-hitrate/SKILL.md | 19 +-- .../scripts/session_analysis.py | 3 + .../scripts/stat_cache_hitrate.py | 133 +++++++++++++++--- 3 files changed, 124 insertions(+), 31 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index e07281576a6..251cbb04c2a 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -4,7 +4,7 @@ description: > 统计 FastDeploy Go Router 日志中的三层 cache 命中率指标,生成可视化报告。 三层指标:Prefix Hit Ratio(KV Cache 内容复用度)、Session Hit Rate(请求级路由粘性)、 Per-Worker Cache Stats(各 prefill worker 的缓存利用排名)。支持全量统计、tail 快速查看、 - 持续监控模式。 + 持续监控模式、指定时间段统计(--start/--end)。 当用户提到以下内容时触发此 skill:统计/查看 cache 命中率、查看 cache-aware 调度效果、 查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。 @@ -35,12 +35,15 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 如果用户直接确认或未指定路径,使用默认值 `logs/router.log`。 ### 2. 分析模式 -向用户询问分析模式: -> "请选择分析模式: -> 1. **全量统计**(默认)— 扫描完整日志 -> 2. **快速查看尾部** — 只看最近的数据(可指定行数如 2000 或时间如 30m) -> 3. **持续监控** — 全量分析后提示监控命令 -> 4. **指定时间段** — 分析特定时间范围(如 `--start "16:00" --end "17:00"`)" +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号,避免客户端偶发不显示第 4 项): +- 选项 1: `全量统计(默认)` — 扫描完整日志 +- 选项 2: `快速查看尾部` — 只看最近的数据(可指定行数如 2000 或时间如 30m) +- 选项 3: `持续监控` — 全量分析后提示监控命令 +- 选项 4: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) + +若用户选择“指定时间段”,直接让用户填写: +- 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); +- 然后映射为 `--start/--end` 参数执行。 如果用户未选择,默认使用全量统计。 @@ -94,7 +97,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 - `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接 - `detail/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) -- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(可跳转)」。 +- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg-hit(=去首请求平均命中率) / max_hit / min_hit / all_hits / purl_cnt / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(含 session 时间段,可跳转)」。 ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py index f7b4caed542..7de5b7f6042 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -66,6 +66,8 @@ def _req_id_from_tags(tags, fallback): { "session": identity, "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id", + "first_ts": recs[0].get("ts", "-"), + "last_ts": recs[-1].get("ts", "-"), "req_count": len(hits), "first_hit": f"{hits[0]}%", "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", @@ -74,6 +76,7 @@ def _req_id_from_tags(tags, fallback): "all_hits": ", ".join(f"{h}%" for h in hits), "sticky": "yes" if len(workers) <= 1 else "no", "unique_workers": len(workers), + "prefill_url_count": len(prefill_urls), "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls), "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-", "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-", diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index b5adcb9bd5f..fb6b45b56fa 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None): def _escape_md(v): return str(v).replace("\n", "
").replace("|", "\\|") - header = "| " + " | ".join(columns) + " |" - align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |" - rows = [] + matrix = [] for row in data: - rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |") + matrix.append([_escape_md(row.get(c, "")) for c in columns]) + + widths = [] + for i, col in enumerate(columns): + max_cell = max((len(r[i]) for r in matrix), default=0) + widths.append(max(len(col), max_cell)) + + def _format_cell(text, width, right=False): + return text.rjust(width) if right else text.ljust(width) + + header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)] + header = "| " + " | ".join(header_cells) + " |" + + align_cells = [] + for i, c in enumerate(columns): + w = max(widths[i], 3) + if c in align_right: + align_cells.append("-" * (w - 1) + ":") + else: + align_cells.append(":" + "-" * (w - 1)) + align = "| " + " | ".join(align_cells) + " |" + + rows = [] + for row_cells in matrix: + padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)] + rows.append("| " + " | ".join(padded) + " |") return "\n".join([header, align] + rows) @@ -97,6 +120,41 @@ def _truncate_text(v, limit=72): def _seq_label(n): return f"S{n:03d}" + +def _extract_seq_num(seq_id): + return int(str(seq_id).lstrip("S") or 0) + + +def _summarize_id_type_ranges(rows_with_seq): + """基于序号连续区间汇总 id_type,便于在报告开头快速识别口径。""" + if not rows_with_seq: + return [] + + ranges = [] + current_type = rows_with_seq[0].get("id_type", "session_id") + start_id = rows_with_seq[0]["id"] + end_id = start_id + start_ts = rows_with_seq[0].get("first_ts", "-") + end_ts = rows_with_seq[0].get("last_ts", "-") + + for row in rows_with_seq[1:]: + row_type = row.get("id_type", "session_id") + row_id = row["id"] + if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1: + end_id = row_id + end_ts = row.get("last_ts", end_ts) + continue + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + current_type = row_type + start_id = row_id + end_id = row_id + start_ts = row.get("first_ts", "-") + end_ts = row.get("last_ts", "-") + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + return ranges + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -742,8 +800,36 @@ def save_detailed_report( parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") parts.append("") + all_rows_with_seq = [] + for i, r in enumerate(session_rows, start=1): + all_rows_with_seq.append({**r, "id": _seq_label(i)}) + id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq) + seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} + ts_starts = [r.get("first_ts", "-") for r in all_rows_with_seq if r.get("first_ts", "-") != "-"] + ts_ends = [r.get("last_ts", "-") for r in all_rows_with_seq if r.get("last_ts", "-") != "-"] + session_parts = ["# Session 命中详情", ""] + overall_start_ts = min(ts_starts) if ts_starts else "-" + overall_end_ts = max(ts_ends) if ts_ends else "-" + session_parts.append("## 时间范围") + session_parts.append(f"- 分析覆盖时间段: `{overall_start_ts} ~ {overall_end_ts}`") + session_parts.append("") + session_parts.append("## id_type 摘要") + if len(id_type_ranges) == 1: + start_id, end_id, id_type, range_start_ts, range_end_ts = id_type_ranges[0] + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + for start_id, end_id, id_type, range_start_ts, range_end_ts in id_type_ranges: + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + session_parts.append("") session_parts.append("## 概览") + session_parts.append("- 字段说明:`avg-hit` = `avg_hit(excl_first)`(去除首请求后的平均命中率)") session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') session_parts.append( f'- Sessions with >1 request: **{session_summary["multi_req"]}**' @@ -763,10 +849,9 @@ def save_detailed_report( focus_columns = [ "id", "req_count", - "id_type", "sticky", - "unique_workers", - "avg_hit(excl_first)", + "purl_cnt", + "avg-hit", "max_hit", "min_hit", "switch_reqids", @@ -781,11 +866,6 @@ def save_detailed_report( ), )[:20] compact_rows = [] - all_rows_with_seq = [] - for i, r in enumerate(session_rows, start=1): - all_rows_with_seq.append({**r, "id": _seq_label(i)}) - - seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} for r in prioritized_rows: sid = seq_map.get(r["session"], "-") @@ -793,39 +873,46 @@ def save_detailed_report( { "id": sid, "req_count": r["req_count"], - "id_type": r.get("id_type", "session_id"), "sticky": r["sticky"], - "unique_workers": r["unique_workers"], - "avg_hit(excl_first)": r["avg_hit(excl_first)"], + "purl_cnt": r.get("prefill_url_count", 0), + "avg-hit": r["avg_hit(excl_first)"], "max_hit": r["max_hit"], "min_hit": r["min_hit"], "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", } ) session_parts.append( - _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"}) + _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "purl_cnt"}) ) session_parts.append("") session_columns = [ "id", "req_count", - "id_type", "first_hit", - "avg_hit(excl_first)", + "avg-hit", "max_hit", "min_hit", "all_hits", + "purl_cnt", "prefill_urls", "sticky", - "unique_workers", ] + all_rows_for_table = [] + for r in all_rows_with_seq: + all_rows_for_table.append( + { + **r, + "avg-hit": r["avg_hit(excl_first)"], + "purl_cnt": r.get("prefill_url_count", 0), + } + ) session_parts.append("## 全量明细(Markdown 表格)") session_parts.append( _render_markdown_table( - all_rows_with_seq, + all_rows_for_table, session_columns, - align_right={"req_count", "unique_workers"}, + align_right={"req_count", "purl_cnt"}, ) ) session_parts.append("") @@ -834,12 +921,11 @@ def save_detailed_report( map_rows = [ { "id": r["id"], - "id_type": r.get("id_type", "session_id"), "session_or_trace_id": r["session"], } for r in all_rows_with_seq ] - session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"])) + session_parts.append(_render_markdown_table(map_rows, ["id", "session_or_trace_id"])) session_parts.append("") session_parts.append("## 切换 reqid 明细(可跳转)") @@ -847,6 +933,7 @@ def save_detailed_report( session_parts.append(f'### switch-{r["id"].lower()}') session_parts.append(f'- ID: **{r["id"]}**') session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})') + session_parts.append(f'- 时间段: `{r.get("first_ts", "-")} ~ {r.get("last_ts", "-")}`') session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}') session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}') session_parts.append("") From 8564f9cd62ed26f32559d115483a3ca1796d9049 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Sun, 12 Apr 2026 23:19:42 +0800 Subject: [PATCH 12/40] Unify full session detail table columns with Top20 --- .../skills/stat-cache-hitrate/SKILL.md | 19 ++- .../scripts/session_analysis.py | 3 + .../scripts/stat_cache_hitrate.py | 148 ++++++++++++++---- 3 files changed, 129 insertions(+), 41 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index e07281576a6..251cbb04c2a 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -4,7 +4,7 @@ description: > 统计 FastDeploy Go Router 日志中的三层 cache 命中率指标,生成可视化报告。 三层指标:Prefix Hit Ratio(KV Cache 内容复用度)、Session Hit Rate(请求级路由粘性)、 Per-Worker Cache Stats(各 prefill worker 的缓存利用排名)。支持全量统计、tail 快速查看、 - 持续监控模式。 + 持续监控模式、指定时间段统计(--start/--end)。 当用户提到以下内容时触发此 skill:统计/查看 cache 命中率、查看 cache-aware 调度效果、 查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。 @@ -35,12 +35,15 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 如果用户直接确认或未指定路径,使用默认值 `logs/router.log`。 ### 2. 分析模式 -向用户询问分析模式: -> "请选择分析模式: -> 1. **全量统计**(默认)— 扫描完整日志 -> 2. **快速查看尾部** — 只看最近的数据(可指定行数如 2000 或时间如 30m) -> 3. **持续监控** — 全量分析后提示监控命令 -> 4. **指定时间段** — 分析特定时间范围(如 `--start "16:00" --end "17:00"`)" +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号,避免客户端偶发不显示第 4 项): +- 选项 1: `全量统计(默认)` — 扫描完整日志 +- 选项 2: `快速查看尾部` — 只看最近的数据(可指定行数如 2000 或时间如 30m) +- 选项 3: `持续监控` — 全量分析后提示监控命令 +- 选项 4: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) + +若用户选择“指定时间段”,直接让用户填写: +- 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); +- 然后映射为 `--start/--end` 参数执行。 如果用户未选择,默认使用全量统计。 @@ -94,7 +97,7 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 - `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接 - `detail/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) -- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg_hit(excl_first) / max_hit / min_hit / all_hits / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(可跳转)」。 +- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg-hit(=去首请求平均命中率) / max_hit / min_hit / all_hits / purl_cnt / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(含 session 时间段,可跳转)」。 ### 交叉诊断矩阵 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py index f7b4caed542..7de5b7f6042 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -66,6 +66,8 @@ def _req_id_from_tags(tags, fallback): { "session": identity, "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id", + "first_ts": recs[0].get("ts", "-"), + "last_ts": recs[-1].get("ts", "-"), "req_count": len(hits), "first_hit": f"{hits[0]}%", "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", @@ -74,6 +76,7 @@ def _req_id_from_tags(tags, fallback): "all_hits": ", ".join(f"{h}%" for h in hits), "sticky": "yes" if len(workers) <= 1 else "no", "unique_workers": len(workers), + "prefill_url_count": len(prefill_urls), "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls), "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-", "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-", diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index b5adcb9bd5f..bd85730b7d1 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -81,11 +81,34 @@ def _render_markdown_table(data, columns, align_right=None): def _escape_md(v): return str(v).replace("\n", "
").replace("|", "\\|") - header = "| " + " | ".join(columns) + " |" - align = "| " + " | ".join("---:" if c in align_right else "---" for c in columns) + " |" - rows = [] + matrix = [] for row in data: - rows.append("| " + " | ".join(_escape_md(row.get(c, "")) for c in columns) + " |") + matrix.append([_escape_md(row.get(c, "")) for c in columns]) + + widths = [] + for i, col in enumerate(columns): + max_cell = max((len(r[i]) for r in matrix), default=0) + widths.append(max(len(col), max_cell)) + + def _format_cell(text, width, right=False): + return text.rjust(width) if right else text.ljust(width) + + header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)] + header = "| " + " | ".join(header_cells) + " |" + + align_cells = [] + for i, c in enumerate(columns): + w = max(widths[i], 3) + if c in align_right: + align_cells.append("-" * (w - 1) + ":") + else: + align_cells.append(":" + "-" * (w - 1)) + align = "| " + " | ".join(align_cells) + " |" + + rows = [] + for row_cells in matrix: + padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)] + rows.append("| " + " | ".join(padded) + " |") return "\n".join([header, align] + rows) @@ -97,6 +120,41 @@ def _truncate_text(v, limit=72): def _seq_label(n): return f"S{n:03d}" + +def _extract_seq_num(seq_id): + return int(str(seq_id).lstrip("S") or 0) + + +def _summarize_id_type_ranges(rows_with_seq): + """基于序号连续区间汇总 id_type,便于在报告开头快速识别口径。""" + if not rows_with_seq: + return [] + + ranges = [] + current_type = rows_with_seq[0].get("id_type", "session_id") + start_id = rows_with_seq[0]["id"] + end_id = start_id + start_ts = rows_with_seq[0].get("first_ts", "-") + end_ts = rows_with_seq[0].get("last_ts", "-") + + for row in rows_with_seq[1:]: + row_type = row.get("id_type", "session_id") + row_id = row["id"] + if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1: + end_id = row_id + end_ts = row.get("last_ts", end_ts) + continue + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + current_type = row_type + start_id = row_id + end_id = row_id + start_ts = row.get("first_ts", "-") + end_ts = row.get("last_ts", "-") + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + return ranges + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -742,8 +800,36 @@ def save_detailed_report( parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") parts.append("") + all_rows_with_seq = [] + for i, r in enumerate(session_rows, start=1): + all_rows_with_seq.append({**r, "id": _seq_label(i)}) + id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq) + seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} + ts_starts = [r.get("first_ts", "-") for r in all_rows_with_seq if r.get("first_ts", "-") != "-"] + ts_ends = [r.get("last_ts", "-") for r in all_rows_with_seq if r.get("last_ts", "-") != "-"] + session_parts = ["# Session 命中详情", ""] + overall_start_ts = min(ts_starts) if ts_starts else "-" + overall_end_ts = max(ts_ends) if ts_ends else "-" + session_parts.append("## 时间范围") + session_parts.append(f"- 分析覆盖时间段: `{overall_start_ts} ~ {overall_end_ts}`") + session_parts.append("") + session_parts.append("## id_type 摘要") + if len(id_type_ranges) == 1: + start_id, end_id, id_type, range_start_ts, range_end_ts = id_type_ranges[0] + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + for start_id, end_id, id_type, range_start_ts, range_end_ts in id_type_ranges: + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + session_parts.append("") session_parts.append("## 概览") + session_parts.append("- 字段说明:`avg-hit` = `avg_hit(excl_first)`(去除首请求后的平均命中率)") session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') session_parts.append( f'- Sessions with >1 request: **{session_summary["multi_req"]}**' @@ -763,10 +849,9 @@ def save_detailed_report( focus_columns = [ "id", "req_count", - "id_type", "sticky", - "unique_workers", - "avg_hit(excl_first)", + "purl_cnt", + "avg-hit", "max_hit", "min_hit", "switch_reqids", @@ -781,11 +866,6 @@ def save_detailed_report( ), )[:20] compact_rows = [] - all_rows_with_seq = [] - for i, r in enumerate(session_rows, start=1): - all_rows_with_seq.append({**r, "id": _seq_label(i)}) - - seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} for r in prioritized_rows: sid = seq_map.get(r["session"], "-") @@ -793,39 +873,41 @@ def save_detailed_report( { "id": sid, "req_count": r["req_count"], - "id_type": r.get("id_type", "session_id"), "sticky": r["sticky"], - "unique_workers": r["unique_workers"], - "avg_hit(excl_first)": r["avg_hit(excl_first)"], + "purl_cnt": r.get("prefill_url_count", 0), + "avg-hit": r["avg_hit(excl_first)"], "max_hit": r["max_hit"], "min_hit": r["min_hit"], "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", } ) session_parts.append( - _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "unique_workers"}) + _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "purl_cnt"}) ) session_parts.append("") - session_columns = [ - "id", - "req_count", - "id_type", - "first_hit", - "avg_hit(excl_first)", - "max_hit", - "min_hit", - "all_hits", - "prefill_urls", - "sticky", - "unique_workers", - ] + session_columns = focus_columns + all_rows_for_table = [] + for r in all_rows_with_seq: + sid = r["id"] + all_rows_for_table.append( + { + "id": sid, + "req_count": r["req_count"], + "sticky": r["sticky"], + "purl_cnt": r.get("prefill_url_count", 0), + "avg-hit": r["avg_hit(excl_first)"], + "max_hit": r["max_hit"], + "min_hit": r["min_hit"], + "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", + } + ) session_parts.append("## 全量明细(Markdown 表格)") session_parts.append( _render_markdown_table( - all_rows_with_seq, + all_rows_for_table, session_columns, - align_right={"req_count", "unique_workers"}, + align_right={"req_count", "purl_cnt"}, ) ) session_parts.append("") @@ -834,12 +916,11 @@ def save_detailed_report( map_rows = [ { "id": r["id"], - "id_type": r.get("id_type", "session_id"), "session_or_trace_id": r["session"], } for r in all_rows_with_seq ] - session_parts.append(_render_markdown_table(map_rows, ["id", "id_type", "session_or_trace_id"])) + session_parts.append(_render_markdown_table(map_rows, ["id", "session_or_trace_id"])) session_parts.append("") session_parts.append("## 切换 reqid 明细(可跳转)") @@ -847,6 +928,7 @@ def save_detailed_report( session_parts.append(f'### switch-{r["id"].lower()}') session_parts.append(f'- ID: **{r["id"]}**') session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})') + session_parts.append(f'- 时间段: `{r.get("first_ts", "-")} ~ {r.get("last_ts", "-")}`') session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}') session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}') session_parts.append("") From 322b98b0abd910f63a260e89285889222bde6bb6 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 00:04:20 +0800 Subject: [PATCH 13/40] infer mixed token-select counts from router semantics --- .../.claude/skills/troubleshoot/SKILL.md | 21 +- .../troubleshoot/scripts/analyzers/load.py | 148 +---------- .../scripts/analyzers/load_report.py | 243 ++++++++++++++++++ .../skills/troubleshoot/scripts/log_parser.py | 80 +++++- .../troubleshoot/scripts/troubleshoot.py | 13 +- 5 files changed, 342 insertions(+), 163 deletions(-) create mode 100644 fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index 43ee91a46b1..7f7a5793e91 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -24,10 +24,9 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 运行脚本前,Claude 必须按以下顺序向用户确认参数: ### 1. 日志文件路径 -使用 AskUserQuestion 工具向用户询问日志文件路径。提供常见的默认选项,同时允许用户直接输入自定义路径(支持绝对路径和相对路径): +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项(客户端会自动提供 Other 自定义输入): - 选项 1: `logs/router.log`(默认) - 选项 2: `fd-router.log`(golang_router 根目录) -- 选项 3: 用户通过 Other 输入自定义路径 **重要规则**: - 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 @@ -37,11 +36,10 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 如果用户直接确认或未指定路径,使用脚本的自动发现逻辑。 ### 2. 分析范围 -向用户询问分析范围: -> "请选择分析范围: -> 1. **全量分析**(默认)— 分析整个日志文件 -> 2. **尾部分析** — 只分析最近数据(可指定行数或时间如 `--tail 5000` 或 `--tail 30m`) -> 3. **指定时间段** — 分析特定时间范围内的日志" +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): +- 选项 1: `全量分析(默认)` — 分析整个日志文件 +- 选项 2: `尾部分析` — 只分析最近数据(可指定行数或时间如 `--tail 5000` 或 `--tail 30m`) +- 选项 3: `指定时间段` — 分析特定时间范围内的日志 如果用户未选择,默认使用全量分析。 @@ -54,11 +52,10 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 `--start/--end` 与 `--tail` 互斥。 ### 3. 分析模式 -向用户询问分析模式: -> "请选择分析模式: -> 1. **完整分析**(默认)— 运行所有维度(errors + latency + health + cache + load) -> 2. **单维度/多维度分析** — 选择特定维度(errors / latency / health / cache / load),可选多个 -> 3. **请求追踪** — 追踪特定请求 ID(需提供 ID)" +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): +- 选项 1: `完整分析(默认)` — 运行所有维度(errors + latency + health + cache + load) +- 选项 2: `单维度/多维度分析` — 选择特定维度(errors / latency / health / cache / load),可选多个 +- 选项 3: `请求追踪` — 追踪特定请求 ID(需提供 ID) 如果用户未选择,默认使用完整分析。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index 9be82357494..c38b0b80953 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -13,9 +13,9 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from chart import render_bar, render_sparkline, render_table from log_parser import extract_ts, match_select_release, parse_stats_line from stats import compute_statistics, time_bucket +from analyzers.load_report import format_load_report # ════════════════════════════════════════════════════════════════ # Counter 异常检测正则 @@ -28,14 +28,21 @@ TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}") # Token 事件 -SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*{URL_RE},\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") def _strip_scheme(url): return re.sub(r"^https?://", "", url) +def _normalize_worker_type(worker_type): + t = (worker_type or "unknown").lower() + if t in ("prefill", "decode", "mixed"): + return t + return "unknown" + + def parse_counter_anomaly(line): """解析 H5 counter 异常行。""" ts = extract_ts(line) @@ -73,7 +80,7 @@ def analyze_load(log_file, tail=None): r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters", tail, ) - h11_lines = _grep_lines(log_file, r"release prefill tokens", tail) + h11_lines = _grep_lines(log_file, r"release (?:[a-zA-Z_]+\s+)?tokens", tail) # 解析 stats 行 stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r] @@ -161,12 +168,12 @@ def _analyze_tokens(h3_lines, h11_lines): for line in h3_lines: m = SELECT_TOKENS_RE.search(line) if m: - token_alloc[m.group(1)].append(int(m.group(2))) + token_alloc[m.group(2)].append(int(m.group(3))) for line in h11_lines: m = RELEASE_TOKENS_RE.search(line) if m: - token_release[m.group(1)].append(int(m.group(2))) + token_release[m.group(2)].append(int(m.group(3))) result = [] all_workers = set(token_alloc.keys()) | set(token_release.keys()) @@ -285,135 +292,6 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, # ════════════════════════════════════════════════════════════════ -def format_load_report(result): - """将分析结果格式化为终端报告。""" - sections = ["## 负载与计数器分析", ""] - sections.append(f' {result["summary"]}') - sections.append("") - - if result["diagnoses"]: - sections.append("### 诊断") - sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') - sections.append("") - - # 负载概览 - ls = result.get("load_stats", {}) - if ls: - sections.append("### 负载概览 (total_running)") - sections.append("") - sections.append( - f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' - f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' - ) - sections.append("") - - # Per-Worker 负载 - if result["worker_load"]: - sections.append("### Per-Worker 负载") - sections.append("") - bar_data = [ - {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]} - for w in result["worker_load"] - ] - sections.append(render_bar(bar_data, show_count=True)) - sections.append("") - - # 负载趋势 - if result["load_trend"] and len(result["load_trend"]) > 1: - sections.append("### 负载趋势") - sections.append("") - sections.append( - render_sparkline( - result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req" - ) - ) - sections.append("") - - # Counter 异常 - if result["counter_anomalies"]: - sections.append("### 计数器异常") - sections.append("") - for a in result["counter_anomalies"]: - workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) - sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') - sections.append("") - - id_cov = result.get("select_release", {}).get("id_coverage", {}) - if id_cov: - sections.append("### 请求标识覆盖(基于 select 近似请求数)") - sections.append("") - sections.append( - " total={total} | with_request_id={with_rid} | without_request_id={without_rid} | " - "with_alt_id={with_alt} | without_any_id={without_any}".format( - total=id_cov.get("total_requests_estimated", 0), - with_rid=id_cov.get("with_request_id", 0), - without_rid=id_cov.get("without_request_id", 0), - with_alt=id_cov.get("with_alt_id", 0), - without_any=id_cov.get("without_any_id", 0), - ) - ) - if id_cov.get("without_any_id", 0) > 0: - sections.append(" ℹ 无 request/session/trace/req_id 时,不做退化匹配,仅统计为 untracked。") - sections.append("") - - # Select/Release 匹配 - sr = result.get("select_release", {}) - if sr.get("per_worker"): - sections.append("### Select/Release 匹配") - sections.append("") - id_cov = sr.get("id_coverage", {}) - no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0 - table_data = [] - for w_url, pw in sorted(sr["per_worker"].items()): - delta_display = "N/A" if no_correlatable_id else str(pw["delta"]) - table_data.append( - { - "Worker": _strip_scheme(w_url), - "Select": str(pw["selects"]), - "Release": str(pw["releases"]), - "Delta": delta_display, - } - ) - sections.append( - render_table( - table_data, - columns=["Worker", "Select", "Release", "Delta"], - right_align={"Select", "Release", "Delta"}, - ) - ) - sections.append("") - if no_correlatable_id: - sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") - sections.append("") - - if sr.get("unmatched_selects"): - sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') - for u in sr["unmatched_selects"][:5]: - sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append("") - - if sr.get("untracked_selects"): - sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') - for u in sr["untracked_selects"][:5]: - sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append("") - - # Token 统计 - if result.get("token_stats"): - sections.append("### Token 计数器") - sections.append("") - sections.append( - render_table( - result["token_stats"], - columns=["worker", "alloc_count", "alloc_avg", "release_count"], - right_align={"alloc_count", "alloc_avg", "release_count"}, - ) - ) - sections.append("") - - return "\n".join(sections) # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py new file mode 100644 index 00000000000..e118c4e1af3 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +"""Load report formatter.""" + +from chart import render_bar, render_sparkline, render_table + + +def _strip_scheme(url): + import re + return re.sub(r"^https?://", "", url) + + +def format_load_report(result): + """将分析结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_text) + """ + sections = ["## 负载与计数器分析", ""] + sections.append(f' {result["summary"]}') + sections.append("") + detail_sections = ["# 负载与计数器详情", ""] + detail_sections.append(f'总结: {result["summary"]}') + detail_sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + for d in result["diagnoses"]: + sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") + + # 负载概览 + ls = result.get("load_stats", {}) + if ls: + sections.append("### 负载概览 (total_running)") + sections.append("") + sections.append( + f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' + f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' + ) + sections.append("") + + # Per-Worker 负载 + if result["worker_load"]: + sections.append("### Per-Worker 负载") + sections.append("") + bar_data = [ + {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]} + for w in result["worker_load"] + ] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 负载趋势 + if result["load_trend"] and len(result["load_trend"]) > 1: + sections.append("### 负载趋势") + sections.append("") + sections.append( + render_sparkline( + result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req" + ) + ) + sections.append("") + + # Counter 异常 + if result["counter_anomalies"]: + sections.append("### 计数器异常") + sections.append("") + for a in result["counter_anomalies"]: + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) + sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') + sections.append("") + detail_sections.append("## 计数器异常") + detail_sections.append("") + for a in result["counter_anomalies"]: + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) + detail_sections.append(f'- {a["type"]}: {a["total"]} 次 [{workers_str}]') + detail_sections.append("") + + # 按 prefill / decode / mixed 分类统计 + type_summary = result.get("select_release", {}).get("type_summary", {}) + if type_summary: + sections.append("### 按类型统计(prefill / decode / mixed)") + sections.append("") + type_rows = [] + for t in ("prefill", "decode", "mixed", "unknown"): + s = type_summary.get(t) + if not s: + continue + token_display = "-" + if t == "prefill": + token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}' + elif t == "mixed" and (s.get("token_selects", 0) > 0 or s.get("token_releases", 0) > 0): + token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}' + type_rows.append( + { + "type": t, + "counter(S/R)": f'{s.get("counter_selects",0)}/{s.get("counter_releases",0)}', + "token(S/R)": token_display, + } + ) + if type_rows: + sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) + sections.append("") + sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append("") + detail_sections.append("## 按类型统计") + detail_sections.append("") + detail_sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) + detail_sections.append("") + + id_cov = result.get("select_release", {}).get("id_coverage", {}) + if id_cov: + sections.append("### 请求标识覆盖(基于 select 近似请求数)") + sections.append("") + sections.append( + " total={total} | with_request_id={with_rid} | without_request_id={without_rid} | " + "with_alt_id={with_alt} | without_any_id={without_any}".format( + total=id_cov.get("total_requests_estimated", 0), + with_rid=id_cov.get("with_request_id", 0), + without_rid=id_cov.get("without_request_id", 0), + with_alt=id_cov.get("with_alt_id", 0), + without_any=id_cov.get("without_any_id", 0), + ) + ) + if id_cov.get("without_any_id", 0) > 0: + sections.append(" ℹ 无 request/session/trace/req_id 时,不做退化匹配,仅统计为 untracked。") + sections.append(" 字段说明: total=select 事件总数估算;with_request_id=含 request_id;without_request_id=不含 request_id;with_alt_id=含 req_id/trace_id/session_id;without_any_id=四类 ID 都缺失。") + sections.append("") + detail_sections.append("## 请求标识覆盖字段说明") + detail_sections.append("") + detail_sections.append( + "- total: select 事件总数(近似请求数)\n" + "- with_request_id: 携带 request_id 的 select 数\n" + "- without_request_id: 未携带 request_id 的 select 数\n" + "- with_alt_id: 无 request_id 但携带 req_id/trace_id/session_id 的 select 数\n" + "- without_any_id: 四类 ID 都没有,无法做请求级关联" + ) + detail_sections.append("") + + # Select/Release 匹配 + sr = result.get("select_release", {}) + if sr.get("per_worker"): + sections.append("### Select/Release 匹配") + sections.append("") + id_cov = sr.get("id_coverage", {}) + no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0 + table_data = [] + for w_url, pw in sorted(sr["per_worker"].items()): + delta_display = "N/A" if no_correlatable_id else str(pw["delta"]) + table_data.append( + { + "Worker": _strip_scheme(w_url), + "ReqSelect": str(pw["selects"]), + "ReqRelease": str(pw["releases"]), + "ReqDelta": delta_display, + "TokenSelect": str(pw.get("token_selects", 0)), + "TokenSelInf": str(pw.get("token_selects_inferred", 0)), + "TokenRelease": str(pw.get("token_releases", 0)), + } + ) + sections.append( + render_table( + table_data, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"}, + ) + ) + sections.append("") + if no_correlatable_id: + sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") + sections.append("") + sections.append(" 说明: prefill/mixed 在运行时都会同时增加 request 与 token 计数器;其中 mixed 的 TokenSelect 可能来自推断(TokenSelInf)。") + sections.append("") + detail_sections.append("## Select/Release Per-Worker") + detail_sections.append("") + detail_sections.append( + render_table( + table_data, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"}, + ) + ) + detail_sections.append("") + + if sr.get("unmatched_selects"): + sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') + sections.append(" 解释: 出现 request select,但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。") + for u in sr["unmatched_selects"][:3]: + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') + sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append("") + detail_sections.append("## 未匹配 select(完整)") + detail_sections.append("") + for u in sr["unmatched_selects"]: + detail_sections.append( + f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}' + ) + detail_sections.append("") + + if sr.get("untracked_selects"): + sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') + for u in sr["untracked_selects"][:3]: + sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') + sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append("") + detail_sections.append("## Untracked selects(缺少可关联 ID)") + detail_sections.append("") + for u in sr["untracked_selects"]: + detail_sections.append( + f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}' + ) + detail_sections.append("") + + if sr.get("failed_selects"): + sections.append(f' ⚠ Failed to select: {len(sr["failed_selects"])} 次') + sections.append(" 解释: 路由在该时刻未能选出可用 worker,通常意味着可用池不足或健康状态异常。") + sections.append("") + detail_sections.append("## Failed to select") + detail_sections.append("") + for f in sr["failed_selects"]: + detail_sections.append(f'- [{f.get("ts","")}] line={f.get("line","")}') + detail_sections.append("") + + # Token 统计 + if result.get("token_stats"): + sections.append("### Token 计数器") + sections.append("") + sections.append( + render_table( + result["token_stats"], + columns=["worker", "alloc_count", "alloc_avg", "release_count"], + right_align={"alloc_count", "alloc_avg", "release_count"}, + ) + ) + sections.append("") + + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 44f5cdebd94..1bb11ddaa5e 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -466,8 +466,8 @@ def parse_error_line(line): SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})") RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})") FAILED_SELECT_RE = re.compile(r"Failed to select") -SELECT_TOKENS_RE = re.compile(rf"select worker \(prefill\):\s*({URL_RE}),\s*tokens:\s*(\d+)") -RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)") +SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*({URL_RE}),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)") def _parse_ts_safe(ts): @@ -493,6 +493,14 @@ def _select_match_key(tags): return (None, None) +def _normalize_worker_type(worker_type): + """归一化 worker type。""" + t = (worker_type or "unknown").lower() + if t in ("prefill", "decode", "mixed"): + return t + return "unknown" + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -516,10 +524,10 @@ def match_select_release(lines, fallback_window_s=120): selects.append( { "ts": ts, - "worker": tm.group(1), - "type": "prefill", + "worker": tm.group(2), + "type": _normalize_worker_type(tm.group(1)), "tags": tags, - "tokens": int(tm.group(2)), + "tokens": int(tm.group(3)), "line": line_no, } ) @@ -528,13 +536,14 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: + token_type = trm.group(1) or "prefill" releases.append( { "ts": ts, - "worker": trm.group(1), - "type": "prefill_tokens", + "worker": trm.group(2), + "type": f'{_normalize_worker_type(token_type)}_tokens', "tags": tags, - "tokens": int(trm.group(2)), + "tokens": int(trm.group(3)), "line": line_no, } ) @@ -546,7 +555,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": sm.group(2), - "type": sm.group(1) or "unknown", + "type": _normalize_worker_type(sm.group(1)), "tags": tags, "tokens": None, "line": line_no, @@ -560,7 +569,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": rm.group(2), - "type": rm.group(1) or "unknown", + "type": _normalize_worker_type(rm.group(1)), "tags": tags, "tokens": None, "line": line_no, @@ -576,8 +585,11 @@ def match_select_release(lines, fallback_window_s=120): unmatched_selects = [] release_used = set() + # 请求生命周期匹配只使用 request counter release(排除 token release) + counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")] release_by_key = defaultdict(list) - for i, r in enumerate(releases): + for i in counter_release_indexes: + r = releases[i] _, key = _select_match_key(r.get("tags", {})) if key: release_by_key[key].append(i) @@ -639,7 +651,8 @@ def match_select_release(lines, fallback_window_s=120): sdt = _parse_ts_safe(s["ts"]) best_idx = None best_delta = None - for ri, r in enumerate(releases): + for ri in counter_release_indexes: + r = releases[ri] if ri in release_used: continue if r.get("worker") != s.get("worker"): @@ -680,11 +693,25 @@ def match_select_release(lines, fallback_window_s=120): ) # Per-worker summary - per_worker = defaultdict(lambda: {"selects": 0, "releases": 0}) + # 对照 golang_router SelectWorker 语义: + # - prefill: request counter + token counter 同时增加(日志通常带 tokens) + # - mixed: request counter + token counter 同时增加(日志通常不带 tokens,需要推断) + per_worker = defaultdict( + lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_selects_inferred": 0, "token_releases": 0} + ) for s in selects: + s_type = _normalize_worker_type(s.get("type")) per_worker[s["worker"]]["selects"] += 1 + if s.get("tokens") is not None: + per_worker[s["worker"]]["token_selects"] += 1 + elif s_type == "mixed": + per_worker[s["worker"]]["token_selects"] += 1 + per_worker[s["worker"]]["token_selects_inferred"] += 1 for r in releases: - per_worker[r["worker"]]["releases"] += 1 + if str(r.get("type", "")).endswith("_tokens"): + per_worker[r["worker"]]["token_releases"] += 1 + else: + per_worker[r["worker"]]["releases"] += 1 pw_result = {} for w, counts in per_worker.items(): @@ -692,7 +719,31 @@ def match_select_release(lines, fallback_window_s=120): "selects": counts["selects"], "releases": counts["releases"], "delta": counts["selects"] - counts["releases"], + "token_selects": counts["token_selects"], + "token_selects_inferred": counts["token_selects_inferred"], + "token_releases": counts["token_releases"], + } + + # 按 worker type 分类统计(prefill/decode/mixed) + type_summary = defaultdict( + lambda: { + "counter_selects": 0, + "counter_releases": 0, + "token_selects": 0, + "token_releases": 0, } + ) + for s in selects: + s_type = _normalize_worker_type(s.get("type")) + type_summary[s_type]["counter_selects"] += 1 + if s.get("tokens") is not None or s_type == "mixed": + type_summary[s_type]["token_selects"] += 1 + for r in releases: + r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) + if str(r.get("type", "")).endswith("_tokens"): + type_summary[r_type]["token_releases"] += 1 + else: + type_summary[r_type]["counter_releases"] += 1 return { "matched": matched, @@ -707,6 +758,7 @@ def match_select_release(lines, fallback_window_s=120): "with_alt_id": with_alt_id, "without_any_id": without_any_id, }, + "type_summary": dict(type_summary), } diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 30b9df0f443..a818d31150f 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -144,10 +144,11 @@ def format_full_report(results, status, status_reason): report_text: 主报告文本(总结 + 可视化) details: dict 包含需要拆分到独立文件的详情数据 - 'health_events': str 或 None + - 'load_select_release': str 或 None - 'trace_files': {trace_id: text} 或 {} """ parts = [] - details = {"health_events": None, "trace_files": {}} + details = {"health_events": None, "load_select_release": None, "trace_files": {}} # 状态行 parts.append(f"STATUS: {status} — {status_reason}") @@ -168,7 +169,10 @@ def format_full_report(results, status, status_reason): details["health_events"] = detail if "load" in results: - parts.append(format_load_report(results["load"])) + summary, detail = format_load_report(results["load"]) + parts.append(summary) + if detail: + details["load_select_release"] = detail if "cache" in results: parts.append(format_cache_report(results["cache"])) @@ -208,6 +212,11 @@ def save_detailed_report(report_text, output_dir, details=None): with open(health_path, "w", encoding="utf-8") as f: f.write(details["health_events"]) + if details.get("load_select_release"): + load_path = os.path.join(detail_dir, "load_select_release.md") + with open(load_path, "w", encoding="utf-8") as f: + f.write(details["load_select_release"]) + for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") From 2df1eec9ecfa6785cdc3645a05d84c4b3c7956f2 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 00:07:57 +0800 Subject: [PATCH 14/40] count token-select by prefill/mixed worker type only --- .../scripts/analyzers/load_report.py | 11 +++++------ .../skills/troubleshoot/scripts/log_parser.py | 18 +++++------------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index e118c4e1af3..86ba1f0d94f 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -160,30 +160,29 @@ def format_load_report(result): "ReqRelease": str(pw["releases"]), "ReqDelta": delta_display, "TokenSelect": str(pw.get("token_selects", 0)), - "TokenSelInf": str(pw.get("token_selects_inferred", 0)), "TokenRelease": str(pw.get("token_releases", 0)), } ) sections.append( render_table( table_data, - columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"], - right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"}, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"}, ) ) sections.append("") if no_correlatable_id: sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") sections.append("") - sections.append(" 说明: prefill/mixed 在运行时都会同时增加 request 与 token 计数器;其中 mixed 的 TokenSelect 可能来自推断(TokenSelInf)。") + sections.append(" 说明: TokenSelect 按 worker type 统计(prefill + mixed 的 select 都计入),不依赖日志里是否出现 tokens 字段。") sections.append("") detail_sections.append("## Select/Release Per-Worker") detail_sections.append("") detail_sections.append( render_table( table_data, - columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"], - right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenSelInf", "TokenRelease"}, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"}, ) ) detail_sections.append("") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 1bb11ddaa5e..200f976f2ff 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -692,21 +692,14 @@ def match_select_release(lines, fallback_window_s=120): } ) - # Per-worker summary - # 对照 golang_router SelectWorker 语义: - # - prefill: request counter + token counter 同时增加(日志通常带 tokens) - # - mixed: request counter + token counter 同时增加(日志通常不带 tokens,需要推断) - per_worker = defaultdict( - lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_selects_inferred": 0, "token_releases": 0} - ) + # Per-worker summary(按 worker type 统计,不依赖日志中的 tokens 字段) + # 规则:prefill/mixed 的 select 均计入 token_selects。 + per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0}) for s in selects: s_type = _normalize_worker_type(s.get("type")) per_worker[s["worker"]]["selects"] += 1 - if s.get("tokens") is not None: - per_worker[s["worker"]]["token_selects"] += 1 - elif s_type == "mixed": + if s_type in ("prefill", "mixed"): per_worker[s["worker"]]["token_selects"] += 1 - per_worker[s["worker"]]["token_selects_inferred"] += 1 for r in releases: if str(r.get("type", "")).endswith("_tokens"): per_worker[r["worker"]]["token_releases"] += 1 @@ -720,7 +713,6 @@ def match_select_release(lines, fallback_window_s=120): "releases": counts["releases"], "delta": counts["selects"] - counts["releases"], "token_selects": counts["token_selects"], - "token_selects_inferred": counts["token_selects_inferred"], "token_releases": counts["token_releases"], } @@ -736,7 +728,7 @@ def match_select_release(lines, fallback_window_s=120): for s in selects: s_type = _normalize_worker_type(s.get("type")) type_summary[s_type]["counter_selects"] += 1 - if s.get("tokens") is not None or s_type == "mixed": + if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 for r in releases: r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) From 83279b7001264114508c64138766410ce25c1234 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 00:33:09 +0800 Subject: [PATCH 15/40] troubleshoot: clarify DEGRADED meaning in report header --- .../troubleshoot/references/error_catalog.md | 1 + .../troubleshoot/references/log_patterns.md | 11 ++ .../references/report_templates.md | 1 + .../troubleshoot/scripts/analyzers/errors.py | 9 ++ .../scripts/analyzers/load_report.py | 9 +- .../skills/troubleshoot/scripts/chart.py | 3 +- .../skills/troubleshoot/scripts/log_parser.py | 113 +++++++++++++++++- .../troubleshoot/scripts/troubleshoot.py | 18 ++- 8 files changed, 156 insertions(+), 9 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md index ba48297d9c9..60b4931b546 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -61,6 +61,7 @@ | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md index cf33b41f723..4322909c01d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -233,6 +233,17 @@ PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` --- +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + ## 使用脚本工具 各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index ba9e40e9869..5eec70d1514 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -44,6 +44,7 @@ ### 简洁版(终端输出) - 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 - 按三层分类(Router / FD 后端 / 客户端) - 每个问题一行摘要 + 关键指标 - 末尾提示详细版文件路径 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index b8217a5ffa4..1f3f63fcc45 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -33,6 +33,7 @@ ("counter already zero", "Router"), ("tokenizer failed", "Router"), ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), # 客户端 ("Invalid request body", "客户端"), ("Invalid JSON format", "客户端"), @@ -282,6 +283,14 @@ def format_errors_report(result): render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) ) sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") # 状态码分布 if result["status_code_dist"]: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index 86ba1f0d94f..b06d26883aa 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -25,8 +25,11 @@ def format_load_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: + max_diag_in_summary = 8 + for d in result["diagnoses"][:max_diag_in_summary]: sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + if len(result["diagnoses"]) > max_diag_in_summary: + sections.append(f' ... 其余 {len(result["diagnoses"]) - max_diag_in_summary} 项见 detail 报告') sections.append("") detail_sections.append("## 诊断") detail_sections.append("") @@ -39,6 +42,7 @@ def format_load_report(result): if ls: sections.append("### 负载概览 (total_running)") sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") sections.append( f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' @@ -108,6 +112,9 @@ def format_load_report(result): sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) sections.append("") sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: token-release 由同 worker 邻近 select 推断到 prefill/mixed,不直接依赖 `release prefill tokens` 文本。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") sections.append("") detail_sections.append("## 按类型统计") detail_sections.append("") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py index 83bb0203432..1eaea1369f8 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None): w = col_widths[col] if col in right_align: header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") else: header_parts.append(f" {col:<{w}} ") - sep_parts.append("-" * (w + 2)) + sep_parts.append(":" + "-" * (w + 1)) lines = [] lines.append("|" + "|".join(header_parts) + "|") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 200f976f2ff..a5d646dc029 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type): return "unknown" +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: - token_type = trm.group(1) or "prefill" + token_type = trm.group(1) releases.append( { "ts": ts, "worker": trm.group(2), - "type": f'{_normalize_worker_type(token_type)}_tokens', + # 不直接信任日志里的 token type 文本("release prefill tokens" 也可能来自 mixed) + "type": "unknown_tokens", + "raw_token_type": token_type or "", "tags": tags, "tokens": int(trm.group(3)), "line": line_no, @@ -716,7 +789,24 @@ def match_select_release(lines, fallback_window_s=120): "token_releases": counts["token_releases"], } - # 按 worker type 分类统计(prefill/decode/mixed) + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + if base_t == "unknown": + # token release 的 worker type 由同 worker 邻近 select 推断(prefill/mixed) + base_t = _infer_token_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) type_summary = defaultdict( lambda: { "counter_selects": 0, @@ -730,9 +820,10 @@ def match_select_release(lines, fallback_window_s=120): type_summary[s_type]["counter_selects"] += 1 if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 - for r in releases: - r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) - if str(r.get("type", "")).endswith("_tokens"): + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): type_summary[r_type]["token_releases"] += 1 else: type_summary[r_type]["counter_releases"] += 1 @@ -949,6 +1040,16 @@ def check(name, got, expected): "dial tcp {ip:port}: connection refused", ) + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) + print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index a818d31150f..3b80cd45c5e 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -128,7 +128,20 @@ def determine_status(results): reasons.append(d["message"]) if reasons: - return "DEGRADED", ", ".join(reasons) + # 去重并限制长度,避免状态行过长难读 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + max_reasons = 4 + shown = deduped[:max_reasons] + extra = len(deduped) - len(shown) + summary = ";".join(shown) + if extra > 0: + summary += f";另有 {extra} 项诊断见各维度 detail 报告" + return "DEGRADED", summary if not results: return "HEALTHY", "无分析数据" @@ -152,6 +165,9 @@ def format_full_report(results, status, status_reason): # 状态行 parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) parts.append("=" * 60) parts.append("") From 55657a4405354be0e491ba9a71a4dba778c03beb Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 10:53:12 +0800 Subject: [PATCH 16/40] troubleshoot: revert trend windows to auto and split detail outputs by responsibility --- .../troubleshoot/references/error_catalog.md | 1 + .../troubleshoot/references/log_patterns.md | 11 ++ .../references/report_templates.md | 11 +- .../troubleshoot/scripts/analyzers/cache.py | 131 ++++++++++++++++- .../troubleshoot/scripts/analyzers/errors.py | 48 +++++- .../troubleshoot/scripts/analyzers/health.py | 38 ++++- .../troubleshoot/scripts/analyzers/latency.py | 8 +- .../troubleshoot/scripts/analyzers/load.py | 64 +++++++- .../scripts/analyzers/load_report.py | 42 +++++- .../troubleshoot/scripts/analyzers/trace.py | 25 +++- .../skills/troubleshoot/scripts/chart.py | 3 +- .../skills/troubleshoot/scripts/log_parser.py | 139 +++++++++++++++++- .../troubleshoot/scripts/troubleshoot.py | 121 ++++++++++++++- 13 files changed, 605 insertions(+), 37 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md index ba48297d9c9..60b4931b546 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -61,6 +61,7 @@ | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md index cf33b41f723..4322909c01d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -233,6 +233,17 @@ PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` --- +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + ## 使用脚本工具 各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index ba9e40e9869..c02f55c2d65 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -44,6 +44,7 @@ ### 简洁版(终端输出) - 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 - 按三层分类(Router / FD 后端 / 客户端) - 每个问题一行摘要 + 关键指标 - 末尾提示详细版文件路径 @@ -53,8 +54,14 @@ - 路径:`skill_output/troubleshoot//troubleshoot_report_.md` - 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) - 详情拆分到 `details/` 子目录: - - `details/health_events.md` — Worker 逐分钟健康事件 - - `details/trace_.md` — 请求追踪事件链 + - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断 + - `detail/load_select_release.md` — 负载诊断 + select/release 明细 + - `detail/load_diagnoses.md` — load 诊断列表 + - `detail/load_counter_state.md` — request/token counter 末状态 + - `detail/latency_diagnoses.md` — 延迟诊断详情 + - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) + - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 + - `detail/trace_.md` — 请求追踪事件链 --- diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3fca296f4d6..0d146c9b43c 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig "cold_starts": cold_starts, "hitratio_stats": hitratio_stats, "tokenizer_degraded_count": tokenizer_degraded_count, + "cross_diagnosis": _analyze_cross_diagnosis( + session_stickiness=session_stickiness, + hitratio_stats=hitratio_stats, + strategy_dist=strategy_dist, + eviction_impact=eviction_impact, + ), "diagnoses": diagnoses, "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " f"冷启动 {cold_starts}", @@ -339,6 +345,45 @@ def _diagnose( return diagnoses +def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact): + """交叉诊断:基于粘性/命中率/fallback/驱逐给出简表。""" + if not session_stickiness: + return [] + avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1) + mean_hr = hitratio_stats.get("mean", 0) + fallback_pct = 0 + for s in strategy_dist: + if s.get("value") == "process_tokens": + fallback_pct = s.get("pct", 0) + break + evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted")) + + diagnosis = "运行良好" + action = "-" + if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10: + diagnosis = "运行良好" + elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0: + diagnosis = "疑似驱逐导致命中率低" + action = "考虑增大 eviction-duration-mins" + elif avg_stickiness < 40 and fallback_pct >= 20: + diagnosis = "低粘性 + 高 fallback" + action = "检查负载阈值与 cache-aware 参数" + elif avg_stickiness < 40 and mean_hr < 20: + diagnosis = "低粘性 + 低命中" + action = "检查缓存预热与 prompt 稳定性" + + return [ + { + "avg_stickiness_pct": round(avg_stickiness, 1), + "mean_hitRatio_pct": round(mean_hr, 1), + "fallback_pct": round(fallback_pct, 1), + "evicted_after_timeout": evicted_cnt, + "diagnosis": diagnosis, + "action": action, + } + ] + + # ════════════════════════════════════════════════════════════════ # 报告格式化 # ════════════════════════════════════════════════════════════════ @@ -349,13 +394,18 @@ def format_cache_report(result): sections = ["## Cache 调度诊断", ""] sections.append(f' {result["summary"]}') sections.append("") + detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""] if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)") sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") # 策略分布 if result["strategy_dist"]: @@ -364,6 +414,10 @@ def format_cache_report(result): bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## 策略分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # hitRatio 统计 hs = result.get("hitratio_stats", {}) @@ -383,6 +437,10 @@ def format_cache_report(result): bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## Fallback 原因分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # Tokenizer 退化 if result.get("tokenizer_degraded_count", 0) > 0: @@ -394,6 +452,8 @@ def format_cache_report(result): if stickiness: sections.append("### Session 粘性") sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)") + sections.append("") table_data = [ { "Session": sid[:16], @@ -403,26 +463,37 @@ def format_cache_report(result): } for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) ] - sections.append( + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append( render_table( - table_data[:10], + table_data, columns=["Session", "请求数", "粘性率", "切换次数"], right_align={"请求数", "粘性率", "切换次数"}, ) ) - sections.append("") + detail_sections.append("") # 非最优选择 if result.get("suboptimal_selections"): subs = result["suboptimal_selections"] sections.append(f"### 非最优选择 ({len(subs)} 次)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)") + sections.append("") reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): sections.append(f" {reason}: {count} 次") sections.append("") + detail_sections.append("## 非最优选择(Top 20)") + detail_sections.append("") + for s in subs[:20]: + detail_sections.append( + f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' + ) + detail_sections.append("") # 驱逐影响 if result.get("eviction_impact"): @@ -430,13 +501,61 @@ def format_cache_report(result): evicted = [e for e in evictions if e["evicted"]] sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + for e in evictions[:50]: + detail_sections.append( + f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' + ) + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') sections.append("") + detail_sections.append("## 冷启动识别") + detail_sections.append("") + detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') + detail_sections.append("") + + if result.get("cross_diagnosis"): + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append( + render_table( + result["cross_diagnosis"], + columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"], + right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, + ) + ) + detail_sections.append("") + + if any( + [ + result.get("session_stickiness"), + result.get("suboptimal_selections"), + result.get("eviction_impact"), + result.get("cross_diagnosis"), + result.get("diagnoses"), + ] + ): + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](detail/cache_eviction.md) | " + "[detail/cache_fallback.md](detail/cache_fallback.md) | " + "[detail/cache_cross.md](detail/cache_cross.md)" + ) + sections.append("") - return "\n".join(sections) + return "\n".join(sections), "\n".join(detail_sections) # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index b8217a5ffa4..7f519806225 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -33,6 +33,7 @@ ("counter already zero", "Router"), ("tokenizer failed", "Router"), ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), # 客户端 ("Invalid request body", "客户端"), ("Invalid JSON format", "客户端"), @@ -55,6 +56,15 @@ ("GetRemoteMetrics failed", "FD 后端"), ] +IMPACT_RULES = [ + ("Failed to select", "请求可能返回 502/503"), + ("Failed to connect to backend", "后端不可达,请求失败"), + ("Panic recovered", "Router 代码异常,可能影响稳定性"), + ("scanner error", "流式响应中断"), + ("copy error", "非流式响应中断"), + ("Failed to read YAML file config/register.yaml", "可选配置未加载(若未启用可忽略)"), +] + # scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 SCANNER_COPY_PATTERNS = ("scanner error", "copy error") @@ -75,6 +85,13 @@ def classify_source_layer(template, original=""): return "未知" +def classify_impact(template): + for pattern, impact in IMPACT_RULES: + if pattern in template: + return impact + return "-" + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n): "count": g["count"], "pct": round(g["count"] / total * 100, 1) if total else 0, "source_layer": source_layer, + "impact": classify_impact(g["template"]), "level": g["level"], + "urls": _extract_urls(g["originals"]), "sample_originals": g["originals"], } ) @@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n): return result +def _extract_urls(originals): + import re + + urls = set() + for line in originals: + for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line): + urls.add(m) + return sorted(urls) + + def _grep_lines(log_file, pattern, tail=None): """用 grep 从日志文件提取匹配行。""" try: @@ -240,6 +269,9 @@ def format_errors_report(result): f'请求总数: {result["total_requests"]} | ' f'错误率: {result["error_rate"]}%' ) + sections.append(" 指标口径: ERROR/WARN=日志级别计数;请求总数=HTTP 请求行数;错误率=非200请求数/请求总数×100%。") + if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0): + sections.append(" ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200;并不代表没有 ERROR/WARN 日志。") sections.append("") # Panic @@ -276,12 +308,26 @@ def format_errors_report(result): "占比": f'{e["pct"]}%', "级别": e["level"], "来源层": e["source_layer"], + "影响": e.get("impact", "-"), + "URLs": ",".join(e.get("urls", [])[:2]) if e.get("urls") else "-", } ) sections.append( - render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) + render_table( + table_data, + columns=["模板", "数量", "占比", "级别", "来源层", "影响", "URLs"], + right_align={"数量", "占比"}, + ) ) sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") # 状态码分布 if result["status_code_dist"]: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index ca01d718dbc..8fc4e88cc72 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events): break all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + for reg in register_by_ip.get(worker_ip, []): + all_events.append({"ts": reg["ts"], "type": "REGISTERED"}) all_events.extend(recovery_events) all_events.sort(key=lambda e: e["ts"] or "") down_periods = _compute_down_periods(all_events) down_count = len(down_periods) avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + detect_latency = _compute_detect_latency(all_events) workers[url] = { "events": all_events, @@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): "recovered": recovered, "inflight_preserved": counter_counts.get(url, 0), "down_periods": down_periods, + "avg_detect_latency_s": detect_latency, } return workers @@ -191,6 +195,24 @@ def _compute_down_periods(events): return down_periods +def _compute_detect_latency(events): + """计算 NOT_HEALTHY -> REMOVED 平均检测延迟(秒)。""" + last_unhealthy = None + latencies = [] + for evt in events: + if evt["type"] == "NOT_HEALTHY" and evt.get("ts"): + last_unhealthy = evt["ts"] + elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"): + try: + latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds()) + except ValueError: + pass + last_unhealthy = None + if not latencies: + return "-" + return round(sum(latencies) / len(latencies), 1) + + def _compute_uptime_pct(events): """计算 Worker 可用性百分比。""" if not events: @@ -313,8 +335,7 @@ def format_health_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/health_events.md](detail/health_events.md)") sections.append("") # Worker 可用性表格 @@ -335,6 +356,7 @@ def format_health_report(result): "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", + "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"), "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", } @@ -342,8 +364,8 @@ def format_health_report(result): sections.append( render_table( table_data, - columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"], - right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"}, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"}, ) ) sections.append("") @@ -360,6 +382,12 @@ def format_health_report(result): # 事件详情 → 拆分到 detail_text detail_parts = ["# Worker 健康事件详情", ""] has_events = False + if result.get("diagnoses"): + detail_parts.append("## 诊断") + detail_parts.append("") + for d in result["diagnoses"]: + detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_parts.append("") for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True @@ -373,7 +401,7 @@ def format_health_report(result): # 主报告中添加引用 if has_events: - sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)") + sections.append("> 完整事件详情: [detail/health_events.md](detail/health_events.md)") sections.append("") return "\n".join(sections), detail_text diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py index eec862910e8..57af094661e 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -255,6 +255,7 @@ def format_latency_report(result): f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' f'max={_fmt_ms(stats["max"])}' ) + sections.append(" 指标口径: pXX=延迟分位数;吞吐量=每个时间桶内请求数(count);调度耗时=同 request_id 的 ts_ms(max-min)。") sections.append("") # 延迟分布 @@ -331,13 +332,10 @@ def format_latency_report(result): ) sections.append("") - # 诊断 + # 诊断(仅在 detail 输出) if result["diagnoses"]: sections.append("### 诊断") - for d in result["diagnoses"]: - severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "} - mark = severity_mark.get(d["severity"], " ") - sections.append(f' [{mark}] {d["message"]}') + sections.append(" 诊断见详情: [detail/latency_diagnoses.md](detail/latency_diagnoses.md)") sections.append("") return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index c38b0b80953..0d92ae56a32 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -30,6 +30,8 @@ # Token 事件 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)") +RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)") def _strip_scheme(url): @@ -135,11 +137,21 @@ def analyze_load(log_file, tail=None): sr_result = ( match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} + else { + "matched": [], + "unmatched_selects": [], + "unmatched_releases": [], + "untracked_selects": [], + "failed_selects": [], + "per_worker": {}, + "id_coverage": {}, + "type_summary": {}, + } ) # Token 统计 token_stats = _analyze_tokens(h3_lines, h11_lines) + counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines) # 请求堆积检测 pileup = _detect_pileup(stats_records) @@ -154,6 +166,7 @@ def analyze_load(log_file, tail=None): "counter_anomalies": anomaly_summary, "select_release": sr_result, "token_stats": token_stats, + "counter_last_state": counter_last_state, "pileup_detected": pileup, "diagnoses": diagnoses, "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", @@ -191,6 +204,55 @@ def _analyze_tokens(h3_lines, h11_lines): return result +def _analyze_counter_last_state(lines): + """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。""" + state = defaultdict( + lambda: { + "req_last_action": "-", + "req_last_value": "-", + "token_last_action": "-", + "token_last_value": "-", + "last_ts": "", + } + ) + for line in lines: + ts = extract_ts(line) or "" + m = SELECT_REQ_COUNT_RE.search(line) + if m: + w = m.group(2) + state[w]["req_last_action"] = "select" + state[w]["req_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_REQ_COUNT_RE.search(line) + if m: + w = m.group(1) + state[w]["req_last_action"] = "release" + state[w]["req_last_value"] = m.group(2) + state[w]["last_ts"] = ts + continue + m = SELECT_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "select" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "release" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + + result = [] + for w in sorted(state.keys()): + s = state[w] + result.append({"worker": _strip_scheme(w), **s}) + return result + + def _detect_pileup(stats_records): """检测请求堆积:total_running 连续上升 >5 个采样点。""" if len(stats_records) < 5: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index 86ba1f0d94f..8c375e25e57 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -25,8 +25,10 @@ def format_load_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append( + f' 共 {len(result["diagnoses"])} 条诊断,见详情: [detail/load_diagnoses.md](detail/load_diagnoses.md);' + '匹配明细见 [detail/load_select_release.md](detail/load_select_release.md)' + ) sections.append("") detail_sections.append("## 诊断") detail_sections.append("") @@ -39,6 +41,7 @@ def format_load_report(result): if ls: sections.append("### 负载概览 (total_running)") sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") sections.append( f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' @@ -108,6 +111,9 @@ def format_load_report(result): sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) sections.append("") sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: token-release 由同 worker 邻近 select 推断到 prefill/mixed,不直接依赖 `release prefill tokens` 文本。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") sections.append("") detail_sections.append("## 按类型统计") detail_sections.append("") @@ -192,7 +198,7 @@ def format_load_report(result): sections.append(" 解释: 出现 request select,但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。") for u in sr["unmatched_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)") sections.append("") detail_sections.append("## 未匹配 select(完整)") detail_sections.append("") @@ -202,11 +208,23 @@ def format_load_report(result): ) detail_sections.append("") + if sr.get("unmatched_releases"): + sections.append(f' ⚠ {len(sr["unmatched_releases"])} 个未匹配 release(已区分 req/token)') + sections.append(" > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)") + sections.append("") + detail_sections.append("## 未匹配 release(按 release_kind 分类)") + detail_sections.append("") + for r in sr["unmatched_releases"]: + detail_sections.append( + f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}' + ) + detail_sections.append("") + if sr.get("untracked_selects"): sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') for u in sr["untracked_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](detail/load_select_release.md)") sections.append("") detail_sections.append("## Untracked selects(缺少可关联 ID)") detail_sections.append("") @@ -239,4 +257,20 @@ def format_load_report(result): ) sections.append("") + if result.get("counter_last_state"): + sections.append("### 计数器末状态") + sections.append("") + sections.append(" 末状态详情见: [detail/load_counter_state.md](detail/load_counter_state.md)") + sections.append("") + detail_sections.append("## Counter / Token Counter 末状态(最后一条计数日志)") + detail_sections.append("") + detail_sections.append( + render_table( + result["counter_last_state"], + columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"], + right_align={"req_last_value", "token_last_value"}, + ) + ) + detail_sections.append("") + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 6c9a0323724..a792da8002f 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -16,6 +16,7 @@ from log_parser import ( extract_tags, extract_ts, + match_select_release, parse_cache_strategy_line, parse_http_line, ) @@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None): # 解析事件链 events = _parse_event_chain(all_lines) lifecycle_complete = _check_lifecycle_complete(events) - diagnoses = _diagnose_trace(events, lifecycle_complete) + sr_check = match_select_release(all_lines) + diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) traces[tid] = { "events": events, "lifecycle_complete": lifecycle_complete, "diagnoses": diagnoses, + "sr_check": sr_check, "matched_tag": "session_id" if is_session else "request_id/trace_id", "related_ids": { "request_ids": sorted(related_request_ids) if is_session else [], @@ -271,7 +274,7 @@ def _check_lifecycle_complete(events): return has_entry and has_exit and (not has_select or has_release) -def _diagnose_trace(events, lifecycle_complete): +def _diagnose_trace(events, lifecycle_complete, sr_check=None): """生成追踪诊断。""" diagnoses = [] types = [e["type"] for e in events] @@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete): if "FAILED_SELECT" in types: diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + if sr_check: + if sr_check.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select', + } + ) + if sr_check.get("unmatched_releases"): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release', + } + ) + return diagnoses @@ -367,7 +386,7 @@ def format_trace_report(result): # 主报告中添加引用和摘要 safe_tid = tid.replace("/", "_") sections.append(f' 事件数: {len(trace["events"])}') - sections.append(f" > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)") + sections.append(f" > 完整事件链: [detail/trace_{safe_tid}.md](detail/trace_{safe_tid}.md)") sections.append("") return "\n".join(sections), detail_dict diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py index 83bb0203432..1eaea1369f8 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None): w = col_widths[col] if col in right_align: header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") else: header_parts.append(f" {col:<{w}} ") - sep_parts.append("-" * (w + 2)) + sep_parts.append(":" + "-" * (w + 1)) lines = [] lines.append("|" + "|".join(header_parts) + "|") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 200f976f2ff..b99e75c37c5 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type): return "unknown" +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: - token_type = trm.group(1) or "prefill" + token_type = trm.group(1) releases.append( { "ts": ts, "worker": trm.group(2), - "type": f'{_normalize_worker_type(token_type)}_tokens', + # 不直接信任日志里的 token type 文本("release prefill tokens" 也可能来自 mixed) + "type": "unknown_tokens", + "raw_token_type": token_type or "", "tags": tags, "tokens": int(trm.group(3)), "line": line_no, @@ -716,7 +789,24 @@ def match_select_release(lines, fallback_window_s=120): "token_releases": counts["token_releases"], } - # 按 worker type 分类统计(prefill/decode/mixed) + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + if base_t == "unknown": + # token release 的 worker type 由同 worker 邻近 select 推断(prefill/mixed) + base_t = _infer_token_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) type_summary = defaultdict( lambda: { "counter_selects": 0, @@ -730,16 +820,43 @@ def match_select_release(lines, fallback_window_s=120): type_summary[s_type]["counter_selects"] += 1 if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 - for r in releases: - r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) - if str(r.get("type", "")).endswith("_tokens"): + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): type_summary[r_type]["token_releases"] += 1 else: type_summary[r_type]["counter_releases"] += 1 + unmatched_releases = [] + for i, r in enumerate(releases): + if str(r.get("type", "")).endswith("_tokens"): + # token release: 近邻存在 prefill/mixed select 则视为可解释,不计入 unmatched + inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", "")) + if inferred_token_type == "unknown": + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": inferred_token_type, + "release_kind": "token_release", + } + ) + continue + if i not in release_used: + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")), + "release_kind": "request_release", + } + ) + return { "matched": matched, "unmatched_selects": unmatched_selects, + "unmatched_releases": unmatched_releases, "untracked_selects": untracked_selects, "failed_selects": failed_selects, "per_worker": pw_result, @@ -949,6 +1066,16 @@ def check(name, got, expected): "dial tcp {ip:port}: connection refused", ) + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) + print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index a818d31150f..7dd2e153a64 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -128,7 +128,20 @@ def determine_status(results): reasons.append(d["message"]) if reasons: - return "DEGRADED", ", ".join(reasons) + # 去重并限制长度,避免状态行过长难读 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + max_reasons = 4 + shown = deduped[:max_reasons] + extra = len(deduped) - len(shown) + summary = ";".join(shown) + if extra > 0: + summary += f";另有 {extra} 项诊断见各维度 detail 报告" + return "DEGRADED", summary if not results: return "HEALTHY", "无分析数据" @@ -148,10 +161,26 @@ def format_full_report(results, status, status_reason): - 'trace_files': {trace_id: text} 或 {} """ parts = [] - details = {"health_events": None, "load_select_release": None, "trace_files": {}} + details = { + "health_events": None, + "load_select_release": None, + "latency_diagnoses": None, + "cache_diagnosis": None, + "load_diagnoses": None, + "load_counter_state": None, + "cache_session_stickiness": None, + "cache_suboptimal": None, + "cache_eviction": None, + "cache_fallback": None, + "cache_cross": None, + "trace_files": {}, + } # 状态行 parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) parts.append("=" * 60) parts.append("") @@ -161,6 +190,12 @@ def format_full_report(results, status, status_reason): if "latency" in results: parts.append(format_latency_report(results["latency"])) + if results["latency"].get("diagnoses"): + lines = ["# 延迟诊断详情", ""] + for d in results["latency"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] {d.get("message","")}') + lines.append("") + details["latency_diagnoses"] = "\n".join(lines) if "health" in results: summary, detail = format_health_report(results["health"]) @@ -173,9 +208,58 @@ def format_full_report(results, status, status_reason): parts.append(summary) if detail: details["load_select_release"] = detail + if results["load"].get("diagnoses"): + lines = ["# Load 诊断详情", ""] + for d in results["load"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}') + lines.append("") + details["load_diagnoses"] = "\n".join(lines) + if results["load"].get("counter_last_state"): + rows = results["load"]["counter_last_state"] + lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"] + for r in rows: + lines.append( + f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' + ) + lines.append("") + details["load_counter_state"] = "\n".join(lines) if "cache" in results: - parts.append(format_cache_report(results["cache"])) + summary, detail = format_cache_report(results["cache"]) + parts.append(summary) + if detail: + details["cache_diagnosis"] = detail + c = results["cache"] + if c.get("session_stickiness"): + lines = ["# Cache Session 粘性详情", ""] + for sid, s in c["session_stickiness"].items(): + lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}') + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + if c.get("suboptimal_selections"): + lines = ["# Cache 非最优选择详情", ""] + for x in c["suboptimal_selections"][:200]: + lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}') + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + if c.get("eviction_impact"): + lines = ["# Cache 驱逐影响详情", ""] + for x in c["eviction_impact"][:200]: + lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}') + lines.append("") + details["cache_eviction"] = "\n".join(lines) + if c.get("fallback_reasons"): + lines = ["# Cache Fallback 原因详情", ""] + for x in c["fallback_reasons"]: + lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') + lines.append("") + details["cache_fallback"] = "\n".join(lines) + if c.get("cross_diagnosis"): + lines = ["# Cache 交叉诊断详情", ""] + for x in c["cross_diagnosis"]: + lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%') + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -217,6 +301,37 @@ def save_detailed_report(report_text, output_dir, details=None): with open(load_path, "w", encoding="utf-8") as f: f.write(details["load_select_release"]) + if details.get("latency_diagnoses"): + latency_path = os.path.join(detail_dir, "latency_diagnoses.md") + with open(latency_path, "w", encoding="utf-8") as f: + f.write(details["latency_diagnoses"]) + + if details.get("cache_diagnosis"): + cache_path = os.path.join(detail_dir, "cache_diagnosis.md") + with open(cache_path, "w", encoding="utf-8") as f: + f.write(details["cache_diagnosis"]) + if details.get("load_diagnoses"): + with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f: + f.write(details["load_diagnoses"]) + if details.get("load_counter_state"): + with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: + f.write(details["load_counter_state"]) + if details.get("cache_session_stickiness"): + with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: + f.write(details["cache_session_stickiness"]) + if details.get("cache_suboptimal"): + with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: + f.write(details["cache_suboptimal"]) + if details.get("cache_eviction"): + with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: + f.write(details["cache_eviction"]) + if details.get("cache_fallback"): + with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: + f.write(details["cache_fallback"]) + if details.get("cache_cross"): + with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: + f.write(details["cache_cross"]) + for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") From 5ac7cb3afd399b6372884dad17f5714b07ea96e4 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 11:21:35 +0800 Subject: [PATCH 17/40] troubleshoot: map token-release type by worker URL instead of time-neighbor inference --- .../troubleshoot/references/error_catalog.md | 1 + .../troubleshoot/references/log_patterns.md | 11 ++ .../references/report_templates.md | 12 +- .../troubleshoot/scripts/analyzers/cache.py | 131 +++++++++++++- .../troubleshoot/scripts/analyzers/errors.py | 56 ++++-- .../troubleshoot/scripts/analyzers/health.py | 38 +++- .../troubleshoot/scripts/analyzers/latency.py | 8 +- .../troubleshoot/scripts/analyzers/load.py | 65 ++++++- .../scripts/analyzers/load_report.py | 65 ++++++- .../troubleshoot/scripts/analyzers/trace.py | 25 ++- .../skills/troubleshoot/scripts/chart.py | 3 +- .../skills/troubleshoot/scripts/log_parser.py | 163 +++++++++++++++++- .../troubleshoot/scripts/troubleshoot.py | 142 ++++++++++++++- 13 files changed, 669 insertions(+), 51 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md index ba48297d9c9..60b4931b546 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -61,6 +61,7 @@ | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md index cf33b41f723..4322909c01d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -233,6 +233,17 @@ PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` --- +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + ## 使用脚本工具 各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index ba9e40e9869..cd705d02816 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -44,6 +44,7 @@ ### 简洁版(终端输出) - 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 - 按三层分类(Router / FD 后端 / 客户端) - 每个问题一行摘要 + 关键指标 - 末尾提示详细版文件路径 @@ -53,8 +54,15 @@ - 路径:`skill_output/troubleshoot//troubleshoot_report_.md` - 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) - 详情拆分到 `details/` 子目录: - - `details/health_events.md` — Worker 逐分钟健康事件 - - `details/trace_.md` — 请求追踪事件链 + - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断 + - `detail/errors_topn.md` — ERROR/WARN 模板明细(数量/级别/来源层/影响 + URLs) + - `detail/load_select_release.md` — 负载诊断 + select/release 明细 + - `detail/load_diagnoses.md` — load 诊断列表 + - `detail/load_counter_state.md` — request/token counter 末状态 + - `detail/latency_diagnoses.md` — 延迟诊断详情 + - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) + - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 + - `detail/trace_.md` — 请求追踪事件链 --- diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3fca296f4d6..3a5c19ad00b 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig "cold_starts": cold_starts, "hitratio_stats": hitratio_stats, "tokenizer_degraded_count": tokenizer_degraded_count, + "cross_diagnosis": _analyze_cross_diagnosis( + session_stickiness=session_stickiness, + hitratio_stats=hitratio_stats, + strategy_dist=strategy_dist, + eviction_impact=eviction_impact, + ), "diagnoses": diagnoses, "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " f"冷启动 {cold_starts}", @@ -339,6 +345,45 @@ def _diagnose( return diagnoses +def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact): + """交叉诊断:基于粘性/命中率/fallback/驱逐给出简表。""" + if not session_stickiness: + return [] + avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1) + mean_hr = hitratio_stats.get("mean", 0) + fallback_pct = 0 + for s in strategy_dist: + if s.get("value") == "process_tokens": + fallback_pct = s.get("pct", 0) + break + evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted")) + + diagnosis = "运行良好" + action = "-" + if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10: + diagnosis = "运行良好" + elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0: + diagnosis = "疑似驱逐导致命中率低" + action = "考虑增大 eviction-duration-mins" + elif avg_stickiness < 40 and fallback_pct >= 20: + diagnosis = "低粘性 + 高 fallback" + action = "检查负载阈值与 cache-aware 参数" + elif avg_stickiness < 40 and mean_hr < 20: + diagnosis = "低粘性 + 低命中" + action = "检查缓存预热与 prompt 稳定性" + + return [ + { + "avg_stickiness_pct": round(avg_stickiness, 1), + "mean_hitRatio_pct": round(mean_hr, 1), + "fallback_pct": round(fallback_pct, 1), + "evicted_after_timeout": evicted_cnt, + "diagnosis": diagnosis, + "action": action, + } + ] + + # ════════════════════════════════════════════════════════════════ # 报告格式化 # ════════════════════════════════════════════════════════════════ @@ -349,13 +394,18 @@ def format_cache_report(result): sections = ["## Cache 调度诊断", ""] sections.append(f' {result["summary"]}') sections.append("") + detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""] if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") # 策略分布 if result["strategy_dist"]: @@ -364,6 +414,10 @@ def format_cache_report(result): bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## 策略分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # hitRatio 统计 hs = result.get("hitratio_stats", {}) @@ -383,6 +437,10 @@ def format_cache_report(result): bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## Fallback 原因分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # Tokenizer 退化 if result.get("tokenizer_degraded_count", 0) > 0: @@ -394,6 +452,8 @@ def format_cache_report(result): if stickiness: sections.append("### Session 粘性") sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") table_data = [ { "Session": sid[:16], @@ -403,26 +463,37 @@ def format_cache_report(result): } for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) ] - sections.append( + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append( render_table( - table_data[:10], + table_data, columns=["Session", "请求数", "粘性率", "切换次数"], right_align={"请求数", "粘性率", "切换次数"}, ) ) - sections.append("") + detail_sections.append("") # 非最优选择 if result.get("suboptimal_selections"): subs = result["suboptimal_selections"] sections.append(f"### 非最优选择 ({len(subs)} 次)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): sections.append(f" {reason}: {count} 次") sections.append("") + detail_sections.append("## 非最优选择(Top 20)") + detail_sections.append("") + for s in subs[:20]: + detail_sections.append( + f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' + ) + detail_sections.append("") # 驱逐影响 if result.get("eviction_impact"): @@ -430,13 +501,61 @@ def format_cache_report(result): evicted = [e for e in evictions if e["evicted"]] sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + for e in evictions[:50]: + detail_sections.append( + f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' + ) + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') sections.append("") + detail_sections.append("## 冷启动识别") + detail_sections.append("") + detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') + detail_sections.append("") + + if result.get("cross_diagnosis"): + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append( + render_table( + result["cross_diagnosis"], + columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"], + right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, + ) + ) + detail_sections.append("") + + if any( + [ + result.get("session_stickiness"), + result.get("suboptimal_selections"), + result.get("eviction_impact"), + result.get("cross_diagnosis"), + result.get("diagnoses"), + ] + ): + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") - return "\n".join(sections) + return "\n".join(sections), "\n".join(detail_sections) # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index b8217a5ffa4..f0e4c352b6c 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -33,6 +33,7 @@ ("counter already zero", "Router"), ("tokenizer failed", "Router"), ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), # 客户端 ("Invalid request body", "客户端"), ("Invalid JSON format", "客户端"), @@ -55,6 +56,15 @@ ("GetRemoteMetrics failed", "FD 后端"), ] +IMPACT_RULES = [ + ("Failed to select", "请求可能返回 502/503"), + ("Failed to connect to backend", "后端不可达,请求失败"), + ("Panic recovered", "Router 代码异常,可能影响稳定性"), + ("scanner error", "流式响应中断"), + ("copy error", "非流式响应中断"), + ("Failed to read YAML file config/register.yaml", "可选配置未加载(若未启用可忽略)"), +] + # scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 SCANNER_COPY_PATTERNS = ("scanner error", "copy error") @@ -75,6 +85,13 @@ def classify_source_layer(template, original=""): return "未知" +def classify_impact(template): + for pattern, impact in IMPACT_RULES: + if pattern in template: + return impact + return "-" + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n): "count": g["count"], "pct": round(g["count"] / total * 100, 1) if total else 0, "source_layer": source_layer, + "impact": classify_impact(g["template"]), "level": g["level"], + "urls": _extract_urls(g["originals"]), "sample_originals": g["originals"], } ) @@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n): return result +def _extract_urls(originals): + import re + + urls = set() + for line in originals: + for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line): + urls.add(m) + return sorted(urls) + + def _grep_lines(log_file, pattern, tail=None): """用 grep 从日志文件提取匹配行。""" try: @@ -240,6 +269,9 @@ def format_errors_report(result): f'请求总数: {result["total_requests"]} | ' f'错误率: {result["error_rate"]}%' ) + sections.append(" 指标口径: ERROR/WARN=日志级别计数;请求总数=HTTP 请求行数;错误率=非200请求数/请求总数×100%。") + if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0): + sections.append(" ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200;并不代表没有 ERROR/WARN 日志。") sections.append("") # Panic @@ -266,22 +298,16 @@ def format_errors_report(result): sections.append(render_bar(bar_data, show_count=True)) sections.append("") - # 来源层表格 - table_data = [] - for e in result["error_top_n"][:10]: - table_data.append( - { - "模板": e["template"][:60], - "数量": e["count"], - "占比": f'{e["pct"]}%', - "级别": e["level"], - "来源层": e["source_layer"], - } - ) - sections.append( - render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) - ) + sections.append(" 具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)") sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") # 状态码分布 if result["status_code_dist"]: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index ca01d718dbc..5d1994d9405 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events): break all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + for reg in register_by_ip.get(worker_ip, []): + all_events.append({"ts": reg["ts"], "type": "REGISTERED"}) all_events.extend(recovery_events) all_events.sort(key=lambda e: e["ts"] or "") down_periods = _compute_down_periods(all_events) down_count = len(down_periods) avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + detect_latency = _compute_detect_latency(all_events) workers[url] = { "events": all_events, @@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): "recovered": recovered, "inflight_preserved": counter_counts.get(url, 0), "down_periods": down_periods, + "avg_detect_latency_s": detect_latency, } return workers @@ -191,6 +195,24 @@ def _compute_down_periods(events): return down_periods +def _compute_detect_latency(events): + """计算 NOT_HEALTHY -> REMOVED 平均检测延迟(秒)。""" + last_unhealthy = None + latencies = [] + for evt in events: + if evt["type"] == "NOT_HEALTHY" and evt.get("ts"): + last_unhealthy = evt["ts"] + elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"): + try: + latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds()) + except ValueError: + pass + last_unhealthy = None + if not latencies: + return "-" + return round(sum(latencies) / len(latencies), 1) + + def _compute_uptime_pct(events): """计算 Worker 可用性百分比。""" if not events: @@ -313,8 +335,7 @@ def format_health_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") # Worker 可用性表格 @@ -335,6 +356,7 @@ def format_health_report(result): "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", + "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"), "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", } @@ -342,8 +364,8 @@ def format_health_report(result): sections.append( render_table( table_data, - columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"], - right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"}, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"}, ) ) sections.append("") @@ -360,6 +382,12 @@ def format_health_report(result): # 事件详情 → 拆分到 detail_text detail_parts = ["# Worker 健康事件详情", ""] has_events = False + if result.get("diagnoses"): + detail_parts.append("## 诊断") + detail_parts.append("") + for d in result["diagnoses"]: + detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_parts.append("") for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True @@ -373,7 +401,7 @@ def format_health_report(result): # 主报告中添加引用 if has_events: - sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)") + sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") return "\n".join(sections), detail_text diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py index eec862910e8..508cf3824d9 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -255,6 +255,7 @@ def format_latency_report(result): f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' f'max={_fmt_ms(stats["max"])}' ) + sections.append(" 指标口径: pXX=延迟分位数;吞吐量=每个时间桶内请求数(count);调度耗时=同 request_id 的 ts_ms(max-min)。") sections.append("") # 延迟分布 @@ -331,13 +332,10 @@ def format_latency_report(result): ) sections.append("") - # 诊断 + # 诊断(仅在 detail 输出) if result["diagnoses"]: sections.append("### 诊断") - for d in result["diagnoses"]: - severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "} - mark = severity_mark.get(d["severity"], " ") - sections.append(f' [{mark}] {d["message"]}') + sections.append(" 诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)") sections.append("") return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index c38b0b80953..2e03ba1ce63 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -30,6 +30,8 @@ # Token 事件 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)") +RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)") def _strip_scheme(url): @@ -135,11 +137,22 @@ def analyze_load(log_file, tail=None): sr_result = ( match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} + else { + "matched": [], + "unmatched_selects": [], + "unmatched_releases": [], + "untracked_selects": [], + "failed_selects": [], + "per_worker": {}, + "id_coverage": {}, + "type_summary": {}, + "worker_type_profile": {}, + } ) # Token 统计 token_stats = _analyze_tokens(h3_lines, h11_lines) + counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines) # 请求堆积检测 pileup = _detect_pileup(stats_records) @@ -154,6 +167,7 @@ def analyze_load(log_file, tail=None): "counter_anomalies": anomaly_summary, "select_release": sr_result, "token_stats": token_stats, + "counter_last_state": counter_last_state, "pileup_detected": pileup, "diagnoses": diagnoses, "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", @@ -191,6 +205,55 @@ def _analyze_tokens(h3_lines, h11_lines): return result +def _analyze_counter_last_state(lines): + """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。""" + state = defaultdict( + lambda: { + "req_last_action": "-", + "req_last_value": "-", + "token_last_action": "-", + "token_last_value": "-", + "last_ts": "", + } + ) + for line in lines: + ts = extract_ts(line) or "" + m = SELECT_REQ_COUNT_RE.search(line) + if m: + w = m.group(2) + state[w]["req_last_action"] = "select" + state[w]["req_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_REQ_COUNT_RE.search(line) + if m: + w = m.group(1) + state[w]["req_last_action"] = "release" + state[w]["req_last_value"] = m.group(2) + state[w]["last_ts"] = ts + continue + m = SELECT_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "select" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "release" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + + result = [] + for w in sorted(state.keys()): + s = state[w] + result.append({"worker": _strip_scheme(w), **s}) + return result + + def _detect_pileup(stats_records): """检测请求堆积:total_running 连续上升 >5 个采样点。""" if len(stats_records) < 5: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index 86ba1f0d94f..9d4e9b51496 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -25,8 +25,10 @@ def format_load_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append( + f' 共 {len(result["diagnoses"])} 条诊断,见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md);' + '匹配明细见 [detail/load_select_release.md](../detail/load_select_release.md)' + ) sections.append("") detail_sections.append("## 诊断") detail_sections.append("") @@ -39,6 +41,7 @@ def format_load_report(result): if ls: sections.append("### 负载概览 (total_running)") sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") sections.append( f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' @@ -108,6 +111,9 @@ def format_load_report(result): sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) sections.append("") sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: `release prefill tokens` 会被识别为 token-release;worker type 按该 worker URL 在 select 中的类型映射(prefill/decode/mixed)。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") sections.append("") detail_sections.append("## 按类型统计") detail_sections.append("") @@ -178,6 +184,29 @@ def format_load_report(result): sections.append("") detail_sections.append("## Select/Release Per-Worker") detail_sections.append("") + + if sr.get("worker_type_profile"): + sections.append("### Worker URL 类型画像(基于 select)") + sections.append("") + rows = [] + for w, p in sorted(sr["worker_type_profile"].items()): + rows.append( + { + "Worker": _strip_scheme(w), + "Dominant": p.get("dominant_type", "unknown"), + "Prefill": p.get("prefill", 0), + "Decode": p.get("decode", 0), + "Mixed": p.get("mixed", 0), + } + ) + sections.append( + render_table( + rows, + columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"], + right_align={"Prefill", "Decode", "Mixed"}, + ) + ) + sections.append("") detail_sections.append( render_table( table_data, @@ -192,7 +221,7 @@ def format_load_report(result): sections.append(" 解释: 出现 request select,但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。") for u in sr["unmatched_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") sections.append("") detail_sections.append("## 未匹配 select(完整)") detail_sections.append("") @@ -202,11 +231,23 @@ def format_load_report(result): ) detail_sections.append("") + if sr.get("unmatched_releases"): + sections.append(f' ⚠ {len(sr["unmatched_releases"])} 个未匹配 release(已区分 req/token)') + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") + sections.append("") + detail_sections.append("## 未匹配 release(按 release_kind 分类)") + detail_sections.append("") + for r in sr["unmatched_releases"]: + detail_sections.append( + f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}' + ) + detail_sections.append("") + if sr.get("untracked_selects"): sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') for u in sr["untracked_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") sections.append("") detail_sections.append("## Untracked selects(缺少可关联 ID)") detail_sections.append("") @@ -239,4 +280,20 @@ def format_load_report(result): ) sections.append("") + if result.get("counter_last_state"): + sections.append("### 计数器末状态") + sections.append("") + sections.append(" 末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)") + sections.append("") + detail_sections.append("## Counter / Token Counter 末状态(最后一条计数日志)") + detail_sections.append("") + detail_sections.append( + render_table( + result["counter_last_state"], + columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"], + right_align={"req_last_value", "token_last_value"}, + ) + ) + detail_sections.append("") + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 6c9a0323724..24af9a23500 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -16,6 +16,7 @@ from log_parser import ( extract_tags, extract_ts, + match_select_release, parse_cache_strategy_line, parse_http_line, ) @@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None): # 解析事件链 events = _parse_event_chain(all_lines) lifecycle_complete = _check_lifecycle_complete(events) - diagnoses = _diagnose_trace(events, lifecycle_complete) + sr_check = match_select_release(all_lines) + diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) traces[tid] = { "events": events, "lifecycle_complete": lifecycle_complete, "diagnoses": diagnoses, + "sr_check": sr_check, "matched_tag": "session_id" if is_session else "request_id/trace_id", "related_ids": { "request_ids": sorted(related_request_ids) if is_session else [], @@ -271,7 +274,7 @@ def _check_lifecycle_complete(events): return has_entry and has_exit and (not has_select or has_release) -def _diagnose_trace(events, lifecycle_complete): +def _diagnose_trace(events, lifecycle_complete, sr_check=None): """生成追踪诊断。""" diagnoses = [] types = [e["type"] for e in events] @@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete): if "FAILED_SELECT" in types: diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + if sr_check: + if sr_check.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select', + } + ) + if sr_check.get("unmatched_releases"): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release', + } + ) + return diagnoses @@ -367,7 +386,7 @@ def format_trace_report(result): # 主报告中添加引用和摘要 safe_tid = tid.replace("/", "_") sections.append(f' 事件数: {len(trace["events"])}') - sections.append(f" > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)") + sections.append(f" > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)") sections.append("") return "\n".join(sections), detail_dict diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py index 83bb0203432..1eaea1369f8 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None): w = col_widths[col] if col in right_align: header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") else: header_parts.append(f" {col:<{w}} ") - sep_parts.append("-" * (w + 2)) + sep_parts.append(":" + "-" * (w + 1)) lines = [] lines.append("|" + "|".join(header_parts) + "|") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 200f976f2ff..548c29ebc29 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -501,6 +501,77 @@ def _normalize_worker_type(worker_type): return "unknown" +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -536,12 +607,14 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: - token_type = trm.group(1) or "prefill" + token_type = trm.group(1) releases.append( { "ts": ts, "worker": trm.group(2), - "type": f'{_normalize_worker_type(token_type)}_tokens', + # 文本默认按 prefill 记,再结合同 worker 邻近 select 做纠偏(mixed 场景) + "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens', + "raw_token_type": token_type or "", "tags": tags, "tokens": int(trm.group(3)), "line": line_no, @@ -716,7 +789,33 @@ def match_select_release(lines, fallback_window_s=120): "token_releases": counts["token_releases"], } - # 按 worker type 分类统计(prefill/decode/mixed) + # 基于 select 构建 worker URL -> dominant type 映射 + per_worker_type_counts = defaultdict(lambda: defaultdict(int)) + for s in selects: + per_worker_type_counts[s["worker"]][_normalize_worker_type(s.get("type"))] += 1 + worker_dominant_type = {} + for w, counts in per_worker_type_counts.items(): + worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown" + + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + # token release 按 worker URL 对应的 select 类型映射,不做邻近时间纠偏 + mapped_t = worker_dominant_type.get(r.get("worker", ""), "unknown") + if mapped_t in ("prefill", "decode", "mixed"): + base_t = mapped_t + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) type_summary = defaultdict( lambda: { "counter_selects": 0, @@ -730,16 +829,57 @@ def match_select_release(lines, fallback_window_s=120): type_summary[s_type]["counter_selects"] += 1 if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 - for r in releases: - r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) - if str(r.get("type", "")).endswith("_tokens"): + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): type_summary[r_type]["token_releases"] += 1 else: type_summary[r_type]["counter_releases"] += 1 + # 每个 worker URL 的类型画像(基于 select) + worker_type_profile = {} + for w, counts in per_worker_type_counts.items(): + dominant = "unknown" + if counts: + dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] + worker_type_profile[w] = { + "dominant_type": dominant, + "prefill": counts.get("prefill", 0), + "decode": counts.get("decode", 0), + "mixed": counts.get("mixed", 0), + "unknown": counts.get("unknown", 0), + } + + unmatched_releases = [] + for i, r in enumerate(releases): + if str(r.get("type", "")).endswith("_tokens"): + # token release: 近邻存在 prefill/mixed select 则视为可解释,不计入 unmatched + inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", "")) + if inferred_token_type == "unknown": + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": inferred_token_type, + "release_kind": "token_release", + } + ) + continue + if i not in release_used: + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")), + "release_kind": "request_release", + } + ) + return { "matched": matched, "unmatched_selects": unmatched_selects, + "unmatched_releases": unmatched_releases, "untracked_selects": untracked_selects, "failed_selects": failed_selects, "per_worker": pw_result, @@ -751,6 +891,7 @@ def match_select_release(lines, fallback_window_s=120): "without_any_id": without_any_id, }, "type_summary": dict(type_summary), + "worker_type_profile": worker_type_profile, } @@ -949,6 +1090,16 @@ def check(name, got, expected): "dial tcp {ip:port}: connection refused", ) + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) + print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index a818d31150f..641c5106bee 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -128,7 +128,14 @@ def determine_status(results): reasons.append(d["message"]) if reasons: - return "DEGRADED", ", ".join(reasons) + # 去重但保留完整信息 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + return "DEGRADED", ";".join(deduped) if not results: return "HEALTHY", "无分析数据" @@ -148,19 +155,65 @@ def format_full_report(results, status, status_reason): - 'trace_files': {trace_id: text} 或 {} """ parts = [] - details = {"health_events": None, "load_select_release": None, "trace_files": {}} + details = { + "health_events": None, + "load_select_release": None, + "latency_diagnoses": None, + "cache_diagnosis": None, + "load_diagnoses": None, + "load_counter_state": None, + "cache_session_stickiness": None, + "cache_suboptimal": None, + "cache_eviction": None, + "cache_fallback": None, + "cache_cross": None, + "errors_topn": None, + "trace_files": {}, + } # 状态行 parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) parts.append("=" * 60) parts.append("") # 各维度报告 if "errors" in results: parts.append(format_errors_report(results["errors"])) + if results["errors"].get("error_top_n"): + lines = [ + "# Errors TopN 详情", + "", + "| 模板 | 数量 | 级别 | 来源层 | 影响 |", + "|:--|--:|:--|:--|:--|", + ] + for e in results["errors"]["error_top_n"]: + lines.append( + f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |' + ) + lines.append("") + lines.append("## 涉及 URLs") + lines.append("") + for e in results["errors"]["error_top_n"]: + urls = e.get("urls") or [] + if not urls: + continue + lines.append(f'- 模板: {e.get("template","")}') + for u in urls: + lines.append(f' - {u}') + lines.append("") + details["errors_topn"] = "\n".join(lines) if "latency" in results: parts.append(format_latency_report(results["latency"])) + if results["latency"].get("diagnoses"): + lines = ["# 延迟诊断详情", ""] + for d in results["latency"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] {d.get("message","")}') + lines.append("") + details["latency_diagnoses"] = "\n".join(lines) if "health" in results: summary, detail = format_health_report(results["health"]) @@ -173,9 +226,58 @@ def format_full_report(results, status, status_reason): parts.append(summary) if detail: details["load_select_release"] = detail + if results["load"].get("diagnoses"): + lines = ["# Load 诊断详情", ""] + for d in results["load"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}') + lines.append("") + details["load_diagnoses"] = "\n".join(lines) + if results["load"].get("counter_last_state"): + rows = results["load"]["counter_last_state"] + lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"] + for r in rows: + lines.append( + f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' + ) + lines.append("") + details["load_counter_state"] = "\n".join(lines) if "cache" in results: - parts.append(format_cache_report(results["cache"])) + summary, detail = format_cache_report(results["cache"]) + parts.append(summary) + if detail: + details["cache_diagnosis"] = detail + c = results["cache"] + if c.get("session_stickiness"): + lines = ["# Cache Session 粘性详情", ""] + for sid, s in c["session_stickiness"].items(): + lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}') + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + if c.get("suboptimal_selections"): + lines = ["# Cache 非最优选择详情", ""] + for x in c["suboptimal_selections"][:200]: + lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}') + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + if c.get("eviction_impact"): + lines = ["# Cache 驱逐影响详情", ""] + for x in c["eviction_impact"][:200]: + lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}') + lines.append("") + details["cache_eviction"] = "\n".join(lines) + if c.get("fallback_reasons"): + lines = ["# Cache Fallback 原因详情", ""] + for x in c["fallback_reasons"]: + lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') + lines.append("") + details["cache_fallback"] = "\n".join(lines) + if c.get("cross_diagnosis"): + lines = ["# Cache 交叉诊断详情", ""] + for x in c["cross_diagnosis"]: + lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%') + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -217,6 +319,40 @@ def save_detailed_report(report_text, output_dir, details=None): with open(load_path, "w", encoding="utf-8") as f: f.write(details["load_select_release"]) + if details.get("latency_diagnoses"): + latency_path = os.path.join(detail_dir, "latency_diagnoses.md") + with open(latency_path, "w", encoding="utf-8") as f: + f.write(details["latency_diagnoses"]) + + if details.get("cache_diagnosis"): + cache_path = os.path.join(detail_dir, "cache_diagnosis.md") + with open(cache_path, "w", encoding="utf-8") as f: + f.write(details["cache_diagnosis"]) + if details.get("load_diagnoses"): + with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f: + f.write(details["load_diagnoses"]) + if details.get("load_counter_state"): + with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: + f.write(details["load_counter_state"]) + if details.get("cache_session_stickiness"): + with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: + f.write(details["cache_session_stickiness"]) + if details.get("cache_suboptimal"): + with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: + f.write(details["cache_suboptimal"]) + if details.get("cache_eviction"): + with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: + f.write(details["cache_eviction"]) + if details.get("cache_fallback"): + with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: + f.write(details["cache_fallback"]) + if details.get("cache_cross"): + with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: + f.write(details["cache_cross"]) + if details.get("errors_topn"): + with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f: + f.write(details["errors_topn"]) + for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") From c8a3fd75a649e8ba14b13fdc09f7441a2cc27093 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 12:28:52 +0800 Subject: [PATCH 18/40] fix(troubleshoot): add FIFO+ID consistency checks and quote-safe hint --- .../troubleshoot/references/error_catalog.md | 1 + .../troubleshoot/references/log_patterns.md | 11 + .../references/report_templates.md | 12 +- .../troubleshoot/scripts/analyzers/cache.py | 131 +++++++- .../troubleshoot/scripts/analyzers/errors.py | 56 +++- .../troubleshoot/scripts/analyzers/health.py | 38 ++- .../troubleshoot/scripts/analyzers/latency.py | 8 +- .../troubleshoot/scripts/analyzers/load.py | 75 ++++- .../scripts/analyzers/load_report.py | 102 +++++- .../troubleshoot/scripts/analyzers/trace.py | 25 +- .../skills/troubleshoot/scripts/chart.py | 3 +- .../skills/troubleshoot/scripts/log_parser.py | 303 ++++++++++++++---- .../troubleshoot/scripts/troubleshoot.py | 198 +++++++++++- 13 files changed, 850 insertions(+), 113 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md index ba48297d9c9..60b4931b546 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -61,6 +61,7 @@ | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md index cf33b41f723..4322909c01d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -233,6 +233,17 @@ PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` --- +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + ## 使用脚本工具 各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index ba9e40e9869..cd705d02816 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -44,6 +44,7 @@ ### 简洁版(终端输出) - 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 - 按三层分类(Router / FD 后端 / 客户端) - 每个问题一行摘要 + 关键指标 - 末尾提示详细版文件路径 @@ -53,8 +54,15 @@ - 路径:`skill_output/troubleshoot//troubleshoot_report_.md` - 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) - 详情拆分到 `details/` 子目录: - - `details/health_events.md` — Worker 逐分钟健康事件 - - `details/trace_.md` — 请求追踪事件链 + - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断 + - `detail/errors_topn.md` — ERROR/WARN 模板明细(数量/级别/来源层/影响 + URLs) + - `detail/load_select_release.md` — 负载诊断 + select/release 明细 + - `detail/load_diagnoses.md` — load 诊断列表 + - `detail/load_counter_state.md` — request/token counter 末状态 + - `detail/latency_diagnoses.md` — 延迟诊断详情 + - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) + - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 + - `detail/trace_.md` — 请求追踪事件链 --- diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3fca296f4d6..3a5c19ad00b 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig "cold_starts": cold_starts, "hitratio_stats": hitratio_stats, "tokenizer_degraded_count": tokenizer_degraded_count, + "cross_diagnosis": _analyze_cross_diagnosis( + session_stickiness=session_stickiness, + hitratio_stats=hitratio_stats, + strategy_dist=strategy_dist, + eviction_impact=eviction_impact, + ), "diagnoses": diagnoses, "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " f"冷启动 {cold_starts}", @@ -339,6 +345,45 @@ def _diagnose( return diagnoses +def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact): + """交叉诊断:基于粘性/命中率/fallback/驱逐给出简表。""" + if not session_stickiness: + return [] + avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1) + mean_hr = hitratio_stats.get("mean", 0) + fallback_pct = 0 + for s in strategy_dist: + if s.get("value") == "process_tokens": + fallback_pct = s.get("pct", 0) + break + evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted")) + + diagnosis = "运行良好" + action = "-" + if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10: + diagnosis = "运行良好" + elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0: + diagnosis = "疑似驱逐导致命中率低" + action = "考虑增大 eviction-duration-mins" + elif avg_stickiness < 40 and fallback_pct >= 20: + diagnosis = "低粘性 + 高 fallback" + action = "检查负载阈值与 cache-aware 参数" + elif avg_stickiness < 40 and mean_hr < 20: + diagnosis = "低粘性 + 低命中" + action = "检查缓存预热与 prompt 稳定性" + + return [ + { + "avg_stickiness_pct": round(avg_stickiness, 1), + "mean_hitRatio_pct": round(mean_hr, 1), + "fallback_pct": round(fallback_pct, 1), + "evicted_after_timeout": evicted_cnt, + "diagnosis": diagnosis, + "action": action, + } + ] + + # ════════════════════════════════════════════════════════════════ # 报告格式化 # ════════════════════════════════════════════════════════════════ @@ -349,13 +394,18 @@ def format_cache_report(result): sections = ["## Cache 调度诊断", ""] sections.append(f' {result["summary"]}') sections.append("") + detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""] if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") # 策略分布 if result["strategy_dist"]: @@ -364,6 +414,10 @@ def format_cache_report(result): bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## 策略分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # hitRatio 统计 hs = result.get("hitratio_stats", {}) @@ -383,6 +437,10 @@ def format_cache_report(result): bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## Fallback 原因分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # Tokenizer 退化 if result.get("tokenizer_degraded_count", 0) > 0: @@ -394,6 +452,8 @@ def format_cache_report(result): if stickiness: sections.append("### Session 粘性") sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") table_data = [ { "Session": sid[:16], @@ -403,26 +463,37 @@ def format_cache_report(result): } for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) ] - sections.append( + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append( render_table( - table_data[:10], + table_data, columns=["Session", "请求数", "粘性率", "切换次数"], right_align={"请求数", "粘性率", "切换次数"}, ) ) - sections.append("") + detail_sections.append("") # 非最优选择 if result.get("suboptimal_selections"): subs = result["suboptimal_selections"] sections.append(f"### 非最优选择 ({len(subs)} 次)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): sections.append(f" {reason}: {count} 次") sections.append("") + detail_sections.append("## 非最优选择(Top 20)") + detail_sections.append("") + for s in subs[:20]: + detail_sections.append( + f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' + ) + detail_sections.append("") # 驱逐影响 if result.get("eviction_impact"): @@ -430,13 +501,61 @@ def format_cache_report(result): evicted = [e for e in evictions if e["evicted"]] sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + for e in evictions[:50]: + detail_sections.append( + f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' + ) + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') sections.append("") + detail_sections.append("## 冷启动识别") + detail_sections.append("") + detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') + detail_sections.append("") + + if result.get("cross_diagnosis"): + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append( + render_table( + result["cross_diagnosis"], + columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"], + right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, + ) + ) + detail_sections.append("") + + if any( + [ + result.get("session_stickiness"), + result.get("suboptimal_selections"), + result.get("eviction_impact"), + result.get("cross_diagnosis"), + result.get("diagnoses"), + ] + ): + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") - return "\n".join(sections) + return "\n".join(sections), "\n".join(detail_sections) # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index b8217a5ffa4..f0e4c352b6c 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -33,6 +33,7 @@ ("counter already zero", "Router"), ("tokenizer failed", "Router"), ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), # 客户端 ("Invalid request body", "客户端"), ("Invalid JSON format", "客户端"), @@ -55,6 +56,15 @@ ("GetRemoteMetrics failed", "FD 后端"), ] +IMPACT_RULES = [ + ("Failed to select", "请求可能返回 502/503"), + ("Failed to connect to backend", "后端不可达,请求失败"), + ("Panic recovered", "Router 代码异常,可能影响稳定性"), + ("scanner error", "流式响应中断"), + ("copy error", "非流式响应中断"), + ("Failed to read YAML file config/register.yaml", "可选配置未加载(若未启用可忽略)"), +] + # scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 SCANNER_COPY_PATTERNS = ("scanner error", "copy error") @@ -75,6 +85,13 @@ def classify_source_layer(template, original=""): return "未知" +def classify_impact(template): + for pattern, impact in IMPACT_RULES: + if pattern in template: + return impact + return "-" + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n): "count": g["count"], "pct": round(g["count"] / total * 100, 1) if total else 0, "source_layer": source_layer, + "impact": classify_impact(g["template"]), "level": g["level"], + "urls": _extract_urls(g["originals"]), "sample_originals": g["originals"], } ) @@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n): return result +def _extract_urls(originals): + import re + + urls = set() + for line in originals: + for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line): + urls.add(m) + return sorted(urls) + + def _grep_lines(log_file, pattern, tail=None): """用 grep 从日志文件提取匹配行。""" try: @@ -240,6 +269,9 @@ def format_errors_report(result): f'请求总数: {result["total_requests"]} | ' f'错误率: {result["error_rate"]}%' ) + sections.append(" 指标口径: ERROR/WARN=日志级别计数;请求总数=HTTP 请求行数;错误率=非200请求数/请求总数×100%。") + if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0): + sections.append(" ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200;并不代表没有 ERROR/WARN 日志。") sections.append("") # Panic @@ -266,22 +298,16 @@ def format_errors_report(result): sections.append(render_bar(bar_data, show_count=True)) sections.append("") - # 来源层表格 - table_data = [] - for e in result["error_top_n"][:10]: - table_data.append( - { - "模板": e["template"][:60], - "数量": e["count"], - "占比": f'{e["pct"]}%', - "级别": e["level"], - "来源层": e["source_layer"], - } - ) - sections.append( - render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) - ) + sections.append(" 具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)") sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") # 状态码分布 if result["status_code_dist"]: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index ca01d718dbc..5d1994d9405 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events): break all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + for reg in register_by_ip.get(worker_ip, []): + all_events.append({"ts": reg["ts"], "type": "REGISTERED"}) all_events.extend(recovery_events) all_events.sort(key=lambda e: e["ts"] or "") down_periods = _compute_down_periods(all_events) down_count = len(down_periods) avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + detect_latency = _compute_detect_latency(all_events) workers[url] = { "events": all_events, @@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): "recovered": recovered, "inflight_preserved": counter_counts.get(url, 0), "down_periods": down_periods, + "avg_detect_latency_s": detect_latency, } return workers @@ -191,6 +195,24 @@ def _compute_down_periods(events): return down_periods +def _compute_detect_latency(events): + """计算 NOT_HEALTHY -> REMOVED 平均检测延迟(秒)。""" + last_unhealthy = None + latencies = [] + for evt in events: + if evt["type"] == "NOT_HEALTHY" and evt.get("ts"): + last_unhealthy = evt["ts"] + elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"): + try: + latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds()) + except ValueError: + pass + last_unhealthy = None + if not latencies: + return "-" + return round(sum(latencies) / len(latencies), 1) + + def _compute_uptime_pct(events): """计算 Worker 可用性百分比。""" if not events: @@ -313,8 +335,7 @@ def format_health_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") # Worker 可用性表格 @@ -335,6 +356,7 @@ def format_health_report(result): "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", + "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"), "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", } @@ -342,8 +364,8 @@ def format_health_report(result): sections.append( render_table( table_data, - columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"], - right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"}, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"}, ) ) sections.append("") @@ -360,6 +382,12 @@ def format_health_report(result): # 事件详情 → 拆分到 detail_text detail_parts = ["# Worker 健康事件详情", ""] has_events = False + if result.get("diagnoses"): + detail_parts.append("## 诊断") + detail_parts.append("") + for d in result["diagnoses"]: + detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_parts.append("") for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True @@ -373,7 +401,7 @@ def format_health_report(result): # 主报告中添加引用 if has_events: - sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)") + sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") return "\n".join(sections), detail_text diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py index eec862910e8..508cf3824d9 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -255,6 +255,7 @@ def format_latency_report(result): f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' f'max={_fmt_ms(stats["max"])}' ) + sections.append(" 指标口径: pXX=延迟分位数;吞吐量=每个时间桶内请求数(count);调度耗时=同 request_id 的 ts_ms(max-min)。") sections.append("") # 延迟分布 @@ -331,13 +332,10 @@ def format_latency_report(result): ) sections.append("") - # 诊断 + # 诊断(仅在 detail 输出) if result["diagnoses"]: sections.append("### 诊断") - for d in result["diagnoses"]: - severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "} - mark = severity_mark.get(d["severity"], " ") - sections.append(f' [{mark}] {d["message"]}') + sections.append(" 诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)") sections.append("") return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index c38b0b80953..5b9e3271f07 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -30,6 +30,8 @@ # Token 事件 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)") +RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)") def _strip_scheme(url): @@ -135,11 +137,22 @@ def analyze_load(log_file, tail=None): sr_result = ( match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} + else { + "matched": [], + "unmatched_selects": [], + "unmatched_releases": [], + "untracked_selects": [], + "failed_selects": [], + "per_worker": {}, + "id_coverage": {}, + "type_summary": {}, + "worker_type_profile": {}, + } ) # Token 统计 token_stats = _analyze_tokens(h3_lines, h11_lines) + counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines) # 请求堆积检测 pileup = _detect_pileup(stats_records) @@ -154,6 +167,7 @@ def analyze_load(log_file, tail=None): "counter_anomalies": anomaly_summary, "select_release": sr_result, "token_stats": token_stats, + "counter_last_state": counter_last_state, "pileup_detected": pileup, "diagnoses": diagnoses, "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", @@ -191,6 +205,55 @@ def _analyze_tokens(h3_lines, h11_lines): return result +def _analyze_counter_last_state(lines): + """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。""" + state = defaultdict( + lambda: { + "req_last_action": "-", + "req_last_value": "-", + "token_last_action": "-", + "token_last_value": "-", + "last_ts": "", + } + ) + for line in lines: + ts = extract_ts(line) or "" + m = SELECT_REQ_COUNT_RE.search(line) + if m: + w = m.group(2) + state[w]["req_last_action"] = "select" + state[w]["req_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_REQ_COUNT_RE.search(line) + if m: + w = m.group(1) + state[w]["req_last_action"] = "release" + state[w]["req_last_value"] = m.group(2) + state[w]["last_ts"] = ts + continue + m = SELECT_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "select" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "release" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + + result = [] + for w in sorted(state.keys()): + s = state[w] + result.append({"worker": _strip_scheme(w), **s}) + return result + + def _detect_pileup(stats_records): """检测请求堆积:total_running 连续上升 >5 个采样点。""" if len(stats_records) < 5: @@ -273,6 +336,16 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, } ) + id_mismatch_count = sr_result.get("id_consistency", {}).get("both_present_but_mismatch", 0) + if id_mismatch_count > 0: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"{id_mismatch_count} 个 select/release 在 FIFO 命中后 ID 不一致(疑似串流或日志错配)", + "source_layer": "FD 后端", + } + ) + # Token 计数器潜在泄漏 for t in token_stats: if t.get("alloc_count", 0) > t.get("release_count", 0): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index 86ba1f0d94f..74358b6e72d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -25,8 +25,7 @@ def format_load_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(f' 共 {len(result["diagnoses"])} 条诊断,见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)') sections.append("") detail_sections.append("## 诊断") detail_sections.append("") @@ -39,6 +38,7 @@ def format_load_report(result): if ls: sections.append("### 负载概览 (total_running)") sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") sections.append( f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' @@ -108,6 +108,9 @@ def format_load_report(result): sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) sections.append("") sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: `release prefill tokens` 会被识别为 token-release;worker type 按该 worker URL 在 select 中的类型映射(prefill/decode/mixed)。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") sections.append("") detail_sections.append("## 按类型统计") detail_sections.append("") @@ -178,6 +181,56 @@ def format_load_report(result): sections.append("") detail_sections.append("## Select/Release Per-Worker") detail_sections.append("") + + id_consistency = sr.get("id_consistency", {}) + if id_consistency: + sections.append("### FIFO × ID 一致性校验") + sections.append("") + sections.append( + " matched={ok}, mismatch={mismatch}, select_only={so}, release_only={ro}, both_missing={bm}".format( + ok=id_consistency.get("both_present_and_equal", 0), + mismatch=id_consistency.get("both_present_but_mismatch", 0), + so=id_consistency.get("only_select_has_id", 0), + ro=id_consistency.get("only_release_has_id", 0), + bm=id_consistency.get("both_missing", 0), + ) + ) + sections.append("") + sections.append(" 说明: 主匹配按 worker FIFO,随后检查 matched 对中的 ID 是否一致。") + sections.append("") + detail_sections.append("## FIFO × ID 一致性") + detail_sections.append("") + detail_sections.append( + "- both_present_and_equal: select/release 都有可关联 ID 且相等\n" + "- both_present_but_mismatch: select/release 都有 ID 但不一致(需要重点排查)\n" + "- only_select_has_id: 仅 select 有 ID\n" + "- only_release_has_id: 仅 release 有 ID\n" + "- both_missing: 两边都没有可关联 ID" + ) + detail_sections.append("") + + if sr.get("worker_type_profile"): + sections.append("### Worker URL 类型画像(基于 select)") + sections.append("") + rows = [] + for w, p in sorted(sr["worker_type_profile"].items()): + rows.append( + { + "Worker": _strip_scheme(w), + "Dominant": p.get("dominant_type", "unknown"), + "Prefill": p.get("prefill", 0), + "Decode": p.get("decode", 0), + "Mixed": p.get("mixed", 0), + } + ) + sections.append( + render_table( + rows, + columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"], + right_align={"Prefill", "Decode", "Mixed"}, + ) + ) + sections.append("") detail_sections.append( render_table( table_data, @@ -192,7 +245,7 @@ def format_load_report(result): sections.append(" 解释: 出现 request select,但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。") for u in sr["unmatched_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") sections.append("") detail_sections.append("## 未匹配 select(完整)") detail_sections.append("") @@ -202,14 +255,39 @@ def format_load_report(result): ) detail_sections.append("") + if sr.get("unmatched_releases"): + sections.append(f' ⚠ {len(sr["unmatched_releases"])} 个未匹配 release(已区分 req/token)') + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") + sections.append("") + detail_sections.append("## 未匹配 release(按 release_kind 分类)") + detail_sections.append("") + for r in sr["unmatched_releases"]: + detail_sections.append( + f'- [{r.get("release_ts","")}] worker={_strip_scheme(r["worker"])} release_kind={r.get("release_kind","")} type={r.get("type","")}' + ) + detail_sections.append("") + if sr.get("untracked_selects"): sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') for u in sr["untracked_selects"][:3]: sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") sections.append("") detail_sections.append("## Untracked selects(缺少可关联 ID)") detail_sections.append("") + + if sr.get("id_mismatched_matches"): + sections.append(f' ⚠ {len(sr["id_mismatched_matches"])} 个 FIFO 匹配对存在 ID 不一致') + sections.append(" > 完整列表见: [detail/load_select_release.md](../detail/load_select_release.md)") + sections.append("") + detail_sections.append("## FIFO 匹配但 ID 不一致(完整)") + detail_sections.append("") + for m in sr["id_mismatched_matches"]: + detail_sections.append( + f'- [{m.get("select_ts","")}] worker={_strip_scheme(m.get("worker",""))} ' + f'select_id={m.get("select_id","")} release_id={m.get("release_id","")} note={m.get("note","")}' + ) + detail_sections.append("") for u in sr["untracked_selects"]: detail_sections.append( f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}' @@ -239,4 +317,20 @@ def format_load_report(result): ) sections.append("") + if result.get("counter_last_state"): + sections.append("### 计数器末状态") + sections.append("") + sections.append(" 末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)") + sections.append("") + detail_sections.append("## Counter / Token Counter 末状态(最后一条计数日志)") + detail_sections.append("") + detail_sections.append( + render_table( + result["counter_last_state"], + columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"], + right_align={"req_last_value", "token_last_value"}, + ) + ) + detail_sections.append("") + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 6c9a0323724..24af9a23500 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -16,6 +16,7 @@ from log_parser import ( extract_tags, extract_ts, + match_select_release, parse_cache_strategy_line, parse_http_line, ) @@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None): # 解析事件链 events = _parse_event_chain(all_lines) lifecycle_complete = _check_lifecycle_complete(events) - diagnoses = _diagnose_trace(events, lifecycle_complete) + sr_check = match_select_release(all_lines) + diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) traces[tid] = { "events": events, "lifecycle_complete": lifecycle_complete, "diagnoses": diagnoses, + "sr_check": sr_check, "matched_tag": "session_id" if is_session else "request_id/trace_id", "related_ids": { "request_ids": sorted(related_request_ids) if is_session else [], @@ -271,7 +274,7 @@ def _check_lifecycle_complete(events): return has_entry and has_exit and (not has_select or has_release) -def _diagnose_trace(events, lifecycle_complete): +def _diagnose_trace(events, lifecycle_complete, sr_check=None): """生成追踪诊断。""" diagnoses = [] types = [e["type"] for e in events] @@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete): if "FAILED_SELECT" in types: diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + if sr_check: + if sr_check.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select', + } + ) + if sr_check.get("unmatched_releases"): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release', + } + ) + return diagnoses @@ -367,7 +386,7 @@ def format_trace_report(result): # 主报告中添加引用和摘要 safe_tid = tid.replace("/", "_") sections.append(f' 事件数: {len(trace["events"])}') - sections.append(f" > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)") + sections.append(f" > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)") sections.append("") return "\n".join(sections), detail_dict diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py index 83bb0203432..1eaea1369f8 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None): w = col_widths[col] if col in right_align: header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") else: header_parts.append(f" {col:<{w}} ") - sep_parts.append("-" * (w + 2)) + sep_parts.append(":" + "-" * (w + 1)) lines = [] lines.append("|" + "|".join(header_parts) + "|") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 200f976f2ff..4dc5832c103 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -501,6 +501,83 @@ def _normalize_worker_type(worker_type): return "unknown" +def _normalize_worker_url_key(url): + if not url: + return "" + return re.sub(r"^https?://", "", str(url).strip().rstrip("/")) + + +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -525,6 +602,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": tm.group(2), + "worker_key": _normalize_worker_url_key(tm.group(2)), "type": _normalize_worker_type(tm.group(1)), "tags": tags, "tokens": int(tm.group(3)), @@ -536,12 +614,15 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: - token_type = trm.group(1) or "prefill" + token_type = trm.group(1) releases.append( { "ts": ts, "worker": trm.group(2), - "type": f'{_normalize_worker_type(token_type)}_tokens', + "worker_key": _normalize_worker_url_key(trm.group(2)), + # 文本默认按 prefill 记,再结合同 worker 邻近 select 做纠偏(mixed 场景) + "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens', + "raw_token_type": token_type or "", "tags": tags, "tokens": int(trm.group(3)), "line": line_no, @@ -555,6 +636,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": sm.group(2), + "worker_key": _normalize_worker_url_key(sm.group(2)), "type": _normalize_worker_type(sm.group(1)), "tags": tags, "tokens": None, @@ -569,6 +651,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": rm.group(2), + "worker_key": _normalize_worker_url_key(rm.group(2)), "type": _normalize_worker_type(rm.group(1)), "tags": tags, "tokens": None, @@ -580,28 +663,22 @@ def match_select_release(lines, fallback_window_s=120): if FAILED_SELECT_RE.search(line): failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) - # Match by request_id / alt_id + # Match by worker FIFO(select -> 同 worker 下一条 release) matched = [] unmatched_selects = [] release_used = set() # 请求生命周期匹配只使用 request counter release(排除 token release) + # 说明:request_id 只用于覆盖率观测,不参与 select/release 配对条件。 counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")] - release_by_key = defaultdict(list) - for i in counter_release_indexes: - r = releases[i] - _, key = _select_match_key(r.get("tags", {})) - if key: - release_by_key[key].append(i) - # 请求 ID 覆盖(按 select 事件近似请求数) total_req_est = len(selects) with_request_id = 0 with_alt_id = 0 without_any_id = 0 - pending_selects = [] untracked_selects = [] + pending_selects = [] for s in selects: key_type, key = _select_match_key(s.get("tags", {})) if key_type == "request_id": @@ -611,7 +688,6 @@ def match_select_release(lines, fallback_window_s=120): else: without_any_id += 1 - found = False if not key: # 没有任何可用 ID 时,不做退化匹配(只统计可观测信息) untracked_selects.append( @@ -623,53 +699,74 @@ def match_select_release(lines, fallback_window_s=120): "note": "no correlatable id (request_id/req_id/trace_id/session_id)", } ) - continue - - if key and key in release_by_key: - for ri in release_by_key[key]: - if ri not in release_used: - r = releases[ri] - matched.append( - { - "request_id": s["tags"].get("request_id", ""), - "worker": s["worker"], - "select_ts": s["ts"], - "release_ts": r["ts"], - "type": s["type"], - "match_method": key_type or "id", - } - ) - release_used.add(ri) - found = True - break - - if not found: - pending_selects.append(s) + pending_selects.append(s) + + # worker FIFO + ID 一致性联合校验: + # 1) 主匹配仍按 worker FIFO,保证在缺失 request_id 场景可工作 + # 2) 对已匹配对追加 ID 一致性检查(request_id/req_id/trace_id/session_id) + id_consistency = { + "both_present_and_equal": 0, + "both_present_but_mismatch": 0, + "only_select_has_id": 0, + "only_release_has_id": 0, + "both_missing": 0, + } + id_mismatched_matches = [] - # Fallback: 有 ID 但未匹配时,按 worker + 时间邻近匹配 for s in pending_selects: - sdt = _parse_ts_safe(s["ts"]) + sdt = _parse_ts_safe(s.get("ts")) best_idx = None - best_delta = None + best_ts = None for ri in counter_release_indexes: - r = releases[ri] if ri in release_used: continue - if r.get("worker") != s.get("worker"): + r = releases[ri] + if r.get("worker_key") != s.get("worker_key"): continue rdt = _parse_ts_safe(r.get("ts")) - if sdt and rdt: - delta = (rdt - sdt).total_seconds() - if delta < 0 or delta > fallback_window_s: - continue - else: - delta = 0 - if best_delta is None or delta < best_delta: - best_delta = delta + # 优先选择时间不早于 select 的最早 release;解析失败则按出现顺序 + if sdt and rdt and rdt < sdt: + continue + if best_idx is None: + best_idx = ri + best_ts = rdt + elif rdt and best_ts and rdt < best_ts: best_idx = ri + best_ts = rdt if best_idx is not None: r = releases[best_idx] + s_key_type, s_key = _select_match_key(s.get("tags", {})) + r_key_type, r_key = _select_match_key(r.get("tags", {})) + if s_key and r_key: + if s_key == r_key: + id_check = "match" + id_consistency["both_present_and_equal"] += 1 + else: + id_check = "mismatch" + id_consistency["both_present_but_mismatch"] += 1 + id_mismatched_matches.append( + { + "worker": s["worker"], + "select_ts": s["ts"], + "release_ts": r["ts"], + "select_id_key": s_key_type, + "select_id": s_key, + "release_id_key": r_key_type, + "release_id": r_key, + "note": "worker FIFO matched, but ID mismatched", + } + ) + elif s_key and not r_key: + id_check = "select_only" + id_consistency["only_select_has_id"] += 1 + elif (not s_key) and r_key: + id_check = "release_only" + id_consistency["only_release_has_id"] += 1 + else: + id_check = "both_missing" + id_consistency["both_missing"] += 1 + matched.append( { "request_id": s["tags"].get("request_id", ""), @@ -677,7 +774,8 @@ def match_select_release(lines, fallback_window_s=120): "select_ts": s["ts"], "release_ts": r["ts"], "type": s["type"], - "match_method": "worker_time_fallback", + "match_method": "worker_fifo", + "id_check": id_check, } ) release_used.add(best_idx) @@ -688,7 +786,7 @@ def match_select_release(lines, fallback_window_s=120): "select_ts": s["ts"], "type": s["type"], "tags": s["tags"], - "note": "no matching release found (request_id/worker-time)", + "note": "no matching release found (worker FIFO)", } ) @@ -697,14 +795,16 @@ def match_select_release(lines, fallback_window_s=120): per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0}) for s in selects: s_type = _normalize_worker_type(s.get("type")) - per_worker[s["worker"]]["selects"] += 1 + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker[wkey]["selects"] += 1 if s_type in ("prefill", "mixed"): - per_worker[s["worker"]]["token_selects"] += 1 + per_worker[wkey]["token_selects"] += 1 for r in releases: + wkey = r.get("worker_key") or _normalize_worker_url_key(r.get("worker")) if str(r.get("type", "")).endswith("_tokens"): - per_worker[r["worker"]]["token_releases"] += 1 + per_worker[wkey]["token_releases"] += 1 else: - per_worker[r["worker"]]["releases"] += 1 + per_worker[wkey]["releases"] += 1 pw_result = {} for w, counts in per_worker.items(): @@ -716,7 +816,34 @@ def match_select_release(lines, fallback_window_s=120): "token_releases": counts["token_releases"], } - # 按 worker type 分类统计(prefill/decode/mixed) + # 基于 select 构建 worker URL -> dominant type 映射 + per_worker_type_counts = defaultdict(lambda: defaultdict(int)) + for s in selects: + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker_type_counts[wkey][_normalize_worker_type(s.get("type"))] += 1 + worker_dominant_type = {} + for w, counts in per_worker_type_counts.items(): + worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown" + + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + # token release 按 worker URL 对应的 select 类型映射,不做邻近时间纠偏 + mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown") + if mapped_t in ("prefill", "decode", "mixed"): + base_t = mapped_t + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) type_summary = defaultdict( lambda: { "counter_selects": 0, @@ -730,16 +857,57 @@ def match_select_release(lines, fallback_window_s=120): type_summary[s_type]["counter_selects"] += 1 if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 - for r in releases: - r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) - if str(r.get("type", "")).endswith("_tokens"): + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): type_summary[r_type]["token_releases"] += 1 else: type_summary[r_type]["counter_releases"] += 1 + # 每个 worker URL 的类型画像(基于 select) + worker_type_profile = {} + for w, counts in per_worker_type_counts.items(): + dominant = "unknown" + if counts: + dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] + worker_type_profile[w] = { + "dominant_type": dominant, + "prefill": counts.get("prefill", 0), + "decode": counts.get("decode", 0), + "mixed": counts.get("mixed", 0), + "unknown": counts.get("unknown", 0), + } + + unmatched_releases = [] + for i, r in enumerate(releases): + if str(r.get("type", "")).endswith("_tokens"): + # token release: 近邻存在 prefill/mixed select 则视为可解释,不计入 unmatched + inferred_token_type = _normalize_worker_type(str(inferred_release_types.get(i, "unknown_tokens")).replace("_tokens", "")) + if inferred_token_type == "unknown": + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": inferred_token_type, + "release_kind": "token_release", + } + ) + continue + if i not in release_used: + unmatched_releases.append( + { + "worker": r.get("worker", ""), + "release_ts": r.get("ts", ""), + "type": _normalize_worker_type(inferred_release_types.get(i, "unknown")), + "release_kind": "request_release", + } + ) + return { "matched": matched, "unmatched_selects": unmatched_selects, + "unmatched_releases": unmatched_releases, "untracked_selects": untracked_selects, "failed_selects": failed_selects, "per_worker": pw_result, @@ -750,7 +918,10 @@ def match_select_release(lines, fallback_window_s=120): "with_alt_id": with_alt_id, "without_any_id": without_any_id, }, + "id_consistency": id_consistency, + "id_mismatched_matches": id_mismatched_matches, "type_summary": dict(type_summary), + "worker_type_profile": worker_type_profile, } @@ -949,6 +1120,24 @@ def check(name, got, expected): "dial tcp {ip:port}: connection refused", ) + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) + check("id consistency exact match", msr["id_consistency"].get("both_present_and_equal", 0), 1) + + mismatch_lines = [ + "[INFO] 2026/04/12 10:01:00 logger.go:1: [request_id:r2] select worker (decode): http://10.0.0.2:9965, count: 1", + "[INFO] 2026/04/12 10:01:01 logger.go:1: [request_id:r3] release worker: http://10.0.0.2:9965, count: 0", + ] + mm = match_select_release(mismatch_lines) + check("id mismatch detected", mm["id_consistency"].get("both_present_but_mismatch", 0), 1) + print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index a818d31150f..803bf6fba43 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -24,6 +24,7 @@ import os import sys from datetime import datetime +from pathlib import Path # 确保能 import 同级模块 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) @@ -46,23 +47,56 @@ def determine_log_file(user_path=None): 3. fd-router.log(golang_router 根目录) """ if user_path: - if os.path.isfile(user_path): - return user_path + p = Path(user_path).expanduser() + if p.is_file(): + return str(p) print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr) + print( + "提示: 若路径含空格/括号,请使用引号,例如: " + "python3 scripts/troubleshoot.py 'fastdeploy/golang_router/logs/fd-router (2).log' --load", + file=sys.stderr, + ) sys.exit(1) - # 尝试不同 CWD 下的候选路径 - candidates = [ - "logs/router.log", # CWD = golang_router/ - "fd-router.log", # CWD = golang_router/ - "fastdeploy/golang_router/logs/router.log", # CWD = 项目根 - "fastdeploy/golang_router/fd-router.log", # CWD = 项目根 + # 统一基于脚本位置与当前工作目录搜索,避免 CWD 差异导致找不到日志。 + script_dir = Path(__file__).resolve().parent + golang_router_dir = script_dir.parents[2] # .../fastdeploy/golang_router + cwd = Path.cwd() + + # 精确候选(优先常见命名) + exact_candidates = [ + golang_router_dir / "logs" / "router.log", + golang_router_dir / "fd-router.log", + cwd / "logs" / "router.log", + cwd / "fd-router.log", + cwd / "fastdeploy" / "golang_router" / "logs" / "router.log", + cwd / "fastdeploy" / "golang_router" / "fd-router.log", ] - for path in candidates: - if os.path.isfile(path): - return path + for p in exact_candidates: + if p.is_file(): + return str(p) + + # 模糊候选:支持 fd-router (2).log 等命名 + pattern_roots = [ + golang_router_dir / "logs", + golang_router_dir, + cwd / "logs", + cwd, + cwd / "fastdeploy" / "golang_router" / "logs", + cwd / "fastdeploy" / "golang_router", + ] + dynamic_candidates = [] + for root in pattern_roots: + if not root.is_dir(): + continue + dynamic_candidates.extend(sorted(root.glob("fd-router*.log"))) + dynamic_candidates.extend(sorted(root.glob("router*.log"))) + + if dynamic_candidates: + return str(dynamic_candidates[0]) print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr) + print("已搜索: logs/router.log, fd-router.log, fd-router*.log, router*.log", file=sys.stderr) sys.exit(1) @@ -128,7 +162,14 @@ def determine_status(results): reasons.append(d["message"]) if reasons: - return "DEGRADED", ", ".join(reasons) + # 去重但保留完整信息 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + return "DEGRADED", ";".join(deduped) if not results: return "HEALTHY", "无分析数据" @@ -148,19 +189,65 @@ def format_full_report(results, status, status_reason): - 'trace_files': {trace_id: text} 或 {} """ parts = [] - details = {"health_events": None, "load_select_release": None, "trace_files": {}} + details = { + "health_events": None, + "load_select_release": None, + "latency_diagnoses": None, + "cache_diagnosis": None, + "load_diagnoses": None, + "load_counter_state": None, + "cache_session_stickiness": None, + "cache_suboptimal": None, + "cache_eviction": None, + "cache_fallback": None, + "cache_cross": None, + "errors_topn": None, + "trace_files": {}, + } # 状态行 parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) parts.append("=" * 60) parts.append("") # 各维度报告 if "errors" in results: parts.append(format_errors_report(results["errors"])) + if results["errors"].get("error_top_n"): + lines = [ + "# Errors TopN 详情", + "", + "| 模板 | 数量 | 级别 | 来源层 | 影响 |", + "|:--|--:|:--|:--|:--|", + ] + for e in results["errors"]["error_top_n"]: + lines.append( + f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |' + ) + lines.append("") + lines.append("## 涉及 URLs") + lines.append("") + for e in results["errors"]["error_top_n"]: + urls = e.get("urls") or [] + if not urls: + continue + lines.append(f'- 模板: {e.get("template","")}') + for u in urls: + lines.append(f' - {u}') + lines.append("") + details["errors_topn"] = "\n".join(lines) if "latency" in results: parts.append(format_latency_report(results["latency"])) + if results["latency"].get("diagnoses"): + lines = ["# 延迟诊断详情", ""] + for d in results["latency"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] {d.get("message","")}') + lines.append("") + details["latency_diagnoses"] = "\n".join(lines) if "health" in results: summary, detail = format_health_report(results["health"]) @@ -173,9 +260,58 @@ def format_full_report(results, status, status_reason): parts.append(summary) if detail: details["load_select_release"] = detail + if results["load"].get("diagnoses"): + lines = ["# Load 诊断详情", ""] + for d in results["load"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}') + lines.append("") + details["load_diagnoses"] = "\n".join(lines) + if results["load"].get("counter_last_state"): + rows = results["load"]["counter_last_state"] + lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"] + for r in rows: + lines.append( + f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' + ) + lines.append("") + details["load_counter_state"] = "\n".join(lines) if "cache" in results: - parts.append(format_cache_report(results["cache"])) + summary, detail = format_cache_report(results["cache"]) + parts.append(summary) + if detail: + details["cache_diagnosis"] = detail + c = results["cache"] + if c.get("session_stickiness"): + lines = ["# Cache Session 粘性详情", ""] + for sid, s in c["session_stickiness"].items(): + lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}') + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + if c.get("suboptimal_selections"): + lines = ["# Cache 非最优选择详情", ""] + for x in c["suboptimal_selections"][:200]: + lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}') + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + if c.get("eviction_impact"): + lines = ["# Cache 驱逐影响详情", ""] + for x in c["eviction_impact"][:200]: + lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}') + lines.append("") + details["cache_eviction"] = "\n".join(lines) + if c.get("fallback_reasons"): + lines = ["# Cache Fallback 原因详情", ""] + for x in c["fallback_reasons"]: + lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') + lines.append("") + details["cache_fallback"] = "\n".join(lines) + if c.get("cross_diagnosis"): + lines = ["# Cache 交叉诊断详情", ""] + for x in c["cross_diagnosis"]: + lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%') + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -217,6 +353,40 @@ def save_detailed_report(report_text, output_dir, details=None): with open(load_path, "w", encoding="utf-8") as f: f.write(details["load_select_release"]) + if details.get("latency_diagnoses"): + latency_path = os.path.join(detail_dir, "latency_diagnoses.md") + with open(latency_path, "w", encoding="utf-8") as f: + f.write(details["latency_diagnoses"]) + + if details.get("cache_diagnosis"): + cache_path = os.path.join(detail_dir, "cache_diagnosis.md") + with open(cache_path, "w", encoding="utf-8") as f: + f.write(details["cache_diagnosis"]) + if details.get("load_diagnoses"): + with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f: + f.write(details["load_diagnoses"]) + if details.get("load_counter_state"): + with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: + f.write(details["load_counter_state"]) + if details.get("cache_session_stickiness"): + with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: + f.write(details["cache_session_stickiness"]) + if details.get("cache_suboptimal"): + with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: + f.write(details["cache_suboptimal"]) + if details.get("cache_eviction"): + with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: + f.write(details["cache_eviction"]) + if details.get("cache_fallback"): + with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: + f.write(details["cache_fallback"]) + if details.get("cache_cross"): + with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: + f.write(details["cache_cross"]) + if details.get("errors_topn"): + with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f: + f.write(details["errors_topn"]) + for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") From a317cecf8df73eaa751ff1cd8e4612e9818899d1 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 12:56:38 +0800 Subject: [PATCH 19/40] fix(load): treat positive req delta as possible in-flight requests --- .../troubleshoot/references/error_catalog.md | 1 + .../troubleshoot/references/log_patterns.md | 11 + .../references/report_templates.md | 12 +- .../troubleshoot/scripts/analyzers/cache.py | 131 +++++++++- .../troubleshoot/scripts/analyzers/errors.py | 56 +++-- .../troubleshoot/scripts/analyzers/health.py | 38 ++- .../troubleshoot/scripts/analyzers/latency.py | 8 +- .../troubleshoot/scripts/analyzers/load.py | 81 ++++-- .../scripts/analyzers/load_report.py | 77 +++--- .../troubleshoot/scripts/analyzers/trace.py | 25 +- .../skills/troubleshoot/scripts/chart.py | 3 +- .../skills/troubleshoot/scripts/log_parser.py | 234 ++++++++++++------ .../troubleshoot/scripts/troubleshoot.py | 198 +++++++++++++-- 13 files changed, 710 insertions(+), 165 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md index ba48297d9c9..60b4931b546 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -61,6 +61,7 @@ | `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | | `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | | `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | | `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md index cf33b41f723..4322909c01d 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -233,6 +233,17 @@ PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` --- +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + ## 使用脚本工具 各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index ba9e40e9869..cd705d02816 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -44,6 +44,7 @@ ### 简洁版(终端输出) - 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 - 按三层分类(Router / FD 后端 / 客户端) - 每个问题一行摘要 + 关键指标 - 末尾提示详细版文件路径 @@ -53,8 +54,15 @@ - 路径:`skill_output/troubleshoot//troubleshoot_report_.md` - 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) - 详情拆分到 `details/` 子目录: - - `details/health_events.md` — Worker 逐分钟健康事件 - - `details/trace_.md` — 请求追踪事件链 + - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断 + - `detail/errors_topn.md` — ERROR/WARN 模板明细(数量/级别/来源层/影响 + URLs) + - `detail/load_select_release.md` — 负载诊断 + select/release 明细 + - `detail/load_diagnoses.md` — load 诊断列表 + - `detail/load_counter_state.md` — request/token counter 末状态 + - `detail/latency_diagnoses.md` — 延迟诊断详情 + - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) + - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 + - `detail/trace_.md` — 请求追踪事件链 --- diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3fca296f4d6..3a5c19ad00b 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -136,6 +136,12 @@ def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weig "cold_starts": cold_starts, "hitratio_stats": hitratio_stats, "tokenizer_degraded_count": tokenizer_degraded_count, + "cross_diagnosis": _analyze_cross_diagnosis( + session_stickiness=session_stickiness, + hitratio_stats=hitratio_stats, + strategy_dist=strategy_dist, + eviction_impact=eviction_impact, + ), "diagnoses": diagnoses, "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " f"冷启动 {cold_starts}", @@ -339,6 +345,45 @@ def _diagnose( return diagnoses +def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact): + """交叉诊断:基于粘性/命中率/fallback/驱逐给出简表。""" + if not session_stickiness: + return [] + avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1) + mean_hr = hitratio_stats.get("mean", 0) + fallback_pct = 0 + for s in strategy_dist: + if s.get("value") == "process_tokens": + fallback_pct = s.get("pct", 0) + break + evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted")) + + diagnosis = "运行良好" + action = "-" + if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10: + diagnosis = "运行良好" + elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0: + diagnosis = "疑似驱逐导致命中率低" + action = "考虑增大 eviction-duration-mins" + elif avg_stickiness < 40 and fallback_pct >= 20: + diagnosis = "低粘性 + 高 fallback" + action = "检查负载阈值与 cache-aware 参数" + elif avg_stickiness < 40 and mean_hr < 20: + diagnosis = "低粘性 + 低命中" + action = "检查缓存预热与 prompt 稳定性" + + return [ + { + "avg_stickiness_pct": round(avg_stickiness, 1), + "mean_hitRatio_pct": round(mean_hr, 1), + "fallback_pct": round(fallback_pct, 1), + "evicted_after_timeout": evicted_cnt, + "diagnosis": diagnosis, + "action": action, + } + ] + + # ════════════════════════════════════════════════════════════════ # 报告格式化 # ════════════════════════════════════════════════════════════════ @@ -349,13 +394,18 @@ def format_cache_report(result): sections = ["## Cache 调度诊断", ""] sections.append(f' {result["summary"]}') sections.append("") + detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""] if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") # 策略分布 if result["strategy_dist"]: @@ -364,6 +414,10 @@ def format_cache_report(result): bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## 策略分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # hitRatio 统计 hs = result.get("hitratio_stats", {}) @@ -383,6 +437,10 @@ def format_cache_report(result): bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] sections.append(render_bar(bar_data, show_count=True)) sections.append("") + detail_sections.append("## Fallback 原因分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") # Tokenizer 退化 if result.get("tokenizer_degraded_count", 0) > 0: @@ -394,6 +452,8 @@ def format_cache_report(result): if stickiness: sections.append("### Session 粘性") sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") table_data = [ { "Session": sid[:16], @@ -403,26 +463,37 @@ def format_cache_report(result): } for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) ] - sections.append( + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append( render_table( - table_data[:10], + table_data, columns=["Session", "请求数", "粘性率", "切换次数"], right_align={"请求数", "粘性率", "切换次数"}, ) ) - sections.append("") + detail_sections.append("") # 非最优选择 if result.get("suboptimal_selections"): subs = result["suboptimal_selections"] sections.append(f"### 非最优选择 ({len(subs)} 次)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): sections.append(f" {reason}: {count} 次") sections.append("") + detail_sections.append("## 非最优选择(Top 20)") + detail_sections.append("") + for s in subs[:20]: + detail_sections.append( + f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' + ) + detail_sections.append("") # 驱逐影响 if result.get("eviction_impact"): @@ -430,13 +501,61 @@ def format_cache_report(result): evicted = [e for e in evictions if e["evicted"]] sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + for e in evictions[:50]: + detail_sections.append( + f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' + ) + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') sections.append("") + detail_sections.append("## 冷启动识别") + detail_sections.append("") + detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') + detail_sections.append("") + + if result.get("cross_diagnosis"): + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append( + render_table( + result["cross_diagnosis"], + columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"], + right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, + ) + ) + detail_sections.append("") + + if any( + [ + result.get("session_stickiness"), + result.get("suboptimal_selections"), + result.get("eviction_impact"), + result.get("cross_diagnosis"), + result.get("diagnoses"), + ] + ): + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") - return "\n".join(sections) + return "\n".join(sections), "\n".join(detail_sections) # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py index b8217a5ffa4..f0e4c352b6c 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -33,6 +33,7 @@ ("counter already zero", "Router"), ("tokenizer failed", "Router"), ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), # 客户端 ("Invalid request body", "客户端"), ("Invalid JSON format", "客户端"), @@ -55,6 +56,15 @@ ("GetRemoteMetrics failed", "FD 后端"), ] +IMPACT_RULES = [ + ("Failed to select", "请求可能返回 502/503"), + ("Failed to connect to backend", "后端不可达,请求失败"), + ("Panic recovered", "Router 代码异常,可能影响稳定性"), + ("scanner error", "流式响应中断"), + ("copy error", "非流式响应中断"), + ("Failed to read YAML file config/register.yaml", "可选配置未加载(若未启用可忽略)"), +] + # scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 SCANNER_COPY_PATTERNS = ("scanner error", "copy error") @@ -75,6 +85,13 @@ def classify_source_layer(template, original=""): return "未知" +def classify_impact(template): + for pattern, impact in IMPACT_RULES: + if pattern in template: + return impact + return "-" + + # ════════════════════════════════════════════════════════════════ # 主分析函数 # ════════════════════════════════════════════════════════════════ @@ -182,7 +199,9 @@ def _compute_error_top_n(records, top_n): "count": g["count"], "pct": round(g["count"] / total * 100, 1) if total else 0, "source_layer": source_layer, + "impact": classify_impact(g["template"]), "level": g["level"], + "urls": _extract_urls(g["originals"]), "sample_originals": g["originals"], } ) @@ -192,6 +211,16 @@ def _compute_error_top_n(records, top_n): return result +def _extract_urls(originals): + import re + + urls = set() + for line in originals: + for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line): + urls.add(m) + return sorted(urls) + + def _grep_lines(log_file, pattern, tail=None): """用 grep 从日志文件提取匹配行。""" try: @@ -240,6 +269,9 @@ def format_errors_report(result): f'请求总数: {result["total_requests"]} | ' f'错误率: {result["error_rate"]}%' ) + sections.append(" 指标口径: ERROR/WARN=日志级别计数;请求总数=HTTP 请求行数;错误率=非200请求数/请求总数×100%。") + if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0): + sections.append(" ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200;并不代表没有 ERROR/WARN 日志。") sections.append("") # Panic @@ -266,22 +298,16 @@ def format_errors_report(result): sections.append(render_bar(bar_data, show_count=True)) sections.append("") - # 来源层表格 - table_data = [] - for e in result["error_top_n"][:10]: - table_data.append( - { - "模板": e["template"][:60], - "数量": e["count"], - "占比": f'{e["pct"]}%', - "级别": e["level"], - "来源层": e["source_layer"], - } - ) - sections.append( - render_table(table_data, columns=["模板", "数量", "占比", "级别", "来源层"], right_align={"数量", "占比"}) - ) + sections.append(" 具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)") sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") # 状态码分布 if result["status_code_dist"]: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py index ca01d718dbc..5d1994d9405 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -150,12 +150,15 @@ def _build_worker_timelines(health_events, counter_events, register_events): break all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + for reg in register_by_ip.get(worker_ip, []): + all_events.append({"ts": reg["ts"], "type": "REGISTERED"}) all_events.extend(recovery_events) all_events.sort(key=lambda e: e["ts"] or "") down_periods = _compute_down_periods(all_events) down_count = len(down_periods) avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + detect_latency = _compute_detect_latency(all_events) workers[url] = { "events": all_events, @@ -165,6 +168,7 @@ def _build_worker_timelines(health_events, counter_events, register_events): "recovered": recovered, "inflight_preserved": counter_counts.get(url, 0), "down_periods": down_periods, + "avg_detect_latency_s": detect_latency, } return workers @@ -191,6 +195,24 @@ def _compute_down_periods(events): return down_periods +def _compute_detect_latency(events): + """计算 NOT_HEALTHY -> REMOVED 平均检测延迟(秒)。""" + last_unhealthy = None + latencies = [] + for evt in events: + if evt["type"] == "NOT_HEALTHY" and evt.get("ts"): + last_unhealthy = evt["ts"] + elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"): + try: + latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds()) + except ValueError: + pass + last_unhealthy = None + if not latencies: + return "-" + return round(sum(latencies) / len(latencies), 1) + + def _compute_uptime_pct(events): """计算 Worker 可用性百分比。""" if not events: @@ -313,8 +335,7 @@ def format_health_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(" 诊断见详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") # Worker 可用性表格 @@ -335,6 +356,7 @@ def format_health_report(result): "在线率": f'{w["uptime_pct"]}%', "下线次数": str(w["down_count"]), "平均下线时长": avg_down or "-", + "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"), "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", } @@ -342,8 +364,8 @@ def format_health_report(result): sections.append( render_table( table_data, - columns=["Worker", "在线率", "下线次数", "平均下线时长", "恢复", "inflight保留"], - right_align={"在线率", "下线次数", "平均下线时长", "inflight保留"}, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"}, ) ) sections.append("") @@ -360,6 +382,12 @@ def format_health_report(result): # 事件详情 → 拆分到 detail_text detail_parts = ["# Worker 健康事件详情", ""] has_events = False + if result.get("diagnoses"): + detail_parts.append("## 诊断") + detail_parts.append("") + for d in result["diagnoses"]: + detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_parts.append("") for url, w in sorted(result["workers"].items()): if w["events"]: has_events = True @@ -373,7 +401,7 @@ def format_health_report(result): # 主报告中添加引用 if has_events: - sections.append("> 完整事件详情: [details/health_events.md](details/health_events.md)") + sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)") sections.append("") return "\n".join(sections), detail_text diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py index eec862910e8..508cf3824d9 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -255,6 +255,7 @@ def format_latency_report(result): f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' f'max={_fmt_ms(stats["max"])}' ) + sections.append(" 指标口径: pXX=延迟分位数;吞吐量=每个时间桶内请求数(count);调度耗时=同 request_id 的 ts_ms(max-min)。") sections.append("") # 延迟分布 @@ -331,13 +332,10 @@ def format_latency_report(result): ) sections.append("") - # 诊断 + # 诊断(仅在 detail 输出) if result["diagnoses"]: sections.append("### 诊断") - for d in result["diagnoses"]: - severity_mark = {"CRITICAL": "!!", "HIGH": "!", "MEDIUM": "~", "LOW": "-", "INFO": " "} - mark = severity_mark.get(d["severity"], " ") - sections.append(f' [{mark}] {d["message"]}') + sections.append(" 诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)") sections.append("") return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index c38b0b80953..7b59b6c5f01 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -30,6 +30,8 @@ # Token 事件 SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)") +RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)") def _strip_scheme(url): @@ -135,11 +137,21 @@ def analyze_load(log_file, tail=None): sr_result = ( match_select_release(h3_lines + h11_lines) if h3_lines - else {"matched": [], "unmatched_selects": [], "untracked_selects": [], "failed_selects": [], "per_worker": {}} + else { + "matched": [], + "unmatched_selects": [], + "unmatched_releases": [], + "failed_selects": [], + "per_worker": {}, + "id_coverage": {}, + "type_summary": {}, + "worker_type_profile": {}, + } ) # Token 统计 token_stats = _analyze_tokens(h3_lines, h11_lines) + counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines) # 请求堆积检测 pileup = _detect_pileup(stats_records) @@ -154,6 +166,7 @@ def analyze_load(log_file, tail=None): "counter_anomalies": anomaly_summary, "select_release": sr_result, "token_stats": token_stats, + "counter_last_state": counter_last_state, "pileup_detected": pileup, "diagnoses": diagnoses, "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", @@ -191,6 +204,55 @@ def _analyze_tokens(h3_lines, h11_lines): return result +def _analyze_counter_last_state(lines): + """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。""" + state = defaultdict( + lambda: { + "req_last_action": "-", + "req_last_value": "-", + "token_last_action": "-", + "token_last_value": "-", + "last_ts": "", + } + ) + for line in lines: + ts = extract_ts(line) or "" + m = SELECT_REQ_COUNT_RE.search(line) + if m: + w = m.group(2) + state[w]["req_last_action"] = "select" + state[w]["req_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_REQ_COUNT_RE.search(line) + if m: + w = m.group(1) + state[w]["req_last_action"] = "release" + state[w]["req_last_value"] = m.group(2) + state[w]["last_ts"] = ts + continue + m = SELECT_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "select" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "release" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + + result = [] + for w in sorted(state.keys()): + s = state[w] + result.append({"worker": _strip_scheme(w), **s}) + return result + + def _detect_pileup(stats_records): """检测请求堆积:total_running 连续上升 >5 个采样点。""" if len(stats_records) < 5: @@ -254,25 +316,16 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, # Select/Release 不一致(仅在存在可关联 ID 时启用,避免无 ID 场景误报) if has_correlatable_ids: for w_url, pw in sr_result.get("per_worker", {}).items(): - if pw.get("delta", 0) > 0: + delta = pw.get("delta", 0) + if delta >= 3: diagnoses.append( { - "severity": "HIGH", - "message": f'{_strip_scheme(w_url)} select-release 差值 {pw["delta"]}(请求泄漏/卡住)', + "severity": "MEDIUM", + "message": f'{_strip_scheme(w_url)} select-release 差值 {delta}(可能存在在途请求堆积)', "source_layer": "FD 后端", } ) - # 卡住的请求 - if sr_result.get("unmatched_selects"): - diagnoses.append( - { - "severity": "HIGH", - "message": f'{len(sr_result["unmatched_selects"])} 个 select 无对应 release(疑似卡住)', - "source_layer": "FD 后端", - } - ) - # Token 计数器潜在泄漏 for t in token_stats: if t.get("alloc_count", 0) > t.get("release_count", 0): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py index 86ba1f0d94f..5cbdc829bf6 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -25,8 +25,7 @@ def format_load_report(result): if result["diagnoses"]: sections.append("### 诊断") sections.append("") - for d in result["diagnoses"]: - sections.append(f' [{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + sections.append(f' 共 {len(result["diagnoses"])} 条诊断,见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)') sections.append("") detail_sections.append("## 诊断") detail_sections.append("") @@ -39,6 +38,7 @@ def format_load_report(result): if ls: sections.append("### 负载概览 (total_running)") sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") sections.append( f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' @@ -108,6 +108,9 @@ def format_load_report(result): sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) sections.append("") sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: `release prefill tokens` 会被识别为 token-release;worker type 按该 worker URL 在 select 中的类型映射(prefill/decode/mixed)。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") sections.append("") detail_sections.append("## 按类型统计") detail_sections.append("") @@ -174,10 +177,35 @@ def format_load_report(result): if no_correlatable_id: sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") sections.append("") + sections.append(" ℹ ReqDelta>0 可能仅表示仍有在途请求(尚未完成推理),需结合时间窗口观察。") + sections.append("") sections.append(" 说明: TokenSelect 按 worker type 统计(prefill + mixed 的 select 都计入),不依赖日志里是否出现 tokens 字段。") sections.append("") detail_sections.append("## Select/Release Per-Worker") detail_sections.append("") + + if sr.get("worker_type_profile"): + sections.append("### Worker URL 类型画像(基于 select)") + sections.append("") + rows = [] + for w, p in sorted(sr["worker_type_profile"].items()): + rows.append( + { + "Worker": _strip_scheme(w), + "Dominant": p.get("dominant_type", "unknown"), + "Prefill": p.get("prefill", 0), + "Decode": p.get("decode", 0), + "Mixed": p.get("mixed", 0), + } + ) + sections.append( + render_table( + rows, + columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"], + right_align={"Prefill", "Decode", "Mixed"}, + ) + ) + sections.append("") detail_sections.append( render_table( table_data, @@ -187,35 +215,6 @@ def format_load_report(result): ) detail_sections.append("") - if sr.get("unmatched_selects"): - sections.append(f' ⚠ {len(sr["unmatched_selects"])} 个未匹配 select(疑似请求卡住)') - sections.append(" 解释: 出现 request select,但在 request release 口径下找不到匹配。可能是请求卡住、日志缺失、或窗口外释放。") - for u in sr["unmatched_selects"][:3]: - sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") - sections.append("") - detail_sections.append("## 未匹配 select(完整)") - detail_sections.append("") - for u in sr["unmatched_selects"]: - detail_sections.append( - f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}' - ) - detail_sections.append("") - - if sr.get("untracked_selects"): - sections.append(f' ℹ {len(sr["untracked_selects"])} 个 select 缺少可关联 ID,未参与卡住判定') - for u in sr["untracked_selects"][:3]: - sections.append(f' [{u.get("select_ts","")}] {_strip_scheme(u["worker"])} ({u["type"]})') - sections.append(" > 完整列表见: [details/load_select_release.md](details/load_select_release.md)") - sections.append("") - detail_sections.append("## Untracked selects(缺少可关联 ID)") - detail_sections.append("") - for u in sr["untracked_selects"]: - detail_sections.append( - f'- [{u.get("select_ts","")}] worker={_strip_scheme(u["worker"])} type={u["type"]} note={u.get("note","")}' - ) - detail_sections.append("") - if sr.get("failed_selects"): sections.append(f' ⚠ Failed to select: {len(sr["failed_selects"])} 次') sections.append(" 解释: 路由在该时刻未能选出可用 worker,通常意味着可用池不足或健康状态异常。") @@ -239,4 +238,20 @@ def format_load_report(result): ) sections.append("") + if result.get("counter_last_state"): + sections.append("### 计数器末状态") + sections.append("") + sections.append(" 末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)") + sections.append("") + detail_sections.append("## Counter / Token Counter 末状态(最后一条计数日志)") + detail_sections.append("") + detail_sections.append( + render_table( + result["counter_last_state"], + columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"], + right_align={"req_last_value", "token_last_value"}, + ) + ) + detail_sections.append("") + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 6c9a0323724..24af9a23500 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -16,6 +16,7 @@ from log_parser import ( extract_tags, extract_ts, + match_select_release, parse_cache_strategy_line, parse_http_line, ) @@ -108,12 +109,14 @@ def analyze_trace(log_file, trace_ids, tail=None): # 解析事件链 events = _parse_event_chain(all_lines) lifecycle_complete = _check_lifecycle_complete(events) - diagnoses = _diagnose_trace(events, lifecycle_complete) + sr_check = match_select_release(all_lines) + diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) traces[tid] = { "events": events, "lifecycle_complete": lifecycle_complete, "diagnoses": diagnoses, + "sr_check": sr_check, "matched_tag": "session_id" if is_session else "request_id/trace_id", "related_ids": { "request_ids": sorted(related_request_ids) if is_session else [], @@ -271,7 +274,7 @@ def _check_lifecycle_complete(events): return has_entry and has_exit and (not has_select or has_release) -def _diagnose_trace(events, lifecycle_complete): +def _diagnose_trace(events, lifecycle_complete, sr_check=None): """生成追踪诊断。""" diagnoses = [] types = [e["type"] for e in events] @@ -294,6 +297,22 @@ def _diagnose_trace(events, lifecycle_complete): if "FAILED_SELECT" in types: diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + if sr_check: + if sr_check.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select', + } + ) + if sr_check.get("unmatched_releases"): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release', + } + ) + return diagnoses @@ -367,7 +386,7 @@ def format_trace_report(result): # 主报告中添加引用和摘要 safe_tid = tid.replace("/", "_") sections.append(f' 事件数: {len(trace["events"])}') - sections.append(f" > 完整事件链: [details/trace_{safe_tid}.md](details/trace_{safe_tid}.md)") + sections.append(f" > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)") sections.append("") return "\n".join(sections), detail_dict diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py index 83bb0203432..1eaea1369f8 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -227,9 +227,10 @@ def render_table(data, columns=None, right_align=None): w = col_widths[col] if col in right_align: header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") else: header_parts.append(f" {col:<{w}} ") - sep_parts.append("-" * (w + 2)) + sep_parts.append(":" + "-" * (w + 1)) lines = [] lines.append("|" + "|".join(header_parts) + "|") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 200f976f2ff..2f98511a811 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -501,6 +501,83 @@ def _normalize_worker_type(worker_type): return "unknown" +def _normalize_worker_url_key(url): + if not url: + return "" + return re.sub(r"^https?://", "", str(url).strip().rstrip("/")) + + +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + def match_select_release(lines, fallback_window_s=120): """匹配 select/release worker 事件对。 @@ -525,6 +602,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": tm.group(2), + "worker_key": _normalize_worker_url_key(tm.group(2)), "type": _normalize_worker_type(tm.group(1)), "tags": tags, "tokens": int(tm.group(3)), @@ -536,12 +614,15 @@ def match_select_release(lines, fallback_window_s=120): # Token-bearing release trm = RELEASE_TOKENS_RE.search(line) if trm: - token_type = trm.group(1) or "prefill" + token_type = trm.group(1) releases.append( { "ts": ts, "worker": trm.group(2), - "type": f'{_normalize_worker_type(token_type)}_tokens', + "worker_key": _normalize_worker_url_key(trm.group(2)), + # 文本默认按 prefill 记,再结合同 worker 邻近 select 做纠偏(mixed 场景) + "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens', + "raw_token_type": token_type or "", "tags": tags, "tokens": int(trm.group(3)), "line": line_no, @@ -555,6 +636,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": sm.group(2), + "worker_key": _normalize_worker_url_key(sm.group(2)), "type": _normalize_worker_type(sm.group(1)), "tags": tags, "tokens": None, @@ -569,6 +651,7 @@ def match_select_release(lines, fallback_window_s=120): { "ts": ts, "worker": rm.group(2), + "worker_key": _normalize_worker_url_key(rm.group(2)), "type": _normalize_worker_type(rm.group(1)), "tags": tags, "tokens": None, @@ -580,20 +663,14 @@ def match_select_release(lines, fallback_window_s=120): if FAILED_SELECT_RE.search(line): failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) - # Match by request_id / alt_id + # Match by worker FIFO(select -> 同 worker 下一条 release) matched = [] unmatched_selects = [] release_used = set() # 请求生命周期匹配只使用 request counter release(排除 token release) + # 说明:request_id 只用于覆盖率观测,不参与 select/release 配对条件。 counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")] - release_by_key = defaultdict(list) - for i in counter_release_indexes: - r = releases[i] - _, key = _select_match_key(r.get("tags", {})) - if key: - release_by_key[key].append(i) - # 请求 ID 覆盖(按 select 事件近似请求数) total_req_est = len(selects) with_request_id = 0 @@ -601,7 +678,6 @@ def match_select_release(lines, fallback_window_s=120): without_any_id = 0 pending_selects = [] - untracked_selects = [] for s in selects: key_type, key = _select_match_key(s.get("tags", {})) if key_type == "request_id": @@ -611,62 +687,28 @@ def match_select_release(lines, fallback_window_s=120): else: without_any_id += 1 - found = False - if not key: - # 没有任何可用 ID 时,不做退化匹配(只统计可观测信息) - untracked_selects.append( - { - "worker": s["worker"], - "select_ts": s["ts"], - "type": s["type"], - "tags": s["tags"], - "note": "no correlatable id (request_id/req_id/trace_id/session_id)", - } - ) - continue + pending_selects.append(s) - if key and key in release_by_key: - for ri in release_by_key[key]: - if ri not in release_used: - r = releases[ri] - matched.append( - { - "request_id": s["tags"].get("request_id", ""), - "worker": s["worker"], - "select_ts": s["ts"], - "release_ts": r["ts"], - "type": s["type"], - "match_method": key_type or "id", - } - ) - release_used.add(ri) - found = True - break - - if not found: - pending_selects.append(s) - - # Fallback: 有 ID 但未匹配时,按 worker + 时间邻近匹配 for s in pending_selects: - sdt = _parse_ts_safe(s["ts"]) + sdt = _parse_ts_safe(s.get("ts")) best_idx = None - best_delta = None + best_ts = None for ri in counter_release_indexes: - r = releases[ri] if ri in release_used: continue - if r.get("worker") != s.get("worker"): + r = releases[ri] + if r.get("worker_key") != s.get("worker_key"): continue rdt = _parse_ts_safe(r.get("ts")) - if sdt and rdt: - delta = (rdt - sdt).total_seconds() - if delta < 0 or delta > fallback_window_s: - continue - else: - delta = 0 - if best_delta is None or delta < best_delta: - best_delta = delta + # 优先选择时间不早于 select 的最早 release;解析失败则按出现顺序 + if sdt and rdt and rdt < sdt: + continue + if best_idx is None: best_idx = ri + best_ts = rdt + elif rdt and best_ts and rdt < best_ts: + best_idx = ri + best_ts = rdt if best_idx is not None: r = releases[best_idx] @@ -677,7 +719,7 @@ def match_select_release(lines, fallback_window_s=120): "select_ts": s["ts"], "release_ts": r["ts"], "type": s["type"], - "match_method": "worker_time_fallback", + "match_method": "worker_fifo", } ) release_used.add(best_idx) @@ -688,7 +730,7 @@ def match_select_release(lines, fallback_window_s=120): "select_ts": s["ts"], "type": s["type"], "tags": s["tags"], - "note": "no matching release found (request_id/worker-time)", + "note": "no matching release found (worker FIFO)", } ) @@ -697,14 +739,16 @@ def match_select_release(lines, fallback_window_s=120): per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0}) for s in selects: s_type = _normalize_worker_type(s.get("type")) - per_worker[s["worker"]]["selects"] += 1 + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker[wkey]["selects"] += 1 if s_type in ("prefill", "mixed"): - per_worker[s["worker"]]["token_selects"] += 1 + per_worker[wkey]["token_selects"] += 1 for r in releases: + wkey = r.get("worker_key") or _normalize_worker_url_key(r.get("worker")) if str(r.get("type", "")).endswith("_tokens"): - per_worker[r["worker"]]["token_releases"] += 1 + per_worker[wkey]["token_releases"] += 1 else: - per_worker[r["worker"]]["releases"] += 1 + per_worker[wkey]["releases"] += 1 pw_result = {} for w, counts in per_worker.items(): @@ -716,7 +760,34 @@ def match_select_release(lines, fallback_window_s=120): "token_releases": counts["token_releases"], } - # 按 worker type 分类统计(prefill/decode/mixed) + # 基于 select 构建 worker URL -> dominant type 映射 + per_worker_type_counts = defaultdict(lambda: defaultdict(int)) + for s in selects: + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker_type_counts[wkey][_normalize_worker_type(s.get("type"))] += 1 + worker_dominant_type = {} + for w, counts in per_worker_type_counts.items(): + worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown" + + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + # token release 按 worker URL 对应的 select 类型映射,不做邻近时间纠偏 + mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown") + if mapped_t in ("prefill", "decode", "mixed"): + base_t = mapped_t + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) type_summary = defaultdict( lambda: { "counter_selects": 0, @@ -730,17 +801,32 @@ def match_select_release(lines, fallback_window_s=120): type_summary[s_type]["counter_selects"] += 1 if s_type in ("prefill", "mixed"): type_summary[s_type]["token_selects"] += 1 - for r in releases: - r_type = _normalize_worker_type(str(r.get("type", "")).replace("_tokens", "")) - if str(r.get("type", "")).endswith("_tokens"): + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): type_summary[r_type]["token_releases"] += 1 else: type_summary[r_type]["counter_releases"] += 1 + # 每个 worker URL 的类型画像(基于 select) + worker_type_profile = {} + for w, counts in per_worker_type_counts.items(): + dominant = "unknown" + if counts: + dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] + worker_type_profile[w] = { + "dominant_type": dominant, + "prefill": counts.get("prefill", 0), + "decode": counts.get("decode", 0), + "mixed": counts.get("mixed", 0), + "unknown": counts.get("unknown", 0), + } + return { "matched": matched, "unmatched_selects": unmatched_selects, - "untracked_selects": untracked_selects, + "unmatched_releases": [], "failed_selects": failed_selects, "per_worker": pw_result, "id_coverage": { @@ -751,6 +837,7 @@ def match_select_release(lines, fallback_window_s=120): "without_any_id": without_any_id, }, "type_summary": dict(type_summary), + "worker_type_profile": worker_type_profile, } @@ -949,6 +1036,15 @@ def check(name, got, expected): "dial tcp {ip:port}: connection refused", ) + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index a818d31150f..803bf6fba43 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -24,6 +24,7 @@ import os import sys from datetime import datetime +from pathlib import Path # 确保能 import 同级模块 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) @@ -46,23 +47,56 @@ def determine_log_file(user_path=None): 3. fd-router.log(golang_router 根目录) """ if user_path: - if os.path.isfile(user_path): - return user_path + p = Path(user_path).expanduser() + if p.is_file(): + return str(p) print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr) + print( + "提示: 若路径含空格/括号,请使用引号,例如: " + "python3 scripts/troubleshoot.py 'fastdeploy/golang_router/logs/fd-router (2).log' --load", + file=sys.stderr, + ) sys.exit(1) - # 尝试不同 CWD 下的候选路径 - candidates = [ - "logs/router.log", # CWD = golang_router/ - "fd-router.log", # CWD = golang_router/ - "fastdeploy/golang_router/logs/router.log", # CWD = 项目根 - "fastdeploy/golang_router/fd-router.log", # CWD = 项目根 + # 统一基于脚本位置与当前工作目录搜索,避免 CWD 差异导致找不到日志。 + script_dir = Path(__file__).resolve().parent + golang_router_dir = script_dir.parents[2] # .../fastdeploy/golang_router + cwd = Path.cwd() + + # 精确候选(优先常见命名) + exact_candidates = [ + golang_router_dir / "logs" / "router.log", + golang_router_dir / "fd-router.log", + cwd / "logs" / "router.log", + cwd / "fd-router.log", + cwd / "fastdeploy" / "golang_router" / "logs" / "router.log", + cwd / "fastdeploy" / "golang_router" / "fd-router.log", ] - for path in candidates: - if os.path.isfile(path): - return path + for p in exact_candidates: + if p.is_file(): + return str(p) + + # 模糊候选:支持 fd-router (2).log 等命名 + pattern_roots = [ + golang_router_dir / "logs", + golang_router_dir, + cwd / "logs", + cwd, + cwd / "fastdeploy" / "golang_router" / "logs", + cwd / "fastdeploy" / "golang_router", + ] + dynamic_candidates = [] + for root in pattern_roots: + if not root.is_dir(): + continue + dynamic_candidates.extend(sorted(root.glob("fd-router*.log"))) + dynamic_candidates.extend(sorted(root.glob("router*.log"))) + + if dynamic_candidates: + return str(dynamic_candidates[0]) print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr) + print("已搜索: logs/router.log, fd-router.log, fd-router*.log, router*.log", file=sys.stderr) sys.exit(1) @@ -128,7 +162,14 @@ def determine_status(results): reasons.append(d["message"]) if reasons: - return "DEGRADED", ", ".join(reasons) + # 去重但保留完整信息 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + return "DEGRADED", ";".join(deduped) if not results: return "HEALTHY", "无分析数据" @@ -148,19 +189,65 @@ def format_full_report(results, status, status_reason): - 'trace_files': {trace_id: text} 或 {} """ parts = [] - details = {"health_events": None, "load_select_release": None, "trace_files": {}} + details = { + "health_events": None, + "load_select_release": None, + "latency_diagnoses": None, + "cache_diagnosis": None, + "load_diagnoses": None, + "load_counter_state": None, + "cache_session_stickiness": None, + "cache_suboptimal": None, + "cache_eviction": None, + "cache_fallback": None, + "cache_cross": None, + "errors_topn": None, + "trace_files": {}, + } # 状态行 parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) parts.append("=" * 60) parts.append("") # 各维度报告 if "errors" in results: parts.append(format_errors_report(results["errors"])) + if results["errors"].get("error_top_n"): + lines = [ + "# Errors TopN 详情", + "", + "| 模板 | 数量 | 级别 | 来源层 | 影响 |", + "|:--|--:|:--|:--|:--|", + ] + for e in results["errors"]["error_top_n"]: + lines.append( + f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |' + ) + lines.append("") + lines.append("## 涉及 URLs") + lines.append("") + for e in results["errors"]["error_top_n"]: + urls = e.get("urls") or [] + if not urls: + continue + lines.append(f'- 模板: {e.get("template","")}') + for u in urls: + lines.append(f' - {u}') + lines.append("") + details["errors_topn"] = "\n".join(lines) if "latency" in results: parts.append(format_latency_report(results["latency"])) + if results["latency"].get("diagnoses"): + lines = ["# 延迟诊断详情", ""] + for d in results["latency"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] {d.get("message","")}') + lines.append("") + details["latency_diagnoses"] = "\n".join(lines) if "health" in results: summary, detail = format_health_report(results["health"]) @@ -173,9 +260,58 @@ def format_full_report(results, status, status_reason): parts.append(summary) if detail: details["load_select_release"] = detail + if results["load"].get("diagnoses"): + lines = ["# Load 诊断详情", ""] + for d in results["load"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}') + lines.append("") + details["load_diagnoses"] = "\n".join(lines) + if results["load"].get("counter_last_state"): + rows = results["load"]["counter_last_state"] + lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"] + for r in rows: + lines.append( + f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' + ) + lines.append("") + details["load_counter_state"] = "\n".join(lines) if "cache" in results: - parts.append(format_cache_report(results["cache"])) + summary, detail = format_cache_report(results["cache"]) + parts.append(summary) + if detail: + details["cache_diagnosis"] = detail + c = results["cache"] + if c.get("session_stickiness"): + lines = ["# Cache Session 粘性详情", ""] + for sid, s in c["session_stickiness"].items(): + lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}') + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + if c.get("suboptimal_selections"): + lines = ["# Cache 非最优选择详情", ""] + for x in c["suboptimal_selections"][:200]: + lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}') + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + if c.get("eviction_impact"): + lines = ["# Cache 驱逐影响详情", ""] + for x in c["eviction_impact"][:200]: + lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}') + lines.append("") + details["cache_eviction"] = "\n".join(lines) + if c.get("fallback_reasons"): + lines = ["# Cache Fallback 原因详情", ""] + for x in c["fallback_reasons"]: + lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') + lines.append("") + details["cache_fallback"] = "\n".join(lines) + if c.get("cross_diagnosis"): + lines = ["# Cache 交叉诊断详情", ""] + for x in c["cross_diagnosis"]: + lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%') + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -217,6 +353,40 @@ def save_detailed_report(report_text, output_dir, details=None): with open(load_path, "w", encoding="utf-8") as f: f.write(details["load_select_release"]) + if details.get("latency_diagnoses"): + latency_path = os.path.join(detail_dir, "latency_diagnoses.md") + with open(latency_path, "w", encoding="utf-8") as f: + f.write(details["latency_diagnoses"]) + + if details.get("cache_diagnosis"): + cache_path = os.path.join(detail_dir, "cache_diagnosis.md") + with open(cache_path, "w", encoding="utf-8") as f: + f.write(details["cache_diagnosis"]) + if details.get("load_diagnoses"): + with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f: + f.write(details["load_diagnoses"]) + if details.get("load_counter_state"): + with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: + f.write(details["load_counter_state"]) + if details.get("cache_session_stickiness"): + with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: + f.write(details["cache_session_stickiness"]) + if details.get("cache_suboptimal"): + with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: + f.write(details["cache_suboptimal"]) + if details.get("cache_eviction"): + with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: + f.write(details["cache_eviction"]) + if details.get("cache_fallback"): + with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: + f.write(details["cache_fallback"]) + if details.get("cache_cross"): + with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: + f.write(details["cache_cross"]) + if details.get("errors_topn"): + with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f: + f.write(details["errors_topn"]) + for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") From 0afcff2beef44a81e350280f6f6d279d756b8e51 Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 14:35:02 +0800 Subject: [PATCH 20/40] [Feature] Add troubleshoot and stats-cache-hitratio skills --- .../troubleshoot/scripts/analyzers/load.py | 7 +- .../skills/troubleshoot/scripts/log_parser.py | 33 +- fastdeploy/golang_router/cmd/main.go | 10 +- .../config/config.example.yaml | 3 + .../config/config.example.yaml | 3 + .../golang_router/internal/config/config.go | 16 +- .../internal/gateway/completions_test.go | 2 +- .../internal/manager/health_test.go | 2 +- .../internal/middleware/logger_test.go | 2 +- .../scheduler/handler/prefill_cache_aware.go | 3 + .../scheduler/handler/tokenizer_test.go | 12 +- fastdeploy/golang_router/pkg/logger/logger.go | 288 +++++++++++++++++- .../golang_router/pkg/logger/logger_test.go | 15 +- 13 files changed, 335 insertions(+), 61 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py index 7b59b6c5f01..83b9c8a05e1 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -15,7 +15,6 @@ from log_parser import extract_ts, match_select_release, parse_stats_line from stats import compute_statistics, time_bucket -from analyzers.load_report import format_load_report # ════════════════════════════════════════════════════════════════ # Counter 异常检测正则 @@ -82,7 +81,7 @@ def analyze_load(log_file, tail=None): r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters", tail, ) - h11_lines = _grep_lines(log_file, r"release (?:[a-zA-Z_]+\s+)?tokens", tail) + h11_lines = _grep_lines(log_file, r"release [a-zA-Z_]+ tokens:", tail) # 解析 stats 行 stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r] @@ -321,7 +320,7 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, diagnoses.append( { "severity": "MEDIUM", - "message": f'{_strip_scheme(w_url)} select-release 差值 {delta}(可能存在在途请求堆积)', + "message": f"{_strip_scheme(w_url)} select-release 差值 {delta}(可能存在在途请求堆积)", "source_layer": "FD 后端", } ) @@ -345,8 +344,6 @@ def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, # ════════════════════════════════════════════════════════════════ - - # ════════════════════════════════════════════════════════════════ # Grep 工具 # ════════════════════════════════════════════════════════════════ diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py index 44a0f285fe5..99864e1de16 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -506,6 +506,7 @@ def _normalize_worker_url_key(url): return "" return re.sub(r"^https?://", "", str(url).strip().rstrip("/")) + def _infer_release_worker_type(release, selects, fallback_window_s=120): """为未显式标注 type 的 release 近似推断 worker type。 @@ -554,7 +555,11 @@ def _infer_token_release_worker_type(release, selects, fallback_window_s=120): return "unknown" r_ts = _parse_ts_safe(release.get("ts")) - candidates = [s for s in selects if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed")] + candidates = [ + s + for s in selects + if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed") + ] if not candidates: return "unknown" @@ -716,31 +721,14 @@ def match_select_release(lines, fallback_window_s=120): if s_key and r_key: if s_key == r_key: id_check = "match" - id_consistency["both_present_and_equal"] += 1 else: id_check = "mismatch" - id_consistency["both_present_but_mismatch"] += 1 - id_mismatched_matches.append( - { - "worker": s["worker"], - "select_ts": s["ts"], - "release_ts": r["ts"], - "select_id_key": s_key_type, - "select_id": s_key, - "release_id_key": r_key_type, - "release_id": r_key, - "note": "worker FIFO matched, but ID mismatched", - } - ) elif s_key and not r_key: id_check = "select_only" - id_consistency["only_select_has_id"] += 1 elif (not s_key) and r_key: id_check = "release_only" - id_consistency["only_release_has_id"] += 1 else: id_check = "both_missing" - id_consistency["both_missing"] += 1 matched.append( { @@ -750,6 +738,7 @@ def match_select_release(lines, fallback_window_s=120): "release_ts": r["ts"], "type": s["type"], "match_method": "worker_fifo", + "id_check": id_check, } ) release_used.add(best_idx) @@ -806,7 +795,9 @@ def match_select_release(lines, fallback_window_s=120): if r_type_raw.endswith("_tokens"): base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) # token release 按 worker URL 对应的 select 类型映射,不做邻近时间纠偏 - mapped_t = worker_dominant_type.get(r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown") + mapped_t = worker_dominant_type.get( + r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown" + ) if mapped_t in ("prefill", "decode", "mixed"): base_t = mapped_t inferred_release_types[i] = f"{base_t}_tokens" @@ -866,8 +857,6 @@ def match_select_release(lines, fallback_window_s=120): "with_alt_id": with_alt_id, "without_any_id": without_any_id, }, - "id_consistency": id_consistency, - "id_mismatched_matches": id_mismatched_matches, "type_summary": dict(type_summary), "worker_type_profile": worker_type_profile, } @@ -1077,7 +1066,7 @@ def check(name, got, expected): msr = match_select_release(sample_lines) check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) - + print(f'\n{"=" * 40}') print(f"Results: {passed} passed, {failed} failed") if failed: diff --git a/fastdeploy/golang_router/cmd/main.go b/fastdeploy/golang_router/cmd/main.go index e0e8c98e137..6664436823c 100644 --- a/fastdeploy/golang_router/cmd/main.go +++ b/fastdeploy/golang_router/cmd/main.go @@ -41,7 +41,14 @@ func main() { } // Initialize logger - logger.Init(cfg.Log.Level, cfg.Log.Output) + logCfg := logger.Config{ + Level: cfg.Log.Level, + Output: cfg.Log.Output, + MaxAgeDays: cfg.Log.MaxAgeDays, + MaxTotalSizeMB: cfg.Log.MaxTotalSizeMB, + CleanupIntervalSecs: cfg.Log.CleanupIntervalSecs, + } + logger.Init(logCfg) defer logger.CloseLogFile() // Initialize manager @@ -59,6 +66,7 @@ func main() { go scheduler_handler.StartBackupCleanupTask(context.Background(), intervalCleanupSecs) statsIntervalSecs := cfg.Scheduler.StatsIntervalSecs go scheduler_handler.StartStatsReporter(context.Background(), statsIntervalSecs) + go logger.StartLogCleanup(context.Background(), logCfg) // Start server addr := ":" + cfg.Server.Port diff --git a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml index be4b11227d2..5e1091b0eef 100644 --- a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml +++ b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml @@ -29,3 +29,6 @@ manager: log: level: "info" # debug, info, warn, error output: "file" # stdout, file + max-age-days: 7 # max days to keep log files; default: 7 + max-total-size-mb: 500 # max total log size in MB; default: 500 + cleanup-interval-secs: 3600 # cleanup check interval in seconds; default: 3600 diff --git a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml index be4b11227d2..5e1091b0eef 100644 --- a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml +++ b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml @@ -29,3 +29,6 @@ manager: log: level: "info" # debug, info, warn, error output: "file" # stdout, file + max-age-days: 7 # max days to keep log files; default: 7 + max-total-size-mb: 500 # max total log size in MB; default: 500 + cleanup-interval-secs: 3600 # cleanup check interval in seconds; default: 3600 diff --git a/fastdeploy/golang_router/internal/config/config.go b/fastdeploy/golang_router/internal/config/config.go index 2cb8226961d..f184a5b16da 100644 --- a/fastdeploy/golang_router/internal/config/config.go +++ b/fastdeploy/golang_router/internal/config/config.go @@ -49,8 +49,11 @@ type SchedulerConfig struct { } type LogConfig struct { - Level string `yaml:"level"` // debug, info, warn, error - Output string `yaml:"output"` // stdout, file + Level string `yaml:"level"` // debug, info, warn, error + Output string `yaml:"output"` // stdout, file + MaxAgeDays int `yaml:"max-age-days"` // max days to keep log files; 0 = use default (7) + MaxTotalSizeMB int `yaml:"max-total-size-mb"` // max total log size in MB; 0 = use default (500) + CleanupIntervalSecs float64 `yaml:"cleanup-interval-secs"` // cleanup check interval in seconds; 0 = use default (3600) } func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) { @@ -81,6 +84,15 @@ func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) { if cfg.Log.Level == "" { cfg.Log.Level = "info" } + if cfg.Log.MaxAgeDays == 0 { + cfg.Log.MaxAgeDays = 7 + } + if cfg.Log.MaxTotalSizeMB == 0 { + cfg.Log.MaxTotalSizeMB = 500 + } + if cfg.Log.CleanupIntervalSecs == 0 { + cfg.Log.CleanupIntervalSecs = 3600 + } if cfg.Manager.HealthCheckEndpoint == "" { cfg.Manager.HealthCheckEndpoint = "/health" } diff --git a/fastdeploy/golang_router/internal/gateway/completions_test.go b/fastdeploy/golang_router/internal/gateway/completions_test.go index 825544ff5e3..4fea9736ad6 100644 --- a/fastdeploy/golang_router/internal/gateway/completions_test.go +++ b/fastdeploy/golang_router/internal/gateway/completions_test.go @@ -20,7 +20,7 @@ import ( ) func TestMain(m *testing.M) { - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) gin.SetMode(gin.TestMode) os.Exit(m.Run()) } diff --git a/fastdeploy/golang_router/internal/manager/health_test.go b/fastdeploy/golang_router/internal/manager/health_test.go index bc42031d85f..f50ea2d00b2 100644 --- a/fastdeploy/golang_router/internal/manager/health_test.go +++ b/fastdeploy/golang_router/internal/manager/health_test.go @@ -15,7 +15,7 @@ import ( func init() { // Initialize logger for all tests - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) } func TestCheckServiceHealth(t *testing.T) { diff --git a/fastdeploy/golang_router/internal/middleware/logger_test.go b/fastdeploy/golang_router/internal/middleware/logger_test.go index da9c7290567..47b63742547 100644 --- a/fastdeploy/golang_router/internal/middleware/logger_test.go +++ b/fastdeploy/golang_router/internal/middleware/logger_test.go @@ -12,7 +12,7 @@ import ( func init() { // Initialize logger to avoid nil pointer dereference in recovery middleware - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) } func TestLoggerMiddleware(t *testing.T) { diff --git a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go index 48737c03c72..2259087d619 100644 --- a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go +++ b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go @@ -384,6 +384,9 @@ func (c *radixPrefixCache) Record(tokens []int, worker string) { // evictionWorker periodically evicts inactive nodes func (c *radixPrefixCache) evictionWorker(interval time.Duration) { + if interval <= 0 { + return + } ticker := time.NewTicker(interval) defer ticker.Stop() for { diff --git a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go index d3b6dacfdc4..e1155e3686b 100644 --- a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go +++ b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go @@ -586,13 +586,13 @@ func TestParseTokensFromBody(t *testing.T) { name: "invalid JSON format", input: []byte(`invalid json`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "empty body", input: []byte(``), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "large array of tokens", @@ -610,13 +610,13 @@ func TestParseTokensFromBody(t *testing.T) { name: "non-array input_ids", input: []byte(`{"input_ids": "not an array"}`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "malformed array", input: []byte(`{"input_ids": [1, "two", 3]}`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, } @@ -629,8 +629,8 @@ func TestParseTokensFromBody(t *testing.T) { t.Errorf("parseTokensFromBody() error = %v, wantErr %v", err, tt.err) return } - if err != nil && tt.err != nil && err.Error() != tt.err.Error() { - t.Errorf("parseTokensFromBody() error message = %v, want %v", err.Error(), tt.err.Error()) + if err != nil && tt.err != nil && !strings.Contains(err.Error(), tt.err.Error()) { + t.Errorf("parseTokensFromBody() error message = %v, want containing %v", err.Error(), tt.err.Error()) return } diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 8e213fc0c9f..07412670628 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -1,12 +1,26 @@ package logger import ( + "context" + "fmt" "log" "os" + "path/filepath" + "sort" + "strings" "sync" - "context" + "time" ) +// Config holds logger configuration. +type Config struct { + Level string + Output string + MaxAgeDays int + MaxTotalSizeMB int + CleanupIntervalSecs float64 +} + var ( infoLogger *log.Logger errorLogger *log.Logger @@ -14,37 +28,166 @@ var ( debugLogger *log.Logger level string once sync.Once - logFile *os.File + writer *rotatingWriter // nil when output is stdout ) +// nowFunc is overridable in tests for time-dependent logic. +var nowFunc = time.Now + type contextKey string + const TraceIDKey contextKey = "trace_id" const ReqIDKey contextKey = "req_id" const RequestIDKey contextKey = "request_id" const SessionIDKey contextKey = "session_id" -// Init initialize logger -func Init(logLevel, output string) { - once.Do(func() { - level = logLevel +// gracePeriod is how long we keep the previous day's file open after rotation. +const gracePeriod = 5 * time.Minute + +// rotatingWriter implements io.Writer with day-level rotation and dual-file writes. +// Current day's log is always "router.log"; on day change it is renamed to +// "router-YYYY-MM-DD.log" and a new "router.log" is created. During a short +// grace period after rotation, log lines whose timestamp belongs to the previous +// day are written to the archived file. +type rotatingWriter struct { + mu sync.Mutex + currentFile *os.File // today's router.log + prevFile *os.File // previous day's router-.log during grace period (may be nil) + currentDate string // "2006-01-02" + prevDate string // previous date during grace period + graceUntil time.Time // when to close prevFile + logDir string +} + +func newRotatingWriter(logDir string) (*rotatingWriter, error) { + today := nowFunc().Format("2006-01-02") + f, err := os.OpenFile(filepath.Join(logDir, "router.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + return nil, err + } + return &rotatingWriter{ + currentFile: f, + currentDate: today, + logDir: logDir, + }, nil +} + +func (w *rotatingWriter) Write(p []byte) (n int, err error) { + w.mu.Lock() + defer w.mu.Unlock() + + today := nowFunc().Format("2006-01-02") + + // Detect day change and rotate. + if today != w.currentDate { + w.rotateLocked(today) + } + + // Close previous file if grace period expired. + if w.prevFile != nil && nowFunc().After(w.graceUntil) { + w.prevFile.Close() + w.prevFile = nil + w.prevDate = "" + } + + // During grace period, route log lines to the correct file based on timestamp. + target := w.currentFile + if w.prevFile != nil { + if logDate := parseLogDate(p); logDate == w.prevDate { + target = w.prevFile + } + } + + return target.Write(p) +} + +func (w *rotatingWriter) Close() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.prevFile != nil { + w.prevFile.Close() + w.prevFile = nil + } + if w.currentFile != nil { + return w.currentFile.Close() + } + return nil +} + +// rotateLocked performs the actual file rotation. Must be called with w.mu held. +func (w *rotatingWriter) rotateLocked(newDate string) { + // Close any lingering previous file. + if w.prevFile != nil { + w.prevFile.Close() + w.prevFile = nil + } + + // Close current router.log so we can rename it. + if w.currentFile != nil { + w.currentFile.Close() + } + + // Rename router.log -> router-.log + oldPath := filepath.Join(w.logDir, "router.log") + archivePath := filepath.Join(w.logDir, "router-"+w.currentDate+".log") + if err := os.Rename(oldPath, archivePath); err != nil { + // Rename failed; try to reopen router.log and continue without rotation. + w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + return + } + + // Open the archived file for dual-write grace period. + w.prevFile, _ = os.OpenFile(archivePath, os.O_WRONLY|os.O_APPEND, 0666) + w.prevDate = w.currentDate + w.graceUntil = nowFunc().Add(gracePeriod) + + // Create new router.log for the new day. + w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + w.currentDate = newDate +} + +// parseLogDate extracts the date from a log line produced by log.LstdFlags. +// Format: "[LEVEL] 2006/01/02 15:04:05 ..." +// Returns "2006-01-02" or empty string on parse failure. +func parseLogDate(p []byte) string { + // Find the date pattern "YYYY/MM/DD" in the log prefix. + // log.LstdFlags produces: "2006/01/02 15:04:05" after the logger prefix. + // The prefix is like "[INFO] " (7 chars), so the date starts around index 7. + s := string(p) + for i := 0; i+10 <= len(s); i++ { + c := s[i] + if c >= '1' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' { + // Found a candidate "YYYY/MM/DD" + year := s[i : i+4] + month := s[i+5 : i+7] + day := s[i+8 : i+10] + return year + "-" + month + "-" + day + } + } + return "" +} +// Init initializes the logger. +func Init(cfg Config) { + once.Do(func() { + level = cfg.Level flags := log.LstdFlags | log.Lshortfile - if output == "file" { - // Check if logs directory exists + if cfg.Output == "file" { if _, err := os.Stat("logs"); os.IsNotExist(err) { if err := os.MkdirAll("logs", 0755); err != nil { log.Fatalln("Failed to create logs directory:", err) } } - logFile, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + var err error + writer, err = newRotatingWriter("logs") if err != nil { - log.Fatalln("Failed to open log file:", err) + log.Fatalln("Failed to create rotating log writer:", err) } - infoLogger = log.New(logFile, "[INFO] ", flags) - errorLogger = log.New(logFile, "[ERROR] ", flags) - warnLogger = log.New(logFile, "[WARN] ", flags) - debugLogger = log.New(logFile, "[DEBUG] ", flags) + infoLogger = log.New(writer, "[INFO] ", flags) + errorLogger = log.New(writer, "[ERROR] ", flags) + warnLogger = log.New(writer, "[WARN] ", flags) + debugLogger = log.New(writer, "[DEBUG] ", flags) } else { infoLogger = log.New(os.Stdout, "[INFO] ", flags) errorLogger = log.New(os.Stderr, "[ERROR] ", flags) @@ -54,9 +197,122 @@ func Init(logLevel, output string) { }) } +// CloseLogFile closes the log file if in file output mode. func CloseLogFile() { - if logFile != nil { - logFile.Close() + if writer != nil { + writer.Close() + } +} + +// StartLogCleanup runs periodic log cleanup in a background goroutine. +// It deletes archived log files older than MaxAgeDays and trims total log size +// to stay under MaxTotalSizeMB. +func StartLogCleanup(ctx context.Context, cfg Config) { + if cfg.Output != "file" { + return + } + if cfg.CleanupIntervalSecs <= 0 { + return + } + + ticker := time.NewTicker(time.Duration(cfg.CleanupIntervalSecs * float64(time.Second))) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + cleanupLogs("logs", cfg.MaxAgeDays, cfg.MaxTotalSizeMB) + } + } +} + +type logFileInfo struct { + name string + path string + date time.Time + size int64 +} + +// cleanupLogs removes archived log files based on age and total size limits. +func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { + entries, err := os.ReadDir(logDir) + if err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Failed to read log directory for cleanup: %v\n", err) + return + } + + now := nowFunc() + var archives []logFileInfo + var routerLogSize int64 + + for _, entry := range entries { + if entry.IsDir() { + continue + } + name := entry.Name() + info, err := entry.Info() + if err != nil { + continue + } + + // Count router.log size but never delete it. + if name == "router.log" { + routerLogSize = info.Size() + continue + } + + // Match archived files: router-YYYY-MM-DD.log + if !strings.HasPrefix(name, "router-") || !strings.HasSuffix(name, ".log") { + continue + } + dateStr := strings.TrimPrefix(name, "router-") + dateStr = strings.TrimSuffix(dateStr, ".log") + fileDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + continue + } + archives = append(archives, logFileInfo{ + name: name, + path: filepath.Join(logDir, name), + date: fileDate, + size: info.Size(), + }) + } + + // Sort by date ascending (oldest first). + sort.Slice(archives, func(i, j int) bool { + return archives[i].date.Before(archives[j].date) + }) + + // Phase 1: Age-based cleanup. + if maxAgeDays > 0 { + cutoff := now.AddDate(0, 0, -maxAgeDays) + remaining := archives[:0] + for _, f := range archives { + if f.date.Before(cutoff) { + os.Remove(f.path) + } else { + remaining = append(remaining, f) + } + } + archives = remaining + } + + // Phase 2: Size-based cleanup. + if maxTotalSizeMB > 0 { + maxBytes := int64(maxTotalSizeMB) * 1024 * 1024 + var totalSize int64 = routerLogSize + for _, f := range archives { + totalSize += f.size + } + for len(archives) > 0 && totalSize > maxBytes { + oldest := archives[0] + os.Remove(oldest.path) + totalSize -= oldest.size + archives = archives[1:] + } } } diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go index 59faeee2a4d..fea0b853cf7 100644 --- a/fastdeploy/golang_router/pkg/logger/logger_test.go +++ b/fastdeploy/golang_router/pkg/logger/logger_test.go @@ -10,7 +10,7 @@ import ( func TestLoggerInit(t *testing.T) { t.Run("stdout output", func(t *testing.T) { - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) if infoLogger == nil || errorLogger == nil || warnLogger == nil || debugLogger == nil { t.Error("Loggers should be initialized") @@ -117,7 +117,7 @@ func TestLogLevels(t *testing.T) { func TestLogFunctions(t *testing.T) { var buf bytes.Buffer - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) level = "debug" // Redirect output @@ -132,7 +132,7 @@ func TestLogFunctions(t *testing.T) { } func TestContextPrefix(t *testing.T) { - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) level = "debug" t.Run("nil context produces no prefix", func(t *testing.T) { @@ -151,7 +151,7 @@ func TestContextPrefix(t *testing.T) { } }) - t.Run("context without request_id produces [request_id:null]", func(t *testing.T) { + t.Run("context without request_id produces no request_id prefix", func(t *testing.T) { var buf bytes.Buffer oldOutput := infoLogger.Writer() defer func() { infoLogger.SetOutput(oldOutput) }() @@ -160,8 +160,11 @@ func TestContextPrefix(t *testing.T) { ctx := context.Background() Info(ctx, "mixed mode log") output := buf.String() - if !strings.Contains(output, "[request_id:null]") { - t.Errorf("context without request_id should produce [request_id:null], got: %s", output) + if strings.Contains(output, "[request_id:") { + t.Errorf("context without request_id should not produce request_id prefix, got: %s", output) + } + if !strings.Contains(output, "mixed mode log") { + t.Errorf("message should be present, got: %s", output) } }) From e03b69f559e50fa25c3c0101042f99e8c18aabfb Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 14:58:29 +0800 Subject: [PATCH 21/40] docs: add router troubleshoot playbook with skill workflow --- docs/zh/online_serving/router_faq.md | 1 + .../router_troubleshoot_playbook.md | 190 ++++++++++++++++++ .../.claude/skills/troubleshoot/SKILL.md | 5 +- .../troubleshoot/scripts/analyzers/trace.py | 141 ++++++++++++- .../troubleshoot/scripts/troubleshoot.py | 2 + 5 files changed, 328 insertions(+), 11 deletions(-) create mode 100644 docs/zh/online_serving/router_troubleshoot_playbook.md diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md index 9c32726f4dc..a431065dbf0 100644 --- a/docs/zh/online_serving/router_faq.md +++ b/docs/zh/online_serving/router_faq.md @@ -5,6 +5,7 @@ 本文档基于 [Golang Router](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/golang_router) 的代码实现,汇总了 Router 在使用过程中常见的日志信息、返回输出及问题排查方法,帮助用户快速定位和解决问题。 Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 +如需按“日志定位 + troubleshoot skill”流程化排查,请参考 [Router 问题排查实战手册](router_troubleshoot_playbook.md)。 ## 常见日志分析 diff --git a/docs/zh/online_serving/router_troubleshoot_playbook.md b/docs/zh/online_serving/router_troubleshoot_playbook.md new file mode 100644 index 00000000000..0ccee9c6d55 --- /dev/null +++ b/docs/zh/online_serving/router_troubleshoot_playbook.md @@ -0,0 +1,190 @@ +# Router 问题排查实战手册(日志定位 + troubleshoot skill) + +本文档结合以下两部分信息整理: +- Router 常见问题与日志语义:[`docs/zh/online_serving/router_faq.md`](router_faq.md) +- `fastdeploy/golang_router/.claude/skills/troubleshoot` 的脚本能力与使用方式 + +目标:给出一套可落地的排查流程,帮助你从“现象”快速定位到“日志证据”和“处理建议”。 + +--- + +## 1. 先定范围:全量 / 尾部 / 指定时间段 + +建议先根据问题发生时间选择分析范围(这是和分析模式并列的维度): + +- **全量分析**:适合历史慢性问题、趋势问题。 +- **尾部分析(`--tail`)**:适合刚发生的故障,优先看最近 N 行或 N 分钟。 +- **指定时间段(`--start/--end`)**:适合已知故障窗口(例如 14:05~14:20)。 + +> 说明:`--tail` 与 `--start/--end` 互斥,二选一。 + +--- + +## 2. 先看健康与注册,再看调度与请求 + +根据 `router_faq.md` 的建议,先确认“有没有可用实例”,再看“请求是否调度成功”。 + +### 2.1 健康与注册检查(必做) + +```bash +# 已注册实例列表 +curl -X GET http://{router_url}/registered + +# 已注册实例数量 +curl -X GET http://{router_url}/registered_number + +# 从 Router 机器检查后端健康 +curl -X GET http://{server_url}/health +``` + +重点日志关键词: +- 健康移除:`Removed unhealthy ... instance` +- 注册失败:`Failed to register instance` +- 健康检查失败:`failed to send request to ...` / `Server ... is not healthy` + +若实例都不健康或未注册,后续 502/503 多数是结果,不是根因。 + +### 2.2 调度失败检查 + +常见错误: +- `Failed to select worker` +- `Failed to select worker pair` +- `No available prefill/decode workers` + +这类问题先确认: +1) 注册数量是否为 0; +2) 调度策略与部署模式是否匹配; +3) `fd_metrics_score` 依赖的 `/metrics` 是否可访问。 + +### 2.3 请求链路与后端请求失败 + +常见日志: +- `Failed to connect to backend service` +- `Request failed (attempt n/max)` +- `Decode/Prefill/Backend request failed for {url}` +- `Panic recovered` + +这类问题通常需要结合 trace(ID 级别)看完整链路。 + +--- + +## 3. 使用 troubleshoot skill 的标准方式 + +脚本入口(在 `fastdeploy/golang_router/` 下): + +```bash +SCRIPTS=.claude/skills/troubleshoot/scripts +python3 $SCRIPTS/troubleshoot.py [options] +``` + +### 3.1 全量体检(默认推荐首轮) + +```bash +python3 $SCRIPTS/troubleshoot.py +``` + +会同时输出:errors / latency / health / cache / load 的综合结果。 + +### 3.2 指定维度分析(精准打点) + +```bash +python3 $SCRIPTS/troubleshoot.py --errors +python3 $SCRIPTS/troubleshoot.py --latency +python3 $SCRIPTS/troubleshoot.py --health +python3 $SCRIPTS/troubleshoot.py --cache +python3 $SCRIPTS/troubleshoot.py --load +``` + +### 3.3 请求追踪(ID 级排查) + +```bash +# 单个 ID +python3 $SCRIPTS/troubleshoot.py --trace + +# 多个 ID +python3 $SCRIPTS/troubleshoot.py --trace "id1,id2,id3" +``` + +trace 会展示: +- 匹配到的 tag 类型(request_id / trace_id / session_id / req_id) +- 生命周期完整性 +- 事件链(含原始日志 RAW) +- 仅 request_id / 仅 session_id / 仅 trace_id 的统计 +- 各标签组合形式(detail 中给出组合与对应 ID) + +### 3.4 范围过滤与 trace 组合 + +当你要“在某个时间窗内追踪某个 ID”时,使用范围参数和 trace 组合: + +```bash +python3 $SCRIPTS/troubleshoot.py --start "2026/04/13 14:05:00" --end "2026/04/13 14:20:00" --trace "" +``` + +这符合“范围维度(全量/尾部/时间段)”与“模式维度(含 trace)”分离的使用方式。 + +--- + +## 4. 一套可复制的故障定位流程 + +### 步骤 A:确认故障窗口与错误现象 +- 收集用户报错时间、HTTP 状态码(502/503/500/400)和请求路径。 + +### 步骤 B:先跑时间窗综合分析 +```bash +python3 $SCRIPTS/troubleshoot.py --start "HH:MM:SS" --end "HH:MM:SS" +``` +- 看 STATUS(HEALTHY / DEGRADED / CRITICAL)。 +- 优先看 errors、health 章节,判断是否是后端健康/注册问题。 + +### 步骤 C:按症状进入专项 +- 502/503:`--errors --health --load` +- 延迟突增:`--latency --load --cache` +- 单请求失败:`--trace `(可叠加步骤 B 的时间窗) + +### 步骤 D:在 detail 文件中取证 +报告目录默认: +`skill_output/troubleshoot//` + +重点文件: +- `summary/troubleshoot_report.md` +- `detail/trace_.md` +- `detail/health_events.md` +- `detail/load_select_release.md` + +--- + +## 5. 现象到日志的快速映射 + +| 现象 | 优先看日志/关键词 | 推荐命令 | +|---|---|---| +| 503 无可用 worker | `No available prefill/decode workers`, `Removed unhealthy ...` | `--health --errors` | +| 502 调度失败 | `Failed to select worker`, `Failed to select worker pair` | `--errors --health --load` | +| 502 后端连接失败 | `Failed to connect to backend service`, `Request failed (attempt ...)` | `--errors --trace ` | +| 请求卡住/链路不完整 | 有 select 无 release、无 `Request completed successfully.` | `--trace ` | +| 延迟抖动 | HTTP latency、`[stats] total_running...` | `--latency --load --cache` | + +--- + +## 6. 常见误区 + +1. **只看 502/503 响应,不看健康与注册日志**:容易把“结果”当“根因”。 +2. **不限定时间窗口**:日志噪音大,容易误判。 +3. **trace 只看结构化事件,不看 RAW**:可能漏掉关键上下文(例如同一秒的 WARN/ERROR 细节)。 +4. **把范围维度和模式维度混在一起**:建议先定范围(全量/尾部/时间段),再定模式(完整/多维/trace)。 + +--- + +## 7. 推荐排查命令模板 + +```bash +# 模板 1:故障窗口综合体检 +python3 $SCRIPTS/troubleshoot.py --start "YYYY/MM/DD HH:MM:SS" --end "YYYY/MM/DD HH:MM:SS" + +# 模板 2:最近 30 分钟快速巡检 +python3 $SCRIPTS/troubleshoot.py --tail 30m + +# 模板 3:单请求深挖(配合时间窗) +python3 $SCRIPTS/troubleshoot.py --start "HH:MM:SS" --end "HH:MM:SS" --trace "" +``` + +如果你已经知道故障集中在特定 ID,优先从模板 3 入手,然后回到模板 1 看全局背景。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index 7f7a5793e91..2ea74156c82 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -55,10 +55,13 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): - 选项 1: `完整分析(默认)` — 运行所有维度(errors + latency + health + cache + load) - 选项 2: `单维度/多维度分析` — 选择特定维度(errors / latency / health / cache / load),可选多个 -- 选项 3: `请求追踪` — 追踪特定请求 ID(需提供 ID) +- 选项 3: `请求追踪` — 追踪特定请求 ID 如果用户未选择,默认使用完整分析。 +当用户选择“请求追踪”选项时,AskUserQuestion 的选项文案应直接提示可输入: +- `trace_id/request_id/session_id`(逗号分隔多 ID) + ### 4. 输出目录 诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 24af9a23500..37006121994 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -112,15 +112,21 @@ def analyze_trace(log_file, trace_ids, tail=None): sr_check = match_select_release(all_lines) diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) + tag_coverage = _build_id_coverage_stats(all_lines) + tag_combos = _build_id_combo_stats(all_lines) + matched_tags = _detect_matched_tags(all_lines, tid) traces[tid] = { "events": events, "lifecycle_complete": lifecycle_complete, "diagnoses": diagnoses, "sr_check": sr_check, - "matched_tag": "session_id" if is_session else "request_id/trace_id", + "matched_tag": _format_matched_tag(matched_tags), + "matched_tags": matched_tags, "related_ids": { "request_ids": sorted(related_request_ids) if is_session else [], }, + "id_coverage": tag_coverage, + "id_combos": tag_combos, } total_traced = len(traces) @@ -152,13 +158,14 @@ def _parse_event_chain(lines): "path": http["path"], "status": http["status"], "latency_ms": http["latency_ms"], + "raw": line.strip(), } ) continue # Parsing completed if PARSING_COMPLETE_RE.search(line): - events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags}) + events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags, "raw": line.strip()}) continue # Cache-aware strategy @@ -172,6 +179,7 @@ def _parse_event_chain(lines): "strategy": strategy.get("strategy"), "selected": strategy.get("selected", ""), "selected_hitRatio": strategy.get("selected_hitRatio", 0), + "raw": line.strip(), } ) continue @@ -186,6 +194,7 @@ def _parse_event_chain(lines): "tags": tags, "worker_type": m.group(1) or "unknown", "worker": m.group(2), + "raw": line.strip(), } ) continue @@ -200,6 +209,7 @@ def _parse_event_chain(lines): "tags": tags, "worker_type": m.group(1) or "unknown", "worker": m.group(2), + "raw": line.strip(), } ) continue @@ -214,6 +224,7 @@ def _parse_event_chain(lines): "tags": tags, "worker": m.group(1), "tokens": int(m.group(2)), + "raw": line.strip(), } ) continue @@ -221,39 +232,45 @@ def _parse_event_chain(lines): # Prefill events m = PREFILL_FIRST_CHUNK_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1)}) + events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()}) continue m = PREFILL_DONE_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1)}) + events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1), "raw": line.strip()}) continue m = PREFILL_ERROR_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2)}) + events.append( + {"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2), "raw": line.strip()} + ) continue m = PREFILL_DEFER_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1)}) + events.append( + {"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()} + ) continue m = PREFILL_ERR_PATH_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1)}) + events.append( + {"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()} + ) continue # Request completed if REQUEST_COMPLETE_RE.search(line): - events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags}) + events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags, "raw": line.strip()}) continue # ts_ms m = TS_MS_RE.search(line) if m: - events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1)}) + events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1), "raw": line.strip()}) continue # Failed to select if FAILED_SELECT_RE.search(line): - events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags}) + events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags, "raw": line.strip()}) continue # 按时间排序 @@ -339,6 +356,12 @@ def format_trace_report(result): sections.append(f"### ID: {tid}") if trace.get("matched_tag"): sections.append(f' 匹配类型: {trace["matched_tag"]}') + if trace.get("id_coverage"): + c = trace["id_coverage"] + sections.append( + " ID统计: " + f'request_only={c["request_only"]}, session_only={c["session_only"]}, trace_only={c["trace_only"]}' + ) if trace.get("related_ids", {}).get("request_ids"): sections.append(f' 关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') @@ -357,6 +380,19 @@ def format_trace_report(result): detail_lines = [f"# 请求追踪事件链: {tid}", ""] if trace.get("matched_tag"): detail_lines.append(f'匹配类型: {trace["matched_tag"]}') + if trace.get("id_coverage"): + c = trace["id_coverage"] + detail_lines.append("ID覆盖统计:") + detail_lines.append( + f'- only_request_id: {c["request_only"]} | only_session_id: {c["session_only"]} | only_trace_id: {c["trace_only"]}' + ) + if trace.get("id_combos"): + detail_lines.append("") + detail_lines.append("标签组合明细(按唯一ID计数):") + for item in trace["id_combos"]: + detail_lines.append( + f'- combo={item["combo"]} | count={item["count"]} | ids={", ".join(item["ids"])}' + ) if trace.get("related_ids", {}).get("request_ids"): detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') detail_lines.append(f"生命周期: {status}") @@ -379,7 +415,11 @@ def format_trace_report(result): line += f' tokens={evt["tokens"]}' if evt.get("error"): line += f' error={evt["error"]}' + if evt.get("ts_ms"): + line += f' ts_ms={evt["ts_ms"]}' detail_lines.append(line) + if evt.get("raw"): + detail_lines.append(f' RAW: {evt["raw"]}') detail_lines.append("") detail_dict[tid] = "\n".join(detail_lines) @@ -413,3 +453,84 @@ def _grep_lines(log_file, pattern, tail=None): def _shell_quote(s): return "'" + s.replace("'", "'\\''") + "'" + + +def _detect_matched_tags(lines, target_id): + matched = set() + for line in lines: + tags = extract_tags(line) + for key in ("request_id", "trace_id", "session_id", "req_id"): + if tags.get(key) == target_id: + matched.add(key) + return sorted(matched) + + +def _format_matched_tag(matched_tags): + if not matched_tags: + return "unknown" + if len(matched_tags) == 1: + return matched_tags[0] + return "+".join(matched_tags) + + +def _build_id_coverage_stats(lines): + request_only_ids = set() + session_only_ids = set() + trace_only_ids = set() + + for line in lines: + tags = extract_tags(line) + req_val = tags.get("request_id") or tags.get("req_id") + session_val = tags.get("session_id") + trace_val = tags.get("trace_id") + has_request = bool(req_val) + has_session = bool(session_val) + has_trace = bool(trace_val) + + if has_request and not has_session and not has_trace: + request_only_ids.add(req_val) + if has_session and not has_request and not has_trace: + session_only_ids.add(session_val) + if has_trace and not has_request and not has_session: + trace_only_ids.add(trace_val) + + return { + "request_only": len(request_only_ids), + "session_only": len(session_only_ids), + "trace_only": len(trace_only_ids), + } + + +def _build_id_combo_stats(lines): + combo_to_ids = {} + for line in lines: + tags = extract_tags(line) + keys = [] + if tags.get("request_id"): + keys.append("request_id") + if tags.get("req_id"): + keys.append("req_id") + if tags.get("session_id"): + keys.append("session_id") + if tags.get("trace_id"): + keys.append("trace_id") + combo = "+".join(keys) if keys else "no_id_tag" + + ids = [] + if tags.get("request_id"): + ids.append(tags["request_id"]) + if tags.get("req_id"): + ids.append(tags["req_id"]) + if tags.get("session_id"): + ids.append(tags["session_id"]) + if tags.get("trace_id"): + ids.append(tags["trace_id"]) + id_key = "|".join(ids) if ids else "" + + combo_to_ids.setdefault(combo, set()).add(id_key) + + rows = [] + for combo, ids in combo_to_ids.items(): + rows.append({"combo": combo, "count": len(ids), "ids": sorted(ids)}) + rows.sort(key=lambda x: x["count"], reverse=True) + return rows diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 803bf6fba43..8378cbe20a1 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -430,6 +430,8 @@ def main(): # 时间范围预过滤(--start 和 --end 可单独或同时指定) import atexit + start_ts = None + end_ts = None if args.start or args.end: start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None From 09c18242446243c966f9d54f334c4cc164149496 Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 15:29:26 +0800 Subject: [PATCH 22/40] [Feature] Add troubleshoot and stats-cache-hitratio skills --- .../troubleshoot/scripts/analyzers/cache.py | 41 +++++++++++-------- .../troubleshoot/scripts/troubleshoot.py | 34 +++++++++++---- 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 3a5c19ad00b..57a1490d3fd 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -456,7 +456,7 @@ def format_cache_report(result): sections.append("") table_data = [ { - "Session": sid[:16], + "Session": sid, "请求数": str(s["total_requests"]), "粘性率": f'{s["stickiness_pct"]}%', "切换次数": str(s["switches"]), @@ -530,28 +530,35 @@ def format_cache_report(result): detail_sections.append( render_table( result["cross_diagnosis"], - columns=["avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout", "diagnosis", "action"], + columns=[ + "avg_stickiness_pct", + "mean_hitRatio_pct", + "fallback_pct", + "evicted_after_timeout", + "diagnosis", + "action", + ], right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, ) ) detail_sections.append("") - if any( - [ - result.get("session_stickiness"), - result.get("suboptimal_selections"), - result.get("eviction_impact"), - result.get("cross_diagnosis"), - result.get("diagnoses"), - ] - ): + # 只显示实际生成了文件的链接 + detail_links = [] + if result.get("session_stickiness"): + detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") + if result.get("suboptimal_selections"): + detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") + if result.get("eviction_impact"): + detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)") + if result.get("fallback_reasons"): + detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)") + if result.get("cross_diagnosis"): + detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)") + + if detail_links: sections.append( - "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " - "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " - "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " - "[detail/cache_eviction.md](../detail/cache_eviction.md) | " - "[detail/cache_fallback.md](../detail/cache_fallback.md) | " - "[detail/cache_cross.md](../detail/cache_cross.md)" + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links) ) sections.append("") diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 8378cbe20a1..d869f9c71cc 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -33,9 +33,14 @@ from analyzers.errors import analyze_errors, format_errors_report from analyzers.health import analyze_health, format_health_report from analyzers.latency import analyze_latency, format_latency_report -from analyzers.load import analyze_load, format_load_report +from analyzers.load import analyze_load +from analyzers.load_report import format_load_report from analyzers.trace import analyze_trace, format_trace_report -from log_parser import complete_time_arg, filter_file_by_recent_minutes, filter_file_by_time_range +from log_parser import ( + complete_time_arg, + filter_file_by_recent_minutes, + filter_file_by_time_range, +) def determine_log_file(user_path=None): @@ -236,7 +241,7 @@ def format_full_report(results, status, status_reason): continue lines.append(f'- 模板: {e.get("template","")}') for u in urls: - lines.append(f' - {u}') + lines.append(f" - {u}") lines.append("") details["errors_topn"] = "\n".join(lines) @@ -268,7 +273,12 @@ def format_full_report(results, status, status_reason): details["load_diagnoses"] = "\n".join(lines) if results["load"].get("counter_last_state"): rows = results["load"]["counter_last_state"] - lines = ["# Load Counter 末状态", "", "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", "|:--|:--|--:|:--|--:|:--|"] + lines = [ + "# Load Counter 末状态", + "", + "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", + "|:--|:--|--:|:--|--:|:--|", + ] for r in rows: lines.append( f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' @@ -285,19 +295,25 @@ def format_full_report(results, status, status_reason): if c.get("session_stickiness"): lines = ["# Cache Session 粘性详情", ""] for sid, s in c["session_stickiness"].items(): - lines.append(f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}') + lines.append( + f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}' + ) lines.append("") details["cache_session_stickiness"] = "\n".join(lines) if c.get("suboptimal_selections"): lines = ["# Cache 非最优选择详情", ""] for x in c["suboptimal_selections"][:200]: - lines.append(f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}') + lines.append( + f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}' + ) lines.append("") details["cache_suboptimal"] = "\n".join(lines) if c.get("eviction_impact"): lines = ["# Cache 驱逐影响详情", ""] for x in c["eviction_impact"][:200]: - lines.append(f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}') + lines.append( + f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}' + ) lines.append("") details["cache_eviction"] = "\n".join(lines) if c.get("fallback_reasons"): @@ -309,7 +325,9 @@ def format_full_report(results, status, status_reason): if c.get("cross_diagnosis"): lines = ["# Cache 交叉诊断详情", ""] for x in c["cross_diagnosis"]: - lines.append(f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%') + lines.append( + f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%' + ) lines.append("") details["cache_cross"] = "\n".join(lines) From fe3c0d1d0bece7395b9da49ead94533e969689b8 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 16:05:55 +0800 Subject: [PATCH 23/40] Adjust trace input flow to direct prompt instead of AskUserQuestion --- .../.claude/skills/troubleshoot/SKILL.md | 25 ++++- .../references/report_templates.md | 5 +- .../troubleshoot/scripts/analyzers/cache.py | 97 +++++++++++-------- .../troubleshoot/scripts/analyzers/trace.py | 69 ++++++++++++- .../troubleshoot/scripts/troubleshoot.py | 60 +++++++----- 5 files changed, 184 insertions(+), 72 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index 2ea74156c82..919e25a1101 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -11,14 +11,14 @@ description: > 关键词:troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、 health、load、cache、trace、/troubleshoot。 -IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。 -错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。 --- # Router Troubleshooting 综合排查 FastDeploy Go Router 问题,输出完整诊断报告。 +> IMPORTANT: 执行前务必先读取 `references/log_patterns.md` 了解日志格式和提取规则。错误分类时参考 `references/error_catalog.md`。涉及后端问题时参考 `references/fastdeploy_cross_reference.md`。 + ## 执行前交互 运行脚本前,Claude 必须按以下顺序向用户确认参数: @@ -51,6 +51,16 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 缺失部分自动从日志首末行推断(缺年份取首行,缺日期取末行)。 `--start/--end` 与 `--tail` 互斥。 +当用户选择“指定时间段”时,必须再发起一次 **AskUserQuestion**(离散选项)引导时间输入: +- 选项 1: `当天(00:00:00 到当前)`(推荐) +- 选项 2: `最近半小时`(自动换算为 `--start now-30m --end now` 语义) + +用户若通过客户端默认 `Other` 输入时间,则将该输入直接作为时间范围参数解析。 +可补充一条简短示例引导: +- 示例 1:`16:00-16:30` +- 示例 2:`03/31 16:00 ~ 03/31 18:00` +- 示例 3:`2026/03/31 16:00:00`(仅起始) + ### 3. 分析模式 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): - 选项 1: `完整分析(默认)` — 运行所有维度(errors + latency + health + cache + load) @@ -59,8 +69,12 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 如果用户未选择,默认使用完整分析。 -当用户选择“请求追踪”选项时,AskUserQuestion 的选项文案应直接提示可输入: -- `trace_id/request_id/session_id`(逗号分隔多 ID) +当用户选择“请求追踪”后,**不要再发 AskUserQuestion** 收集 trace ID。 +直接发一条提示并等待用户输入完成后再继续执行即可。 + +提示文案建议: +- `请输入要追踪的 ID(支持 trace_id / request_id / session_id,多个用逗号分隔;输入 all 可全量追踪)` +- 示例:`a1b2c3d4` / `trace-001,trace-002` / `session-abc-123` / `all` ### 4. 输出目录 诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 @@ -86,6 +100,7 @@ python3 $SCRIPTS/troubleshoot.py --load # 请求追踪(需指定 ID,支持逗号分隔多 ID) python3 $SCRIPTS/troubleshoot.py --trace python3 $SCRIPTS/troubleshoot.py --trace "id1,id2" +python3 $SCRIPTS/troubleshoot.py --trace all # 尾部分析 python3 $SCRIPTS/troubleshoot.py --tail 5000 @@ -110,6 +125,8 @@ python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --erro - **文件**:详细报告导出到 `skill_output/troubleshoot//summary/troubleshoot_report.md` - 逐分钟事件详情拆分到 `detail/health_events.md` - 请求追踪事件链拆分到 `detail/trace_.md` +- **Cache 明细要求**:`cache_session_stickiness.md` / `cache_suboptimal.md` / `cache_eviction.md` / `cache_fallback.md` / `cache_cross.md` + 必须始终生成(即使无异常也写“未发现/样本不足”总结,避免链接缺失) - **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` ## 三层诊断框架 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index cd705d02816..2ec683f2299 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -102,15 +102,18 @@ Worker 可用性时间线 可用性统计 ``` -### Cache(调度诊断)— 待实现 +### Cache(调度诊断) ``` 调度策略分布 Session 粘性分析 非最优选择分析 Fallback 原因分类 +驱逐影响与交叉诊断 ``` +要求:即使某项计数为 0(例如“非最优选择”),也要输出该小节并给出“未发现/样本不足”总结,保证 detail 链接稳定存在。 + ### Load(负载分析)— 待实现 ``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 57a1490d3fd..a12341967a0 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -449,11 +449,11 @@ def format_cache_report(result): # Session 粘性 stickiness = result.get("session_stickiness", {}) + sections.append("### Session 粘性") + sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") + sections.append("") if stickiness: - sections.append("### Session 粘性") - sections.append("") - sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") table_data = [ { "Session": sid, @@ -473,14 +473,21 @@ def format_cache_report(result): ) ) detail_sections.append("") + else: + sections.append(" 未检测到可计算粘性的多请求 Session。") + sections.append("") + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + detail_sections.append("") # 非最优选择 - if result.get("suboptimal_selections"): - subs = result["suboptimal_selections"] - sections.append(f"### 非最优选择 ({len(subs)} 次)") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") + subs = result.get("suboptimal_selections") or [] + sections.append(f"### 非最优选择 ({len(subs)} 次)") + sections.append("") + sections.append(" 详情见: [detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") + sections.append("") + if subs: reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 @@ -494,15 +501,22 @@ def format_cache_report(result): f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' ) detail_sections.append("") + else: + sections.append(" 未发现非最优选择(selected_hitRatio 始终为当次最高)。") + sections.append("") + detail_sections.append("## 非最优选择") + detail_sections.append("") + detail_sections.append("- 未发现非最优选择。") + detail_sections.append("") # 驱逐影响 - if result.get("eviction_impact"): - evictions = result["eviction_impact"] - evicted = [e for e in evictions if e["evicted"]] - sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") + evictions = result.get("eviction_impact") or [] + evicted = [e for e in evictions if e["evicted"]] + sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") + sections.append("") + sections.append(" 详情见: [detail/cache_eviction.md](../detail/cache_eviction.md)") + sections.append("") + if evictions: detail_sections.append("## 驱逐影响") detail_sections.append("") for e in evictions[:50]: @@ -510,6 +524,13 @@ def format_cache_report(result): f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' ) detail_sections.append("") + else: + sections.append(" 未检测到超时导致的潜在驱逐影响。") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + detail_sections.append("- 未检测到超时驱逐样本。") + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: @@ -520,11 +541,11 @@ def format_cache_report(result): detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') detail_sections.append("") + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_cross.md](../detail/cache_cross.md)") + sections.append("") if result.get("cross_diagnosis"): - sections.append("### 交叉诊断") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") detail_sections.append("## 交叉诊断") detail_sections.append("") detail_sections.append( @@ -542,25 +563,23 @@ def format_cache_report(result): ) ) detail_sections.append("") - - # 只显示实际生成了文件的链接 - detail_links = [] - if result.get("session_stickiness"): - detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") - if result.get("suboptimal_selections"): - detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") - if result.get("eviction_impact"): - detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)") - if result.get("fallback_reasons"): - detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)") - if result.get("cross_diagnosis"): - detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)") - - if detail_links: - sections.append( - "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links) - ) + else: + sections.append(" 样本不足,未生成交叉诊断。") sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append("- 样本不足,未生成交叉诊断。") + detail_sections.append("") + + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 37006121994..d9a599b305c 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -62,8 +62,13 @@ def analyze_trace(log_file, trace_ids, tail=None): Returns: dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary} """ + auto_discovery_summary = "" if isinstance(trace_ids, str): - trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] + normalized = trace_ids.strip().lower() + if normalized in ("all", "full", "all_ids", "全部", "全量"): + trace_ids, auto_discovery_summary = _discover_full_trace_targets(log_file, tail=tail) + else: + trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] if not trace_ids: return {"traces": {}, "summary": "未指定追踪 ID"} @@ -132,10 +137,64 @@ def analyze_trace(log_file, trace_ids, tail=None): total_traced = len(traces) complete = sum(1 for t in traces.values() if t["lifecycle_complete"]) - return { - "traces": traces, - "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整", - } + summary = f"{total_traced} ID(s) 追踪, {complete} 生命周期完整" + if auto_discovery_summary: + summary += f" | {auto_discovery_summary}" + + return {"traces": traces, "summary": summary} + + +def _discover_full_trace_targets(log_file, tail=None): + """全量追踪目标发现。 + + 规则: + 1) 有 session_id 的优先按 session_id 追踪 + 2) 无 session 但有 trace_id 的按 trace_id 追踪 + 3) 剩余“孤立”的 request_id/req_id 单独追踪 + """ + lines = _grep_lines(log_file, r"session_id:|trace_id:|request_id:|req_id:", tail=tail) + if not lines: + return [], "全量追踪未发现任何可用 ID" + + session_ids = set() + trace_ids = set() + all_request_ids = set() + request_ids_with_session_or_trace = set() + + for line in lines: + tags = extract_tags(line) + sid = tags.get("session_id") + tid = tags.get("trace_id") + rid = tags.get("request_id") or tags.get("req_id") + has_session = bool(sid) + has_trace = bool(tid) + has_request = bool(rid) + + if has_session: + session_ids.add(sid) + if has_trace: + trace_ids.add(tid) + if has_request: + all_request_ids.add(rid) + if has_session or has_trace: + request_ids_with_session_or_trace.add(rid) + + standalone_request_ids = all_request_ids - request_ids_with_session_or_trace + + targets = [] + chosen = set() + for bucket in (sorted(session_ids), sorted(trace_ids), sorted(standalone_request_ids)): + for _id in bucket: + if _id and _id not in chosen: + chosen.add(_id) + targets.append(_id) + + summary = ( + "全量ID发现: " + f"session={len(session_ids)}, trace={len(trace_ids)}, " + f"standalone_request={len(standalone_request_ids)}, total_targets={len(targets)}" + ) + return targets, summary def _parse_event_chain(lines): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index d869f9c71cc..96a37ff9577 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -11,7 +11,7 @@ --health 仅分析 Worker 健康 --cache 仅分析 Cache 调度 --load 仅分析负载与计数器 - --trace ID 追踪指定请求(支持逗号分隔多 ID) + --trace ID 追踪指定请求(支持逗号分隔多 ID;传 all 可全量追踪) --tail N 仅分析尾部 N 行(支持 N 或 Nm 格式如 30m) --start TIME 起始时间(如 "16:00:00"、"03/31 16:00") --end TIME 结束时间(如 "17:00:00"、"2026/03/31 17:00:00") @@ -292,44 +292,58 @@ def format_full_report(results, status, status_reason): if detail: details["cache_diagnosis"] = detail c = results["cache"] + lines = ["# Cache Session 粘性详情", ""] if c.get("session_stickiness"): - lines = ["# Cache Session 粘性详情", ""] for sid, s in c["session_stickiness"].items(): lines.append( f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}' ) - lines.append("") - details["cache_session_stickiness"] = "\n".join(lines) + else: + lines.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + + lines = ["# Cache 非最优选择详情", ""] if c.get("suboptimal_selections"): - lines = ["# Cache 非最优选择详情", ""] for x in c["suboptimal_selections"][:200]: lines.append( f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}' ) - lines.append("") - details["cache_suboptimal"] = "\n".join(lines) + else: + lines.append("- 未发现非最优选择。") + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + + lines = ["# Cache 驱逐影响详情", ""] if c.get("eviction_impact"): - lines = ["# Cache 驱逐影响详情", ""] for x in c["eviction_impact"][:200]: lines.append( f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}' ) - lines.append("") - details["cache_eviction"] = "\n".join(lines) + else: + lines.append("- 未检测到超时驱逐样本。") + lines.append("") + details["cache_eviction"] = "\n".join(lines) + + lines = ["# Cache Fallback 原因详情", ""] if c.get("fallback_reasons"): - lines = ["# Cache Fallback 原因详情", ""] for x in c["fallback_reasons"]: lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') - lines.append("") - details["cache_fallback"] = "\n".join(lines) + else: + lines.append("- 未出现 fallback 记录。") + lines.append("") + details["cache_fallback"] = "\n".join(lines) + + lines = ["# Cache 交叉诊断详情", ""] if c.get("cross_diagnosis"): - lines = ["# Cache 交叉诊断详情", ""] for x in c["cross_diagnosis"]: lines.append( f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%' ) - lines.append("") - details["cache_cross"] = "\n".join(lines) + else: + lines.append("- 样本不足,未生成交叉诊断。") + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -386,19 +400,19 @@ def save_detailed_report(report_text, output_dir, details=None): if details.get("load_counter_state"): with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: f.write(details["load_counter_state"]) - if details.get("cache_session_stickiness"): + if details.get("cache_session_stickiness") is not None: with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: f.write(details["cache_session_stickiness"]) - if details.get("cache_suboptimal"): + if details.get("cache_suboptimal") is not None: with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: f.write(details["cache_suboptimal"]) - if details.get("cache_eviction"): + if details.get("cache_eviction") is not None: with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: f.write(details["cache_eviction"]) - if details.get("cache_fallback"): + if details.get("cache_fallback") is not None: with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: f.write(details["cache_fallback"]) - if details.get("cache_cross"): + if details.get("cache_cross") is not None: with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: f.write(details["cache_cross"]) if details.get("errors_topn"): @@ -426,7 +440,7 @@ def main(): parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康") parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度") parser.add_argument("--load", action="store_true", help="仅分析负载与计数器") - parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID)") + parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID;传 all 可全量追踪)") parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)") parser.add_argument( "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' @@ -478,7 +492,7 @@ def main(): run_health = args.health or (not any_mode) run_load = args.load or (not any_mode) run_cache = args.cache or (not any_mode) - run_trace = bool(args.trace) # trace 需要指定 ID,全量扫描不自动调用 + run_trace = bool(args.trace) # trace 需要指定 ID(支持 all),全量扫描不自动调用 results = {} step = 0 From b65a31f03b2decd979b4de641ebca88aecddeb43 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 16:20:57 +0800 Subject: [PATCH 24/40] Store trace detail markdowns under detail/trace subfolder --- .../.claude/skills/troubleshoot/SKILL.md | 27 +++++- .../references/report_templates.md | 7 +- .../troubleshoot/scripts/analyzers/cache.py | 97 +++++++++++-------- .../troubleshoot/scripts/analyzers/trace.py | 73 ++++++++++++-- .../troubleshoot/scripts/troubleshoot.py | 67 ++++++++----- 5 files changed, 194 insertions(+), 77 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index 2ea74156c82..00c94a2f487 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -11,14 +11,14 @@ description: > 关键词:troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、 health、load、cache、trace、/troubleshoot。 -IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格式和提取规则。 -错误分类时参考 references/error_catalog.md。涉及后端问题时参考 references/fastdeploy_cross_reference.md。 --- # Router Troubleshooting 综合排查 FastDeploy Go Router 问题,输出完整诊断报告。 +> IMPORTANT: 执行前务必先读取 `references/log_patterns.md` 了解日志格式和提取规则。错误分类时参考 `references/error_catalog.md`。涉及后端问题时参考 `references/fastdeploy_cross_reference.md`。 + ## 执行前交互 运行脚本前,Claude 必须按以下顺序向用户确认参数: @@ -51,6 +51,16 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 缺失部分自动从日志首末行推断(缺年份取首行,缺日期取末行)。 `--start/--end` 与 `--tail` 互斥。 +当用户选择“指定时间段”时,必须再发起一次 **AskUserQuestion**(离散选项)引导时间输入: +- 选项 1: `当天(00:00:00 到当前)`(推荐) +- 选项 2: `最近半小时`(自动换算为 `--start now-30m --end now` 语义) + +用户若通过客户端默认 `Other` 输入时间,则将该输入直接作为时间范围参数解析。 +可补充一条简短示例引导: +- 示例 1:`16:00-16:30` +- 示例 2:`03/31 16:00 ~ 03/31 18:00` +- 示例 3:`2026/03/31 16:00:00`(仅起始) + ### 3. 分析模式 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): - 选项 1: `完整分析(默认)` — 运行所有维度(errors + latency + health + cache + load) @@ -59,8 +69,12 @@ IMPORTANT: 执行前务必先读取 references/log_patterns.md 了解日志格 如果用户未选择,默认使用完整分析。 -当用户选择“请求追踪”选项时,AskUserQuestion 的选项文案应直接提示可输入: -- `trace_id/request_id/session_id`(逗号分隔多 ID) +当用户选择“请求追踪”后,**不要再发 AskUserQuestion** 收集 trace ID。 +直接发一条提示并等待用户输入完成后再继续执行即可。 + +提示文案建议: +- `请输入要追踪的 ID(支持 trace_id / request_id / session_id,多个用逗号分隔;输入 all 可全量追踪)` +- 示例:`a1b2c3d4` / `trace-001,trace-002` / `session-abc-123` / `all` ### 4. 输出目录 诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 @@ -86,6 +100,7 @@ python3 $SCRIPTS/troubleshoot.py --load # 请求追踪(需指定 ID,支持逗号分隔多 ID) python3 $SCRIPTS/troubleshoot.py --trace python3 $SCRIPTS/troubleshoot.py --trace "id1,id2" +python3 $SCRIPTS/troubleshoot.py --trace all # 尾部分析 python3 $SCRIPTS/troubleshoot.py --tail 5000 @@ -109,7 +124,9 @@ python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --erro - **终端**:简洁三层汇总(Router / FD 后端 / 客户端),含状态码分布、错误 Top N、趋势图 - **文件**:详细报告导出到 `skill_output/troubleshoot//summary/troubleshoot_report.md` - 逐分钟事件详情拆分到 `detail/health_events.md` - - 请求追踪事件链拆分到 `detail/trace_.md` + - 请求追踪事件链拆分到 `detail/trace/trace_.md` +- **Cache 明细要求**:`cache_session_stickiness.md` / `cache_suboptimal.md` / `cache_eviction.md` / `cache_fallback.md` / `cache_cross.md` + 必须始终生成(即使无异常也写“未发现/样本不足”总结,避免链接缺失) - **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` ## 三层诊断框架 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md index cd705d02816..61db59ec7e6 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -62,7 +62,7 @@ - `detail/latency_diagnoses.md` — 延迟诊断详情 - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 - - `detail/trace_.md` — 请求追踪事件链 + - `detail/trace/trace_.md` — 请求追踪事件链 --- @@ -102,15 +102,18 @@ Worker 可用性时间线 可用性统计 ``` -### Cache(调度诊断)— 待实现 +### Cache(调度诊断) ``` 调度策略分布 Session 粘性分析 非最优选择分析 Fallback 原因分类 +驱逐影响与交叉诊断 ``` +要求:即使某项计数为 0(例如“非最优选择”),也要输出该小节并给出“未发现/样本不足”总结,保证 detail 链接稳定存在。 + ### Load(负载分析)— 待实现 ``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py index 57a1490d3fd..a12341967a0 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -449,11 +449,11 @@ def format_cache_report(result): # Session 粘性 stickiness = result.get("session_stickiness", {}) + sections.append("### Session 粘性") + sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") + sections.append("") if stickiness: - sections.append("### Session 粘性") - sections.append("") - sections.append(" Session 粘性详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") table_data = [ { "Session": sid, @@ -473,14 +473,21 @@ def format_cache_report(result): ) ) detail_sections.append("") + else: + sections.append(" 未检测到可计算粘性的多请求 Session。") + sections.append("") + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + detail_sections.append("") # 非最优选择 - if result.get("suboptimal_selections"): - subs = result["suboptimal_selections"] - sections.append(f"### 非最优选择 ({len(subs)} 次)") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") + subs = result.get("suboptimal_selections") or [] + sections.append(f"### 非最优选择 ({len(subs)} 次)") + sections.append("") + sections.append(" 详情见: [detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") + sections.append("") + if subs: reason_counts = defaultdict(int) for s in subs: reason_counts[s["reason"]] += 1 @@ -494,15 +501,22 @@ def format_cache_report(result): f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' ) detail_sections.append("") + else: + sections.append(" 未发现非最优选择(selected_hitRatio 始终为当次最高)。") + sections.append("") + detail_sections.append("## 非最优选择") + detail_sections.append("") + detail_sections.append("- 未发现非最优选择。") + detail_sections.append("") # 驱逐影响 - if result.get("eviction_impact"): - evictions = result["eviction_impact"] - evicted = [e for e in evictions if e["evicted"]] - sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") + evictions = result.get("eviction_impact") or [] + evicted = [e for e in evictions if e["evicted"]] + sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") + sections.append("") + sections.append(" 详情见: [detail/cache_eviction.md](../detail/cache_eviction.md)") + sections.append("") + if evictions: detail_sections.append("## 驱逐影响") detail_sections.append("") for e in evictions[:50]: @@ -510,6 +524,13 @@ def format_cache_report(result): f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' ) detail_sections.append("") + else: + sections.append(" 未检测到超时导致的潜在驱逐影响。") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + detail_sections.append("- 未检测到超时驱逐样本。") + detail_sections.append("") # 冷启动 if result.get("cold_starts", 0) > 0: @@ -520,11 +541,11 @@ def format_cache_report(result): detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') detail_sections.append("") + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_cross.md](../detail/cache_cross.md)") + sections.append("") if result.get("cross_diagnosis"): - sections.append("### 交叉诊断") - sections.append("") - sections.append(" 详情见: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") - sections.append("") detail_sections.append("## 交叉诊断") detail_sections.append("") detail_sections.append( @@ -542,25 +563,23 @@ def format_cache_report(result): ) ) detail_sections.append("") - - # 只显示实际生成了文件的链接 - detail_links = [] - if result.get("session_stickiness"): - detail_links.append("[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") - if result.get("suboptimal_selections"): - detail_links.append("[detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") - if result.get("eviction_impact"): - detail_links.append("[detail/cache_eviction.md](../detail/cache_eviction.md)") - if result.get("fallback_reasons"): - detail_links.append("[detail/cache_fallback.md](../detail/cache_fallback.md)") - if result.get("cross_diagnosis"): - detail_links.append("[detail/cache_cross.md](../detail/cache_cross.md)") - - if detail_links: - sections.append( - "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + " | ".join(detail_links) - ) + else: + sections.append(" 样本不足,未生成交叉诊断。") sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append("- 样本不足,未生成交叉诊断。") + detail_sections.append("") + + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index 37006121994..d0dcbdca6d9 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -62,8 +62,13 @@ def analyze_trace(log_file, trace_ids, tail=None): Returns: dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary} """ + auto_discovery_summary = "" if isinstance(trace_ids, str): - trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] + normalized = trace_ids.strip().lower() + if normalized in ("all", "full", "all_ids", "全部", "全量"): + trace_ids, auto_discovery_summary = _discover_full_trace_targets(log_file, tail=tail) + else: + trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] if not trace_ids: return {"traces": {}, "summary": "未指定追踪 ID"} @@ -132,10 +137,64 @@ def analyze_trace(log_file, trace_ids, tail=None): total_traced = len(traces) complete = sum(1 for t in traces.values() if t["lifecycle_complete"]) - return { - "traces": traces, - "summary": f"{total_traced} ID(s) 追踪, {complete} 生命周期完整", - } + summary = f"{total_traced} ID(s) 追踪, {complete} 生命周期完整" + if auto_discovery_summary: + summary += f" | {auto_discovery_summary}" + + return {"traces": traces, "summary": summary} + + +def _discover_full_trace_targets(log_file, tail=None): + """全量追踪目标发现。 + + 规则: + 1) 有 session_id 的优先按 session_id 追踪 + 2) 无 session 但有 trace_id 的按 trace_id 追踪 + 3) 剩余“孤立”的 request_id/req_id 单独追踪 + """ + lines = _grep_lines(log_file, r"session_id:|trace_id:|request_id:|req_id:", tail=tail) + if not lines: + return [], "全量追踪未发现任何可用 ID" + + session_ids = set() + trace_ids = set() + all_request_ids = set() + request_ids_with_session_or_trace = set() + + for line in lines: + tags = extract_tags(line) + sid = tags.get("session_id") + tid = tags.get("trace_id") + rid = tags.get("request_id") or tags.get("req_id") + has_session = bool(sid) + has_trace = bool(tid) + has_request = bool(rid) + + if has_session: + session_ids.add(sid) + if has_trace: + trace_ids.add(tid) + if has_request: + all_request_ids.add(rid) + if has_session or has_trace: + request_ids_with_session_or_trace.add(rid) + + standalone_request_ids = all_request_ids - request_ids_with_session_or_trace + + targets = [] + chosen = set() + for bucket in (sorted(session_ids), sorted(trace_ids), sorted(standalone_request_ids)): + for _id in bucket: + if _id and _id not in chosen: + chosen.add(_id) + targets.append(_id) + + summary = ( + "全量ID发现: " + f"session={len(session_ids)}, trace={len(trace_ids)}, " + f"standalone_request={len(standalone_request_ids)}, total_targets={len(targets)}" + ) + return targets, summary def _parse_event_chain(lines): @@ -426,7 +485,9 @@ def format_trace_report(result): # 主报告中添加引用和摘要 safe_tid = tid.replace("/", "_") sections.append(f' 事件数: {len(trace["events"])}') - sections.append(f" > 完整事件链: [detail/trace_{safe_tid}.md](../detail/trace_{safe_tid}.md)") + sections.append( + f" > 完整事件链: [detail/trace/trace_{safe_tid}.md](../detail/trace/trace_{safe_tid}.md)" + ) sections.append("") return "\n".join(sections), detail_dict diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index d869f9c71cc..251a21c7e81 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -11,7 +11,7 @@ --health 仅分析 Worker 健康 --cache 仅分析 Cache 调度 --load 仅分析负载与计数器 - --trace ID 追踪指定请求(支持逗号分隔多 ID) + --trace ID 追踪指定请求(支持逗号分隔多 ID;传 all 可全量追踪) --tail N 仅分析尾部 N 行(支持 N 或 Nm 格式如 30m) --start TIME 起始时间(如 "16:00:00"、"03/31 16:00") --end TIME 结束时间(如 "17:00:00"、"2026/03/31 17:00:00") @@ -191,7 +191,7 @@ def format_full_report(results, status, status_reason): details: dict 包含需要拆分到独立文件的详情数据 - 'health_events': str 或 None - 'load_select_release': str 或 None - - 'trace_files': {trace_id: text} 或 {} + - 'trace_files': {trace_id: text} 或 {}(写入 detail/trace/) """ parts = [] details = { @@ -292,44 +292,58 @@ def format_full_report(results, status, status_reason): if detail: details["cache_diagnosis"] = detail c = results["cache"] + lines = ["# Cache Session 粘性详情", ""] if c.get("session_stickiness"): - lines = ["# Cache Session 粘性详情", ""] for sid, s in c["session_stickiness"].items(): lines.append( f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}' ) - lines.append("") - details["cache_session_stickiness"] = "\n".join(lines) + else: + lines.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + + lines = ["# Cache 非最优选择详情", ""] if c.get("suboptimal_selections"): - lines = ["# Cache 非最优选择详情", ""] for x in c["suboptimal_selections"][:200]: lines.append( f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}' ) - lines.append("") - details["cache_suboptimal"] = "\n".join(lines) + else: + lines.append("- 未发现非最优选择。") + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + + lines = ["# Cache 驱逐影响详情", ""] if c.get("eviction_impact"): - lines = ["# Cache 驱逐影响详情", ""] for x in c["eviction_impact"][:200]: lines.append( f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}' ) - lines.append("") - details["cache_eviction"] = "\n".join(lines) + else: + lines.append("- 未检测到超时驱逐样本。") + lines.append("") + details["cache_eviction"] = "\n".join(lines) + + lines = ["# Cache Fallback 原因详情", ""] if c.get("fallback_reasons"): - lines = ["# Cache Fallback 原因详情", ""] for x in c["fallback_reasons"]: lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') - lines.append("") - details["cache_fallback"] = "\n".join(lines) + else: + lines.append("- 未出现 fallback 记录。") + lines.append("") + details["cache_fallback"] = "\n".join(lines) + + lines = ["# Cache 交叉诊断详情", ""] if c.get("cross_diagnosis"): - lines = ["# Cache 交叉诊断详情", ""] for x in c["cross_diagnosis"]: lines.append( f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%' ) - lines.append("") - details["cache_cross"] = "\n".join(lines) + else: + lines.append("- 样本不足,未生成交叉诊断。") + lines.append("") + details["cache_cross"] = "\n".join(lines) if "trace" in results: summary, detail_dict = format_trace_report(results["trace"]) @@ -386,28 +400,31 @@ def save_detailed_report(report_text, output_dir, details=None): if details.get("load_counter_state"): with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: f.write(details["load_counter_state"]) - if details.get("cache_session_stickiness"): + if details.get("cache_session_stickiness") is not None: with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: f.write(details["cache_session_stickiness"]) - if details.get("cache_suboptimal"): + if details.get("cache_suboptimal") is not None: with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: f.write(details["cache_suboptimal"]) - if details.get("cache_eviction"): + if details.get("cache_eviction") is not None: with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: f.write(details["cache_eviction"]) - if details.get("cache_fallback"): + if details.get("cache_fallback") is not None: with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: f.write(details["cache_fallback"]) - if details.get("cache_cross"): + if details.get("cache_cross") is not None: with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: f.write(details["cache_cross"]) if details.get("errors_topn"): with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f: f.write(details["errors_topn"]) + trace_detail_dir = os.path.join(detail_dir, "trace") + if details.get("trace_files"): + os.makedirs(trace_detail_dir, exist_ok=True) for trace_id, trace_text in details.get("trace_files", {}).items(): safe_id = trace_id.replace("/", "_") - trace_path = os.path.join(detail_dir, f"trace_{safe_id}.md") + trace_path = os.path.join(trace_detail_dir, f"trace_{safe_id}.md") with open(trace_path, "w", encoding="utf-8") as f: f.write(trace_text) @@ -426,7 +443,7 @@ def main(): parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康") parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度") parser.add_argument("--load", action="store_true", help="仅分析负载与计数器") - parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID)") + parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID;传 all 可全量追踪)") parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)") parser.add_argument( "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' @@ -478,7 +495,7 @@ def main(): run_health = args.health or (not any_mode) run_load = args.load or (not any_mode) run_cache = args.cache or (not any_mode) - run_trace = bool(args.trace) # trace 需要指定 ID,全量扫描不自动调用 + run_trace = bool(args.trace) # trace 需要指定 ID(支持 all),全量扫描不自动调用 results = {} step = 0 From 41f56f75932a6e90d68790a1faa19bf2acdb7d9e Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 17:07:36 +0800 Subject: [PATCH 25/40] stat-cache-hitrate: remove watch mode and loop guidance --- .../skills/stat-cache-hitrate/SKILL.md | 15 ++-- .../references/report_templates.md | 10 --- .../scripts/stat_cache_hitrate.py | 68 ++++++++++++++++--- 3 files changed, 64 insertions(+), 29 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index 251cbb04c2a..3d52bc4b2be 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -4,7 +4,7 @@ description: > 统计 FastDeploy Go Router 日志中的三层 cache 命中率指标,生成可视化报告。 三层指标:Prefix Hit Ratio(KV Cache 内容复用度)、Session Hit Rate(请求级路由粘性)、 Per-Worker Cache Stats(各 prefill worker 的缓存利用排名)。支持全量统计、tail 快速查看、 - 持续监控模式、指定时间段统计(--start/--end)。 + 指定时间段统计(--start/--end)。 当用户提到以下内容时触发此 skill:统计/查看 cache 命中率、查看 cache-aware 调度效果、 查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。 @@ -37,9 +37,8 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 ### 2. 分析模式 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号,避免客户端偶发不显示第 4 项): - 选项 1: `全量统计(默认)` — 扫描完整日志 -- 选项 2: `快速查看尾部` — 只看最近的数据(可指定行数如 2000 或时间如 30m) -- 选项 3: `持续监控` — 全量分析后提示监控命令 -- 选项 4: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) +- 选项 2: `快速查看尾部` — 只看最近的数据(可指定 `2000/2k` 行,或 `30m/2h/1d` 时间窗口) +- 选项 3: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) 若用户选择“指定时间段”,直接让用户填写: - 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); @@ -66,10 +65,10 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 # 快速查看尾部数据 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail # 默认最后 2000 行 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000 # 指定行数 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m # 指定时间 - -# 持续监控 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --watch +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2k # 行数缩写 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m # 分钟窗口 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2h # 小时窗口 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1d # 天窗口 # 指定时间段(--start 和 --end 可单独或同时使用) python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00" diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md index f5a0def5f55..ebca39be2c4 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -198,14 +198,4 @@ output = f"{bar} {percentage}% (N={count})" +---+---+---+---+---→ -5m -4m -3m -2m -1m -💡 持续跟踪: /loop 30s /analyze-cache-hitrate --tail -``` - -## --watch 持续监控模板 - -`--watch` 模式先输出完整报告(同终端概览报告模板),末尾额外提示: - -``` -💡 全量分析完成。持续跟踪后续变化: - /loop 30s /analyze-cache-hitrate --tail ``` diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index bd85730b7d1..fc729af47c7 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -8,11 +8,12 @@ 3. Per-Worker Stats — 各 worker 缓存利用排名 用法: - python3 stat_cache_hitrate.py [--tail N|Nm] [--watch] [--output DIR] + python3 stat_cache_hitrate.py [--tail N|2k|30m|2h|1d] [--output DIR] """ import argparse import json +import math import os import re import subprocess @@ -170,7 +171,7 @@ def count_lines(filepath): def read_lines(filepath, tail=None): """读取日志文件,支持 tail 模式。""" - if tail: + if tail is not None: if isinstance(tail, str) and tail.endswith("m"): # 按时间 tail:读取全部行,过滤最近 N 分钟 minutes = int(tail[:-1]) @@ -282,7 +283,7 @@ def extract_data(filepath, tail=None): strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail) stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail) inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail) - line_count = int(tail) if tail and not (isinstance(tail, str) and tail.endswith("m")) else total + line_count = int(tail) if tail is not None and not (isinstance(tail, str) and tail.endswith("m")) else total return strategy_recs, stats_recs, inference_count, line_count @@ -984,8 +985,12 @@ def parse_args(): epilog=__doc__, ) parser.add_argument("log_file", help="日志文件路径") - parser.add_argument("--tail", nargs="?", const="2000", help="只分析尾部数据(行数如 2000,或时间如 30m)") - parser.add_argument("--watch", action="store_true", help="全量分析后提示持续监控命令") + parser.add_argument( + "--tail", + nargs="?", + const="2000", + help="只分析尾部数据(支持 2000/2k 行,或 30m/2h/1d 时间窗口)", + ) parser.add_argument( "--output", default=None, help="详细报告输出目录(默认:skill_output/stat-cache-hitrate//)" ) @@ -996,6 +1001,45 @@ def parse_args(): return parser.parse_args() +def parse_tail_arg(tail_str): + """解析 --tail 参数,返回 int(行数) 或 'm'(时间窗口)。""" + if tail_str is None: + return None + + s = str(tail_str).strip().lower() + if not s: + raise ValueError("--tail 不能为空") + + # 行数: 2000 + if re.fullmatch(r"\d+", s): + value = int(s) + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return value + + # 行数缩写: 2k => 2000 + m = re.fullmatch(r"(\d+)k", s) + if m: + value = int(m.group(1)) * 1000 + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return value + + # 时间窗口: 30m/2h/1d(最终统一成分钟) + m = re.fullmatch(r"(\d+)(m|h|d)", s) + if m: + num = int(m.group(1)) + unit = m.group(2) + if num <= 0: + raise ValueError("--tail 时间窗口必须 > 0") + factor = {"m": 1, "h": 60, "d": 1440}[unit] + minutes = num * factor + minutes = max(1, math.ceil(minutes)) + return f"{minutes}m" + + raise ValueError("不支持的 --tail 格式:请使用 2000/2k 或 30m/2h/1d") + + def main(): args = parse_args() @@ -1009,6 +1053,12 @@ def main(): print("Error: --tail 与 --start/--end 不能同时使用,请选择其一", file=sys.stderr) sys.exit(1) + try: + tail = parse_tail_arg(args.tail) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + # 时间范围预过滤(--start 和 --end 可单独或同时指定) import atexit @@ -1023,7 +1073,7 @@ def main(): print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) # Phase 2: 提取 + 解析 - strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, args.tail) + strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, tail) if not strategy_recs and not stats_recs: print( @@ -1039,7 +1089,7 @@ def main(): diagnosis = cross_diagnose(prefix_hr, session_hr) # Phase 4: 输出 - if args.tail: + if tail is not None: print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling)) else: time_span = compute_time_span(strategy_recs, stats_recs) @@ -1094,9 +1144,5 @@ def main(): print(f" - Session 明细: {session_abs}") print(f" URI: {session_uri}") - if args.watch: - print("\n\U0001f4a1 持续跟踪: /loop 30s /stat-cache-hitrate --tail") - - if __name__ == "__main__": main() From 984a925ef3d5b1d7e670445cd46adc86cb1c3214 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Mon, 13 Apr 2026 17:38:32 +0800 Subject: [PATCH 26/40] skills: generalize tail shorthand parsing for line counts --- .../skills/stat-cache-hitrate/SKILL.md | 11 +-- .../scripts/stat_cache_hitrate.py | 99 +++++-------------- .../.claude/skills/troubleshoot/SKILL.md | 9 +- .../troubleshoot/scripts/troubleshoot.py | 33 ++++--- 4 files changed, 50 insertions(+), 102 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index 3d52bc4b2be..ad9b3f29fd2 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -37,7 +37,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 ### 2. 分析模式 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号,避免客户端偶发不显示第 4 项): - 选项 1: `全量统计(默认)` — 扫描完整日志 -- 选项 2: `快速查看尾部` — 只看最近的数据(可指定 `2000/2k` 行,或 `30m/2h/1d` 时间窗口) +- 选项 2: `快速查看尾部` — 只看最近的数据(支持 `2000`、`1k`、`1w` 等行数写法) - 选项 3: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) 若用户选择“指定时间段”,直接让用户填写: @@ -47,6 +47,7 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 如果用户未选择,默认使用全量统计。 `--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。 +`--tail` 仅支持“行数”语义(如 `2000`,也兼容 `1k/1w` 自动换算),不再支持 `30m/2h/1d` 这类时间窗口;按时间请使用 `--start/--end`。 时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 缺失部分自动从日志首末行推断。 @@ -65,12 +66,8 @@ python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志 # 快速查看尾部数据 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail # 默认最后 2000 行 python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000 # 指定行数 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2k # 行数缩写 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 30m # 分钟窗口 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 2h # 小时窗口 -python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1d # 天窗口 - -# 指定时间段(--start 和 --end 可单独或同时使用) +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1k # 行数缩写(自动换算) +# 指定时间段(需要按时间筛选时使用;--start 和 --end 可单独或同时使用) python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00" python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "2026/03/31 16:00:00" python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00" diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index fc729af47c7..1e27f96a476 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -8,12 +8,11 @@ 3. Per-Worker Stats — 各 worker 缓存利用排名 用法: - python3 stat_cache_hitrate.py [--tail N|2k|30m|2h|1d] [--output DIR] + python3 stat_cache_hitrate.py [--tail N|Nk|Nw] [--output DIR] """ import argparse import json -import math import os import re import subprocess @@ -28,7 +27,6 @@ from chart import render_bar, render_sparkline, render_table from log_parser import ( complete_time_arg, - extract_ts, filter_file_by_time_range, parse_cache_strategy_line, parse_stats_line, @@ -172,16 +170,10 @@ def count_lines(filepath): def read_lines(filepath, tail=None): """读取日志文件,支持 tail 模式。""" if tail is not None: - if isinstance(tail, str) and tail.endswith("m"): - # 按时间 tail:读取全部行,过滤最近 N 分钟 - minutes = int(tail[:-1]) - all_lines = _read_file_lines(filepath) - return _filter_by_time(all_lines, minutes) - else: - # 按行数 tail - n = int(tail) - result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True) - return result.stdout.splitlines() if result.returncode == 0 else [] + # 按行数 tail + n = int(tail) + result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True) + return result.stdout.splitlines() if result.returncode == 0 else [] return _read_file_lines(filepath) @@ -190,35 +182,6 @@ def _read_file_lines(filepath): return f.readlines() -def _filter_by_time(lines, minutes): - """过滤最近 N 分钟的日志行。""" - # 找最后一行的时间戳作为基准 - last_ts = None - for line in reversed(lines): - ts = extract_ts(line) - if ts: - last_ts = parse_ts(ts) - break - if not last_ts: - return lines - - from datetime import timedelta - - cutoff = last_ts - timedelta(minutes=minutes) - result = [] - for line in lines: - ts = extract_ts(line) - if ts: - try: - if parse_ts(ts) >= cutoff: - result.append(line) - except ValueError: - result.append(line) - else: - result.append(line) - return result - - # ════════════════════════════════════════════════════════════════ # Phase 2: 日志提取与解析 # ════════════════════════════════════════════════════════════════ @@ -237,7 +200,7 @@ def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None): """大文件模式:grep 过滤 + log_parser.py CLI 管道解析。""" parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log_parser.py") - if tail and not (isinstance(tail, str) and tail.endswith("m")): + if tail: grep_cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -F {_shell_quote(grep_pattern)} | python3 {_shell_quote(parser_path)} {parse_cmd}" else: grep_cmd = f"grep -F {_shell_quote(grep_pattern)} {_shell_quote(filepath)} | python3 {_shell_quote(parser_path)} {parse_cmd}" @@ -255,7 +218,7 @@ def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None): def grep_count(filepath, grep_pattern, tail=None): """大文件模式:grep 计数。""" - if tail and not (isinstance(tail, str) and tail.endswith("m")): + if tail: cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -cE {_shell_quote(grep_pattern)}" else: cmd = f"grep -cE {_shell_quote(grep_pattern)} {_shell_quote(filepath)}" @@ -283,7 +246,7 @@ def extract_data(filepath, tail=None): strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail) stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail) inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail) - line_count = int(tail) if tail is not None and not (isinstance(tail, str) and tail.endswith("m")) else total + line_count = int(tail) if tail is not None else total return strategy_recs, stats_recs, inference_count, line_count @@ -989,7 +952,7 @@ def parse_args(): "--tail", nargs="?", const="2000", - help="只分析尾部数据(支持 2000/2k 行,或 30m/2h/1d 时间窗口)", + help="只分析尾部数据(支持 2000、1k、1w 等行数写法)。按时间请使用 --start/--end", ) parser.add_argument( "--output", default=None, help="详细报告输出目录(默认:skill_output/stat-cache-hitrate//)" @@ -1002,7 +965,7 @@ def parse_args(): def parse_tail_arg(tail_str): - """解析 --tail 参数,返回 int(行数) 或 'm'(时间窗口)。""" + """解析 --tail 参数,返回行数 int。支持数字及 k/w 缩写。""" if tail_str is None: return None @@ -1010,34 +973,20 @@ def parse_tail_arg(tail_str): if not s: raise ValueError("--tail 不能为空") - # 行数: 2000 - if re.fullmatch(r"\d+", s): - value = int(s) - if value <= 0: - raise ValueError("--tail 行数必须 > 0") - return value - - # 行数缩写: 2k => 2000 - m = re.fullmatch(r"(\d+)k", s) - if m: - value = int(m.group(1)) * 1000 - if value <= 0: - raise ValueError("--tail 行数必须 > 0") - return value - - # 时间窗口: 30m/2h/1d(最终统一成分钟) - m = re.fullmatch(r"(\d+)(m|h|d)", s) - if m: - num = int(m.group(1)) - unit = m.group(2) - if num <= 0: - raise ValueError("--tail 时间窗口必须 > 0") - factor = {"m": 1, "h": 60, "d": 1440}[unit] - minutes = num * factor - minutes = max(1, math.ceil(minutes)) - return f"{minutes}m" - - raise ValueError("不支持的 --tail 格式:请使用 2000/2k 或 30m/2h/1d") + m = re.fullmatch(r"(\d+)([kw])?", s) + if not m: + raise ValueError("不支持的 --tail 格式:请使用 2000、1k、1w 等行数写法。按时间请改用 --start/--end") + + value = int(m.group(1)) + unit = m.group(2) + if unit == "k": + value *= 1000 + elif unit == "w": + value *= 10000 + + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return value def main(): diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md index 00c94a2f487..ecb27c1436a 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -38,7 +38,7 @@ description: > ### 2. 分析范围 必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): - 选项 1: `全量分析(默认)` — 分析整个日志文件 -- 选项 2: `尾部分析` — 只分析最近数据(可指定行数或时间如 `--tail 5000` 或 `--tail 30m`) +- 选项 2: `尾部分析` — 只分析最近数据(仅支持行数,如 `--tail 5000`) - 选项 3: `指定时间段` — 分析特定时间范围内的日志 如果用户未选择,默认使用全量分析。 @@ -50,10 +50,11 @@ description: > 时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 缺失部分自动从日志首末行推断(缺年份取首行,缺日期取末行)。 `--start/--end` 与 `--tail` 互斥。 +`--tail` 仅支持“行数”语义(如 `5000`,也兼容 `1k/1w` 自动换算),不再支持 `30m` 这类时间写法;凡是按时间筛选都使用 `--start/--end`。 当用户选择“指定时间段”时,必须再发起一次 **AskUserQuestion**(离散选项)引导时间输入: - 选项 1: `当天(00:00:00 到当前)`(推荐) -- 选项 2: `最近半小时`(自动换算为 `--start now-30m --end now` 语义) +- 选项 2: `自定义时间段`(由用户直接输入起止时间) 用户若通过客户端默认 `Other` 输入时间,则将该输入直接作为时间范围参数解析。 可补充一条简短示例引导: @@ -104,9 +105,7 @@ python3 $SCRIPTS/troubleshoot.py --trace all # 尾部分析 python3 $SCRIPTS/troubleshoot.py --tail 5000 -python3 $SCRIPTS/troubleshoot.py --tail 30m - -# 指定时间段(--start 和 --end 可单独或同时使用) +# 指定时间段(需要按时间筛选时使用;--start 和 --end 可单独或同时使用) python3 $SCRIPTS/troubleshoot.py --start "16:00:00" --end "17:00:00" python3 $SCRIPTS/troubleshoot.py --start "2026/03/31 16:00:00" python3 $SCRIPTS/troubleshoot.py --start "03/31" --end "03/31 18:00" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py index 251a21c7e81..b00521e6b01 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -12,7 +12,7 @@ --cache 仅分析 Cache 调度 --load 仅分析负载与计数器 --trace ID 追踪指定请求(支持逗号分隔多 ID;传 all 可全量追踪) - --tail N 仅分析尾部 N 行(支持 N 或 Nm 格式如 30m) + --tail N 仅分析尾部 N 行(支持 5000/1k/1w 等行数写法) --start TIME 起始时间(如 "16:00:00"、"03/31 16:00") --end TIME 结束时间(如 "17:00:00"、"2026/03/31 17:00:00") --output DIR 详细报告导出目录(默认: skill_output/troubleshoot//) @@ -21,6 +21,7 @@ """ import argparse +import re import os import sys from datetime import datetime @@ -38,7 +39,6 @@ from analyzers.trace import analyze_trace, format_trace_report from log_parser import ( complete_time_arg, - filter_file_by_recent_minutes, filter_file_by_time_range, ) @@ -106,12 +106,22 @@ def determine_log_file(user_path=None): def parse_tail_arg(tail_str): - """解析 --tail 参数:支持纯数字(行数)或 Nm(分钟)格式。""" + """解析 --tail 参数:支持数字及 k/w 缩写。""" if tail_str is None: return None - if tail_str.endswith("m"): - return {"type": "minutes", "value": int(tail_str[:-1])} - return {"type": "lines", "value": int(tail_str)} + s = str(tail_str).strip().lower() + m = re.fullmatch(r"(\d+)([kw])?", s) + if not m: + raise ValueError("--tail 仅支持行数(如 5000、1k、1w)。按时间请改用 --start/--end") + value = int(m.group(1)) + unit = m.group(2) + if unit == "k": + value *= 1000 + elif unit == "w": + value *= 10000 + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return {"type": "lines", "value": value} def determine_status(results): @@ -444,7 +454,7 @@ def main(): parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度") parser.add_argument("--load", action="store_true", help="仅分析负载与计数器") parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID;传 all 可全量追踪)") - parser.add_argument("--tail", help="尾部行数或分钟数 (如 5000 或 30m)") + parser.add_argument("--tail", help="尾部行数(如 5000、1k、1w)。按时间请使用 --start/--end") parser.add_argument( "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' ) @@ -478,14 +488,7 @@ def main(): tail_arg = parse_tail_arg(args.tail) tail = None - # --tail Nm 采用真实时间窗口过滤,再全量分析过滤后的临时文件 - if tail_arg and tail_arg["type"] == "minutes": - filtered_path, is_temp = filter_file_by_recent_minutes(log_file, tail_arg["value"]) - if is_temp: - atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) - log_file = filtered_path - print(f"--tail {tail_arg['value']}m: 使用日志时间戳过滤最近窗口", file=sys.stderr) - elif tail_arg and tail_arg["type"] == "lines": + if tail_arg and tail_arg["type"] == "lines": tail = tail_arg["value"] # 确定分析模式 From b68181da133e19b3e406d4d03d2792b31db75390 Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 16:09:51 +0800 Subject: [PATCH 27/40] [Feature] Add troubleshoot and stats-cache-hitratio skills --- .../troubleshoot/scripts/analyzers/trace.py | 29 +++++++++++++++---- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py index d0dcbdca6d9..ba4c7bd1051 100644 --- a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -291,7 +291,9 @@ def _parse_event_chain(lines): # Prefill events m = PREFILL_FIRST_CHUNK_RE.search(line) if m: - events.append({"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()}) + events.append( + {"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()} + ) continue m = PREFILL_DONE_RE.search(line) if m: @@ -300,7 +302,14 @@ def _parse_event_chain(lines): m = PREFILL_ERROR_RE.search(line) if m: events.append( - {"ts": ts, "type": "PREFILL_ERROR", "tags": tags, "error": m.group(1), "worker": m.group(2), "raw": line.strip()} + { + "ts": ts, + "type": "PREFILL_ERROR", + "tags": tags, + "error": m.group(1), + "worker": m.group(2), + "raw": line.strip(), + } ) continue m = PREFILL_DEFER_RE.search(line) @@ -312,7 +321,13 @@ def _parse_event_chain(lines): m = PREFILL_ERR_PATH_RE.search(line) if m: events.append( - {"ts": ts, "type": "PREFILL_ERROR_PATH_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()} + { + "ts": ts, + "type": "PREFILL_ERROR_PATH_RELEASE", + "tags": tags, + "worker": m.group(1), + "raw": line.strip(), + } ) continue @@ -456,7 +471,7 @@ def format_trace_report(result): detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') detail_lines.append(f"生命周期: {status}") detail_lines.append("") - detail_lines.append("## 事件链") + detail_lines.append("## 事件链(整理)") detail_lines.append("") for evt in trace["events"]: line = f' [{evt.get("ts","")}] {evt["type"]}' @@ -477,8 +492,12 @@ def format_trace_report(result): if evt.get("ts_ms"): line += f' ts_ms={evt["ts_ms"]}' detail_lines.append(line) + detail_lines.append("") + detail_lines.append("## 原始日志 RAW") + detail_lines.append("") + for evt in trace["events"]: if evt.get("raw"): - detail_lines.append(f' RAW: {evt["raw"]}') + detail_lines.append(evt["raw"]) detail_lines.append("") detail_dict[tid] = "\n".join(detail_lines) From 87a79104b09d12bbacd900ff4d6f61bc9ff2e28f Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 17:47:47 +0800 Subject: [PATCH 28/40] [Feature] Add troubleshoot and stats-cache-hitratio skills --- fastdeploy/golang_router/pkg/logger/logger.go | 73 +++++++++++-------- 1 file changed, 41 insertions(+), 32 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 07412670628..c14565e348d 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -45,13 +45,13 @@ const SessionIDKey contextKey = "session_id" const gracePeriod = 5 * time.Minute // rotatingWriter implements io.Writer with day-level rotation and dual-file writes. -// Current day's log is always "router.log"; on day change it is renamed to -// "router-YYYY-MM-DD.log" and a new "router.log" is created. During a short -// grace period after rotation, log lines whose timestamp belongs to the previous -// day are written to the archived file. +// Current day's log is written to "router-YYYY-MM-DD.log" and "router.log" is a +// symlink pointing to the current day's file. On day change a new date file is +// created and the symlink is updated. During a short grace period after rotation, +// log lines whose timestamp belongs to the previous day are written to the old file. type rotatingWriter struct { mu sync.Mutex - currentFile *os.File // today's router.log + currentFile *os.File // today's router-.log prevFile *os.File // previous day's router-.log during grace period (may be nil) currentDate string // "2006-01-02" prevDate string // previous date during grace period @@ -61,10 +61,24 @@ type rotatingWriter struct { func newRotatingWriter(logDir string) (*rotatingWriter, error) { today := nowFunc().Format("2006-01-02") - f, err := os.OpenFile(filepath.Join(logDir, "router.log"), os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + datePath := filepath.Join(logDir, "router-"+today+".log") + symlinkPath := filepath.Join(logDir, "router.log") + + // Migration: if router.log is a regular file (legacy), rename it to the date file. + if info, err := os.Lstat(symlinkPath); err == nil && info.Mode().IsRegular() { + os.Rename(symlinkPath, datePath) + } + + // Open the date file (append mode). + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) if err != nil { return nil, err } + + // Create/update symlink: router.log -> router-.log + os.Remove(symlinkPath) + os.Symlink("router-"+today+".log", symlinkPath) + return &rotatingWriter{ currentFile: f, currentDate: today, @@ -122,28 +136,20 @@ func (w *rotatingWriter) rotateLocked(newDate string) { w.prevFile = nil } - // Close current router.log so we can rename it. - if w.currentFile != nil { - w.currentFile.Close() - } - - // Rename router.log -> router-.log - oldPath := filepath.Join(w.logDir, "router.log") - archivePath := filepath.Join(w.logDir, "router-"+w.currentDate+".log") - if err := os.Rename(oldPath, archivePath); err != nil { - // Rename failed; try to reopen router.log and continue without rotation. - w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - return - } - - // Open the archived file for dual-write grace period. - w.prevFile, _ = os.OpenFile(archivePath, os.O_WRONLY|os.O_APPEND, 0666) + // Keep the old date file open for grace period writes. + w.prevFile = w.currentFile w.prevDate = w.currentDate w.graceUntil = nowFunc().Add(gracePeriod) - // Create new router.log for the new day. - w.currentFile, _ = os.OpenFile(oldPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + // Open new date file for the new day. + datePath := filepath.Join(w.logDir, "router-"+newDate+".log") + w.currentFile, _ = os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) w.currentDate = newDate + + // Update symlink: router.log -> router-.log + symlinkPath := filepath.Join(w.logDir, "router.log") + os.Remove(symlinkPath) + os.Symlink("router-"+newDate+".log", symlinkPath) } // parseLogDate extracts the date from a log line produced by log.LstdFlags. @@ -244,22 +250,17 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { } now := nowFunc() + today := now.Format("2006-01-02") var archives []logFileInfo - var routerLogSize int64 for _, entry := range entries { if entry.IsDir() { continue } name := entry.Name() - info, err := entry.Info() - if err != nil { - continue - } - // Count router.log size but never delete it. + // router.log is now a symlink; skip it. if name == "router.log" { - routerLogSize = info.Size() continue } @@ -273,6 +274,14 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { if err != nil { continue } + // Never delete today's active date file. + if dateStr == today { + continue + } + info, err := entry.Info() + if err != nil { + continue + } archives = append(archives, logFileInfo{ name: name, path: filepath.Join(logDir, name), @@ -303,7 +312,7 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { // Phase 2: Size-based cleanup. if maxTotalSizeMB > 0 { maxBytes := int64(maxTotalSizeMB) * 1024 * 1024 - var totalSize int64 = routerLogSize + var totalSize int64 for _, f := range archives { totalSize += f.size } From 109a8e5b2aee056ba17a95e59c22924fe9bd9e23 Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 19:22:06 +0800 Subject: [PATCH 29/40] [Feature] Add skills and Add logging cleanup --- .../skills/stat-cache-hitrate/SKILL.md | 14 ++- .../stat-cache-hitrate/scripts/log_parser.py | 65 +++++++++++ .../scripts/stat_cache_hitrate.py | 102 ++++++++++-------- 3 files changed, 133 insertions(+), 48 deletions(-) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md index ad9b3f29fd2..097a10f8163 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -40,15 +40,21 @@ IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析 - 选项 2: `快速查看尾部` — 只看最近的数据(支持 `2000`、`1k`、`1w` 等行数写法) - 选项 3: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) -若用户选择“指定时间段”,直接让用户填写: -- 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); +**若用户选择"快速查看尾部",必须再询问行数**,提供选项: +- 选项 1: `2000 行(默认)` +- 选项 2: `5000 行` +- 选项 3: `1万行` + +若用户选择”指定时间段”,直接让用户填写: +- 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); +- 支持相对时间写法:`30m`、`2h`、`1d`、`最后30分钟` 等(换算为绝对时间) - 然后映射为 `--start/--end` 参数执行。 如果用户未选择,默认使用全量统计。 `--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。 -`--tail` 仅支持“行数”语义(如 `2000`,也兼容 `1k/1w` 自动换算),不再支持 `30m/2h/1d` 这类时间窗口;按时间请使用 `--start/--end`。 -时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 +`--tail` 仅支持”行数”语义(如 `2000`,也兼容 `1k/1w` 自动换算),不再支持 `30m/2h/1d` 这类时间窗口;按时间请使用 `--start/--end`。 +时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`、相对时间(`30m`、`2h`、`1d`、`最后30分钟`)。 缺失部分自动从日志首末行推断。 ### 3. 输出目录 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py index d43d6909c64..bb31235f3fa 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py @@ -76,6 +76,63 @@ def parse_ts(ts_str): _SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$") _TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$") +# 相对时间正则:支持 30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟 +_RELATIVE_TIME_RE = re.compile(r"^(?:last|最后)?\s*(\d+)\s*(m|分钟|mins?|h|小时|hours?|d|天|days?)$", re.IGNORECASE) + + +def _parse_relative_time(time_str): + """解析相对时间字符串,返回 timedelta。 + + 支持格式:30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟 + """ + m = _RELATIVE_TIME_RE.match(time_str.strip()) + if not m: + return None + + value = int(m.group(1)) + unit = m.group(2).lower() + + if unit.startswith("m") and "in" not in unit: # m, min, mins + from datetime import timedelta + + return timedelta(minutes=value) + elif unit.startswith("h"): # h, hour, hours + from datetime import timedelta + + return timedelta(hours=value) + else: # d, day, days + from datetime import timedelta + + return timedelta(days=value) + + +def _relative_to_absolute(time_str, log_file, is_end=False): + """将相对时间转换为绝对时间,基于日志文件的时间边界。 + + - start: 从日志末行时间往前推 + - end: 直接使用日志末行时间(或当前时间) + """ + relative_delta = _parse_relative_time(time_str) + if not relative_delta: + return None + + # 获取日志文件末行时间作为基准 + boundary_ts = _get_log_boundary_ts(log_file, "last") + if not boundary_ts: + return None + + # 解析为 datetime + dt = datetime.strptime(boundary_ts, "%Y/%m/%d %H:%M:%S") + + if is_end: + # end 时间:直接使用日志末行时间 + return boundary_ts + else: + # start 时间:末行时间减去 duration + + abs_time = dt - relative_delta + return abs_time.strftime("%Y/%m/%d %H:%M:%S") + def _get_log_boundary_ts(log_file, which="first"): """从日志文件首行或末行提取时间戳。""" @@ -93,11 +150,13 @@ def complete_time_arg(time_str, log_file, is_end=False): 支持格式: 'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD', 'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM' + 相对时间:30m、2h、1d、最后30分钟 等(从日志末行时间算起) 补全规则: - 缺年份:从日志首行取 - 缺日期:从日志末行取 - 缺时间:start→00:00:00, end→23:59:59 + - 相对时间:start 从日志末行往前推,end 直接用日志末行时间 Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串 """ @@ -105,6 +164,12 @@ def complete_time_arg(time_str, log_file, is_end=False): return None time_str = time_str.strip() + # Case 0: 相对时间处理(如 "30m"、"最后30分钟"、"2h") + # 从日志文件末行时间开始算起 + relative_result = _relative_to_absolute(time_str, log_file, is_end) + if relative_result: + return relative_result + # Case 1: 完整日期时间 m = _FULL_DT_RE.match(time_str) if m: diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py index 1e27f96a476..7c6e0d40ecf 100644 --- a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -17,10 +17,10 @@ import re import subprocess import sys -from pathlib import Path -from urllib.parse import quote from collections import defaultdict from datetime import datetime +from pathlib import Path +from urllib.parse import quote # 同目录模块导入 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) @@ -154,6 +154,7 @@ def _summarize_id_type_ranges(rows_with_seq): ranges.append((start_id, end_id, current_type, start_ts, end_ts)) return ranges + # ════════════════════════════════════════════════════════════════ # Phase 1: 日志读取 # ════════════════════════════════════════════════════════════════ @@ -424,7 +425,9 @@ def _quartile_trend(trend, value_field): return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}" -def format_full_report(filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None): +def format_full_report( + filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None +): """格式化完整终端报告。""" parts = [] @@ -667,13 +670,17 @@ def save_detailed_report( trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean") if trend_str: parts.append(f"- 趋势: {trend_str}") - dist_data = [{"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"]] + dist_data = [ + {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"] + ] parts.append("") parts.append("```text") parts.append("Unicode 柱状图(Prefix HR 分布)") parts.append(render_bar(dist_data, show_count=True)) if prefix_hr["trend"]: - sparkline_data = [{"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"]] + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] parts.append("") parts.append("ASCII 折线图(Prefix HR 趋势)") parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) @@ -761,7 +768,9 @@ def save_detailed_report( f.write("\n".join(detail_parts)) if session_rows: - parts.append(f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)") + parts.append( + f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)" + ) parts.append("") all_rows_with_seq = [] @@ -790,7 +799,9 @@ def save_detailed_report( if start_id == end_id: session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") else: - session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + session_parts.append( + f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)" + ) session_parts.append("") session_parts.append("## 概览") session_parts.append("- 字段说明:`avg-hit` = `avg_hit(excl_first)`(去除首请求后的平均命中率)") @@ -1038,11 +1049,24 @@ def main(): diagnosis = cross_diagnose(prefix_hr, session_hr) # Phase 4: 输出 + # 无论 tail 还是全量模式,都生成详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + if args.output: + output_base = args.output + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) + output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate") + output_dir = os.path.join(output_base, run_timestamp) + + time_span = compute_time_span(strategy_recs, stats_recs) + window_rows = build_per_window_rows(strategy_recs, stats_recs) + if tail is not None: + # tail 精简模式:打印摘要 + 生成详细报告 print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling)) else: - time_span = compute_time_span(strategy_recs, stats_recs) - window_rows = build_per_window_rows(strategy_recs, stats_recs) + # 全量模式:打印完整报告 print( format_full_report( args.log_file, @@ -1057,41 +1081,31 @@ def main(): ) ) - # 导出详细报告 - run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - if args.output: - output_base = args.output - else: - script_dir = os.path.dirname(os.path.abspath(__file__)) - golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) - output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate") - output_dir = os.path.join(output_base, run_timestamp) - report_path = save_detailed_report( - args.log_file, - strategy_recs, - stats_recs, - prefix_hr, - session_hr, - per_worker, - scheduling, - diagnosis, - output_dir, - time_span=time_span, - ) - print("\n\U0001f4c4 详细数据见:") - report_abs, report_uri = _build_path_links(report_path) - print(f" - 报告文件: {report_abs}") - print(f" URI: {report_uri}") - details_path = os.path.join(output_dir, "detail", "per_window_data.md") - if os.path.exists(details_path): - details_abs, details_uri = _build_path_links(details_path) - print(f" - 窗口明细: {details_abs}") - print(f" URI: {details_uri}") - session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md") - if os.path.exists(session_detail_path): - session_abs, session_uri = _build_path_links(session_detail_path) - print(f" - Session 明细: {session_abs}") - print(f" URI: {session_uri}") + # 导出详细报告(tail 和全量都生成) + report_path = save_detailed_report( + args.log_file, + strategy_recs, + stats_recs, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + output_dir, + time_span=time_span, + ) + print("\n\U0001f4c4 详细数据见:") + report_abs, report_uri = _build_path_links(report_path) + print(f" - 报告文件: [{report_abs}]({report_uri})") + details_path = os.path.join(output_dir, "detail", "per_window_data.md") + if os.path.exists(details_path): + details_abs, details_uri = _build_path_links(details_path) + print(f" - 窗口明细: [{details_abs}]({details_uri})") + session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md") + if os.path.exists(session_detail_path): + session_abs, session_uri = _build_path_links(session_detail_path) + print(f" - Session 明细: [{session_abs}]({session_uri})") + if __name__ == "__main__": main() From 888b0ac63d572b35702f9bb417a67a13c9062a3f Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 19:39:08 +0800 Subject: [PATCH 30/40] [Feature] Add skills and logging cleanup --- .../router_troubleshoot_playbook.md | 190 ------------------ 1 file changed, 190 deletions(-) delete mode 100644 docs/zh/online_serving/router_troubleshoot_playbook.md diff --git a/docs/zh/online_serving/router_troubleshoot_playbook.md b/docs/zh/online_serving/router_troubleshoot_playbook.md deleted file mode 100644 index 0ccee9c6d55..00000000000 --- a/docs/zh/online_serving/router_troubleshoot_playbook.md +++ /dev/null @@ -1,190 +0,0 @@ -# Router 问题排查实战手册(日志定位 + troubleshoot skill) - -本文档结合以下两部分信息整理: -- Router 常见问题与日志语义:[`docs/zh/online_serving/router_faq.md`](router_faq.md) -- `fastdeploy/golang_router/.claude/skills/troubleshoot` 的脚本能力与使用方式 - -目标:给出一套可落地的排查流程,帮助你从“现象”快速定位到“日志证据”和“处理建议”。 - ---- - -## 1. 先定范围:全量 / 尾部 / 指定时间段 - -建议先根据问题发生时间选择分析范围(这是和分析模式并列的维度): - -- **全量分析**:适合历史慢性问题、趋势问题。 -- **尾部分析(`--tail`)**:适合刚发生的故障,优先看最近 N 行或 N 分钟。 -- **指定时间段(`--start/--end`)**:适合已知故障窗口(例如 14:05~14:20)。 - -> 说明:`--tail` 与 `--start/--end` 互斥,二选一。 - ---- - -## 2. 先看健康与注册,再看调度与请求 - -根据 `router_faq.md` 的建议,先确认“有没有可用实例”,再看“请求是否调度成功”。 - -### 2.1 健康与注册检查(必做) - -```bash -# 已注册实例列表 -curl -X GET http://{router_url}/registered - -# 已注册实例数量 -curl -X GET http://{router_url}/registered_number - -# 从 Router 机器检查后端健康 -curl -X GET http://{server_url}/health -``` - -重点日志关键词: -- 健康移除:`Removed unhealthy ... instance` -- 注册失败:`Failed to register instance` -- 健康检查失败:`failed to send request to ...` / `Server ... is not healthy` - -若实例都不健康或未注册,后续 502/503 多数是结果,不是根因。 - -### 2.2 调度失败检查 - -常见错误: -- `Failed to select worker` -- `Failed to select worker pair` -- `No available prefill/decode workers` - -这类问题先确认: -1) 注册数量是否为 0; -2) 调度策略与部署模式是否匹配; -3) `fd_metrics_score` 依赖的 `/metrics` 是否可访问。 - -### 2.3 请求链路与后端请求失败 - -常见日志: -- `Failed to connect to backend service` -- `Request failed (attempt n/max)` -- `Decode/Prefill/Backend request failed for {url}` -- `Panic recovered` - -这类问题通常需要结合 trace(ID 级别)看完整链路。 - ---- - -## 3. 使用 troubleshoot skill 的标准方式 - -脚本入口(在 `fastdeploy/golang_router/` 下): - -```bash -SCRIPTS=.claude/skills/troubleshoot/scripts -python3 $SCRIPTS/troubleshoot.py [options] -``` - -### 3.1 全量体检(默认推荐首轮) - -```bash -python3 $SCRIPTS/troubleshoot.py -``` - -会同时输出:errors / latency / health / cache / load 的综合结果。 - -### 3.2 指定维度分析(精准打点) - -```bash -python3 $SCRIPTS/troubleshoot.py --errors -python3 $SCRIPTS/troubleshoot.py --latency -python3 $SCRIPTS/troubleshoot.py --health -python3 $SCRIPTS/troubleshoot.py --cache -python3 $SCRIPTS/troubleshoot.py --load -``` - -### 3.3 请求追踪(ID 级排查) - -```bash -# 单个 ID -python3 $SCRIPTS/troubleshoot.py --trace - -# 多个 ID -python3 $SCRIPTS/troubleshoot.py --trace "id1,id2,id3" -``` - -trace 会展示: -- 匹配到的 tag 类型(request_id / trace_id / session_id / req_id) -- 生命周期完整性 -- 事件链(含原始日志 RAW) -- 仅 request_id / 仅 session_id / 仅 trace_id 的统计 -- 各标签组合形式(detail 中给出组合与对应 ID) - -### 3.4 范围过滤与 trace 组合 - -当你要“在某个时间窗内追踪某个 ID”时,使用范围参数和 trace 组合: - -```bash -python3 $SCRIPTS/troubleshoot.py --start "2026/04/13 14:05:00" --end "2026/04/13 14:20:00" --trace "" -``` - -这符合“范围维度(全量/尾部/时间段)”与“模式维度(含 trace)”分离的使用方式。 - ---- - -## 4. 一套可复制的故障定位流程 - -### 步骤 A:确认故障窗口与错误现象 -- 收集用户报错时间、HTTP 状态码(502/503/500/400)和请求路径。 - -### 步骤 B:先跑时间窗综合分析 -```bash -python3 $SCRIPTS/troubleshoot.py --start "HH:MM:SS" --end "HH:MM:SS" -``` -- 看 STATUS(HEALTHY / DEGRADED / CRITICAL)。 -- 优先看 errors、health 章节,判断是否是后端健康/注册问题。 - -### 步骤 C:按症状进入专项 -- 502/503:`--errors --health --load` -- 延迟突增:`--latency --load --cache` -- 单请求失败:`--trace `(可叠加步骤 B 的时间窗) - -### 步骤 D:在 detail 文件中取证 -报告目录默认: -`skill_output/troubleshoot//` - -重点文件: -- `summary/troubleshoot_report.md` -- `detail/trace_.md` -- `detail/health_events.md` -- `detail/load_select_release.md` - ---- - -## 5. 现象到日志的快速映射 - -| 现象 | 优先看日志/关键词 | 推荐命令 | -|---|---|---| -| 503 无可用 worker | `No available prefill/decode workers`, `Removed unhealthy ...` | `--health --errors` | -| 502 调度失败 | `Failed to select worker`, `Failed to select worker pair` | `--errors --health --load` | -| 502 后端连接失败 | `Failed to connect to backend service`, `Request failed (attempt ...)` | `--errors --trace ` | -| 请求卡住/链路不完整 | 有 select 无 release、无 `Request completed successfully.` | `--trace ` | -| 延迟抖动 | HTTP latency、`[stats] total_running...` | `--latency --load --cache` | - ---- - -## 6. 常见误区 - -1. **只看 502/503 响应,不看健康与注册日志**:容易把“结果”当“根因”。 -2. **不限定时间窗口**:日志噪音大,容易误判。 -3. **trace 只看结构化事件,不看 RAW**:可能漏掉关键上下文(例如同一秒的 WARN/ERROR 细节)。 -4. **把范围维度和模式维度混在一起**:建议先定范围(全量/尾部/时间段),再定模式(完整/多维/trace)。 - ---- - -## 7. 推荐排查命令模板 - -```bash -# 模板 1:故障窗口综合体检 -python3 $SCRIPTS/troubleshoot.py --start "YYYY/MM/DD HH:MM:SS" --end "YYYY/MM/DD HH:MM:SS" - -# 模板 2:最近 30 分钟快速巡检 -python3 $SCRIPTS/troubleshoot.py --tail 30m - -# 模板 3:单请求深挖(配合时间窗) -python3 $SCRIPTS/troubleshoot.py --start "HH:MM:SS" --end "HH:MM:SS" --trace "" -``` - -如果你已经知道故障集中在特定 ID,优先从模板 3 入手,然后回到模板 1 看全局背景。 From e16652d3a988d81f42437b1bb2c0676c6a5a4726 Mon Sep 17 00:00:00 2001 From: mouxin Date: Mon, 13 Apr 2026 19:40:52 +0800 Subject: [PATCH 31/40] [Feature] Add skills and logging cleanup --- docs/zh/online_serving/router_faq.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md index a431065dbf0..9c32726f4dc 100644 --- a/docs/zh/online_serving/router_faq.md +++ b/docs/zh/online_serving/router_faq.md @@ -5,7 +5,6 @@ 本文档基于 [Golang Router](https://github.com/PaddlePaddle/FastDeploy/tree/develop/fastdeploy/golang_router) 的代码实现,汇总了 Router 在使用过程中常见的日志信息、返回输出及问题排查方法,帮助用户快速定位和解决问题。 Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 -如需按“日志定位 + troubleshoot skill”流程化排查,请参考 [Router 问题排查实战手册](router_troubleshoot_playbook.md)。 ## 常见日志分析 From 38b6ea050fe9886b44660bbd2f199abe6f6f58ca Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 11:28:53 +0800 Subject: [PATCH 32/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index c14565e348d..a3d64a0714d 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -143,7 +143,18 @@ func (w *rotatingWriter) rotateLocked(newDate string) { // Open new date file for the new day. datePath := filepath.Join(w.logDir, "router-"+newDate+".log") - w.currentFile, _ = os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + log.Printf("[ERROR] failed to open new log file %s: %v, keeping current file", datePath, err) + if w.prevFile != nil { + w.currentFile = w.prevFile + w.currentDate = w.prevDate + w.prevFile = nil + w.prevDate = "" + } + return + } + w.currentFile = f w.currentDate = newDate // Update symlink: router.log -> router-.log @@ -162,7 +173,7 @@ func parseLogDate(p []byte) string { s := string(p) for i := 0; i+10 <= len(s); i++ { c := s[i] - if c >= '1' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' { + if c >= '0' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' { // Found a candidate "YYYY/MM/DD" year := s[i : i+4] month := s[i+5 : i+7] From 5582779cc96b61da523c40b980d326019fa01846 Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 14:33:38 +0800 Subject: [PATCH 33/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 16 ++++++--- .../golang_router/pkg/logger/logger_test.go | 34 +++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index a3d64a0714d..30cbc747ae0 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -76,7 +76,9 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) { } // Create/update symlink: router.log -> router-.log - os.Remove(symlinkPath) + if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) + } os.Symlink("router-"+today+".log", symlinkPath) return &rotatingWriter{ @@ -159,7 +161,9 @@ func (w *rotatingWriter) rotateLocked(newDate string) { // Update symlink: router.log -> router-.log symlinkPath := filepath.Join(w.logDir, "router.log") - os.Remove(symlinkPath) + if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) + } os.Symlink("router-"+newDate+".log", symlinkPath) } @@ -312,7 +316,9 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { remaining := archives[:0] for _, f := range archives { if f.date.Before(cutoff) { - os.Remove(f.path) + if err := os.Remove(f.path); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", f.path, err) + } } else { remaining = append(remaining, f) } @@ -329,7 +335,9 @@ func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { } for len(archives) > 0 && totalSize > maxBytes { oldest := archives[0] - os.Remove(oldest.path) + if err := os.Remove(oldest.path); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", oldest.path, err) + } totalSize -= oldest.size archives = archives[1:] } diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go index fea0b853cf7..b426bc00988 100644 --- a/fastdeploy/golang_router/pkg/logger/logger_test.go +++ b/fastdeploy/golang_router/pkg/logger/logger_test.go @@ -182,3 +182,37 @@ func TestContextPrefix(t *testing.T) { } }) } + +func TestParseLogDate(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"standard INFO log line", "[INFO] 2024/03/15 10:30:45 some message", "2024-03-15"}, + {"standard ERROR log line", "[ERROR] 2024/01/02 09:00:00 error occurred", "2024-01-02"}, + {"standard WARN log line", "[WARN] 2025/12/31 23:59:59 warning msg", "2025-12-31"}, + {"standard DEBUG log line", "[DEBUG] 2024/06/01 00:00:00 debug info", "2024-06-01"}, + {"empty string", "", ""}, + {"no date pattern", "no date here at all", ""}, + {"incomplete date - only year", "2024/", ""}, + {"incomplete date - year and month", "[INFO] 2024/03", ""}, + {"short input", "abc", ""}, + {"date without log prefix", "2024/03/15 10:30:45 message", "2024-03-15"}, + {"date at different position", "prefix 2024/11/20 rest", "2024-11-20"}, + {"slash but not date", "path/to/file is not a date", ""}, + {"single character input", "x", ""}, + {"exactly 10 chars non-date", "abcdefghij", ""}, + {"boundary - first day of year", "[INFO] 2024/01/01 00:00:00 new year", "2024-01-01"}, + {"boundary - last day of year", "[INFO] 2024/12/31 23:59:59 year end", "2024-12-31"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := parseLogDate([]byte(tt.input)) + if got != tt.expected { + t.Errorf("parseLogDate(%q) = %q, want %q", tt.input, got, tt.expected) + } + }) + } +} From fd56b0ac357bc28a53fa55e879a7f00657d0bbdd Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 14:57:01 +0800 Subject: [PATCH 34/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 30cbc747ae0..c4e8191d598 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -79,7 +79,9 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) { if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) } - os.Symlink("router-"+today+".log", symlinkPath) + if err := os.Symlink("router-"+today+".log", symlinkPath); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err) + } return &rotatingWriter{ currentFile: f, @@ -164,7 +166,9 @@ func (w *rotatingWriter) rotateLocked(newDate string) { if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) } - os.Symlink("router-"+newDate+".log", symlinkPath) + if err := os.Symlink("router-"+newDate+".log", symlinkPath); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err) + } } // parseLogDate extracts the date from a log line produced by log.LstdFlags. From cc3864089d559f8129a85ea48cf33db919ffc2b3 Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 15:18:22 +0800 Subject: [PATCH 35/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index c4e8191d598..0a0e50ca686 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -76,11 +76,8 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) { } // Create/update symlink: router.log -> router-.log - if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { - fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) - } - if err := os.Symlink("router-"+today+".log", symlinkPath); err != nil { - fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err) + if err := updateSymlink(symlinkPath, "router-"+today+".log"); err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale: %v\n", symlinkPath, err) } return &rotatingWriter{ @@ -163,12 +160,32 @@ func (w *rotatingWriter) rotateLocked(newDate string) { // Update symlink: router.log -> router-.log symlinkPath := filepath.Join(w.logDir, "router.log") + if err := updateSymlink(symlinkPath, "router-"+newDate+".log"); err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale (points to old date): %v\n", symlinkPath, err) + } +} + +// updateSymlink atomically replaces symlinkPath to point to target. +// It tries os.Remove + os.Symlink first; if remove fails (e.g. permission denied) +// it falls back to a temp-symlink + os.Rename for an atomic swap attempt. +func updateSymlink(symlinkPath, target string) error { + // Fast path: remove old, create new. if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { - fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove symlink %s: %v\n", symlinkPath, err) + // Remove failed (e.g. permission issue). Try atomic rename as fallback. + tmp := symlinkPath + ".tmp" + if err2 := os.Symlink(target, tmp); err2 != nil { + return fmt.Errorf("remove old symlink: %w; create temp symlink: %v", err, err2) + } + if err2 := os.Rename(tmp, symlinkPath); err2 != nil { + os.Remove(tmp) // best-effort cleanup + return fmt.Errorf("remove old symlink: %w; rename temp symlink: %v", err, err2) + } + return nil } - if err := os.Symlink("router-"+newDate+".log", symlinkPath); err != nil { - fmt.Fprintf(os.Stderr, "[ERROR] Failed to create symlink %s: %v\n", symlinkPath, err) + if err := os.Symlink(target, symlinkPath); err != nil { + return fmt.Errorf("create symlink: %w", err) } + return nil } // parseLogDate extracts the date from a log line produced by log.LstdFlags. From 06a886eaae52ddf5578f78d5a7daad2e5a8debe4 Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 15:59:03 +0800 Subject: [PATCH 36/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 0a0e50ca686..1ec91533826 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -131,6 +131,16 @@ func (w *rotatingWriter) Close() error { // rotateLocked performs the actual file rotation. Must be called with w.mu held. func (w *rotatingWriter) rotateLocked(newDate string) { + // Open new date file for the new day first, before touching any state. + datePath := filepath.Join(w.logDir, "router-"+newDate+".log") + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + if err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) + // Advance currentDate so we don't retry on every Write call. + w.currentDate = newDate + return + } + // Close any lingering previous file. if w.prevFile != nil { w.prevFile.Close() @@ -142,19 +152,6 @@ func (w *rotatingWriter) rotateLocked(newDate string) { w.prevDate = w.currentDate w.graceUntil = nowFunc().Add(gracePeriod) - // Open new date file for the new day. - datePath := filepath.Join(w.logDir, "router-"+newDate+".log") - f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) - if err != nil { - log.Printf("[ERROR] failed to open new log file %s: %v, keeping current file", datePath, err) - if w.prevFile != nil { - w.currentFile = w.prevFile - w.currentDate = w.prevDate - w.prevFile = nil - w.prevDate = "" - } - return - } w.currentFile = f w.currentDate = newDate From ce775c9a9cfd0791ae82d0df6f3b7789b56cdb91 Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 16:48:33 +0800 Subject: [PATCH 37/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 6 +++--- fastdeploy/golang_router/pkg/logger/logger_test.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 1ec91533826..7822095af36 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -70,7 +70,7 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) { } // Open the date file (append mode). - f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) if err != nil { return nil, err } @@ -133,7 +133,7 @@ func (w *rotatingWriter) Close() error { func (w *rotatingWriter) rotateLocked(newDate string) { // Open new date file for the new day first, before touching any state. datePath := filepath.Join(w.logDir, "router-"+newDate+".log") - f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) if err != nil { fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) // Advance currentDate so we don't retry on every Write call. @@ -243,7 +243,7 @@ func CloseLogFile() { } } -// StartLogCleanup runs periodic log cleanup in a background goroutine. +// StartLogCleanup blocks running periodic log cleanup; call it in a goroutine. // It deletes archived log files older than MaxAgeDays and trims total log size // to stay under MaxTotalSizeMB. func StartLogCleanup(ctx context.Context, cfg Config) { diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go index b426bc00988..921f26aa166 100644 --- a/fastdeploy/golang_router/pkg/logger/logger_test.go +++ b/fastdeploy/golang_router/pkg/logger/logger_test.go @@ -24,7 +24,7 @@ func TestLoggerInit(t *testing.T) { defer os.RemoveAll("logs") // sync.Once prevents re-init, so manually verify file creation logic - f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) if err != nil { t.Fatalf("Failed to create log file: %v", err) } From ee0162a5b09568f9b5d798ecf382bd227d64194a Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 17:31:47 +0800 Subject: [PATCH 38/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/pkg/logger/logger.go | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 7822095af36..bd1cfdcb7eb 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -56,6 +56,7 @@ type rotatingWriter struct { currentDate string // "2006-01-02" prevDate string // previous date during grace period graceUntil time.Time // when to close prevFile + retryAfter time.Time // earliest time to retry a failed rotation (backoff) logDir string } @@ -93,8 +94,8 @@ func (w *rotatingWriter) Write(p []byte) (n int, err error) { today := nowFunc().Format("2006-01-02") - // Detect day change and rotate. - if today != w.currentDate { + // Detect day change and rotate. Also retry failed rotations after backoff. + if today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) { w.rotateLocked(today) } @@ -136,11 +137,15 @@ func (w *rotatingWriter) rotateLocked(newDate string) { f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) if err != nil { fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) - // Advance currentDate so we don't retry on every Write call. - w.currentDate = newDate + // Don't advance currentDate — keep writing to the old file and retry + // after a backoff to avoid hammering the filesystem on every Write call. + w.retryAfter = nowFunc().Add(30 * time.Second) return } + // Rotation succeeded — clear any retry backoff. + w.retryAfter = time.Time{} + // Close any lingering previous file. if w.prevFile != nil { w.prevFile.Close() From cbdb5484a69fef0c3e2caebfb62ae240879bc847 Mon Sep 17 00:00:00 2001 From: mouxin Date: Tue, 14 Apr 2026 19:06:44 +0800 Subject: [PATCH 39/40] [Feature] Update logging cleanup --- fastdeploy/golang_router/cmd/main.go | 1 + .../config/config.example.yaml | 1 + .../golang_router/internal/config/config.go | 1 + fastdeploy/golang_router/pkg/logger/logger.go | 98 +++++++++++++++---- 4 files changed, 80 insertions(+), 21 deletions(-) diff --git a/fastdeploy/golang_router/cmd/main.go b/fastdeploy/golang_router/cmd/main.go index 6664436823c..c3670622ab2 100644 --- a/fastdeploy/golang_router/cmd/main.go +++ b/fastdeploy/golang_router/cmd/main.go @@ -44,6 +44,7 @@ func main() { logCfg := logger.Config{ Level: cfg.Log.Level, Output: cfg.Log.Output, + Dir: cfg.Log.Dir, MaxAgeDays: cfg.Log.MaxAgeDays, MaxTotalSizeMB: cfg.Log.MaxTotalSizeMB, CleanupIntervalSecs: cfg.Log.CleanupIntervalSecs, diff --git a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml index 5e1091b0eef..075d8eec5fd 100644 --- a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml +++ b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml @@ -29,6 +29,7 @@ manager: log: level: "info" # debug, info, warn, error output: "file" # stdout, file + dir: "logs" # log directory; default: logs max-age-days: 7 # max days to keep log files; default: 7 max-total-size-mb: 500 # max total log size in MB; default: 500 cleanup-interval-secs: 3600 # cleanup check interval in seconds; default: 3600 diff --git a/fastdeploy/golang_router/internal/config/config.go b/fastdeploy/golang_router/internal/config/config.go index f184a5b16da..7a6dc3fc504 100644 --- a/fastdeploy/golang_router/internal/config/config.go +++ b/fastdeploy/golang_router/internal/config/config.go @@ -51,6 +51,7 @@ type SchedulerConfig struct { type LogConfig struct { Level string `yaml:"level"` // debug, info, warn, error Output string `yaml:"output"` // stdout, file + Dir string `yaml:"dir"` // log directory; defaults to "logs" MaxAgeDays int `yaml:"max-age-days"` // max days to keep log files; 0 = use default (7) MaxTotalSizeMB int `yaml:"max-total-size-mb"` // max total log size in MB; 0 = use default (500) CleanupIntervalSecs float64 `yaml:"cleanup-interval-secs"` // cleanup check interval in seconds; 0 = use default (3600) diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index bd1cfdcb7eb..daa23d55450 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -16,6 +16,7 @@ import ( type Config struct { Level string Output string + Dir string // log directory; defaults to "logs" MaxAgeDays int MaxTotalSizeMB int CleanupIntervalSecs float64 @@ -88,15 +89,53 @@ func newRotatingWriter(logDir string) (*rotatingWriter, error) { }, nil } -func (w *rotatingWriter) Write(p []byte) (n int, err error) { +// needsRotate checks if rotation is needed under the lock. +func (w *rotatingWriter) needsRotate(today string) (bool, string) { w.mu.Lock() defer w.mu.Unlock() + needs := today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) + return needs, w.logDir +} + +// tryOpenRotateFile checks if rotation is needed and pre-opens the new log file +// outside the lock to avoid blocking other writers on slow file I/O. +func (w *rotatingWriter) tryOpenRotateFile(today string) *os.File { + needs, logDir := w.needsRotate(today) + if !needs { + return nil + } + + datePath := filepath.Join(logDir, "router-"+today+".log") + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) + return nil + } + return f +} +func (w *rotatingWriter) Write(p []byte) (n int, err error) { today := nowFunc().Format("2006-01-02") - // Detect day change and rotate. Also retry failed rotations after backoff. + // Pre-open new file outside the lock to reduce lock-held I/O time. + preOpened := w.tryOpenRotateFile(today) + + w.mu.Lock() + defer w.mu.Unlock() + + // Authoritative rotation check under lock. if today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) { - w.rotateLocked(today) + if preOpened != nil { + w.commitRotate(today, preOpened) + preOpened = nil // ownership transferred + } else { + // File open failed; set backoff so we don't retry on every Write. + w.retryAfter = nowFunc().Add(30 * time.Second) + } + } + // If another goroutine already rotated, close the unused pre-opened file. + if preOpened != nil { + preOpened.Close() } // Close previous file if grace period expired. @@ -130,19 +169,8 @@ func (w *rotatingWriter) Close() error { return nil } -// rotateLocked performs the actual file rotation. Must be called with w.mu held. -func (w *rotatingWriter) rotateLocked(newDate string) { - // Open new date file for the new day first, before touching any state. - datePath := filepath.Join(w.logDir, "router-"+newDate+".log") - f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) - if err != nil { - fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) - // Don't advance currentDate — keep writing to the old file and retry - // after a backoff to avoid hammering the filesystem on every Write call. - w.retryAfter = nowFunc().Add(30 * time.Second) - return - } - +// commitRotate finalises the rotation with a pre-opened file. Must be called with w.mu held. +func (w *rotatingWriter) commitRotate(newDate string, f *os.File) { // Rotation succeeded — clear any retry backoff. w.retryAfter = time.Time{} @@ -201,16 +229,35 @@ func parseLogDate(p []byte) string { for i := 0; i+10 <= len(s); i++ { c := s[i] if c >= '0' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' { - // Found a candidate "YYYY/MM/DD" + // Found a candidate "YYYY/MM/DD" — validate it. year := s[i : i+4] month := s[i+5 : i+7] day := s[i+8 : i+10] + if !isAllDigits(month) || !isAllDigits(day) { + continue + } + m := (month[0]-'0')*10 + (month[1] - '0') + d := (day[0]-'0')*10 + (day[1] - '0') + if m < 1 || m > 12 || d < 1 || d > 31 { + continue + } + _ = year // year already starts with a digit; any 4-digit year is acceptable return year + "-" + month + "-" + day } } return "" } +// isAllDigits returns true if every byte in s is an ASCII digit. +func isAllDigits(s string) bool { + for i := 0; i < len(s); i++ { + if s[i] < '0' || s[i] > '9' { + return false + } + } + return true +} + // Init initializes the logger. func Init(cfg Config) { once.Do(func() { @@ -218,13 +265,17 @@ func Init(cfg Config) { flags := log.LstdFlags | log.Lshortfile if cfg.Output == "file" { - if _, err := os.Stat("logs"); os.IsNotExist(err) { - if err := os.MkdirAll("logs", 0755); err != nil { + logDir := cfg.Dir + if logDir == "" { + logDir = "logs" + } + if _, err := os.Stat(logDir); os.IsNotExist(err) { + if err := os.MkdirAll(logDir, 0755); err != nil { log.Fatalln("Failed to create logs directory:", err) } } var err error - writer, err = newRotatingWriter("logs") + writer, err = newRotatingWriter(logDir) if err != nil { log.Fatalln("Failed to create rotating log writer:", err) } @@ -259,6 +310,11 @@ func StartLogCleanup(ctx context.Context, cfg Config) { return } + logDir := cfg.Dir + if logDir == "" { + logDir = "logs" + } + ticker := time.NewTicker(time.Duration(cfg.CleanupIntervalSecs * float64(time.Second))) defer ticker.Stop() @@ -267,7 +323,7 @@ func StartLogCleanup(ctx context.Context, cfg Config) { case <-ctx.Done(): return case <-ticker.C: - cleanupLogs("logs", cfg.MaxAgeDays, cfg.MaxTotalSizeMB) + cleanupLogs(logDir, cfg.MaxAgeDays, cfg.MaxTotalSizeMB) } } } From fd3c013b90284d9c11181509e1485b60c5740926 Mon Sep 17 00:00:00 2001 From: mouxin <494624263qq@gmail.com> Date: Tue, 14 Apr 2026 19:35:29 +0800 Subject: [PATCH 40/40] test(golang_router): cover cleanup loop and cross-day log rolling --- .../golang_router/pkg/logger/logger_test.go | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go index 921f26aa166..1d9874ded6f 100644 --- a/fastdeploy/golang_router/pkg/logger/logger_test.go +++ b/fastdeploy/golang_router/pkg/logger/logger_test.go @@ -4,8 +4,10 @@ import ( "bytes" "context" "os" + "path/filepath" "strings" "testing" + "time" ) func TestLoggerInit(t *testing.T) { @@ -216,3 +218,142 @@ func TestParseLogDate(t *testing.T) { }) } } + +func TestStartLogCleanup(t *testing.T) { + t.Run("cleanup runs for file output and respects cancellation", func(t *testing.T) { + tmpDir := t.TempDir() + + originalNowFunc := nowFunc + fixedNow := time.Date(2026, 4, 10, 12, 0, 0, 0, time.UTC) + nowFunc = func() time.Time { return fixedNow } + defer func() { nowFunc = originalNowFunc }() + + // Create archived logs: one older than 1 day and one recent. + oldLog := filepath.Join(tmpDir, "router-2026-04-07.log") + recentLog := filepath.Join(tmpDir, "router-2026-04-09.log") + todayLog := filepath.Join(tmpDir, "router-2026-04-10.log") + for _, p := range []string{oldLog, recentLog, todayLog} { + if err := os.WriteFile(p, []byte("test"), 0644); err != nil { + t.Fatalf("failed to create test log %s: %v", p, err) + } + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + StartLogCleanup(ctx, Config{ + Output: "file", + Dir: tmpDir, + MaxAgeDays: 2, + CleanupIntervalSecs: 0.01, + }) + }() + + waitForCondition(t, 500*time.Millisecond, func() bool { + _, err := os.Stat(oldLog) + return os.IsNotExist(err) + }, "old log should be removed by StartLogCleanup") + + if _, err := os.Stat(recentLog); err != nil { + t.Fatalf("recent log should be kept, stat err: %v", err) + } + if _, err := os.Stat(todayLog); err != nil { + t.Fatalf("today log should be kept, stat err: %v", err) + } + + cancel() + select { + case <-done: + case <-time.After(500 * time.Millisecond): + t.Fatal("StartLogCleanup did not stop after context cancellation") + } + }) + + t.Run("non-file output returns immediately", func(t *testing.T) { + done := make(chan struct{}) + go func() { + defer close(done) + StartLogCleanup(context.Background(), Config{Output: "stdout", CleanupIntervalSecs: 1}) + }() + select { + case <-done: + case <-time.After(200 * time.Millisecond): + t.Fatal("StartLogCleanup should return immediately for non-file output") + } + }) +} + +func TestRotatingWriterCrossDayGracePeriodIntegration(t *testing.T) { + tmpDir := t.TempDir() + + originalNowFunc := nowFunc + defer func() { nowFunc = originalNowFunc }() + + current := time.Date(2026, 4, 10, 23, 59, 59, 0, time.UTC) + nowFunc = func() time.Time { return current } + + w, err := newRotatingWriter(tmpDir) + if err != nil { + t.Fatalf("failed to create rotating writer: %v", err) + } + defer w.Close() + + if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:59 first day line\n")); err != nil { + t.Fatalf("failed to write day-1 line: %v", err) + } + + current = time.Date(2026, 4, 11, 0, 0, 1, 0, time.UTC) + if _, err = w.Write([]byte("[INFO] 2026/04/11 00:00:01 second day line\n")); err != nil { + t.Fatalf("failed to write day-2 line: %v", err) + } + + if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:58 late previous-day line\n")); err != nil { + t.Fatalf("failed to write late previous-day line: %v", err) + } + + day1Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-10.log")) + if err != nil { + t.Fatalf("failed to read day-1 log: %v", err) + } + day1Content := string(day1Bytes) + if !strings.Contains(day1Content, "first day line") { + t.Fatalf("day-1 log missing initial line, content: %s", day1Content) + } + if !strings.Contains(day1Content, "late previous-day line") { + t.Fatalf("day-1 log missing late previous-day line, content: %s", day1Content) + } + + day2Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-11.log")) + if err != nil { + t.Fatalf("failed to read day-2 log: %v", err) + } + day2Content := string(day2Bytes) + if !strings.Contains(day2Content, "second day line") { + t.Fatalf("day-2 log missing day-2 line, content: %s", day2Content) + } + if strings.Contains(day2Content, "late previous-day line") { + t.Fatalf("late previous-day line should not be in day-2 file, content: %s", day2Content) + } + + symlinkTarget, err := os.Readlink(filepath.Join(tmpDir, "router.log")) + if err != nil { + t.Fatalf("failed to read symlink: %v", err) + } + if symlinkTarget != "router-2026-04-11.log" { + t.Fatalf("router.log symlink target = %s, want router-2026-04-11.log", symlinkTarget) + } +} + +func waitForCondition(t *testing.T, timeout time.Duration, cond func() bool, msg string) { + t.Helper() + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if cond() { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatal(msg) +}