diff --git a/docs/online_serving/router.md b/docs/online_serving/router.md index 82940e5680c..7abc9c06af3 100644 --- a/docs/online_serving/router.md +++ b/docs/online_serving/router.md @@ -194,7 +194,7 @@ scheduler: policy: "power_of_two" # Scheduling policy (optional): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score prefill-policy: "cache_aware" # Prefill scheduling policy in PD mode decode-policy: "request_num" # Decode scheduling policy in PD mode - eviction-interval-secs: 60 # Cache eviction interval for CacheAware scheduling + eviction-interval-secs: 60 # Counter eviction interval for CacheAware scheduling eviction-duration-mins: 30 # Eviction duration for cache-aware radix tree nodes (minutes); default: 30 balance-abs-threshold: 1 # Absolute threshold for CacheAware balancing balance-rel-threshold: 0.2 # Relative threshold for CacheAware balancing diff --git a/docs/online_serving/router_faq.md b/docs/online_serving/router_faq.md index 49083539d4c..c0fb8cba4bf 100644 --- a/docs/online_serving/router_faq.md +++ b/docs/online_serving/router_faq.md @@ -29,6 +29,24 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `empty baseURL provided` | Health check received an empty base URL | Health check cannot be performed | Registration parameters | | `failed to create request: {error}` | Failed to create health check request | The instance may be marked as unhealthy | Network environment | | `failed to read response body: {error}` | Failed to read health check response body | The instance may be marked as unhealthy | Backend instance status | +| `Failed to select mixed worker: {error}` | Failed to select Mixed worker in centralized mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to select prefill worker: {error}` | Failed to select Prefill worker in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to read register request body: {error}` | Failed to read registration request body | Registration request returns 400 | Request format | +| `Failed to unmarshal register request JSON: {error}` | Failed to parse registration request JSON | Registration request returns 400 | Request format | +| `Failed to create decode request for {url}: {error}` | Failed to create HTTP request to Decode instance | Current request fails | Network environment | +| `Failed to create prefill request for {url}: {error}` | Failed to create HTTP request to Prefill instance | Current request fails | Network environment | +| `Decode request failed for {url}: {error}` | Request to Decode instance failed | Current request fails | Backend instance status, network connectivity | +| `Prefill request failed for {url}: {error}` | Request to Prefill instance failed | Current request fails | Backend instance status, network connectivity | +| `Failed to read request body: {error}` | Failed to read inference request body | Current request returns 400 | Request format | +| `Failed to unmarshal request JSON: {error}` | Failed to parse inference request JSON | Current request returns 400 | Request format | +| `Failed to select worker pair: {error}` | Failed to select worker pair in PD disaggregated mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to build disaggregate_info: {error}` | Failed to build PD disaggregation communication info | Current request returns 500 | Registration parameters (connector_port, device_ids, etc.) | +| `Failed to encode modified request: {error}` | Failed to encode modified request body | Current request returns 500 | Request content | +| `Failed to select worker: {error}` | Failed to select worker in centralized mode | Current request returns 502 | Health status, scheduling strategy | +| `Failed to connect to backend service: {error}` | Failed to connect to backend inference instance (after 3 retries) | Current request returns 502 | Backend instance status, network connectivity | +| `Request failed (attempt {n}/{max}): {error}` | Request attempt {n} failed | If retries exhausted, request returns 502 | Backend instance status, network connectivity | +| `Failed to create backend request for {url}: {error}` | Failed to create HTTP request to backend | Current request fails | Network environment | +| `Backend request failed for {url}: {error}` | Request to backend instance failed | Current request fails | Backend instance status, network connectivity | ### Warn-Level Logs @@ -37,8 +55,9 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `Server {url} is not healthy` | The instance at this URL failed health check | Router cannot register the instance, or will remove it from the registered list | Health status | | `Instance {url} role is unknown` | Instance role cannot be recognized | The instance will not be added to the scheduling list | Registration parameters | | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer service call failed, automatically falling back to character-based tokenization | cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer; normal request processing is not affected | Tokenizer service status | -| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | Tokenization completely failed (e.g., empty input), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status | -| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed (new format), falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | Request content, Tokenizer service status | +| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | Failed to fetch remote metrics, falling back to local counter | Scheduling accuracy may decrease; normal request processing is not affected | Backend instance metrics port, network connectivity | +| `release worker: {url} skipped, counter already cleaned up` | Worker counter was already cleaned up when trying to release | May occur when a worker is removed by health check while requests are still in-flight | Health status, request timing | +| `release worker: {url} skipped, counter already zero (possible double-release)` | Worker counter is already zero when trying to release | Possible duplicate counter release | Request processing logic | ### Info-Level Logs @@ -49,7 +68,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `Successfully registered instance from index {index}` | Instance from config file registered successfully | Normal startup log | | `No instances found in config file {path}` | No instances found in the registration config file | Check whether register.yaml is empty | | `Request completed successfully.` | Request processing completed | Normal operation log | -| `Request failed, retrying...` | Request failed, retrying | Router will retry up to 3 times | | `select worker (prefill): {url}, tokens: {tokens}` | Prefill scheduler selected a worker, showing current token processing count | Normal operation log | | `select worker ({type}): {url}, count: {count}` | Decode/Mixed scheduler selected a worker, showing current request concurrency | Normal operation log | | `release worker: {url}, count: {count}` | Request ended, worker counter released | Normal operation log | @@ -58,7 +76,6 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `removed counters for {count} unhealthy workers: {urls}` | Batch cleanup of counters for unhealthy workers | Normal operation log | | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | Periodic stats: total requests, worker loads, cache hit rate | Normal operation log, useful for monitoring and tuning | | `Parsing completed; starting worker selection.` | Request parsing completed, starting worker selection | Normal operation log | -| `Request completed with an error.` | Request processing completed with an error | Check backend instance status | | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | Decode selection failed in PD disaggregated mode, releasing Prefill counter | Error handling log | | `[prefill] first chunk received, release counter url={url}` | Prefill streaming response received first chunk, counter released | Normal operation log | | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill non-streaming response completed, counter released | Normal operation log | @@ -66,12 +83,17 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `[prefill] release in defer (fallback) url={url}, isStream={bool}` | Fallback resource release when Prefill request exits abnormally | Error handling log | | `[prefill] release in CommonCompletions defer (error path) url={url}` | Prefill resource release on error path | Error handling log | | `cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized` | cache_aware strategy not initialized, falling back to process_tokens | Check cache_aware configuration | +| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | Tokenization failed, falling back to process_tokens strategy | Prefill scheduling temporarily does not use cache_aware strategy; normal request processing is not affected | | `cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads={loads}. ts_ms={ts}` | Load imbalanced across instances, falling back to process_tokens strategy | Normal operation log, automatic load balancing switch | | `cache-aware prefill: final strategy: cache_aware_scoring, selected={url}, loads={loads}, hitRatios={ratios}. ts_ms={ts}` | cache_aware scoring strategy selected a worker | Normal operation log, showing loads and hit ratios | | `[{method}] {path} {proto} {status} {latency} {clientIP}` | HTTP request access log | Normal operation log, records basic info for each request | | `before SelectWorker prefill. ts_ms={ts}` | Starting Prefill worker selection in PD disaggregated mode | Normal operation log, for performance tracing | | `before SelectWorker decode, after prefill. ts_ms={ts}` | Starting Decode worker selection after Prefill selection | Normal operation log, for performance tracing | | `after SelectWorker decode, before return. ts_ms={ts}` | Decode worker selection completed | Normal operation log, for performance tracing | +| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | Unhealthy worker still has in-flight requests, counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | Unhealthy worker still has in-flight token load, token counter temporarily preserved | Normal operation log, will be auto-cleaned after in-flight requests complete | +| `cleanup unhealthy worker token counter: {url}` | Cleaned up token counter for unhealthy worker | Normal operation log | +| `preserved counters for {count} workers with inflight requests: {urls}` | Batch preserved counters for workers with in-flight requests | Normal operation log | ### Debug-Level Logs @@ -100,6 +122,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `{"error": "Failed to build disaggregate_info"}` | 500 | Failed to build PD disaggregation communication info | Registration parameters (connector_port, device_ids, etc.) | | `{"error": "Invalid request body"}` | 400 | Failed to read request body | Request format | | `{"error": "Invalid JSON format"}` | 400 | Failed to parse request body JSON | Request format | +| `{"error": "Failed to encode modified request: {error}"}` | 500 | Failed to encode modified request body | Request content | +| `{"code": 500, "msg": "Internal server error"}` | 500 | A panic occurred during request processing and was recovered | Backend instance status, request content | + +> **Note**: In PD disaggregated (splitwise) mode, the above error responses include an additional `request_id` field, e.g., `{"error": "...", "request_id": "xxx"}`. Additionally, `Invalid request body` and `Invalid JSON format` responses include specific error details, e.g., `{"error": "Invalid request body: EOF"}`. ### Registration Request Errors (/register) @@ -111,6 +137,7 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `{"code": 400, "msg": "splitwise mode only supports PREFILL/DECODE instances"}` | 400 | MIXED instances are not allowed in PD disaggregated mode | Deployment mode, instance role | | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | Only MIXED instances are allowed in centralized mode | Deployment mode, instance role | | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | Instance registration info validation failed | Registration parameters | +| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router internal manager not initialized | Router startup status | | `{"code": 200, "msg": "Register success"}` | 200 | Registration successful | — | ### Common Registration Parameter Validation Errors @@ -124,6 +151,10 @@ For basic Router usage, please refer to [Load-Balancing Scheduling Router](route | `port is required` | Missing port field | Add the port field | | `invalid port: {port}` | port is not a valid port number | Provide a port number in the range 1-65535 | | `invalid protocol: {protocol}` | Invalid transfer protocol | Use a valid protocol value: ipc / rdma | +| `invalid connector_port: {port}` | connector_port is not a valid port number | Provide a port number in the range 1-65535 | +| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port is not a valid port number | Provide a port number in the range 1-65535 | +| `invalid metrics_port: {port}` | metrics_port is not a valid port number | Provide a port number in the range 1-65535 | +| `rdma_ports[{index}] invalid port: {port}` | Port at index {index} in RDMA ports list is not valid | Provide a port number in the range 1-65535 | ## Troubleshooting Guide @@ -236,7 +267,7 @@ If `Failed to start server` appears in startup logs, check: When using the `cache_aware` scheduling strategy, the Router calls a Tokenizer service to tokenize requests for cache hit ratio computation. When the Tokenizer service is unavailable, the Router has a two-level degradation mechanism: 1. **Fallback to character-based tokenization** (common case): The log will show `tokenizer failed, fallback to char tokens`. The cache_aware strategy remains active, using character-based tokenization for cache matching instead of the Tokenizer. Cache hit accuracy may decrease, but normal request processing is not affected. -2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `tokenize failed, fallback to process_tokens`. The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected. +2. **Fallback to process_tokens strategy** (extreme case): When tokenization completely fails (e.g., empty request content), the log will show `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` (Info level). The cache_aware strategy temporarily becomes inactive, and scheduling falls back to token processing volume. Normal request processing is not affected. To restore full cache_aware functionality: diff --git a/docs/zh/online_serving/router.md b/docs/zh/online_serving/router.md index 0ace28c2da1..375f036ad2c 100644 --- a/docs/zh/online_serving/router.md +++ b/docs/zh/online_serving/router.md @@ -194,7 +194,7 @@ scheduler: policy: "power_of_two" # 调度策略(可选): random, power_of_two, round_robin, process_tokens, request_num, cache_aware, remote_cache_aware, fd_metrics_score, fd_remote_metrics_score; 默认: request_num prefill-policy: "cache_aware" # pd分离模式下prefill节点调度策略; 默认: process_tokens decode-policy: "request_num" # pd分离模式下decode节点调度策略; 默认: request_num - eviction-interval-secs: 60 # cache-aware策略清理过期cache的间隔时间 + eviction-interval-secs: 60 # cache-aware策略清理过期计数器的间隔时间 eviction-duration-mins: 30 # cache-aware策略radix tree节点驱逐时间(分钟); 默认: 30 balance-abs-threshold: 1 # cache-aware策略绝对阈值 balance-rel-threshold: 0.2 # cache-aware策略相对阈值 diff --git a/docs/zh/online_serving/router_faq.md b/docs/zh/online_serving/router_faq.md index a42ed015283..9c32726f4dc 100644 --- a/docs/zh/online_serving/router_faq.md +++ b/docs/zh/online_serving/router_faq.md @@ -29,6 +29,24 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `empty baseURL provided` | 健康检查时传入了空的基础 URL | 健康检查无法执行 | 注册参数 | | `failed to create request: {error}` | 创建健康检查请求失败 | 该实例可能被判定为不健康 | 网络环境 | | `failed to read response body: {error}` | 读取健康检查响应体失败 | 该实例可能被判定为不健康 | 后端实例状态 | +| `Failed to select mixed worker: {error}` | 集中式模式下选择 Mixed Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to select prefill worker: {error}` | PD 分离模式下选择 Prefill Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to read register request body: {error}` | 读取注册请求体失败 | 该注册请求返回 400 | 请求格式 | +| `Failed to unmarshal register request JSON: {error}` | 解析注册请求 JSON 失败 | 该注册请求返回 400 | 请求格式 | +| `Failed to create decode request for {url}: {error}` | 创建发往 Decode 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Failed to create prefill request for {url}: {error}` | 创建发往 Prefill 实例的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Decode request failed for {url}: {error}` | 发往 Decode 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | +| `Prefill request failed for {url}: {error}` | 发往 Prefill 实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | +| `Failed to read request body: {error}` | 读取推理请求体失败 | 当前请求返回 400 | 请求格式 | +| `Failed to unmarshal request JSON: {error}` | 解析推理请求 JSON 失败 | 当前请求返回 400 | 请求格式 | +| `Failed to select worker pair: {error}` | PD 分离模式下选择 Worker 对失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to build disaggregate_info: {error}` | 构建 PD 分离通信信息失败 | 当前请求返回 500 | 注册参数(connector_port、device_ids 等) | +| `Failed to encode modified request: {error}` | 编码修改后的请求体失败 | 当前请求返回 500 | 请求内容 | +| `Failed to select worker: {error}` | 集中式模式下选择 Worker 失败 | 当前请求返回 502 | 健康状况、调度策略 | +| `Failed to connect to backend service: {error}` | 连接后端推理实例失败(已重试 3 次仍失败) | 当前请求返回 502 | 后端实例状态、网络连通性 | +| `Request failed (attempt {n}/{max}): {error}` | 请求发送第 {n} 次尝试失败 | 若重试耗尽则请求返回 502 | 后端实例状态、网络连通性 | +| `Failed to create backend request for {url}: {error}` | 创建发往后端的 HTTP 请求失败 | 当前请求失败 | 网络环境 | +| `Backend request failed for {url}: {error}` | 发往后端实例的请求失败 | 当前请求失败 | 后端实例状态、网络连通性 | ### Warn 级别日志 @@ -37,8 +55,9 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `Server {url} is not healthy` | 该 URL 对应的实例未通过健康检查 | Router 无法注册该实例,或将该实例从已注册列表中移除 | 健康状况 | | `Instance {url} role is unknown` | 实例角色无法识别 | 该实例不会被加入调度列表 | 注册参数 | | `cache-aware prefill: tokenizer failed, fallback to char tokens: {error}` | Tokenizer 服务调用失败,已自动回退至字符级分词 | cache_aware 策略仍然生效,使用字符级分词代替 Tokenizer 进行缓存匹配,不影响正常请求处理 | Tokenizer 服务状态 | -| `cache-aware prefill: tokenize failed, fallback to process_tokens: {error}` | 分词彻底失败(如输入为空),回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | 请求内容、Tokenizer 服务状态 | -| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败(新格式),回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | 请求内容、Tokenizer 服务状态 | +| `GetRemoteMetrics failed for {url}, falling back to local counter: {error}` | 获取远程 metrics 失败,已回退至本地计数器 | 调度精度可能下降,不影响正常请求处理 | 后端实例 metrics 端口、网络连通性 | +| `release worker: {url} skipped, counter already cleaned up` | 释放 Worker 计数器时发现已被清理 | 可能是 Worker 被健康检查移除后仍有在途请求完成 | 健康状况、请求时序 | +| `release worker: {url} skipped, counter already zero (possible double-release)` | 释放 Worker 计数器时发现已归零 | 可能存在计数器重复释放 | 请求处理逻辑 | ### Info 级别日志 @@ -49,7 +68,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `Successfully registered instance from index {index}` | 配置文件中的实例注册成功 | 正常启动日志 | | `No instances found in config file {path}` | 注册配置文件中未找到实例信息 | 请检查 register.yaml 内容是否为空 | | `Request completed successfully.` | 请求处理完成 | 正常运行日志 | -| `Request failed, retrying...` | 请求失败,正在进行重试 | Router 最多重试 3 次 | | `select worker (prefill): {url}, tokens: {tokens}` | Prefill 调度选中 Worker,显示当前 token 处理量 | 正常运行日志 | | `select worker ({type}): {url}, count: {count}` | Decode/Mixed 调度选中 Worker,显示当前请求并发数 | 正常运行日志 | | `release worker: {url}, count: {count}` | 请求结束,释放 Worker 计数器 | 正常运行日志 | @@ -58,7 +76,6 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `removed counters for {count} unhealthy workers: {urls}` | 批量清理不健康 Worker 的计数器 | 正常运行日志 | | `[stats] total_running={n}, workers: [{loads}], cache_hit_rate={rate}% (hits={hits}/total={total})` | 周期性统计:总请求数、各 Worker 负载、缓存命中率 | 正常运行日志,用于监控调优 | | `Parsing completed; starting worker selection.` | 请求解析完成,开始选择 Worker | 正常运行日志 | -| `Request completed with an error.` | 请求处理完成但发生错误 | 请排查后端实例状态 | | `[SelectWorkerPair] decode selection failed, releasing prefill counter url={url}` | PD 分离模式下 Decode 选择失败,释放 Prefill 计数器 | 异常处理日志 | | `[prefill] first chunk received, release counter url={url}` | Prefill 流式响应收到首个数据块,释放计数器 | 正常运行日志 | | `[prefill] non-stream prefill response done, release counter url={url}` | Prefill 非流式响应完成,释放计数器 | 正常运行日志 | @@ -72,6 +89,11 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `before SelectWorker prefill. ts_ms={ts}` | PD 分离模式下开始选择 Prefill Worker | 正常运行日志,用于性能追踪 | | `before SelectWorker decode, after prefill. ts_ms={ts}` | Prefill 选择完成后开始选择 Decode Worker | 正常运行日志,用于性能追踪 | | `after SelectWorker decode, before return. ts_ms={ts}` | Decode Worker 选择完成 | 正常运行日志,用于性能追踪 | +| `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}` | 分词失败,回退至 process_tokens 策略 | Prefill 调度暂时不使用 cache_aware 策略,不影响正常请求处理 | +| `unhealthy worker counter preserved (inflight requests): {url}, count: {count}` | 不健康 Worker 仍有在途请求,计数器暂时保留 | 正常运行日志,待在途请求完成后自动清理 | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {tokens}` | 不健康 Worker 仍有在途 token 负载,token 计数器暂时保留 | 正常运行日志,待在途请求完成后自动清理 | +| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数器 | 正常运行日志 | +| `preserved counters for {count} workers with inflight requests: {urls}` | 批量保留仍有在途请求的 Worker 计数器 | 正常运行日志 | ### Debug 级别日志 @@ -100,6 +122,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `{"error": "Failed to build disaggregate_info"}` | 500 | 构建 PD 分离通信信息失败 | 注册参数(connector_port、device_ids 等) | | `{"error": "Invalid request body"}` | 400 | 请求体读取失败 | 请求格式 | | `{"error": "Invalid JSON format"}` | 400 | 请求体 JSON 解析失败 | 请求格式 | +| `{"error": "Failed to encode modified request: {error}"}` | 500 | 编码修改后的请求体失败 | 请求内容 | +| `{"code": 500, "msg": "Internal server error"}` | 500 | 请求处理过程中发生 panic 并被恢复 | 后端实例状态、请求内容 | + +> **说明**:在 PD 分离(splitwise)模式下,以上错误响应会额外包含 `request_id` 字段,如 `{"error": "...", "request_id": "xxx"}`。此外,`Invalid request body` 和 `Invalid JSON format` 的实际输出会包含具体的错误详情,如 `{"error": "Invalid request body: EOF"}`。 ### 注册请求错误(/register) @@ -112,6 +138,7 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `{"code": 400, "msg": "only MIXED instances are allowed"}` | 400 | 集中式模式下只允许注册 MIXED 实例 | 部署模式、实例角色 | | `{"code": 400, "msg": "invalid InstanceInfo format: {error}"}` | 400 | 实例注册信息校验失败 | 注册参数 | | `{"code": 200, "msg": "Register success"}` | 200 | 注册成功 | — | +| `{"code": 400, "msg": "DefaultManager is nil"}` | 400 | Router 内部管理器未初始化 | Router 启动状态 | ### 常见注册参数校验错误 @@ -124,6 +151,10 @@ Router 的基本使用方式请参考 [负载均衡调度 Router](router.md)。 | `port is required` | 缺少 port 字段 | 添加 port 字段 | | `invalid port: {port}` | port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | | `invalid protocol: {protocol}` | 传输协议不合法 | 使用合法的协议值:ipc / rdma | +| `invalid connector_port: {port}` | connector_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `invalid engine_worker_queue_port: {port}` | engine_worker_queue_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `invalid metrics_port: {port}` | metrics_port 不是合法的端口号 | 填写 1-65535 范围内的端口号 | +| `rdma_ports[{index}] invalid port: {port}` | RDMA 端口列表中第 {index} 个端口号不合法 | 填写 1-65535 范围内的端口号 | ## 常见问题排查方式 @@ -236,7 +267,7 @@ PD 分离模式下建议完整配置以下参数,以确保 KV Cache 传输正 使用 `cache_aware` 调度策略时,Router 会调用 Tokenizer 服务对请求进行分词以计算缓存命中率。当 Tokenizer 服务不可用时,Router 内置了两级退化机制: 1. **回退至字符级分词**(常见情况):日志出现 `tokenizer failed, fallback to char tokens`。此时 cache_aware 策略仍然生效,只是使用字符级分词代替 Tokenizer 进行缓存匹配,缓存命中精度会有所下降,但不影响正常请求处理。 -2. **回退至 process_tokens 策略**(极端情况):当分词彻底失败(如请求内容为空)时,日志出现 `tokenize failed, fallback to process_tokens`。此时 cache_aware 策略暂时不生效,改为按 token 处理量进行调度,同样不影响正常请求处理。 +2. **回退至 process_tokens 策略**(极端情况):当分词彻底失败(如请求内容为空)时,日志出现 `cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: {error}. ts_ms={ts}`(Info 级别)。此时 cache_aware 策略暂时不生效,改为按 token 处理量进行调度,同样不影响正常请求处理。 如需恢复 cache_aware 策略的完整功能: diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md new file mode 100644 index 00000000000..097a10f8163 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/SKILL.md @@ -0,0 +1,124 @@ +--- +name: stat-cache-hitrate +description: > + 统计 FastDeploy Go Router 日志中的三层 cache 命中率指标,生成可视化报告。 + 三层指标:Prefix Hit Ratio(KV Cache 内容复用度)、Session Hit Rate(请求级路由粘性)、 + Per-Worker Cache Stats(各 prefill worker 的缓存利用排名)。支持全量统计、tail 快速查看、 + 指定时间段统计(--start/--end)。 + + 当用户提到以下内容时触发此 skill:统计/查看 cache 命中率、查看 cache-aware 调度效果、 + 查看缓存预热情况、统计 hitRatio、查看 prefix 命中率、session hit rate。 + 关键词:cache 命中率、hitRatio、cache-aware、prefix hit、session hit rate、 + 缓存预热、/stat-cache-hitrate。 + +IMPORTANT: 执行前阅读 references/log_formats.md 了解日志格式和解析规则。 +--- + +# Cache Hit Rate Statistics + +统计 FastDeploy Go Router 的三层 cache 命中率,生成可视化报告。 + +## 执行前交互 + +运行脚本前,Claude 必须先向用户确认以下参数: + +### 1. 日志文件路径 +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项(客户端会自动提供 Other 自定义输入): +- 选项 1: `logs/router.log`(默认) +- 选项 2: `fd-router.log`(golang_router 根目录常用文件名) + +**重要规则**: +- 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 +- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件 +- 支持绝对路径(如 `/home/user/logs/xxx.log`)和相对路径(如 `logs/fd-router (2).log`) + +如果用户直接确认或未指定路径,使用默认值 `logs/router.log`。 + +### 2. 分析模式 +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号,避免客户端偶发不显示第 4 项): +- 选项 1: `全量统计(默认)` — 扫描完整日志 +- 选项 2: `快速查看尾部` — 只看最近的数据(支持 `2000`、`1k`、`1w` 等行数写法) +- 选项 3: `指定时间段` — 分析特定时间范围(如 `--start "16:00" --end "17:00"`) + +**若用户选择"快速查看尾部",必须再询问行数**,提供选项: +- 选项 1: `2000 行(默认)` +- 选项 2: `5000 行` +- 选项 3: `1万行` + +若用户选择”指定时间段”,直接让用户填写: +- 从 `xxx` 开始,到 `xxx` 结束(`start/end` 可只填一个); +- 支持相对时间写法:`30m`、`2h`、`1d`、`最后30分钟` 等(换算为绝对时间) +- 然后映射为 `--start/--end` 参数执行。 + +如果用户未选择,默认使用全量统计。 + +`--start/--end` 与 `--tail` 互斥。`--start` 和 `--end` 可单独或同时指定。 +`--tail` 仅支持”行数”语义(如 `2000`,也兼容 `1k/1w` 自动换算),不再支持 `30m/2h/1d` 这类时间窗口;按时间请使用 `--start/--end`。 +时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`、相对时间(`30m`、`2h`、`1d`、`最后30分钟`)。 +缺失部分自动从日志首末行推断。 + +### 3. 输出目录 +分析结果默认保存到 `skill_output/stat-cache-hitrate//`(自动按运行时间创建子目录)。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 + +## 使用方式 + +运行统计脚本(相对于 `fastdeploy/golang_router/` 目录): + +```bash +# 全量统计 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --output skill_output/stat-cache-hitrate/ + +# 快速查看尾部数据 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail # 默认最后 2000 行 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 5000 # 指定行数 +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --tail 1k # 行数缩写(自动换算) +# 指定时间段(需要按时间筛选时使用;--start 和 --end 可单独或同时使用) +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "16:00:00" --end "17:00:00" +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "2026/03/31 16:00:00" +python3 .claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py <日志文件> --start "03/31" --end "03/31 18:00" +``` + +默认日志路径:`logs/router.log`(相对于 `fastdeploy/golang_router/`)。常用备选:`fd-router.log`(根目录)。不传 `--output` 时自动输出到 `skill_output/stat-cache-hitrate//`。 + +脚本会自动根据文件大小选择解析策略:小文件(<5000 行)在内存中处理,大文件用 grep + 管道流式处理。 + +## 输出说明 + +### 三层指标 + +| 层级 | 指标 | 含义 | +|------|------|------| +| 第一层 | Prefix Hit Ratio | 被选中 worker 的 KV cache 命中率,反映内容级复用度 | +| 第二层 | Session Hit Rate | 带 session_id 的请求被路由到同一 worker 的比例 | +| 第三层 | Per-Worker Stats | 每个 prefill worker 被选中的次数和平均命中率排名 | + +### 输出文件位置 + +详细报告和图表输出到 `skill_output/stat-cache-hitrate//` 目录,每次运行自动创建带时间戳的子目录。 + +- `summary/cache_hitrate_report.md` — Per-Worker 统计 + Fallback 明细 + 详情链接 +- `detail/per_window_data.md` — 每5s窗口明细(连续空窗口自动合并为 3 行:起始/合并说明/结束) +- `detail/session_hit_details.md` — 每个 session(无 session_id 时回退 trace_id)的命中明细(Markdown 表格),包含 `id序号 / req_count / first_hit / avg-hit(=去首请求平均命中率) / max_hit / min_hit / all_hits / purl_cnt / prefill_urls`,并附「序号与会话ID映射」「切换 reqid 明细(含 session 时间段,可跳转)」。 + +### 交叉诊断矩阵 + +| Session HR | Prefix HR | 诊断 | +|------------|-----------|------| +| 高 | 高 | cache-aware 策略运行良好 | +| 高 | 低 | session 粘性好但 prompt 内容变化大,KV cache 实际复用低 | +| 低 | 高 | 换 worker 了但新 worker 也有类似前缀缓存 | +| 低 | 低 | 负载均衡强制分散或缓存未预热 | + +## 重要规则 + +1. **`[stats]` 计数器 per-interval**:每 5s `atomic.Swap(0)` 重置,必须 sum 所有行计算累计值 +2. **Session HR 只统计带 session_id 的请求** +3. **Prefix HR 取 selected worker 的值**:不在 hitRatios map 中则为 0 +4. **此 skill 只关注 cache 命中率**:延迟/错误/健康等排查由 troubleshoot skill 负责 +5. **与 troubleshoot-cache 互补**:本 skill 做数值统计,troubleshoot-cache 做调度策略诊断 + +## 参考文件 + +- `references/log_formats.md` — 日志格式和解析规则 +- `references/report_templates.md` — 终端报告和详细导出的模板 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json new file mode 100644 index 00000000000..23c7f6d86aa --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/evals/trigger_eval.json @@ -0,0 +1,18 @@ +[ + {"query": "帮我统计一下 router 的 cache 命中率,日志在 logs/fd-router.log", "should_trigger": true}, + {"query": "我想看看 cache-aware 调度的效果怎么样,hitRatio 数据是多少", "should_trigger": true}, + {"query": "prefix hit ratio 和 session hit rate 分别是多少?分析一下 logs/router.log", "should_trigger": true}, + {"query": "看一下最近30分钟的缓存预热情况,用 tail 模式快速扫一下", "should_trigger": true}, + {"query": "我刚部署了新的 cache-aware 策略,帮我跑一下 /stat-cache-hitrate 看看效果", "should_trigger": true}, + {"query": "每个 prefill worker 的缓存利用率排名是怎样的?哪个 worker 命中率最高", "should_trigger": true}, + {"query": "stat cache hitrate on our go router log, need to check the KV cache reuse rate", "should_trigger": true}, + {"query": "持续监控 cache 命中率变化趋势,我想看实时数据", "should_trigger": true}, + {"query": "router 最近老是返回 502,帮我排查一下什么问题", "should_trigger": false}, + {"query": "分析一下 router 的请求延迟,p99 是不是太高了", "should_trigger": false}, + {"query": "帮我 trace 一下这个请求 ID: abc-123-def,看看整个链路", "should_trigger": false}, + {"query": "Worker 健康状态怎么样?有没有频繁下线的", "should_trigger": false}, + {"query": "帮我写一个 Go 语言的 HTTP 路由框架", "should_trigger": false}, + {"query": "分析一下 nginx 的 access log,统计各个 URL 的访问量", "should_trigger": false}, + {"query": "router 负载不均衡,某些 worker 的 running 计数异常高", "should_trigger": false}, + {"query": "帮我看看 FastDeploy 的部署文档,我想部署一个新模型", "should_trigger": false} +] diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md new file mode 100644 index 00000000000..bc29a4cbb25 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/log_formats.md @@ -0,0 +1,139 @@ +# 日志格式参考 + +本文件描述 FastDeploy Go Router 的日志格式和解析规则。统计 cache 命中率前必须阅读。 + +--- + +## 通用日志行格式 + +``` +[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:: +``` + +- **Level**:`[INFO]`、`[ERROR]`、`[WARN]`、`[DEBUG]` +- **Timestamp**:`YYYY/MM/DD HH:MM:SS` +- **可选 context 前缀**:`[trace_id:...]`、`[req_id:...]`、`[session_id:...]`、`[request_id:...]` 可能出现在 `logger.go:XX:` 和实际消息之间,顺序固定(trace_id → req_id → session_id → request_id),但不一定全部出现 + +--- + +## 类别 A:Cache-Aware 策略行 + +### A1. cache_aware_scoring(正常走 cache-aware 路径) + +``` +[INFO] 2026/03/30 20:16:57 logger.go:79: [session_id:slimshetty/swebench-verified:sweb.eval.x86_64.psf__requests-1766] [request_id:565a594c-...] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20 ...], hitRatios=map[http://10.52.95.17:9263:100]. ts_ms=2026-03-30 20:16:57.021 +``` + +**提取字段**: +- `selected=` — 被选中的 worker URL,格式 `http://IP:PORT` +- `hitRatios=map[...]` — Go map 格式,详见下方解析规则 +- `loads=map[...]` — 各 worker 的负载 + +### A2. process_tokens fallback(未走 cache-aware 路径) + +``` +cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...] +cache-aware prefill: final strategy: process_tokens, reason: tokenize failed: +cache-aware prefill: final strategy: process_tokens, reason: strategy not initialized +``` + +--- + +## 类别 B:Stats 行 + +``` +[INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14, workers: [http://10.52.96.143:9867: running=0, http://10.52.95.26:9867: running=1, ...], cache_hit_rate=0.00% (hits=0/total=7) +``` + +**提取字段**: +- `total_running=` — 所有 worker 的运行请求总数 +- `workers: [...]` — 各 worker 的 `running=N` +- `cache_hit_rate=%` — 该窗口的命中率百分比 +- `(hits=/total=)` — 该 5s 窗口的命中次数和总次数 + +**关键**:`hits` 和 `total` 是 **per-interval** 的,代码使用 `atomic.Swap(0)` 每 5s 重置为 0。 + +--- + +## 类别 C:推理请求行 + +``` +[INFO] 2026/03/30 18:25:49 logger.go:79: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.52.95.139 +``` + +格式:`[METHOD] /path HTTP/1.1 ` + +延迟单位可能是 `s`、`ms`、`µs`/`us`。 + +**注意**:仅 `POST /v1/chat/completions` 和 `POST /v1/completions` 为推理请求。其余路径(`/register`、`/registered_number`、`/registered`、`/health_generate`、`/metrics`)为管理/监控请求,统计推理吞吐量时应排除。 + +--- + +## Go Map 解析规则 + +Go 的 `fmt.Sprintf("%v", map)` 输出格式:`map[key1:val1 key2:val2 ...]` + +### hitRatios 的特殊挑战 + +Worker URL 包含 `:`(如 `http://10.52.95.17:9263`),而 Go map 的 key-value 分隔符也是 `:`。 +因此 `hitRatios=map[http://10.52.95.17:9263:100]` 中: +- URL = `http://10.52.95.17:9263` +- Ratio = `100` + +### 推荐解析方法 + +**方法 1:正则匹配**(推荐) + +提取 `hitRatios=map[` 和 `]` 之间的内容,然后用正则匹配每个 entry: + +``` +正则:(http://[^\s:]+:\d+):(\d+) +``` + +示例: +``` +输入:http://10.52.95.17:9263:100 http://10.52.96.143:9867:50 +匹配1:group1=http://10.52.95.17:9263, group2=100 +匹配2:group1=http://10.52.96.143:9867, group2=50 +``` + +**方法 2:从右分割** + +对 map 内容按空格分割每个 token,然后对每个 token 找最后一个 `:` 分割: +``` +token = "http://10.52.95.17:9263:100" +lastColon = 最后一个 ":" 的位置 +url = token[:lastColon] → "http://10.52.95.17:9263" +ratio = token[lastColon+1:] → "100" +``` + +### 空 map + +`hitRatios=map[]` 表示冷启动,没有任何 worker 有匹配的前缀缓存。 + +### loads map 解析 + +同样的规则适用于 `loads=map[...]`,value 是负载数: +``` +loads=map[http://10.52.95.146:9263:20 http://10.52.95.17:9263:20] +``` + +### workers 列表解析(stats 行) + +`workers: [http://10.52.96.143:9867: running=0, ...]` 格式不同: +- 用 `,` 分割每个 entry +- 每个 entry 格式:`http://IP:PORT: running=N` +- 注意 URL 后面跟的是 `: running=`(带空格),不是 Go map 的 `:val` + +--- + +## 时间戳解析 + +日志时间戳格式:`YYYY/MM/DD HH:MM:SS` + +提取正则:`(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})` + +用于: +- 确定日志时间跨度 +- 按时间分窗口(5s、1min 等) +- 按 quartile 分段统计趋势 diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md new file mode 100644 index 00000000000..ebca39be2c4 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/references/report_templates.md @@ -0,0 +1,201 @@ +# 报告输出模板 + +本文件包含 cache 命中率分析报告的终端输出模板和详细数据导出模板。 + +--- + +## 终端概览报告模板 + +``` +## Cache Hit Rate Analysis Report +**File**: | **Lines**: N | **Span**: ~ () + +### 1. Prefix Hit Ratio (KV Cache 内容复用度) + 累计平均: XX.X% (被选中 worker) + 分布: + 0-20% ██░░░░░░░░░░░░░░░░░░ X% (N=xxx) + 20-40% ███░░░░░░░░░░░░░░░░░ X% (N=xxx) + 40-60% █████░░░░░░░░░░░░░░░ X% (N=xxx) + 60-80% ████████████░░░░░░░░ X% (N=xxx) + 80-100% ████████████████████ X% (N=xxx) + 冷启动率: X.X% + 趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X% ↑/↓/→ + + Prefix Hit Ratio (5s 窗口): + 100%| ····················· + 80%| ····· ··· + 60%| ····· + 40%| ····· + 20%| ······ + 0%|···· + +---+---+---+---+---+---+---+---+---+---→ time + 18:25 18:26 18:27 18:28 18:29 18:30 + +### 2. Session Hit Rate (请求级路由粘性) + 累计: XX.X% (hits=N / total=N) + 覆盖率: X.X% 的推理请求带 session_id + 趋势: Q1=X% → Q2=X% → Q3=X% → Q4=X% + + Session Hit Rate (5s 窗口): + 100%| ···················· + 80%| ·········· + 60%| ··········· + 40%| + 20%| + 0%|······· + +---+---+---+---+---+---+---+---+---+---→ time + +### 3. Per-Worker Cache Stats + ┌───────────────────────────┬──────────┬──────────┬─────────────────┐ + │ Prefill Worker │ Selected │ Select % │ Avg Hit(Select) │ + ├───────────────────────────┼──────────┼──────────┼─────────────────┤ + │ http://10.52.95.17:9263 │ 1,234 │ 15.2% │ 82% │ + │ http://10.52.96.143:9867 │ 890 │ 11.0% │ 74% │ + │ ... │ ... │ ... │ ... │ + └───────────────────────────┴──────────┴──────────┴─────────────────┘ + +### 4. Scheduling Strategy + cache_aware_scoring: N (X%) | fallback: N (X%) + fallback reasons: load_imbalanced=N, tokenize_failed=N, not_initialized=N + 非最优命中选择: X% (负载均衡优先于命中率的比例) + +### 5. Diagnosis + ✅/⚠/❌ <综合诊断> + +### 图表说明(Legend) + - Unicode 柱状图:每个区间的请求占比,条越长占比越高 + - ASCII 折线图:横轴是时间窗口,纵轴是命中率(0-100%) + - Q1→Q4 趋势:按时间四等分后的均值变化(↑/↓/→) + +📄 详细数据见: + - 报告文件: /abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//cache_hitrate_report_.md + - 窗口明细: /abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/per_window_data.md + - Session 命中详情: /abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + URI: file:///abs/path/to/skill_output/stat-cache-hitrate//details/session_hit_details.md + (含 prefill_urls、worker 切换前后 request_id,以及命中率突降 request_id) +``` + +--- + +## 格式规则 + +### Unicode 柱状图 + +- 总宽度 20 个字符 +- `█` 表示已填充部分,`░` 表示空白部分 +- 后跟百分比和绝对数量 + +``` +计算方法: +filled = round(percentage / 100 * 20) +bar = "█" * filled + "░" * (20 - filled) +output = f"{bar} {percentage}% (N={count})" +``` + +示例: +``` +████████████░░░░░░░░ 60% (N=1200) +██████████████████░░ 90% (N=1800) +██░░░░░░░░░░░░░░░░░░ 10% (N=200) +``` + +### ASCII 折线图 + +- Y 轴:0-100% 范围,6 行(0%, 20%, 40%, 60%, 80%, 100%) +- X 轴:时间,标注关键时间点 +- 数据点用 `·` 绘制 +- 坐标轴用 `|` `+` `─` `→` + +``` +时间粒度自动调整: +- 日志跨度 <30min → 5s 原始粒度 +- 日志跨度 <3h → 1min 粒度 +- 日志跨度 >3h → 5min 粒度 +``` + +图表宽度约 60 列。数据点太多时自动聚合到更粗的粒度。 + +### 表格 + +使用 Unicode box-drawing 字符: + +``` +┌ ─ ┬ ─ ┐ 顶部 +│ │ │ 数据行 +├ ─ ┼ ─ ┤ 分隔行 +│ │ │ 数据行 +└ ─ ┴ ─ ┘ 底部 +``` + +### 趋势箭头 + +- `↑` — 上升趋势(Q4 > Q1 + 10%) +- `↓` — 下降趋势(Q4 < Q1 - 10%) +- `→` — 稳定(变化 < 10%) + +--- + +## 详细数据导出模板 + +主报告:`skill_output/stat-cache-hitrate//cache_hitrate_report_.md` +每窗口明细:`skill_output/stat-cache-hitrate//details/per_window_data.md` + +### 主报告 + +```markdown +# Cache Hit Rate Detailed Report + +**Generated**: +**Source**: + +## 1. Per-Worker 完整统计 + +| Worker | Selected | Select % | Avg Hit (Selected) | Avg Hit (All) | Max Hit | +|--------|----------|----------|--------------------|----- ---------|---------| +| http://10.52.95.17:9263 | 1,234 | 15.2% | 82% | 68% | 100% | +| ... | ... | ... | ... | ... | ... | + +## 2. Fallback 明细 + +### 3.1 load imbalanced (N 次) +| Time | Loads | +|------|-------| +| 20:15:03 | map[...] | + +### 3.2 tokenize failed (N 次) +| Time | Error | +|------|-------| +| ... | ... | + +## 4. 非最优命中选择明细 + +| Time | Selected | Selected HR | Best Worker | Best HR | Load Diff | +|------|----------|-------------|-------------|---------|-----------| +| 20:15:10 | w1:9263 | 60% | w2:9867 | 85% | w1=5, w2=18 | +| ... | ... | ... | ... | ... | ... | +``` + +--- + +## --tail 快速查看模板 + +`--tail` 模式下只输出核心指标: + +``` +## Cache Hit Rate (Recent) +**File**: | **tail lines** | **Span**: ~ + + Prefix Hit Ratio: XX.X% (avg) | Cold start: X.X% + Session Hit Rate: XX.X% (hits=N/total=N) | Coverage: X.X% + Strategy: scoring N (X%) | fallback N (X%) + + Recent trend (1min buckets): + 100%| ····· + 80%| ····· + 60%|····· + +---+---+---+---+---→ + -5m -4m -3m -2m -1m + +``` diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py new file mode 100644 index 00000000000..cc5534a757d --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/chart.py @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 +""" +Chart — 终端可视化渲染工具 + +提供 sparkline 折线图、Unicode 柱状图、Markdown 表格的渲染函数。 +所有函数返回字符串(不直接打印),方便组装到报告中。 + +Python 3 stdlib only,零依赖。 +""" + + +# ════════════════════════════════════════════════════════════════ +# Sparkline 折线图 +# ════════════════════════════════════════════════════════════════ + +BLOCK_CHARS = " ▁▂▃▄▅▆▇█" + + +def render_sparkline( + records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60 +): + """渲染 8 级 Unicode sparkline 折线图。 + + Args: + records: dict 列表,每个 dict 包含 bucket_field 和 value_field + value_field: 数值字段名 + bucket_field: 时间桶字段名 + title: 图表标题 + y_label: Y 轴标签(如 '%') + y_range: Y 轴范围 (min, max) 元组,None 则自动 + width: 图表宽度(字符数) + + Returns: + str: 渲染后的图表文本 + """ + if not records: + return " (no data)" + + all_values = [] + for r in records: + v = r.get(value_field) + if v is not None: + all_values.append(float(v)) + + if not all_values: + return " (no numeric data)" + + # Y 轴范围 + if y_range: + y_min, y_max = y_range + else: + y_min = min(all_values) + y_max = max(all_values) + if y_max == y_min: + y_min = 0 if y_max > 0 else y_max - 1 + y_max = max(y_max, 1) + + y_span = y_max - y_min if y_max != y_min else 1 + + # 降采样 + n = len(records) + if n > width: + step = n / width + sampled = [] + for i in range(width): + start_idx = int(i * step) + end_idx = int((i + 1) * step) + chunk = records[start_idx:end_idx] + vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None] + avg_record = { + bucket_field: chunk[0].get(bucket_field, ""), + value_field: sum(vals) / len(vals) if vals else 0, + } + sampled.append(avg_record) + records = sampled + + lines = [] + + # 标题行 + def fmt_val(v): + if abs(v) >= 1000: + return f"{v:.0f}" + elif abs(v) >= 10: + return f"{v:.1f}" + return f"{v:.2f}" + + header_parts = [] + if title: + header_parts.append(title) + header_parts.append(f"min={fmt_val(min(all_values))}") + header_parts.append(f"max={fmt_val(max(all_values))}") + if y_label: + header_parts.append(f"({y_label})") + lines.append(" " + " ".join(header_parts)) + + # Sparkline 字符 + spark_chars = [] + for r in records: + v = r.get(value_field) + if v is None: + spark_chars.append(" ") + continue + v = float(v) + normalized = (v - y_min) / y_span + level = max(0, min(8, round(normalized * 8))) + spark_chars.append(BLOCK_CHARS[level]) + lines.append(" " + "".join(spark_chars)) + + # X 轴标签 + data_width = len(records) + if data_width > 0: + + def short_bucket(r): + b = str(r.get(bucket_field, "")) + if " " in b: + b = b.split(" ")[-1] + return b[:5] if len(b) >= 5 else b + + lbl_width = 6 + max_labels = max(1, data_width // lbl_width) + n_records = len(records) + + if n_records <= 2: + indices = list(range(n_records)) + elif n_records <= max_labels: + indices = [0, n_records - 1] + else: + n_labels = min(5, max(2, max_labels)) + indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)] + + label_line = [" "] * (data_width + lbl_width + 2) + last_end = -1 + for idx in indices: + lbl = short_bucket(records[idx]) + pos = idx + if pos < last_end: + continue + for ci, c in enumerate(lbl): + p = pos + ci + if p < len(label_line): + label_line[p] = c + last_end = pos + len(lbl) + 1 + lines.append(" " + "".join(label_line).rstrip()) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Unicode 柱状图 +# ════════════════════════════════════════════════════════════════ + + +def render_bar(data, bar_width=20, show_count=False): + """渲染 Unicode 柱状图。 + + Args: + data: dict 列表,每个 dict 包含 label, value(百分比 0-100), 可选 count + bar_width: 柱状图宽度(字符数) + show_count: 是否显示绝对数量 + + Returns: + str: 渲染后的图表文本 + """ + if not data: + return " (no data)" + + max_label_len = max(len(str(d.get("label", ""))) for d in data) + max_label_len = max(max_label_len, 4) + + lines = [] + for d in data: + label = str(d.get("label", "")) + value = float(d.get("value", 0)) + count = d.get("count") + + filled = round(value / 100 * bar_width) if value > 0 else 0 + filled = max(1, filled) if value > 0 else 0 + filled = min(bar_width, filled) + empty = bar_width - filled + bar = "█" * filled + "░" * empty + + line = f" {label:<{max_label_len}} {bar} {value:>5.1f}%" + if show_count and count is not None: + line += f" (N={count})" + lines.append(line) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Markdown 表格 +# ════════════════════════════════════════════════════════════════ + + +def render_table(data, columns=None, right_align=None): + """渲染 Markdown 表格。 + + Args: + data: dict 列表 + columns: 列名列表,None 则用第一条记录的所有 key + right_align: 右对齐的列名集合 + + Returns: + str: 渲染后的表格文本 + """ + if not data: + return " (no data)" + + if columns is None: + columns = list(data[0].keys()) + if right_align is None: + right_align = set() + + # 计算列宽 + col_widths = {} + for col in columns: + col_widths[col] = len(col) + for row in data: + val = str(row.get(col, "")) + col_widths[col] = max(col_widths[col], len(val)) + + # 表头 + header_parts = [] + sep_parts = [] + for col in columns: + w = col_widths[col] + if col in right_align: + header_parts.append(f" {col:>{w}} ") + else: + header_parts.append(f" {col:<{w}} ") + sep_parts.append("-" * (w + 2)) + + lines = [] + lines.append("|" + "|".join(header_parts) + "|") + lines.append("|" + "|".join(sep_parts) + "|") + + # 数据行 + for row in data: + row_parts = [] + for col in columns: + val = str(row.get(col, "")) + w = col_widths[col] + if col in right_align: + row_parts.append(f" {val:>{w}} ") + else: + row_parts.append(f" {val:<{w}} ") + lines.append("|" + "|".join(row_parts) + "|") + + return "\n".join(lines) diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py new file mode 100644 index 00000000000..bb31235f3fa --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/log_parser.py @@ -0,0 +1,424 @@ +#!/usr/bin/env python3 +""" +Router Log Parser — FastDeploy Go Router 日志解析原语 + +支持两种调用方式: +1. 作为模块导入:from log_parser import parse_cache_strategy_line, parse_stats_line +2. 作为 CLI 工具:grep 'pattern' logfile | python3 log_parser.py parse-cache-strategy + +Python 3 stdlib only,零依赖。 +""" + +import argparse +import json +import re +import sys +from datetime import datetime + +# ════════════════════════════════════════════════════════════════ +# 通用解析原语 +# ════════════════════════════════════════════════════════════════ + + +def parse_go_map(s): + """解析 Go fmt.Sprintf('%v', map) 输出:map[key1:val1 key2:val2 ...] + + 处理 URL 中冒号与 Go map key-value 分隔符的冲突(从最后一个冒号分割)。 + 空 map 'map[]' 返回空 dict。 + """ + inner_match = re.search(r"map\[(.*?)\]", s) + if not inner_match: + return {} + inner = inner_match.group(1).strip() + if not inner: + return {} + result = {} + for token in inner.split(): + idx = token.rfind(":") + if idx > 0: + key = token[:idx] + val_str = token[idx + 1 :] + try: + result[key] = int(val_str) if "." not in val_str else float(val_str) + except ValueError: + result[key] = val_str + return result + + +# 时间戳:YYYY/MM/DD HH:MM:SS +TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})") + +# ts_ms:2025-01-15 18:25:33.123 +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + + +def extract_ts(line): + """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。""" + m = TS_RE.search(line) + return m.group(1) if m else None + + +def parse_ts(ts_str): + """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。""" + return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + + +# ════════════════════════════════════════════════════════════════ +# 时间范围过滤 +# ════════════════════════════════════════════════════════════════ + +import os +import subprocess +import tempfile + +_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$") +_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$") +_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$") +_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$") + +# 相对时间正则:支持 30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟 +_RELATIVE_TIME_RE = re.compile(r"^(?:last|最后)?\s*(\d+)\s*(m|分钟|mins?|h|小时|hours?|d|天|days?)$", re.IGNORECASE) + + +def _parse_relative_time(time_str): + """解析相对时间字符串,返回 timedelta。 + + 支持格式:30m、30分钟、2h、2小时、1d、1天、last 30m、最后30分钟 + """ + m = _RELATIVE_TIME_RE.match(time_str.strip()) + if not m: + return None + + value = int(m.group(1)) + unit = m.group(2).lower() + + if unit.startswith("m") and "in" not in unit: # m, min, mins + from datetime import timedelta + + return timedelta(minutes=value) + elif unit.startswith("h"): # h, hour, hours + from datetime import timedelta + + return timedelta(hours=value) + else: # d, day, days + from datetime import timedelta + + return timedelta(days=value) + + +def _relative_to_absolute(time_str, log_file, is_end=False): + """将相对时间转换为绝对时间,基于日志文件的时间边界。 + + - start: 从日志末行时间往前推 + - end: 直接使用日志末行时间(或当前时间) + """ + relative_delta = _parse_relative_time(time_str) + if not relative_delta: + return None + + # 获取日志文件末行时间作为基准 + boundary_ts = _get_log_boundary_ts(log_file, "last") + if not boundary_ts: + return None + + # 解析为 datetime + dt = datetime.strptime(boundary_ts, "%Y/%m/%d %H:%M:%S") + + if is_end: + # end 时间:直接使用日志末行时间 + return boundary_ts + else: + # start 时间:末行时间减去 duration + + abs_time = dt - relative_delta + return abs_time.strftime("%Y/%m/%d %H:%M:%S") + + +def _get_log_boundary_ts(log_file, which="first"): + """从日志文件首行或末行提取时间戳。""" + cmd = "head" if which == "first" else "tail" + try: + r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5) + return extract_ts(r.stdout) if r.returncode == 0 else None + except (subprocess.TimeoutExpired, FileNotFoundError): + return None + + +def complete_time_arg(time_str, log_file, is_end=False): + """解析灵活时间输入,补全缺失部分。 + + 支持格式: + 'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD', + 'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM' + 相对时间:30m、2h、1d、最后30分钟 等(从日志末行时间算起) + + 补全规则: + - 缺年份:从日志首行取 + - 缺日期:从日志末行取 + - 缺时间:start→00:00:00, end→23:59:59 + - 相对时间:start 从日志末行往前推,end 直接用日志末行时间 + + Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串 + """ + if time_str is None: + return None + time_str = time_str.strip() + + # Case 0: 相对时间处理(如 "30m"、"最后30分钟"、"2h") + # 从日志文件末行时间开始算起 + relative_result = _relative_to_absolute(time_str, log_file, is_end) + if relative_result: + return relative_result + + # Case 1: 完整日期时间 + m = _FULL_DT_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + h, mi = m.group(4).zfill(2), m.group(5) + s = (m.group(6) or "00").zfill(2) + return f"{y}/{mo}/{d} {h}:{mi}:{s}" + + # Case 2: 仅日期 YYYY/MM/DD + m = _DATE_ONLY_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + t = "23:59:59" if is_end else "00:00:00" + return f"{y}/{mo}/{d} {t}" + + # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS] + m = _SHORT_DATE_RE.match(time_str) + if m: + mo, d = m.group(1).zfill(2), m.group(2).zfill(2) + ts = _get_log_boundary_ts(log_file, "first") + year = ts[:4] if ts else str(datetime.now().year) + if m.group(3): # 有时间部分 + h, mi = m.group(3).zfill(2), m.group(4) + s = (m.group(5) or "00").zfill(2) + return f"{year}/{mo}/{d} {h}:{mi}:{s}" + t = "23:59:59" if is_end else "00:00:00" + return f"{year}/{mo}/{d} {t}" + + # Case 4: 仅时间 HH:MM[:SS] + m = _TIME_ONLY_RE.match(time_str) + if m: + h, mi = m.group(1).zfill(2), m.group(2) + s = (m.group(3) or "00").zfill(2) + ts = _get_log_boundary_ts(log_file, "last") + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" + return f"{date_part} {h}:{mi}:{s}" + + # Fallback: 原样返回 + return time_str + + +def filter_file_by_time_range(log_file, start_str=None, end_str=None): + """用 awk 按时间范围预过滤日志文件。 + + 时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比,直接用 awk 字符串比较。 + 无时间戳的行(如 panic 堆栈续行)保留。 + + Args: + log_file: 原日志文件路径 + start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if not start_str and not end_str: + return (log_file, False) + + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_") + tmp.close() + + awk_script = r"""{ + ts = "" + if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + } + if (ts == "") { print; next } + if ((start == "" || ts >= start) && (end == "" || ts <= end)) print + }""" + + cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file] + + try: + with open(tmp.name, "w") as outf: + result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120) + if result.returncode != 0: + os.unlink(tmp.name) + return (log_file, False) + except (subprocess.TimeoutExpired, OSError): + if os.path.exists(tmp.name): + os.unlink(tmp.name) + return (log_file, False) + + return (tmp.name, True) + + +# Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] +TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") + + +def extract_tags(line): + """从日志行提取 context tag。""" + return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)} + + +# ════════════════════════════════════════════════════════════════ +# Cache-Aware 策略行解析(类别 A) +# ════════════════════════════════════════════════════════════════ + +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" +STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") + + +def parse_cache_strategy_line(line): + """解析 cache-aware prefill 策略行。 + + 输入示例: + [INFO] 2026/03/30 20:16:57 logger.go:79: ... cache-aware prefill: final strategy: + cache_aware_scoring, selected=http://10.52.95.17:9263, loads=map[...], hitRatios=map[...] + + 返回 dict 或 None(如果不是策略行)。 + """ + sm = STRATEGY_RE.search(line) + if not sm: + return None + + ts = extract_ts(line) + strategy = sm.group(1) + record = {"ts": ts or "", "strategy": strategy} + + # selected worker URL + sel_m = SELECTED_RE.search(line) + if sel_m: + record["selected"] = sel_m.group(1) + + # reason(仅 process_tokens fallback) + reason_m = REASON_RE.search(line) + if reason_m and strategy == "process_tokens": + record["reason"] = reason_m.group(1).strip() + + # hitRatios map + hr_match = re.search(r"hitRatios=(map\[.*?\])", line) + if hr_match: + hit_ratios = parse_go_map(hr_match.group(1)) + record["hitRatios"] = hit_ratios + if "selected" in record: + record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0) + else: + record["hitRatios"] = {} + if "selected" in record: + record["selected_hitRatio"] = 0 + + # loads map + loads_match = re.search(r"loads=(map\[.*?\])", line) + if loads_match: + record["loads"] = parse_go_map(loads_match.group(1)) + + # ts_ms(精确到毫秒的调度时间戳) + ts_ms_m = TS_MS_RE.search(line) + if ts_ms_m: + record["ts_ms"] = ts_ms_m.group(1) + + # context tags + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Stats 行解析(类别 B) +# ════════════════════════════════════════════════════════════════ + +TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") +CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") + + +def parse_stats_line(line): + """解析 [stats] 统计行。 + + 输入示例: + [INFO] 2026/03/30 20:14:38 logger.go:79: [stats] total_running=14, + workers: [...], cache_hit_rate=0.00% (hits=0/total=7) + + 注意:hits 和 total 是 per-interval 的(每 5s 重置),累计值必须 sum 所有行。 + + 返回 dict 或 None(如果不是 stats 行)。 + """ + if "[stats]" not in line: + return None + + ts = extract_ts(line) + record = {"ts": ts or ""} + + # total_running + tr_m = TOTAL_RUNNING_RE.search(line) + if tr_m: + record["total_running"] = int(tr_m.group(1)) + + # per-worker running + workers = {} + for wm in WORKER_RUNNING_RE.finditer(line): + workers[wm.group(1)] = int(wm.group(2)) + record["workers"] = workers + + # cache_hit_rate + hits/total + chr_m = CACHE_HR_RE.search(line) + if chr_m: + record["cache_hit_rate"] = float(chr_m.group(1)) + record["hits"] = int(chr_m.group(2)) + record["total"] = int(chr_m.group(3)) + + return record + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def _cli_parse_stream(parse_fn): + """通用 CLI 流式解析:从 stdin 读入日志行,输出 JSON Lines 到 stdout。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_fn(line) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Log Parser", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + sub = parser.add_subparsers(dest="command") + + sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 → JSON Lines") + sub.add_parser("parse-stats", help="解析 [stats] 统计行 → JSON Lines") + + args = parser.parse_args() + + if args.command == "parse-cache-strategy": + _cli_parse_stream(parse_cache_strategy_line) + elif args.command == "parse-stats": + _cli_parse_stream(parse_stats_line) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py new file mode 100644 index 00000000000..7de5b7f6042 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/session_analysis.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +Session 维度分析:聚合每个 session 的命中率、worker 切换与突降请求。 +""" + +from collections import defaultdict + + +def compute_session_details(strategies, strip_scheme): + """按 session_id(优先)或 trace_id(兜底)统计命中详情。""" + + def _req_id_from_tags(tags, fallback): + return tags.get("request_id") or tags.get("req_id") or tags.get("trace_id") or fallback + + session_records = defaultdict(list) + for idx, rec in enumerate(strategies): + if rec.get("strategy") != "cache_aware_scoring": + continue + tags = rec.get("tags", {}) or {} + session_id = tags.get("session_id") + trace_id = tags.get("trace_id") + identity = session_id or trace_id + if not identity: + continue + session_records[identity].append((idx, rec)) + + rows = [] + for identity, items in session_records.items(): + items.sort(key=lambda x: (x[1].get("ts_ms", ""), x[1].get("ts", ""), x[0])) + recs = [r for _, r in items] + hits = [int(r.get("selected_hitRatio", 0)) for r in recs] + if not hits: + continue + + non_first = hits[1:] + avg_excl_first = round(sum(non_first) / len(non_first), 1) if non_first else "-" + workers = {r.get("selected", "") for r in recs if r.get("selected")} + + prefill_urls = [] + for r in recs: + u = r.get("selected", "") + if u and u not in prefill_urls: + prefill_urls.append(u) + + switch_events = [] + sharp_drop_req_ids = [] + for i in range(1, len(recs)): + prev_r = recs[i - 1] + curr_r = recs[i] + prev_url = prev_r.get("selected", "") + curr_url = curr_r.get("selected", "") + prev_tags = prev_r.get("tags", {}) or {} + curr_tags = curr_r.get("tags", {}) or {} + prev_req = _req_id_from_tags(prev_tags, f"idx#{i}") + curr_req = _req_id_from_tags(curr_tags, f"idx#{i+1}") + + if prev_url and curr_url and prev_url != curr_url: + switch_events.append(f"{prev_req}->{curr_req} ({strip_scheme(prev_url)}→{strip_scheme(curr_url)})") + + prev_hit = int(prev_r.get("selected_hitRatio", 0)) + curr_hit = int(curr_r.get("selected_hitRatio", 0)) + if curr_hit - prev_hit <= -30: + sharp_drop_req_ids.append(f"{curr_req} ({prev_hit}%→{curr_hit}%)") + + rows.append( + { + "session": identity, + "id_type": "session_id" if recs[0].get("tags", {}).get("session_id") else "trace_id", + "first_ts": recs[0].get("ts", "-"), + "last_ts": recs[-1].get("ts", "-"), + "req_count": len(hits), + "first_hit": f"{hits[0]}%", + "avg_hit(excl_first)": f"{avg_excl_first}%" if avg_excl_first != "-" else "-", + "max_hit": f"{max(hits)}%", + "min_hit": f"{min(hits)}%", + "all_hits": ", ".join(f"{h}%" for h in hits), + "sticky": "yes" if len(workers) <= 1 else "no", + "unique_workers": len(workers), + "prefill_url_count": len(prefill_urls), + "prefill_urls": " | ".join(strip_scheme(u) for u in prefill_urls), + "switch_req_pairs": " ; ".join(switch_events) if switch_events else "-", + "sharp_drop_request_ids": " ; ".join(sharp_drop_req_ids) if sharp_drop_req_ids else "-", + } + ) + + rows.sort(key=lambda r: (r["req_count"], r["session"]), reverse=True) + return rows + + +def summarize_session_details(rows): + """生成 session 级摘要指标。""" + if not rows: + return { + "total_sessions": 0, + "multi_req": 0, + "single_req": 0, + "sticky_multi": 0, + "non_sticky_multi": 0, + "non_first_avg": 0, + "non_first_total": 0, + } + + multi_req_rows = [r for r in rows if r["req_count"] > 1] + sticky_multi = [r for r in multi_req_rows if r["sticky"] == "yes"] + non_sticky_multi = [r for r in multi_req_rows if r["sticky"] == "no"] + + non_first_vals = [] + for r in rows: + hit_tokens = [h.strip().rstrip("%") for h in r["all_hits"].split(",") if h.strip()] + nums = [int(x) for x in hit_tokens if x.isdigit()] + if len(nums) > 1: + non_first_vals.extend(nums[1:]) + + return { + "total_sessions": len(rows), + "multi_req": len(multi_req_rows), + "single_req": len(rows) - len(multi_req_rows), + "sticky_multi": len(sticky_multi), + "non_sticky_multi": len(non_sticky_multi), + "non_first_avg": round(sum(non_first_vals) / len(non_first_vals), 2) if non_first_vals else 0, + "non_first_total": len(non_first_vals), + } diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py new file mode 100644 index 00000000000..7c6e0d40ecf --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stat_cache_hitrate.py @@ -0,0 +1,1111 @@ +#!/usr/bin/env python3 +""" +stat_cache_hitrate — FastDeploy Go Router Cache 命中率统计工具 + +统计三层 cache 命中率指标: + 1. Prefix Hit Ratio — KV Cache 内容复用度 + 2. Session Hit Rate — 请求级路由粘性 + 3. Per-Worker Stats — 各 worker 缓存利用排名 + +用法: + python3 stat_cache_hitrate.py [--tail N|Nk|Nw] [--output DIR] +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from collections import defaultdict +from datetime import datetime +from pathlib import Path +from urllib.parse import quote + +# 同目录模块导入 +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from chart import render_bar, render_sparkline, render_table +from log_parser import ( + complete_time_arg, + filter_file_by_time_range, + parse_cache_strategy_line, + parse_stats_line, + parse_ts, +) +from session_analysis import compute_session_details, summarize_session_details +from stats import compute_statistics, count_by, time_bucket +from window_utils import merge_blank_window_rows + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + +def _build_path_links(path): + """返回绝对路径与 file URI,兼容空格/中文路径。""" + abs_path = str(Path(path).resolve()) + file_uri = "file://" + quote(abs_path, safe="/:-._~") + return abs_path, file_uri + + +def _format_half_running(total_running): + """将 stats.total_running 归一化为 prefill 口径(decode+prefill 合计 / 2)。""" + normalized = total_running / 2 + if float(normalized).is_integer(): + return str(int(normalized)) + return f"{normalized:.1f}" + + +def _render_scrollable_tsv(data, columns): + """渲染单行 TSV 文本,适合在 Markdown 查看器里横向滚动。""" + if not data: + return "```tsv\n(no data)\n```" + + def _escape(v): + return str(v).replace("\t", " ").replace("\n", "\\n") + + lines = ["\t".join(columns)] + for row in data: + lines.append("\t".join(_escape(row.get(col, "")) for col in columns)) + return "```tsv\n" + "\n".join(lines) + "\n```" + + +def _render_markdown_table(data, columns, align_right=None): + """渲染 Markdown 表格,便于在终端/文档中直接阅读。""" + if not data: + return "_(no data)_" + + align_right = align_right or set() + + def _escape_md(v): + return str(v).replace("\n", "
").replace("|", "\\|") + + matrix = [] + for row in data: + matrix.append([_escape_md(row.get(c, "")) for c in columns]) + + widths = [] + for i, col in enumerate(columns): + max_cell = max((len(r[i]) for r in matrix), default=0) + widths.append(max(len(col), max_cell)) + + def _format_cell(text, width, right=False): + return text.rjust(width) if right else text.ljust(width) + + header_cells = [_format_cell(c, widths[i]) for i, c in enumerate(columns)] + header = "| " + " | ".join(header_cells) + " |" + + align_cells = [] + for i, c in enumerate(columns): + w = max(widths[i], 3) + if c in align_right: + align_cells.append("-" * (w - 1) + ":") + else: + align_cells.append(":" + "-" * (w - 1)) + align = "| " + " | ".join(align_cells) + " |" + + rows = [] + for row_cells in matrix: + padded = [_format_cell(cell, widths[i], right=(columns[i] in align_right)) for i, cell in enumerate(row_cells)] + rows.append("| " + " | ".join(padded) + " |") + return "\n".join([header, align] + rows) + + +def _truncate_text(v, limit=72): + s = str(v) + return s if len(s) <= limit else s[: limit - 1] + "…" + + +def _seq_label(n): + return f"S{n:03d}" + + +def _extract_seq_num(seq_id): + return int(str(seq_id).lstrip("S") or 0) + + +def _summarize_id_type_ranges(rows_with_seq): + """基于序号连续区间汇总 id_type,便于在报告开头快速识别口径。""" + if not rows_with_seq: + return [] + + ranges = [] + current_type = rows_with_seq[0].get("id_type", "session_id") + start_id = rows_with_seq[0]["id"] + end_id = start_id + start_ts = rows_with_seq[0].get("first_ts", "-") + end_ts = rows_with_seq[0].get("last_ts", "-") + + for row in rows_with_seq[1:]: + row_type = row.get("id_type", "session_id") + row_id = row["id"] + if row_type == current_type and _extract_seq_num(row_id) == _extract_seq_num(end_id) + 1: + end_id = row_id + end_ts = row.get("last_ts", end_ts) + continue + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + current_type = row_type + start_id = row_id + end_id = row_id + start_ts = row.get("first_ts", "-") + end_ts = row.get("last_ts", "-") + + ranges.append((start_id, end_id, current_type, start_ts, end_ts)) + return ranges + + +# ════════════════════════════════════════════════════════════════ +# Phase 1: 日志读取 +# ════════════════════════════════════════════════════════════════ + + +def count_lines(filepath): + """快速统计文件行数。""" + result = subprocess.run(["wc", "-l", filepath], capture_output=True, text=True) + if result.returncode == 0: + return int(result.stdout.strip().split()[0]) + return 0 + + +def read_lines(filepath, tail=None): + """读取日志文件,支持 tail 模式。""" + if tail is not None: + # 按行数 tail + n = int(tail) + result = subprocess.run(["tail", "-n", str(n), filepath], capture_output=True, text=True) + return result.stdout.splitlines() if result.returncode == 0 else [] + return _read_file_lines(filepath) + + +def _read_file_lines(filepath): + with open(filepath, "r", errors="replace") as f: + return f.readlines() + + +# ════════════════════════════════════════════════════════════════ +# Phase 2: 日志提取与解析 +# ════════════════════════════════════════════════════════════════ + +STRATEGY_PATTERN = "cache-aware prefill: final strategy:" +STATS_PATTERN = "[stats]" +INFERENCE_PATTERNS = ["] [POST] /v1/chat/completions ", "] [POST] /v1/completions "] + + +def _shell_quote(s): + """Shell 引号转义,安全处理含空格、括号、单引号的路径。""" + return "'" + s.replace("'", "'\\''") + "'" + + +def grep_and_parse(filepath, grep_pattern, parse_cmd, tail=None): + """大文件模式:grep 过滤 + log_parser.py CLI 管道解析。""" + parser_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "log_parser.py") + + if tail: + grep_cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -F {_shell_quote(grep_pattern)} | python3 {_shell_quote(parser_path)} {parse_cmd}" + else: + grep_cmd = f"grep -F {_shell_quote(grep_pattern)} {_shell_quote(filepath)} | python3 {_shell_quote(parser_path)} {parse_cmd}" + + result = subprocess.run(grep_cmd, shell=True, capture_output=True, text=True) + records = [] + for line in result.stdout.strip().splitlines(): + if line: + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +def grep_count(filepath, grep_pattern, tail=None): + """大文件模式:grep 计数。""" + if tail: + cmd = f"tail -n {tail} {_shell_quote(filepath)} | grep -cE {_shell_quote(grep_pattern)}" + else: + cmd = f"grep -cE {_shell_quote(grep_pattern)} {_shell_quote(filepath)}" + + result = subprocess.run(cmd, shell=True, capture_output=True, text=True) + try: + return int(result.stdout.strip()) + except ValueError: + return 0 + + +def extract_data(filepath, tail=None): + """提取并解析日志数据,根据文件大小自动选择策略。""" + total = count_lines(filepath) + + if total < 5000: + # 小文件:内存中处理 + lines = read_lines(filepath, tail) + strategy_recs = [r for l in lines if (r := parse_cache_strategy_line(l)) is not None] + stats_recs = [r for l in lines if (r := parse_stats_line(l)) is not None] + inference_count = sum(1 for l in lines if any(p in l for p in INFERENCE_PATTERNS)) + return strategy_recs, stats_recs, inference_count, len(lines) + else: + # 大文件:grep + subprocess + strategy_recs = grep_and_parse(filepath, STRATEGY_PATTERN, "parse-cache-strategy", tail) + stats_recs = grep_and_parse(filepath, STATS_PATTERN, "parse-stats", tail) + inference_count = grep_count(filepath, r"\] \[POST\] /v1/chat/completions |\] \[POST\] /v1/completions ", tail) + line_count = int(tail) if tail is not None else total + return strategy_recs, stats_recs, inference_count, line_count + + +# ════════════════════════════════════════════════════════════════ +# Phase 3: 三层指标计算 +# ════════════════════════════════════════════════════════════════ + + +def compute_prefix_hitrate(strategies): + """计算第一层:Prefix Hit Ratio。""" + scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + if not scoring_recs: + return {"mean": 0, "stats": None, "distribution": [], "cold_start_rate": 0, "trend": [], "count": 0} + + hit_ratios = [r.get("selected_hitRatio", 0) for r in scoring_recs] + cold_starts = sum(1 for r in scoring_recs if not r.get("hitRatios")) + + stats = compute_statistics(hit_ratios, distribution_spec="0-20,20-40,40-60,60-80,80-100") + trend = time_bucket(scoring_recs, "auto", [("selected_hitRatio", "mean")]) + + return { + "mean": stats["mean"], + "stats": stats, + "distribution": stats.get("distribution", []), + "cold_start_rate": round(cold_starts / len(scoring_recs) * 100, 1) if scoring_recs else 0, + "trend": trend, + "count": len(scoring_recs), + } + + +def compute_session_hitrate(stats_recs, inference_count): + """计算第二层:Session Hit Rate。""" + total_hits = sum(r.get("hits", 0) for r in stats_recs) + total_total = sum(r.get("total", 0) for r in stats_recs) + + session_hr = round(total_hits / total_total * 100, 1) if total_total else 0 + + # 趋势:每个窗口的 hits/total + trend = time_bucket(stats_recs, "auto", [("hits", "sum"), ("total", "sum")]) + for t in trend: + h = t.get("hits_sum", 0) + tot = t.get("total_sum", 0) + t["value"] = round(h / tot * 100, 1) if tot else 0 + + return { + "rate": session_hr, + "hits": total_hits, + "total": total_total, + "inference_count": inference_count, + "trend": trend, + } + + +def compute_per_worker_stats(strategies): + """计算第三层:Per-Worker Cache Stats。""" + scoring_recs = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + if not scoring_recs: + return [] + + worker_data = defaultdict(lambda: {"selected_count": 0, "hit_ratios": []}) + total_scoring = len(scoring_recs) + + for r in scoring_recs: + selected = r.get("selected", "") + if selected: + worker_data[selected]["selected_count"] += 1 + worker_data[selected]["hit_ratios"].append(r.get("selected_hitRatio", 0)) + + result = [] + for worker, data in worker_data.items(): + avg_hr = round(sum(data["hit_ratios"]) / len(data["hit_ratios"]), 1) if data["hit_ratios"] else 0 + result.append( + { + "Worker": _strip_scheme(worker), + "Selected": data["selected_count"], + "Select%": f"{round(data['selected_count'] / total_scoring * 100, 1)}%", + "AvgHitRatio": f"{avg_hr}%", + } + ) + + result.sort(key=lambda x: x["Selected"], reverse=True) + return result + + +def compute_scheduling_stats(strategies): + """计算调度策略概况。""" + if not strategies: + return {"scoring_count": 0, "fallback_count": 0, "scoring_pct": 0, "fallback_reasons": [], "suboptimal_pct": 0} + + scoring = [r for r in strategies if r.get("strategy") == "cache_aware_scoring"] + fallback = [r for r in strategies if r.get("strategy") == "process_tokens"] + + # Fallback 原因分类 + fallback_reasons = count_by(fallback, "reason") if fallback else [] + + # 非最优命中选择比例 + suboptimal = 0 + for r in scoring: + hit_ratios = r.get("hitRatios", {}) + if not hit_ratios: + continue + selected_hr = r.get("selected_hitRatio", 0) + max_hr = max(hit_ratios.values()) if hit_ratios else 0 + if selected_hr < max_hr: + suboptimal += 1 + + total = len(strategies) + return { + "scoring_count": len(scoring), + "fallback_count": len(fallback), + "scoring_pct": round(len(scoring) / total * 100, 1) if total else 0, + "fallback_reasons": fallback_reasons, + "suboptimal_count": suboptimal, + "suboptimal_pct": round(suboptimal / len(scoring) * 100, 1) if scoring else 0, + } + + +def cross_diagnose(prefix_hr, session_hr): + """交叉诊断矩阵。""" + p_high = prefix_hr["mean"] >= 60 + s_high = session_hr["rate"] >= 60 + + if s_high and p_high: + return { + "icon": "\u2705", + "summary": "cache-aware 策略运行良好", + "detail": "Session 粘性好,KV cache 实际复用度高", + } + elif s_high and not p_high: + return { + "icon": "\u26a0\ufe0f", + "summary": "Session 粘性好但 Prefix HR 低", + "detail": "prompt 内容变化大,同 worker 的 KV cache 实际复用低", + } + elif not s_high and p_high: + return { + "icon": "\u26a0\ufe0f", + "summary": "换 worker 频繁但 Prefix HR 尚可", + "detail": "负载均衡分散了请求,但新 worker 也有类似前缀缓存", + } + else: + return { + "icon": "\u274c", + "summary": "命中率全面偏低", + "detail": "负载均衡强制分散或缓存未预热,建议检查 worker 数量和 session 分配策略", + } + + +# ════════════════════════════════════════════════════════════════ +# Phase 4: 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def _quartile_trend(trend, value_field): + """将趋势数据分为 4 个 quartile,计算每段均值。""" + if not trend: + return "" + n = len(trend) + if n < 4: + values = [t.get(value_field, 0) for t in trend] + avg = round(sum(values) / len(values), 1) if values else 0 + return f"{avg}%" + + q_size = n // 4 + quartiles = [] + for i in range(4): + start = i * q_size + end = start + q_size if i < 3 else n + vals = [t.get(value_field, 0) for t in trend[start:end]] + quartiles.append(round(sum(vals) / len(vals), 1) if vals else 0) + + arrow = ( + "\u2191" if quartiles[3] > quartiles[0] + 10 else "\u2193" if quartiles[3] < quartiles[0] - 10 else "\u2192" + ) + return f"Q1={quartiles[0]}% \u2192 Q2={quartiles[1]}% \u2192 Q3={quartiles[2]}% \u2192 Q4={quartiles[3]}% {arrow}" + + +def format_full_report( + filepath, line_count, prefix_hr, session_hr, per_worker, scheduling, diagnosis, time_span=None, window_rows=None +): + """格式化完整终端报告。""" + parts = [] + + # 标题 + span_str = time_span or "" + parts.append("## Cache Hit Rate Report") + parts.append(f"**File**: {filepath} | **Lines**: {line_count:,}") + if span_str: + parts.append(f"**Span**: {span_str}") + parts.append("") + + # 图表说明 + parts.append("### 图表说明(如何解读)") + parts.append(" - Unicode 柱状图:每行代表一个 Prefix HR 区间(如 60-80%),条越长表示该区间请求占比越高。") + parts.append(" - ASCII 折线图:横轴是时间窗口,纵轴是命中率(0-100%);越靠上表示命中率越高。") + parts.append(" - 趋势 Q1→Q4:把时间均分为四段,比较首尾;↑ 上升,↓ 下降,→ 基本稳定。") + parts.append("") + + # 1. Prefix Hit Ratio + parts.append("### 1. Prefix Hit Ratio (KV Cache 内容复用度)") + if prefix_hr["stats"]: + _ = prefix_hr["stats"] + parts.append(f' 累计平均: {prefix_hr["mean"]}% (被选中 worker, N={prefix_hr["count"]})') + parts.append(" 分布:") + + dist_data = [ + {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"] + ] + parts.append(" Unicode 柱状图(Prefix HR 分布):") + parts.append(render_bar(dist_data, show_count=True)) + + parts.append(f' 冷启动率: {prefix_hr["cold_start_rate"]}%') + + trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean") + if trend_str: + parts.append(f" 趋势: {trend_str}") + + # Sparkline + if prefix_hr["trend"]: + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] + parts.append("") + parts.append(" ASCII 折线图(Prefix HR 趋势):") + parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) + else: + parts.append(" (无 cache_aware_scoring 数据)") + parts.append("") + + # 2. Session Hit Rate + parts.append("### 2. Session Hit Rate (请求级路由粘性)") + parts.append(f' 累计: {session_hr["rate"]}% (hits={session_hr["hits"]} / total={session_hr["total"]})') + trend_str = _quartile_trend(session_hr["trend"], "value") + if trend_str: + parts.append(f" 趋势: {trend_str}") + + if session_hr["trend"]: + parts.append("") + parts.append(" ASCII 折线图(Session HR 趋势):") + parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100))) + parts.append("") + + # 3. Per-Worker + parts.append("### 3. Per-Worker Cache Stats") + if per_worker: + parts.append( + render_table( + per_worker, + columns=["Worker", "Selected", "Select%", "AvgHitRatio"], + right_align={"Selected", "Select%", "AvgHitRatio"}, + ) + ) + else: + parts.append(" (无数据)") + parts.append("") + + # 4. Scheduling Strategy + parts.append("### 4. Scheduling Strategy") + parts.append( + f' cache_aware_scoring: {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)' + f' | fallback: {scheduling["fallback_count"]}' + ) + if scheduling["fallback_reasons"]: + reasons = ", ".join(f'{r["value"]}={r["count"]}' for r in scheduling["fallback_reasons"]) + parts.append(f" fallback reasons: {reasons}") + parts.append( + f' 非最优命中选择: {scheduling["suboptimal_pct"]}%' + f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)' + ) + parts.append("") + + # 5. Diagnosis + parts.append("### 5. Diagnosis") + parts.append(f' {diagnosis["icon"]} {diagnosis["summary"]}') + parts.append(f' {diagnosis["detail"]}') + + # 6. 每窗口明细预览 + if window_rows: + parts.append("") + parts.append("### 6. 每5s窗口明细预览(前10行)") + parts.append( + render_table( + window_rows[:10], + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"], + right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"}, + ) + ) + + return "\n".join(parts) + + +def format_tail_report(filepath, line_count, prefix_hr, session_hr, scheduling): + """格式化 --tail 精简报告。""" + parts = [] + parts.append("## Cache Hit Rate (Recent)") + parts.append(f"**File**: {filepath} | **tail {line_count} lines**") + parts.append("") + parts.append(f' Prefix Hit Ratio: {prefix_hr["mean"]}% (avg) | Cold start: {prefix_hr["cold_start_rate"]}%') + parts.append(f' Session Hit Rate: {session_hr["rate"]}% (hits={session_hr["hits"]}/total={session_hr["total"]})') + parts.append( + f' Strategy: scoring {scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)' + f' | fallback {scheduling["fallback_count"]}' + ) + + # Sparkline + if prefix_hr["trend"]: + parts.append("") + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] + parts.append(render_sparkline(sparkline_data, title="Recent Prefix HR", y_label="%", y_range=(0, 100))) + parts.append(" 说明: 折线越靠上表示对应时间窗口 Prefix HR 越高。") + + return "\n".join(parts) + + +def build_per_window_rows(strategies, stats_recs): + """构建每窗口明细行,用于终端预览和 details 导出。""" + time_data = defaultdict( + lambda: { + "prefix_vals": [], + "hits": 0, + "total": 0, + "scoring": 0, + "fallback": 0, + "running": 0, + "has_running": False, + } + ) + for r in strategies: + ts = r.get("ts", "") + if r.get("strategy") == "cache_aware_scoring": + time_data[ts]["scoring"] += 1 + time_data[ts]["prefix_vals"].append(r.get("selected_hitRatio", 0)) + else: + time_data[ts]["fallback"] += 1 + + for r in stats_recs: + ts = r.get("ts", "") + time_data[ts]["hits"] += r.get("hits", 0) + time_data[ts]["total"] += r.get("total", 0) + if "total_running" in r: + time_data[ts]["running"] += r.get("total_running", 0) + time_data[ts]["has_running"] = True + + rows = [] + for ts in sorted(time_data.keys()): + d = time_data[ts] + short_ts = ts.split(" ")[-1] if " " in ts else ts + if d["prefix_vals"]: + prefix_mean = round(sum(d["prefix_vals"]) / len(d["prefix_vals"]), 1) + prefix_hr = f"{prefix_mean}%" + else: + prefix_hr = "-" + + if d["total"] > 0: + session_val = round(d["hits"] / d["total"] * 100, 1) + session_hr = f'{session_val}% ({d["hits"]}/{d["total"]})' + else: + session_hr = "-" + + running = _format_half_running(d["running"]) if d["has_running"] else "-" + rows.append( + { + "Time": short_ts, + "Prefix HR": prefix_hr, + "Session HR": session_hr, + "Scoring": str(d["scoring"]), + "Fallback": str(d["fallback"]), + "Total Running (prefill≈stats/2)": running, + } + ) + return rows + + +def save_detailed_report( + filepath, + strategies, + stats_recs, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + output_dir, + time_span=None, +): + """导出详细数据 Markdown 文件。 + + 主报告包含 Per-Worker 统计和 Fallback 明细。 + 每窗口明细数据拆分到 details/per_window_data.md。 + """ + summary_dir = os.path.join(output_dir, "summary") + details_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(details_dir, exist_ok=True) + output_path = os.path.join(summary_dir, "cache_hitrate_report.md") + + parts = [] + parts.append("# Cache Hit Rate Detailed Report") + parts.append(f'**Generated**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}') + parts.append(f"**Source**: {filepath}") + if time_span: + parts.append(f"**Span**: {time_span}") + parts.append("") + + parts.append("## 图表说明(Legend)") + parts.append("- **Unicode 柱状图**: 展示 Prefix HR 分布,`█` 越多说明该命中率区间占比越高。") + parts.append("- **ASCII 折线图**: 展示命中率随时间变化,横轴为时间窗口,纵轴为命中率(0-100%)。") + parts.append("- **Q1~Q4 趋势**: 将观察区间均分四段,反映整体走向(↑/↓/→)。") + parts.append("") + + # 1) 主指标摘要(与终端一致,避免“只在终端可见”) + parts.append("## 1. Key Metrics Summary") + parts.append("") + parts.append("### Prefix Hit Ratio") + if prefix_hr["stats"]: + parts.append(f'- 累计平均: **{prefix_hr["mean"]}%** (N={prefix_hr["count"]})') + parts.append(f'- 冷启动率: **{prefix_hr["cold_start_rate"]}%**') + trend_str = _quartile_trend(prefix_hr["trend"], "selected_hitRatio_mean") + if trend_str: + parts.append(f"- 趋势: {trend_str}") + dist_data = [ + {"label": d["range"] + "%", "value": d["pct"], "count": d["count"]} for d in prefix_hr["distribution"] + ] + parts.append("") + parts.append("```text") + parts.append("Unicode 柱状图(Prefix HR 分布)") + parts.append(render_bar(dist_data, show_count=True)) + if prefix_hr["trend"]: + sparkline_data = [ + {"bucket": t["bucket"], "value": t.get("selected_hitRatio_mean", 0)} for t in prefix_hr["trend"] + ] + parts.append("") + parts.append("ASCII 折线图(Prefix HR 趋势)") + parts.append(render_sparkline(sparkline_data, title="Prefix HR Trend", y_label="%", y_range=(0, 100))) + parts.append("```") + else: + parts.append("- (无 cache_aware_scoring 数据)") + parts.append("") + + parts.append("### Session Hit Rate") + parts.append(f'- 累计: **{session_hr["rate"]}%** (hits={session_hr["hits"]}/total={session_hr["total"]})') + trend_str = _quartile_trend(session_hr["trend"], "value") + if trend_str: + parts.append(f"- 趋势: {trend_str}") + if session_hr["trend"]: + parts.append("") + parts.append("```text") + parts.append("ASCII 折线图(Session HR 趋势)") + parts.append(render_sparkline(session_hr["trend"], title="Session HR Trend", y_label="%", y_range=(0, 100))) + parts.append("```") + parts.append("") + + parts.append("### Scheduling Strategy") + parts.append( + f'- cache_aware_scoring: **{scheduling["scoring_count"]} ({scheduling["scoring_pct"]}%)**' + f' | fallback: **{scheduling["fallback_count"]}**' + ) + parts.append( + f'- 非最优命中选择: **{scheduling["suboptimal_pct"]}%**' + f' ({scheduling.get("suboptimal_count", 0)} 次, 负载均衡优先于命中率)' + ) + parts.append(f'- Diagnosis: {diagnosis["icon"]} {diagnosis["summary"]};{diagnosis["detail"]}') + parts.append("") + + # 2) Per-Worker 完整统计 + parts.append("## 2. Per-Worker 完整统计") + parts.append("") + if per_worker: + parts.append( + render_table( + per_worker, + columns=["Worker", "Selected", "Select%", "AvgHitRatio"], + right_align={"Selected", "Select%", "AvgHitRatio"}, + ) + ) + parts.append("") + + # 3) Fallback 明细 + if scheduling["fallback_reasons"]: + parts.append("## 3. Fallback 明细") + for reason in scheduling["fallback_reasons"]: + parts.append(f'- **{reason["value"]}**: {reason["count"]} 次 ({reason["pct"]}%)') + parts.append("") + + # 每窗口明细 → 拆分到 details/ + window_rows = build_per_window_rows(strategies, stats_recs) + window_rows_merged = merge_blank_window_rows(window_rows) + session_rows = compute_session_details(strategies, _strip_scheme) + session_summary = summarize_session_details(session_rows) + + if window_rows: + # 主报告中添加引用 + parts.append( + f"> 每5s窗口明细数据(原始 {len(window_rows)} 条,合并后 {len(window_rows_merged)} 条):" + " [../detail/per_window_data.md](../detail/per_window_data.md)" + ) + parts.append("") + + # 写入 details 子目录 + detail_parts = ["# 每5s窗口明细数据", ""] + detail_parts.append( + "> 注:连续空窗口(Prefix/Session 都为空、且 Scoring/Fallback=0)已按 3 行格式合并展示(起始/合并说明/结束)。" + ) + detail_parts.append("") + detail_parts.append( + render_table( + window_rows_merged, + columns=["Time", "Prefix HR", "Session HR", "Scoring", "Fallback", "Total Running (prefill≈stats/2)"], + right_align={"Scoring", "Fallback", "Total Running (prefill≈stats/2)"}, + ) + ) + detail_parts.append("") + + detail_path = os.path.join(details_dir, "per_window_data.md") + with open(detail_path, "w") as f: + f.write("\n".join(detail_parts)) + + if session_rows: + parts.append( + f"> Session 命中详情 ({len(session_rows)} sessions): [../detail/session_hit_details.md](../detail/session_hit_details.md)" + ) + parts.append("") + + all_rows_with_seq = [] + for i, r in enumerate(session_rows, start=1): + all_rows_with_seq.append({**r, "id": _seq_label(i)}) + id_type_ranges = _summarize_id_type_ranges(all_rows_with_seq) + seq_map = {r["session"]: r["id"] for r in all_rows_with_seq} + ts_starts = [r.get("first_ts", "-") for r in all_rows_with_seq if r.get("first_ts", "-") != "-"] + ts_ends = [r.get("last_ts", "-") for r in all_rows_with_seq if r.get("last_ts", "-") != "-"] + + session_parts = ["# Session 命中详情", ""] + overall_start_ts = min(ts_starts) if ts_starts else "-" + overall_end_ts = max(ts_ends) if ts_ends else "-" + session_parts.append("## 时间范围") + session_parts.append(f"- 分析覆盖时间段: `{overall_start_ts} ~ {overall_end_ts}`") + session_parts.append("") + session_parts.append("## id_type 摘要") + if len(id_type_ranges) == 1: + start_id, end_id, id_type, range_start_ts, range_end_ts = id_type_ranges[0] + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append(f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + for start_id, end_id, id_type, range_start_ts, range_end_ts in id_type_ranges: + if start_id == end_id: + session_parts.append(f"- `{start_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)") + else: + session_parts.append( + f"- `{start_id}~{end_id}`: `{id_type}` (`{range_start_ts} ~ {range_end_ts}`)" + ) + session_parts.append("") + session_parts.append("## 概览") + session_parts.append("- 字段说明:`avg-hit` = `avg_hit(excl_first)`(去除首请求后的平均命中率)") + session_parts.append(f'- Total sessions: **{session_summary["total_sessions"]}**') + session_parts.append( + f'- Sessions with >1 request: **{session_summary["multi_req"]}**' + f' | single request: **{session_summary["single_req"]}**' + ) + if session_summary["multi_req"] > 0: + sticky_pct = round(session_summary["sticky_multi"] / session_summary["multi_req"] * 100, 1) + session_parts.append( + f'- Sticky (multi-request): **{session_summary["sticky_multi"]} ({sticky_pct}%)**' + f' | non-sticky: **{session_summary["non_sticky_multi"]}**' + ) + session_parts.append( + f'- Non-first request avg hit: **{session_summary["non_first_avg"]}%**' + f' (N={session_summary["non_first_total"]})' + ) + session_parts.append("") + focus_columns = [ + "id", + "req_count", + "sticky", + "purl_cnt", + "avg-hit", + "max_hit", + "min_hit", + "switch_reqids", + ] + session_parts.append("## 优先排查 Session(Top 20)") + prioritized_rows = sorted( + session_rows, + key=lambda r: ( + 0 if r.get("sticky") == "no" else 1, + int(str(r.get("min_hit", "0")).rstrip("%") or 0), + -int(r.get("req_count", 0)), + ), + )[:20] + compact_rows = [] + + for r in prioritized_rows: + sid = seq_map.get(r["session"], "-") + compact_rows.append( + { + "id": sid, + "req_count": r["req_count"], + "sticky": r["sticky"], + "purl_cnt": r.get("prefill_url_count", 0), + "avg-hit": r["avg_hit(excl_first)"], + "max_hit": r["max_hit"], + "min_hit": r["min_hit"], + "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", + } + ) + session_parts.append( + _render_markdown_table(compact_rows, focus_columns, align_right={"req_count", "purl_cnt"}) + ) + session_parts.append("") + + session_columns = focus_columns + all_rows_for_table = [] + for r in all_rows_with_seq: + sid = r["id"] + all_rows_for_table.append( + { + "id": sid, + "req_count": r["req_count"], + "sticky": r["sticky"], + "purl_cnt": r.get("prefill_url_count", 0), + "avg-hit": r["avg_hit(excl_first)"], + "max_hit": r["max_hit"], + "min_hit": r["min_hit"], + "switch_reqids": f"[查看](#switch-{sid.lower()})" if r["switch_req_pairs"] != "-" else "-", + } + ) + session_parts.append("## 全量明细(Markdown 表格)") + session_parts.append( + _render_markdown_table( + all_rows_for_table, + session_columns, + align_right={"req_count", "purl_cnt"}, + ) + ) + session_parts.append("") + + session_parts.append("## 序号与会话ID映射") + map_rows = [ + { + "id": r["id"], + "session_or_trace_id": r["session"], + } + for r in all_rows_with_seq + ] + session_parts.append(_render_markdown_table(map_rows, ["id", "session_or_trace_id"])) + session_parts.append("") + + session_parts.append("## 切换 reqid 明细(可跳转)") + for r in all_rows_with_seq: + session_parts.append(f'### switch-{r["id"].lower()}') + session_parts.append(f'- ID: **{r["id"]}**') + session_parts.append(f'- 会话标识: `{r["session"]}` ({r.get("id_type", "session_id")})') + session_parts.append(f'- 时间段: `{r.get("first_ts", "-")} ~ {r.get("last_ts", "-")}`') + session_parts.append(f'- switch_req_pairs: {r["switch_req_pairs"]}') + session_parts.append(f'- sharp_drop_request_ids: {r["sharp_drop_request_ids"]}') + session_parts.append("") + + session_path = os.path.join(details_dir, "session_hit_details.md") + with open(session_path, "w") as f: + f.write("\n".join(session_parts)) + + with open(output_path, "w") as f: + f.write("\n".join(parts)) + + return output_path + + +# ════════════════════════════════════════════════════════════════ +# 时间跨度计算 +# ════════════════════════════════════════════════════════════════ + + +def compute_time_span(strategies, stats_recs): + """从数据中计算时间跨度字符串。""" + all_ts = [] + for r in strategies + stats_recs: + ts = r.get("ts", "") + if ts: + try: + all_ts.append(parse_ts(ts)) + except ValueError: + pass + if len(all_ts) < 2: + return None + t_min = min(all_ts) + t_max = max(all_ts) + duration = t_max - t_min + hours = int(duration.total_seconds() // 3600) + minutes = int((duration.total_seconds() % 3600) // 60) + start = t_min.strftime("%Y-%m-%d %H:%M:%S") + end = t_max.strftime("%Y-%m-%d %H:%M:%S") + if hours > 0: + return f"{start} ~ {end} ({hours}h{minutes}m)" + return f"{start} ~ {end} ({minutes}m)" + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def parse_args(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Cache 命中率统计", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("log_file", help="日志文件路径") + parser.add_argument( + "--tail", + nargs="?", + const="2000", + help="只分析尾部数据(支持 2000、1k、1w 等行数写法)。按时间请使用 --start/--end", + ) + parser.add_argument( + "--output", default=None, help="详细报告输出目录(默认:skill_output/stat-cache-hitrate//)" + ) + parser.add_argument( + "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' + ) + parser.add_argument("--end", default=None, help='结束时间(如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00")') + return parser.parse_args() + + +def parse_tail_arg(tail_str): + """解析 --tail 参数,返回行数 int。支持数字及 k/w 缩写。""" + if tail_str is None: + return None + + s = str(tail_str).strip().lower() + if not s: + raise ValueError("--tail 不能为空") + + m = re.fullmatch(r"(\d+)([kw])?", s) + if not m: + raise ValueError("不支持的 --tail 格式:请使用 2000、1k、1w 等行数写法。按时间请改用 --start/--end") + + value = int(m.group(1)) + unit = m.group(2) + if unit == "k": + value *= 1000 + elif unit == "w": + value *= 10000 + + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return value + + +def main(): + args = parse_args() + + # 验证文件存在 + if not os.path.isfile(args.log_file): + print(f"Error: 文件不存在: {args.log_file}", file=sys.stderr) + sys.exit(1) + + # --tail 与 --start/--end 不能混用(两者是不同的范围选择方式) + if args.tail and (args.start or args.end): + print("Error: --tail 与 --start/--end 不能同时使用,请选择其一", file=sys.stderr) + sys.exit(1) + + try: + tail = parse_tail_arg(args.tail) + except ValueError as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + # 时间范围预过滤(--start 和 --end 可单独或同时指定) + import atexit + + log_file = args.log_file + if args.start or args.end: + start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None + end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None + filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + + # Phase 2: 提取 + 解析 + strategy_recs, stats_recs, inference_count, line_count = extract_data(log_file, tail) + + if not strategy_recs and not stats_recs: + print( + "Warning: 未找到 cache-aware 策略行或 [stats] 行。" "请确认日志文件包含 Go Router 日志。", file=sys.stderr + ) + sys.exit(0) + + # Phase 3: 计算三层指标 + prefix_hr = compute_prefix_hitrate(strategy_recs) + session_hr = compute_session_hitrate(stats_recs, inference_count) + per_worker = compute_per_worker_stats(strategy_recs) + scheduling = compute_scheduling_stats(strategy_recs) + diagnosis = cross_diagnose(prefix_hr, session_hr) + + # Phase 4: 输出 + # 无论 tail 还是全量模式,都生成详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + if args.output: + output_base = args.output + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) + output_base = os.path.join(golang_router_root, "skill_output", "stat-cache-hitrate") + output_dir = os.path.join(output_base, run_timestamp) + + time_span = compute_time_span(strategy_recs, stats_recs) + window_rows = build_per_window_rows(strategy_recs, stats_recs) + + if tail is not None: + # tail 精简模式:打印摘要 + 生成详细报告 + print(format_tail_report(args.log_file, line_count, prefix_hr, session_hr, scheduling)) + else: + # 全量模式:打印完整报告 + print( + format_full_report( + args.log_file, + line_count, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + time_span, + window_rows=window_rows, + ) + ) + + # 导出详细报告(tail 和全量都生成) + report_path = save_detailed_report( + args.log_file, + strategy_recs, + stats_recs, + prefix_hr, + session_hr, + per_worker, + scheduling, + diagnosis, + output_dir, + time_span=time_span, + ) + print("\n\U0001f4c4 详细数据见:") + report_abs, report_uri = _build_path_links(report_path) + print(f" - 报告文件: [{report_abs}]({report_uri})") + details_path = os.path.join(output_dir, "detail", "per_window_data.md") + if os.path.exists(details_path): + details_abs, details_uri = _build_path_links(details_path) + print(f" - 窗口明细: [{details_abs}]({details_uri})") + session_detail_path = os.path.join(output_dir, "detail", "session_hit_details.md") + if os.path.exists(session_detail_path): + session_abs, session_uri = _build_path_links(session_detail_path) + print(f" - Session 明细: [{session_abs}]({session_uri})") + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py new file mode 100644 index 00000000000..a197ee7aff0 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/stats.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Stats — 通用统计计算工具 + +提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。 +不含任何业务逻辑或日志格式依赖。 + +Python 3 stdlib only,零依赖。 +""" + +import math +from collections import defaultdict +from datetime import datetime, timedelta + +# ════════════════════════════════════════════════════════════════ +# 百分位数与基础统计 +# ════════════════════════════════════════════════════════════════ + + +def percentile(sorted_vals, p): + """从已排序列表计算第 p 百分位数(线性插值)。""" + if not sorted_vals: + return 0.0 + n = len(sorted_vals) + k = (p / 100.0) * (n - 1) + f = math.floor(k) + c = math.ceil(k) + if f == c: + return sorted_vals[int(k)] + return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f) + + +def compute_statistics(values, percentiles_list=None, distribution_spec=None): + """计算一组数值的统计量。 + + Args: + values: 数值列表 + percentiles_list: 要计算的百分位数列表,默认 [50, 90, 95, 99] + distribution_spec: 分布区间规格字符串,如 '0-20,20-40,40-60,60-80,80-100' + + Returns: + dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution} + """ + if percentiles_list is None: + percentiles_list = [50, 90, 95, 99] + + if not values: + result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0} + for p in percentiles_list: + result[f"p{p}"] = 0 + if distribution_spec is not None: + result["distribution"] = [] + return result + + sorted_vals = sorted(values) + n = len(sorted_vals) + total = sum(sorted_vals) + mean = total / n + variance = sum((x - mean) ** 2 for x in sorted_vals) / n + stddev = math.sqrt(variance) + + result = { + "count": n, + "min": round(sorted_vals[0], 3), + "max": round(sorted_vals[-1], 3), + "mean": round(mean, 3), + "sum": round(total, 3), + "stddev": round(stddev, 3), + } + + for p in percentiles_list: + result[f"p{p}"] = round(percentile(sorted_vals, p), 3) + + if distribution_spec is not None: + result["distribution"] = compute_distribution(sorted_vals, distribution_spec) + + return result + + +def compute_distribution(sorted_vals, spec_str): + """根据区间规格计算分布直方图。 + + spec_str 示例:'0-20,20-40,40-60,60-80,80-100' + 每个区间是左闭右开 [lo, hi)。 + """ + buckets = _parse_distribution_spec(spec_str) + n = len(sorted_vals) + result = [] + for b in buckets: + if b[0] == "lt": + count = sum(1 for v in sorted_vals if v < b[1]) + label = b[2] + elif b[0] == "gt": + count = sum(1 for v in sorted_vals if v > b[1]) + label = b[2] + elif b[0] == "range": + count = sum(1 for v in sorted_vals if b[1] <= v < b[2]) + label = b[3] + else: + continue + result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0}) + return result + + +def _parse_distribution_spec(spec_str): + """解析分布区间规格:'<100,100-500,>1000' → bucket 定义列表。""" + buckets = [] + for part in spec_str.split(","): + part = part.strip() + if part.startswith("<"): + buckets.append(("lt", float(part[1:]), part)) + elif part.startswith(">"): + buckets.append(("gt", float(part[1:]), part)) + elif "-" in part: + lo, hi = part.split("-", 1) + buckets.append(("range", float(lo), float(hi), part)) + return buckets + + +# ════════════════════════════════════════════════════════════════ +# 时间窗口聚合 +# ════════════════════════════════════════════════════════════════ + + +def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"): + """按时间窗口聚合记录。 + + Args: + records: dict 列表,每个 dict 必须有 ts_field 字段 + window: 窗口大小 '5s'/'1m'/'5m'/'auto' + agg_specs: 聚合规格列表 [(field, func), ...],如 [('selected_hitRatio', 'mean')] + func 支持:count, sum, mean, min, max, pNN + ts_field: 时间戳字段名 + + Returns: + list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...} + """ + if agg_specs is None: + agg_specs = [("_", "count")] + + if not records: + return [] + + window_td = _parse_window(window, records, ts_field) + + # 按窗口分组 + buckets = defaultdict(list) + for r in records: + ts_str = r.get(ts_field, "") + if not ts_str: + continue + try: + dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + except ValueError: + continue + bucket_dt = _align_to_bucket(dt, window_td) + bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S") + buckets[bucket_key].append(r) + + # 按时间排序并聚合 + result = [] + for bucket_key in sorted(buckets.keys()): + bucket_records = buckets[bucket_key] + entry = {"bucket": bucket_key, "count": len(bucket_records)} + + for field, func in agg_specs: + if field == "_": + if func == "count": + entry["count"] = len(bucket_records) + continue + + values = [] + for r in bucket_records: + v = r.get(field) + if v is not None: + try: + values.append(float(v)) + except (ValueError, TypeError): + pass + + out_key = f"{field}_{func}" + entry[out_key] = _aggregate_values(values, func) + + result.append(entry) + + return result + + +def _parse_window(window_str, records, ts_field): + """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。""" + if window_str == "auto": + timestamps = [] + for r in records: + ts_str = r.get(ts_field, "") + if ts_str: + try: + timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")) + except ValueError: + pass + if len(timestamps) < 2: + return timedelta(minutes=1) + span = max(timestamps) - min(timestamps) + if span < timedelta(minutes=30): + return timedelta(seconds=5) + elif span < timedelta(hours=3): + return timedelta(minutes=1) + else: + return timedelta(minutes=5) + elif window_str.endswith("s"): + return timedelta(seconds=int(window_str[:-1])) + elif window_str.endswith("m"): + return timedelta(minutes=int(window_str[:-1])) + elif window_str.endswith("h"): + return timedelta(hours=int(window_str[:-1])) + return timedelta(minutes=1) + + +def _align_to_bucket(dt, window_td): + """将 datetime 对齐到窗口边界。""" + secs = max(1, int(window_td.total_seconds())) + epoch = datetime(dt.year, dt.month, dt.day) + offset = int((dt - epoch).total_seconds()) + aligned = (offset // secs) * secs + return epoch + timedelta(seconds=aligned) + + +def _aggregate_values(values, func): + """用指定函数聚合一组数值。""" + if not values: + return 0 + if func == "count": + return len(values) + elif func == "sum": + return round(sum(values), 3) + elif func == "mean": + return round(sum(values) / len(values), 3) + elif func == "min": + return round(min(values), 3) + elif func == "max": + return round(max(values), 3) + elif func.startswith("p"): + p = int(func[1:]) + return round(percentile(sorted(values), p), 3) + return 0 + + +# ════════════════════════════════════════════════════════════════ +# 分组计数 +# ════════════════════════════════════════════════════════════════ + + +def count_by(records, field, top_n=None): + """按指定字段分组计数。 + + Args: + records: dict 列表 + field: 分组字段名 + top_n: 只返回前 N 个(按计数降序) + + Returns: + list[dict]: [{value, count, pct}],按计数降序排列 + """ + counts = defaultdict(int) + total = 0 + for r in records: + val = r.get(field) + if val is not None: + counts[str(val)] += 1 + total += 1 + + result = [] + for val, count in sorted(counts.items(), key=lambda x: -x[1]): + result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0}) + + if top_n: + result = result[:top_n] + + return result diff --git a/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py new file mode 100644 index 00000000000..4e09710f6f9 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/stat-cache-hitrate/scripts/window_utils.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +""" +窗口明细压缩工具:合并连续空窗口,降低 per_window_data.md 噪声。 +""" + +RUNNING_COL = "Total Running (prefill≈stats/2)" + + +def _is_blank_window_row(row): + """判断是否为空窗口(无 Prefix/Session 明细值)。""" + return ( + row.get("Prefix HR") == "-" + and row.get("Session HR") == "-" + and row.get("Scoring") in {"0", 0} + and row.get("Fallback") in {"0", 0} + ) + + +def merge_blank_window_rows(rows, min_merge_len=5): + """合并连续空窗口,避免明细表被大量 '-' 行淹没。 + + 对于连续空窗口段(长度 >= min_merge_len),压缩成 3 行: + 1) 起始时间行 + 2) 合并说明行(含窗口数量) + 3) 结束时间行 + """ + if not rows: + return rows + + merged = [] + i = 0 + while i < len(rows): + if not _is_blank_window_row(rows[i]): + merged.append(rows[i]) + i += 1 + continue + + j = i + while j < len(rows) and _is_blank_window_row(rows[j]): + j += 1 + + seg_len = j - i + if seg_len < min_merge_len: + merged.extend(rows[i:j]) + i = j + continue + + start_t = rows[i]["Time"] + end_t = rows[j - 1]["Time"] + merged.append( + { + "Time": start_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + RUNNING_COL: rows[i].get(RUNNING_COL, "-"), + } + ) + merged.append( + { + "Time": "|", + "Prefix HR": "-", + "Session HR": f"merged {seg_len} windows", + "Scoring": "0", + "Fallback": "0", + RUNNING_COL: "-", + } + ) + merged.append( + { + "Time": end_t, + "Prefix HR": "-", + "Session HR": "-", + "Scoring": "0", + "Fallback": "0", + RUNNING_COL: rows[j - 1].get(RUNNING_COL, "-"), + } + ) + i = j + + return merged diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md new file mode 100644 index 00000000000..ecb27c1436a --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/SKILL.md @@ -0,0 +1,164 @@ +--- +name: troubleshoot +description: > + FastDeploy Go Router 综合问题排查 skill。覆盖错误分类、延迟分析、请求追踪、Worker 健康时间线、 + Cache 调度诊断、负载与计数器分析六个维度。输出按三层问题来源分类:Router 自身、FastDeploy 后端、客户端。 + + 当用户要求以下操作时触发此 skill:排查 router 问题、分析 router 日志、router 排查、 + 查看 router 状态、综合排查、全量扫描、troubleshoot router、/troubleshoot、 + 分析错误日志、502/503 排查、延迟分析、Worker 健康、负载分析、cache 调度诊断、 + 请求追踪、trace 请求。 + 关键词:troubleshoot、排查、router 问题、全量扫描、综合分析、error、502、latency、 + health、load、cache、trace、/troubleshoot。 + +--- + +# Router Troubleshooting + +综合排查 FastDeploy Go Router 问题,输出完整诊断报告。 + +> IMPORTANT: 执行前务必先读取 `references/log_patterns.md` 了解日志格式和提取规则。错误分类时参考 `references/error_catalog.md`。涉及后端问题时参考 `references/fastdeploy_cross_reference.md`。 + +## 执行前交互 + +运行脚本前,Claude 必须按以下顺序向用户确认参数: + +### 1. 日志文件路径 +使用 AskUserQuestion 工具向用户询问日志文件路径。提供两个常用快捷选项(客户端会自动提供 Other 自定义输入): +- 选项 1: `logs/router.log`(默认) +- 选项 2: `fd-router.log`(golang_router 根目录) + +**重要规则**: +- 如果用户已经在消息中明确指定了日志路径,直接使用该路径,跳过询问步骤 +- 用户指定路径后不要质疑、推荐替代文件、或以任何理由尝试切换到其他文件 +- 支持绝对路径(如 `/home/user/logs/xxx.log`)和相对路径(如 `logs/fd-router (2).log`) + +如果用户直接确认或未指定路径,使用脚本的自动发现逻辑。 + +### 2. 分析范围 +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): +- 选项 1: `全量分析(默认)` — 分析整个日志文件 +- 选项 2: `尾部分析` — 只分析最近数据(仅支持行数,如 `--tail 5000`) +- 选项 3: `指定时间段` — 分析特定时间范围内的日志 + +如果用户未选择,默认使用全量分析。 + +#### 指定时间段的处理 + +脚本原生支持 `--start` 和 `--end` 参数,无需手动预过滤。两者可单独或同时指定。 + +时间格式灵活:支持 `YYYY/MM/DD HH:MM:SS`、`HH:MM:SS`、`HH:MM`、`MM/DD`、`MM/DD HH:MM`。 +缺失部分自动从日志首末行推断(缺年份取首行,缺日期取末行)。 +`--start/--end` 与 `--tail` 互斥。 +`--tail` 仅支持“行数”语义(如 `5000`,也兼容 `1k/1w` 自动换算),不再支持 `30m` 这类时间写法;凡是按时间筛选都使用 `--start/--end`。 + +当用户选择“指定时间段”时,必须再发起一次 **AskUserQuestion**(离散选项)引导时间输入: +- 选项 1: `当天(00:00:00 到当前)`(推荐) +- 选项 2: `自定义时间段`(由用户直接输入起止时间) + +用户若通过客户端默认 `Other` 输入时间,则将该输入直接作为时间范围参数解析。 +可补充一条简短示例引导: +- 示例 1:`16:00-16:30` +- 示例 2:`03/31 16:00 ~ 03/31 18:00` +- 示例 3:`2026/03/31 16:00:00`(仅起始) + +### 3. 分析模式 +必须使用 **AskUserQuestion 的离散选项**(不要只发纯文本编号): +- 选项 1: `完整分析(默认)` — 运行所有维度(errors + latency + health + cache + load) +- 选项 2: `单维度/多维度分析` — 选择特定维度(errors / latency / health / cache / load),可选多个 +- 选项 3: `请求追踪` — 追踪特定请求 ID + +如果用户未选择,默认使用完整分析。 + +当用户选择“请求追踪”后,**不要再发 AskUserQuestion** 收集 trace ID。 +直接发一条提示并等待用户输入完成后再继续执行即可。 + +提示文案建议: +- `请输入要追踪的 ID(支持 trace_id / request_id / session_id,多个用逗号分隔;输入 all 可全量追踪)` +- 示例:`a1b2c3d4` / `trace-001,trace-002` / `session-abc-123` / `all` + +### 4. 输出目录 +诊断报告默认保存到 `skill_output/troubleshoot//`(自动按运行时间创建子目录)。 +用户可通过 `--output` 指定**基目录**,脚本会继续在其下创建 `/summary` 与 `/detail`,避免覆盖历史明细。 + +## 用法 + +脚本路径(相对于 `fastdeploy/golang_router/`):`.claude/skills/troubleshoot/scripts/` + +```bash +SCRIPTS=.claude/skills/troubleshoot/scripts + +# 全量扫描(errors + latency + health + cache + load) +python3 $SCRIPTS/troubleshoot.py + +# 单维度分析 +python3 $SCRIPTS/troubleshoot.py --errors +python3 $SCRIPTS/troubleshoot.py --latency +python3 $SCRIPTS/troubleshoot.py --health +python3 $SCRIPTS/troubleshoot.py --cache +python3 $SCRIPTS/troubleshoot.py --load + +# 请求追踪(需指定 ID,支持逗号分隔多 ID) +python3 $SCRIPTS/troubleshoot.py --trace +python3 $SCRIPTS/troubleshoot.py --trace "id1,id2" +python3 $SCRIPTS/troubleshoot.py --trace all + +# 尾部分析 +python3 $SCRIPTS/troubleshoot.py --tail 5000 +# 指定时间段(需要按时间筛选时使用;--start 和 --end 可单独或同时使用) +python3 $SCRIPTS/troubleshoot.py --start "16:00:00" --end "17:00:00" +python3 $SCRIPTS/troubleshoot.py --start "2026/03/31 16:00:00" +python3 $SCRIPTS/troubleshoot.py --start "03/31" --end "03/31 18:00" + +# 组合模式 +python3 $SCRIPTS/troubleshoot.py --errors --latency +python3 $SCRIPTS/troubleshoot.py --errors --tail 5000 +python3 $SCRIPTS/troubleshoot.py --start "16:00" --end "17:00" --errors --latency +``` + +默认日志路径:`logs/router.log` → `fd-router.log` + +## 输出 + +- **终端**:简洁三层汇总(Router / FD 后端 / 客户端),含状态码分布、错误 Top N、趋势图 +- **文件**:详细报告导出到 `skill_output/troubleshoot//summary/troubleshoot_report.md` + - 逐分钟事件详情拆分到 `detail/health_events.md` + - 请求追踪事件链拆分到 `detail/trace/trace_.md` +- **Cache 明细要求**:`cache_session_stickiness.md` / `cache_suboptimal.md` / `cache_eviction.md` / `cache_fallback.md` / `cache_cross.md` + 必须始终生成(即使无异常也写“未发现/样本不足”总结,避免链接缺失) +- **状态行**:`STATUS: HEALTHY / DEGRADED / CRITICAL` + +## 三层诊断框架 + +| 层 | 典型问题 | 日志特征 | +|----|---------|---------| +| Router | Panic、500、Counter 异常、调度瓶颈、Cache 策略不优 | `Panic recovered`、`Failed to encode`、`double-release` | +| FD 后端 | 502、Worker 下线、高推理延迟、请求卡住 | `Failed to connect`、`Removed unhealthy`、p99 高 | +| 客户端 | 断连、请求格式错误 | `context canceled`、400 | + +## 脚本架构 + +``` +scripts/ + log_parser.py — 日志解析原语(HTTP/Cache/Stats/错误归一化/事件匹配) + stats.py — 通用统计计算(百分位数/时间窗口/分组) + chart.py — 终端可视化(sparkline/柱状图/表格/时间线) + troubleshoot.py — 主编排器 + analyzers/ + errors.py — 错误分类分析 + latency.py — 延迟分析 + health.py — Worker 健康时间线 + cache.py — Cache 调度诊断 + load.py — 负载与计数器分析 + trace.py — 请求追踪 +``` + +## 重要规则 + +1. 大文件 (>5000 行) 用 grep 分类提取,不一次性读取 +2. 每个问题标注来源层(Router / FD 后端 / 客户端) +3. Cache 命中率数值分析用 `/stat-cache-hitrate`,本 skill 做策略诊断 +4. 分析前读取 `references/log_patterns.md` +5. 错误查询参考 `references/error_catalog.md` +6. 后端问题排查参考 `references/fastdeploy_cross_reference.md` +7. 输出格式参考 `references/report_templates.md` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json new file mode 100644 index 00000000000..4b961e85b36 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/evals/trigger_eval.json @@ -0,0 +1,18 @@ +[ + {"query": "router 最近频繁 502 和 503,帮我全面排查一下问题", "should_trigger": true}, + {"query": "帮我 troubleshoot 一下 Go Router,感觉有些请求延迟特别高", "should_trigger": true}, + {"query": "分析 logs/fd-router.log 里面的错误日志,看看哪些错误最多", "should_trigger": true}, + {"query": "有几个 Worker 好像不太健康,帮我看看 Worker 健康时间线", "should_trigger": true}, + {"query": "cache 调度策略最近好像有问题,fallback 比例太高了,诊断一下", "should_trigger": true}, + {"query": "帮我追踪请求 trace-id-12345,看看这个请求在 router 里经历了什么", "should_trigger": true}, + {"query": "/troubleshoot 全量扫描 router 日志,给我一份完整的诊断报告", "should_trigger": true}, + {"query": "router 负载分析一下,有没有 counter 异常或者 double-release 的情况", "should_trigger": true}, + {"query": "统计一下 cache 命中率是多少,prefix hit ratio 和 session hit rate 各是多少", "should_trigger": false}, + {"query": "帮我看看 hitRatio 数据,想了解 KV cache 的复用度", "should_trigger": false}, + {"query": "帮我写一个 Go 的 reverse proxy,要支持负载均衡", "should_trigger": false}, + {"query": "分析 Kubernetes pod 的日志,看看为什么 OOMKilled", "should_trigger": false}, + {"query": "FastDeploy 模型部署失败了,帮我看看怎么回事", "should_trigger": false}, + {"query": "帮我优化一下 Python 代码的性能,跑得太慢了", "should_trigger": false}, + {"query": "nginx 返回 504 Gateway Timeout,帮我排查原因", "should_trigger": false}, + {"query": "帮我监控 cache 命中率的实时变化趋势", "should_trigger": false} +] diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md new file mode 100644 index 00000000000..60b4931b546 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/error_catalog.md @@ -0,0 +1,123 @@ +# Router 错误目录 + +按 HTTP 状态码和日志级别分类的 Router 错误快速索引。每条含严重程度、根因、影响、排查命令、问题来源层。 + +--- + +## 按 HTTP 状态码索引 + +注意:HTTP 响应体中的错误消息与 logger 输出的 ERROR 消息**可能不同**。 +例如:HTTP 502 响应 `Failed to select worker: {err}` 对应的日志 ERROR 是 `Failed to select mixed worker: {err}`。 +分析时需将两者关联而非简单去重。 + +### 400 Bad Request + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Invalid request body: {err}` | 请求体读取失败 | 客户端 | 检查客户端请求格式 | +| `Invalid JSON format: {err}` | JSON 解析失败 | 客户端 | 检查 JSON 格式 | +| `DefaultManager is nil` | Manager 未初始化 | Router | 检查 Router 启动日志 | + +### 500 Internal Server Error + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Failed to build disaggregate_info: {err}` | PD 模式配置错误 | Router | 检查 register.yaml 参数 | +| `Failed to encode modified request: {err}` | 请求编码失败 | Router | 检查请求参数特殊字符 | +| `Internal server error` (Panic) | Router 代码 bug | Router | 检查 Panic recovered 日志 | + +### 502 Bad Gateway + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `Failed to select worker: {err}` | 无可用 Mixed Worker | FD 后端 | `curl /health` 检查后端 | +| `Failed to select worker pair: {err}` | 无可用 PD Worker | FD 后端 | 检查 prefill/decode 注册状态 | +| `Failed to connect to backend service: {err}` | 后端不可达 | FD 后端 | `curl {worker_url}/health` | + +### 503 Service Unavailable + +| 错误消息 | 根因 | 来源层 | 排查 | +|---------|------|-------|------| +| `No available prefill/decode workers` | 全部 Worker 不健康 | FD 后端 | 检查部署状态 | + +--- + +## 按日志级别索引 + +### ERROR 级别 + +| 消息模板 | 严重程度 | 来源层 | 影响 | +|---------|---------|-------|------| +| `Failed to select mixed worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to select prefill worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to read register request body: {err}` | MEDIUM | Router | 注册失败 | +| `Failed to unmarshal register request JSON: {err}` | MEDIUM | Router | 注册失败 | +| `Failed to create decode request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Failed to create prefill request for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Decode request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Prefill request failed for {url}: {err}` | HIGH | FD 后端 | PD 请求失败 | +| `Failed to read request body: {err}` | LOW | 客户端 | 单请求失败 | +| `Failed to unmarshal request JSON: {err}` | LOW | 客户端 | 单请求失败 | +| `Failed to select worker pair: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to build disaggregate_info: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to encode modified request: {err}` | HIGH | Router | 请求返回 500 | +| `Failed to read YAML file config/register.yaml: {err}` | LOW | Router | 启动时未找到可选配置文件(若未使用 register.yaml 可忽略) | +| `Failed to select worker: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Failed to connect to backend service: {err}` | HIGH | FD 后端 | 请求返回 502 | +| `Request failed (attempt {n}/{max}): {err}` | MEDIUM | FD 后端 | 重试中 | +| `Failed to create backend request for {url}: {err}` | HIGH | FD 后端 | 请求失败 | +| `Backend request failed for {url}: {err}` | HIGH | FD 后端 | 请求失败 | +| `scanner error: {err}` | MEDIUM | FD 后端/客户端 | 流式响应中断(gateway redirect 函数) | +| `[prefill] scanner error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 流式错误 | +| `copy error: {err}` | MEDIUM | FD 后端/客户端 | 非流式响应中断 | +| `[prefill] copy error: {err}, message={msg}` | MEDIUM | FD 后端/客户端 | PD 模式 prefill 非流式错误 | +| `Removed unhealthy prefill/decode/mixed instance: {url}` | HIGH | FD 后端 | Worker 被移除(注意:这是 ERROR 级别) | + +### WARN 级别 + +| 消息模板 | 严重程度 | 来源层 | 影响 | +|---------|---------|-------|------| +| `GetRemoteMetrics failed for {url}, falling back to local counter` | LOW | FD 后端 | 调度精度降低 | +| `release worker: {url} skipped, counter already cleaned up` | LOW | Router | 计数器异常 | +| `release worker: {url} skipped, counter already zero (possible double-release)` | MEDIUM | Router | 计数器逻辑 bug | +| `cache-aware prefill: tokenizer failed, fallback to char tokens: {err}` | LOW | Router | cache-aware 精度降低 | +| `Instance {url} role is unknown` | LOW | Router | 注册角色不识别 | + +### INFO 级别(异常相关) + +| 消息模板 | 含义 | 关注场景 | +|---------|------|---------| +| `unhealthy worker counter preserved (inflight requests): {url}, count: {N}` | 不健康 Worker 仍有 inflight 请求 | 频繁出现说明 Worker 不稳定 | +| `unhealthy worker token counter preserved (inflight requests): {url}, tokens: {N}` | 不健康 Worker 仍有 token 计数 | 同上 | +| `cleanup unhealthy worker counter: {url}` | 清理不健康 Worker 的请求计数 | 正常清理 | +| `cleanup unhealthy worker token counter: {url}` | 清理不健康 Worker 的 token 计数 | 正常清理 | +| `preserved counters for {N} workers with inflight requests: [...]` | 保留了 N 个 Worker 的计数器 | N 大说明多 Worker 不稳定 | +| `removed counters for {N} unhealthy workers: [...]` | 移除了 N 个 Worker 的计数器 | 正常清理 | +| `Server {url} is healthy` | 健康检查恢复 | Worker 恢复(来自 HealthGenerate 端点) | + +注意:以下事件是 **ERROR 级别**,不是 INFO: +- `Removed unhealthy prefill/decode/mixed instance: {url}` — Worker 被移除 + +注意:以下内容是 **HTTP 响应体**,不是 logger 输出(不会出现在日志行中): +- `Register success` — 注册成功的 HTTP 200 响应体 +- Worker 注册检测应通过 H1 行的 `POST /register 200` 判断 + +--- + +## 注册参数校验错误 + +| 错误消息 | 根因 | 排查 | +|---------|------|------| +| `invalid connector_port: {value}` | connector_port 非数字或范围错误 | 检查 register.yaml | +| `invalid engine_worker_queue_port: {value}` | engine_worker_queue_port 非数字或范围错误 | 检查 register.yaml | +| `invalid metrics_port: {value}` | metrics_port 非数字或范围错误 | 检查 register.yaml | +| `rdma_ports[{i}] invalid port: {value}` | RDMA 端口配置错误 | 检查 register.yaml | + +--- + +## scanner error / copy error 区分 + +| error 内容 | 来源层 | 含义 | +|-----------|-------|------| +| `context canceled` | 客户端 | 客户端主动断连(超时或取消) | +| 其他 | FD 后端 | 后端流式响应异常 | diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md new file mode 100644 index 00000000000..f35cbcb303a --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/fastdeploy_cross_reference.md @@ -0,0 +1,102 @@ +# FastDeploy 后端交叉引用 + +从 Router 日志推断 FastDeploy 后端问题时的排查指引。 + +--- + +## 症状 → 后端排查 + +### 1. 后端不可达 (502) + +**Router 日志特征**: +``` +[ERROR] Failed to connect to backend service: dial tcp {ip}:{port}: connect: connection refused +``` + +**排查步骤**: +1. `curl http://{worker_url}/health` — 确认后端是否存活 +2. `curl http://{worker_url}/v1/models` — 确认模型是否加载完成 +3. 检查后端日志 `logs/workerlog.0` +4. `netstat -tlnp | grep {port}` — 确认端口监听 +5. 检查网络连通性(防火墙、安全组) + +### 2. 后端 OOM / 频繁重启 + +**Router 日志特征**: +- Worker 频繁 REMOVED → RE-REGISTERED(短周期内多次) +- 健康检查间歇性失败 + +**排查步骤**: +1. `dmesg | grep -i oom` — 检查 OOM killer +2. `nvidia-smi` — 检查 GPU 内存 +3. 后端日志搜索 `CUDA out of memory` +4. 检查 `max_num_seqs`、`max_model_len` 配置 + +### 3. 高推理延迟 + +**Router 日志特征**: +- 请求 p99 高(>10s)但调度耗时仅 ms 级 +- 确认延迟不在 Router 层(调度耗时 << 总延迟) + +**排查步骤**: +1. 检查后端 Prometheus metrics:`http://{worker_url}:{metrics_port}/metrics` + - `fastdeploy_llm_running_queue_size` — 推理队列 + - `fastdeploy_llm_waiting_queue_size` — 等待队列 + - `fastdeploy_llm_generation_tokens_per_second` — 吞吐量 +2. 确认 GPU 利用率:`nvidia-smi --query-gpu=utilization.gpu --format=csv` +3. 检查是否有长 prompt 请求拖慢整体 + +### 4. 流式响应异常 + +**Router 日志特征**: +``` +[ERROR] scanner error: {err} (非 context canceled) +[ERROR] copy error: {err} (非 context canceled) +``` + +**排查步骤**: +1. 后端日志搜索对应 request_id +2. 检查后端是否产生格式错误的 SSE +3. 检查网络是否有中间代理超时切断 + +### 5. 请求超时/卡住 + +**Router 日志特征**: +- 有 select worker 但长时间无 release/completed +- [stats] 中 running 持续不降 + +**根因**:Router 的 `http.Client{}` 没有设置超时,后端不响应则阻塞到客户端断连或 TCP 超时。 + +**排查步骤**: +1. 检查后端是否还在处理请求 +2. 检查后端是否出现死锁 +3. `ss -tnp | grep {port}` — 检查 TCP 连接状态 + +--- + +## 通用 FastDeploy 排查工具 + +### collect-env + +收集环境信息: +```bash +python -m fastdeploy.utils.collect_env +``` + +### 后端日志位置 + +- 默认:`logs/workerlog.0` +- 多 Worker:`logs/workerlog.{N}` + +### Prometheus Metrics + +后端 metrics 端口(从注册信息获取 `metrics_port`): +``` +http://{worker_ip}:{metrics_port}/metrics +``` + +关键指标: +- `fastdeploy_llm_running_queue_size` — 当前推理中的请求数 +- `fastdeploy_llm_waiting_queue_size` — 等待队列长度 +- `fastdeploy_llm_generation_tokens_per_second` — 生成吞吐 +- `fastdeploy_llm_request_total` — 总请求数 diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md new file mode 100644 index 00000000000..4322909c01d --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/log_patterns.md @@ -0,0 +1,293 @@ +# 日志格式与提取规则 + +本文档定义 Router 日志的所有类别、Grep 匹配模式、精确正则,供各子 skill 参考。 + +--- + +## 日志基本格式 + +``` +[LEVEL] YYYY/MM/DD HH:MM:SS logger.go:: [context_tags] message +``` + +### Context Tags(可选,顺序固定) + +- `[trace_id:]` +- `[req_id:]` +- `[session_id:]` +- `[request_id:]` + +所有 tag 可能同时出现,也可能只有部分或没有。顺序固定为:`trace_id → req_id → session_id → request_id`。 + +### ID 匹配正则 + +搜索某个 ID 时,同时匹配四种 tag: +``` +session_id:|trace_id:|request_id:|req_id: +``` + +--- + +## 日志分类提取 + +| 类别 | Grep 模式 | 用途 | 典型内容 | +|------|----------|------|---------| +| E1 — ERROR | `\[ERROR\]` | 错误分类 | 各类 Failed to ... 错误 | +| E2 — WARN | `\[WARN\]` | 警告分类 | counter 异常、tokenizer 退化 | +| H1 — HTTP 请求 | `\] \[(POST\|GET)\] /` | 延迟/状态码/吞吐量 | HTTP middleware 日志行 | +| H2 — 健康事件 | `Removed unhealthy\|is not healthy\|is healthy` | Worker 健康时间线 | 上下线事件 | +| H2b — 注册事件 | `\] \[POST\] /register.*200` | Worker 注册 | 从 H1 HTTP 行中匹配 POST /register 返回 200 | +| H3 — 调度事件 | `select worker\|release worker\|Failed to select\|SelectWorkerPair` | 调度/计数器分析 | Worker 选择和释放 | +| H4 — 后端问题 | `Failed to connect\|request failed\|scanner error\|copy error\|Panic recovered` | 后端问题 | 连接/流式/Panic(注意:`scanner error`/`copy error` 与 H9 有重叠,带 `[prefill]` 前缀的行同时属于 H9) | +| H5 — Counter | `counter preserved\|cleanup unhealthy\|removed counters\|counter already\|double-release\|preserved counters` | 计数器异常 | 计数器生命周期 | +| H6 — Cache-aware | `cache-aware prefill: final strategy:` | Cache 调度诊断 | 策略选择 + hitRatios | +| H7 — Stats | `\[stats\]` | 负载/命中率 | 周期性统计行 | +| H8 — ts_ms | `ts_ms=` | 调度耗时 | 调度开始结束时间戳 | +| H9 — Prefill 生命周期 | `\[prefill\]` | PD 模式 prefill 追踪 | 首包/释放/错误 | +| H10 — 请求标记 | `Parsing completed\|Request completed successfully` | 请求生命周期 | 调度开始/请求结束标记 | +| H11 — Token 释放 | `release prefill tokens` | Token 计数器生命周期 | Token 释放事件 | + +--- + +## H1 — HTTP 请求行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1 +``` + +字段:`[METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP` + +### 延迟单位归一化 + +Go `time.Duration.String()` 输出格式不固定,需归一化为毫秒: + +| 原始格式 | 含义 | 转换为 ms | +|---------|------|----------| +| `1.5s` | 秒 | × 1000 | +| `150ms` | 毫秒 | 直接使用 | +| `150.5ms` | 毫秒 | 直接使用 | +| `500µs` | 微秒 | ÷ 1000 | +| `500us` | 微秒(ASCII) | ÷ 1000 | +| `500ns` | 纳秒 | ÷ 1000000 | +| `1m30s` | 分+秒 | 分×60000 + 秒×1000 | +| `1h2m3s` | 时+分+秒 | 时×3600000 + 分×60000 + 秒×1000 | + +正则提取延迟值:`(\d+(?:\.\d+)?(?:h|m(?!s)|s|ms|µs|us|ns))+` + +### 仅推理请求 + +延迟分析只统计推理请求路径: +- `/v1/chat/completions` +- `/v1/completions` + +排除健康检查 `/health`、注册 `/register` 等管理路径。 + +--- + +## H6 — Cache-aware 策略行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:87: [trace_id:xxx] [session_id:xxx] cache-aware prefill: final strategy: cache_aware_scoring, selected=http://10.0.0.1:9965, loads=map[http://10.0.0.1:9965:2 http://10.0.0.2:9965:5], hitRatios=map[http://10.0.0.1:9965:0.85 http://10.0.0.2:9965:0.42]. ts_ms=2025-01-15 18:25:33.123 +``` + +``` +[INFO] ... cache-aware prefill: final strategy: process_tokens, reason: load imbalanced, loads=map[...]. ts_ms=2025-01-15 18:25:33.123 +``` + +注意:日志中**没有** `scores=map[...]` 字段。scores 仅在 DEBUG 级别的 `chooseByScore` 中逐条打印。 +如需分析非最优选择,需从 hitRatios + loads 使用公式重新计算: +`score = (100-hitRatio)/100 * hitRatioWeight + loadRatio * loadBalanceWeight` + +### Go map 解析 + +`hitRatios=map[key1:val1 key2:val2]` + +- 空 map:`hitRatios=map[]` — 表示冷启动 +- 正则提取 map 内容:`map\[(.*?)\]` +- 每对 key:value 用空格分隔:`(\S+):(\S+)` +- key 是 worker URL,value 是 float64 + +### selected worker 的 hitRatio + +从 hitRatios map 中查找 selected URL 的值: +- 在 map 中找到 → 使用该值 +- 不在 map 中 → hitRatio = 0 +- map 为空 → 冷启动,hitRatio = 0 + +### ts_ms 格式 + +`ts_ms=2025-01-15 18:25:33.123` + +格式:`2006-01-02 15:04:05.000`(Go reference time) + +用于计算调度耗时(两个 ts_ms 之间的差值)。 + +--- + +## H7 — Stats 行格式 + +``` +[INFO] 2025/01/15 18:25:33 logger.go:87: [stats] total_running=5, workers: [http://10.0.0.1:9965: running=2, http://10.0.0.2:9965: running=3], cache_hit_rate=85.71% (hits=6/total=7) +``` + +注意:由于 Go `log.Lshortfile` 打印的是 `Printf` 调用处,stats 行的源文件始终为 `logger.go:NN:`(行号随编译变化),而非 `handler.go`。 + +注意:stats 行**不包含**任何 context tag(trace_id 等),因为由后台 goroutine 周期输出。 + +### 关键:per-interval 计数器 + +`hits` 和 `total` 是 **per-interval** 的值(每 5s 通过 `atomic.Swap(0)` 重置为 0)。 + +计算累计值必须 **sum 所有行**: +- 累计 Session Hit Rate = `sum(hits) / sum(total) * 100` + +### Worker 负载提取 + +`workers: [url1: running=N, url2: running=N]` + +- 注意格式:`workers:` 带冒号+空格,每个 worker 格式为 `url: running=N`,逗号+空格分隔 +- **不包含 token 数据**(reportStats 只读取 running 计数) + +正则:`(http://[^:]+:\d+): running=(\d+)` + +### cache_hit_rate 提取 + +`cache_hit_rate=85.71% (hits=6/total=7)` + +正则:`cache_hit_rate=([\d.]+)% \(hits=(\d+)/total=(\d+)\)` + +--- + +## 模板归一化 + +ERROR/WARN 消息分组时,需将变量替换为占位符: + +| 变量类型 | 正则 | 替换为 | +|---------|------|-------| +| URL | `https?://[\w.:]+` | `{url}` | +| UUID | `[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}` | `{uuid}` | +| 数字 | `\d+` (仅在特定位置) | `{N}` | +| IP:Port | `\d+\.\d+\.\d+\.\d+:\d+` | `{ip:port}` | + +--- + +## Fallback 策略行识别 + +| final strategy | reason 关键词 | 含义 | +|---------------|--------------|------| +| `cache_aware_scoring` | (无 reason) | 正常 cache-aware 调度 | +| `process_tokens` | `tokenize failed` | 退化 B:字符级 tokenize 也失败 | +| `process_tokens` | `load imbalanced` | 退化 C:负载不均衡 | +| `process_tokens` | (其他) | 退化 D:策略未初始化等 | + +退化 A(Tokenizer 服务→字符级)在 WARN 行识别: +``` +[WARN] ... cache-aware prefill: tokenizer failed, fallback to char tokens: {err} +``` +注意完整前缀 `cache-aware prefill: tokenizer failed`。 +退化 A 后仍可走 cache_aware_scoring(精度降低),与 B/C/D 不互斥。 + +--- + +## H4 — 后端问题匹配说明 + +H4 的 `request failed` 模式会匹配多个消息模板: +- `Request failed (attempt {n}/{max}): {err}` — 重试日志 +- `Decode request failed for {url}: {err}` — PD 模式 decode 失败 +- `Prefill request failed for {url}: {err}` — PD 模式 prefill 失败 +- `Backend request failed for {url}: {err}` — 后端请求失败 + +分析时需通过模板归一化去重。 + +--- + +## H9 — Prefill 生命周期事件 + +PD(Prefill/Decode 分离)模式下,`completions.go` 产生的 `[prefill]` 前缀日志: + +| 消息模板 | 含义 | +|---------|------| +| `[prefill] first chunk received, release counter url=%s` | Prefill 首包到达,释放计数器 | +| `[prefill] non-stream prefill response done, release counter url=%s` | 非流式 prefill 完成 | +| `[prefill] release in defer (fallback) url=%s, isStream=%v` | defer 兜底释放 | +| `[prefill] release in CommonCompletions defer (error path) url=%s` | 错误路径释放 | +| `[prefill] backendResp is nil or backendResp.Body is nil, url=%s` | 后端响应异常 | +| `[prefill] scanner error: %v, message=%s` | 流式读取错误(ERROR 级别) | +| `[prefill] copy error: %v, message=%s` | 非流式复制错误(ERROR 级别) | + +--- + +## H10 — 请求生命周期标记 + +| 消息 | 含义 | 级别 | +|------|------|------| +| `Parsing completed; starting worker selection.` | 请求解析完成,开始调度 | INFO | +| `Request completed successfully.` | 请求成功完成 | INFO | + +--- + +## H11 — Token 释放 + +`release prefill tokens: %s, tokens: %d` — 释放 prefill token 计数。 +数据源:`handler.go:333`。用于 troubleshoot-load 的 token 计数器分析。 + +--- + +## Select/Release 日志细节(与代码一致) + +- `select worker (prefill): , tokens: ` +- `select worker (decode|mixed): , count: ` +- `release worker: , count: `(request counter 释放) +- `release prefill tokens: , tokens: `(token counter 释放;可能来自 prefill 或 mixed 请求路径) + +重点:release 只有上面这两种。`release worker` 不带 worker type,`release prefill tokens` 的文本也不能直接断定是 prefill(mixed 也可能调用)。因此按 `prefill/decode/mixed` 统计时,需要从 select 侧做归类;确实无法归类时才记为 `unknown`。 + +--- + +## 使用脚本工具 + +各 skill 的脚本位于各自的 `scripts/` 目录下,自动处理上述所有日志解析和计算。 + +### 快速参考 + +| 任务 | 脚本 | +|------|------| +| 解析 H1 HTTP 行 | `log_parser.py parse-http [--inference-only]` | +| 解析 H6 cache 策略行 | `log_parser.py parse-cache-strategy` | +| 解析 H7 stats 行 | `log_parser.py parse-stats` | +| 检测非支持请求 | `log_parser.py unsupported-requests [--summary-only]` | +| ASCII 折线图 | `chart.py` | +| Unicode 柱状图 | `chart.py` | +| Markdown 表格 | `chart.py` | +| Worker 时间线 | `chart.py` | + +所有工具从 stdin 读取,输出到 stdout。中间数据使用 JSON Lines 格式。 + +--- + +## 已知路由列表 + +Router 支持的全部路由(来自 `internal/router/router.go`): + +| Method | Path | 类型 | +|--------|------|------| +| POST | `/v1/chat/completions` | 推理 | +| POST | `/v1/completions` | 推理 | +| POST | `/register` | 实例注册 | +| GET | `/registered_number` | 注册数量查询 | +| GET | `/registered` | 注册列表查询 | +| GET | `/health_generate` | 健康检查 | +| GET | `/metrics` | Prometheus 指标 | + +### 非支持请求排查 + +客户端可能发送不属于已知路由的请求(如 `/v1/models`),会收到 404 但仍记录在 H1 HTTP 日志中。 + +使用 `log_parser.py unsupported-requests` 子命令检测: +```bash +# 完整输出(详细列表 + 汇总) +grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests + +# 仅汇总 +grep -E '\] \[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\] /' logfile | python3 log_parser.py unsupported-requests --summary-only +``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md new file mode 100644 index 00000000000..61db59ec7e6 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/references/report_templates.md @@ -0,0 +1,131 @@ +# 报告输出规范 + +所有 troubleshoot 分析维度共享的可视化和格式规范。 + +--- + +## 通用可视化组件 + +### Unicode 柱状图 +- 填充块:`█`(U+2588),空块:`░`(U+2591) +- 总宽度:20 字符,右侧标注百分比和计数 +- 块数 = round(percentage / 100 * 20),最小 1 块(>0% 时) + +### Sparkline 折线图 +- 字符集:`▁▂▃▄▅▆▇█`(8 级高度) +- 图表宽度:60 字符,自动降采样 +- X 轴标注时间(首/尾 + 中间 2-3 个刻度) +- Y 轴自适应:百分比类 0-100%,计数类 0-max + +### Markdown 表格 +- 标准 Markdown 表格格式 +- 数值列右对齐 + +### Worker 可用性时间线 +- `█` = 在线,`░` = 下线 +- 右侧标注在线率百分比 + +--- + +## 严重程度标记 + +| 标记 | 含义 | 使用场景 | +|------|------|---------| +| CRITICAL | 服务不可用 | Panic、全部 Worker 不健康、错误率 >20% | +| HIGH | 部分请求失败 | 502/503、Worker 频繁下线 | +| MEDIUM | 性能下降 | 高延迟、cache 命中率低 | +| LOW | 需关注 | 计数器异常、tokenizer 退化 | +| INFO | 正常 | 统计信息 | + +--- + +## 报告格式 + +### 简洁版(终端输出) + +- 第一行:`STATUS: HEALTHY / DEGRADED / CRITICAL — 简要说明` +- 状态定义:`HEALTHY`=无明显异常;`DEGRADED`=服务可用但性能/稳定性下降(需关注);`CRITICAL`=服务不可用或高风险故障 +- 按三层分类(Router / FD 后端 / 客户端) +- 每个问题一行摘要 + 关键指标 +- 末尾提示详细版文件路径 + +### 详细版(文件导出) + +- 路径:`skill_output/troubleshoot//troubleshoot_report_.md` +- 主报告包含各维度总结 + 可视化图表(sparkline/柱状图/时间线等) +- 详情拆分到 `details/` 子目录: + - `detail/health_events.md` — Worker 逐分钟健康事件 + 健康诊断 + - `detail/errors_topn.md` — ERROR/WARN 模板明细(数量/级别/来源层/影响 + URLs) + - `detail/load_select_release.md` — 负载诊断 + select/release 明细 + - `detail/load_diagnoses.md` — load 诊断列表 + - `detail/load_counter_state.md` — request/token counter 末状态 + - `detail/latency_diagnoses.md` — 延迟诊断详情 + - `detail/cache_diagnosis.md` — cache 六维诊断详情(session 粘性/非最优/驱逐/Fallback/冷启动/交叉诊断) + - `detail/cache_session_stickiness.md` / `detail/cache_suboptimal.md` / `detail/cache_eviction.md` / `detail/cache_fallback.md` / `detail/cache_cross.md` — cache 分职责拆分明细 + - `detail/trace/trace_.md` — 请求追踪事件链 + +--- + +## 状态判定规则 + +- **CRITICAL**:存在 Panic、全部 Worker 不健康、或错误率 >20% +- **DEGRADED**:存在 502/503、Worker 不稳定、或错误率 >5% +- **HEALTHY**:无严重问题 + +--- + +## 各维度报告结构 + +### Errors(错误分析) + +``` +HTTP 状态码分布(柱状图) +错误率趋势(折线图) +ERROR/WARN Top N(柱状图 + 表格,标注来源层) +Panic 列表 +``` + +### Latency(延迟分析)— 待实现 + +``` +延迟百分位数 (p50/p90/p95/p99) +延迟分布(柱状图) +吞吐量趋势(折线图) +慢请求 Top 10 +``` + +### Health(Worker 健康)— 待实现 + +``` +Worker 可用性时间线 +健康事件汇总表 +可用性统计 +``` + +### Cache(调度诊断) + +``` +调度策略分布 +Session 粘性分析 +非最优选择分析 +Fallback 原因分类 +驱逐影响与交叉诊断 +``` + +要求:即使某项计数为 0(例如“非最优选择”),也要输出该小节并给出“未发现/样本不足”总结,保证 detail 链接稳定存在。 + +### Load(负载分析)— 待实现 + +``` +Worker 负载分布 +计数器异常检测 +Token 计数器统计 +``` + +### Trace(请求追踪)— 待实现 + +``` +单请求事件链 +生命周期完整性检查 +Session 多请求汇总 +``` diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py new file mode 100644 index 00000000000..e7bb50660a8 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/__init__.py @@ -0,0 +1 @@ +# Analyzers package diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py new file mode 100644 index 00000000000..a12341967a0 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/cache.py @@ -0,0 +1,607 @@ +#!/usr/bin/env python3 +""" +Cache Analyzer — Cache 调度诊断 + +分析 cache-aware 调度策略:session 粘性、非最优选择评分、驱逐影响、 +fallback 原因、冷启动识别、交叉诊断。 +注意:cache 命中率数值分析由 stat-cache-hitrate skill 负责,本模块做策略诊断。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_table +from log_parser import parse_cache_strategy_line, parse_ts +from stats import compute_statistics, count_by + +# ════════════════════════════════════════════════════════════════ +# Fallback 分类 +# ════════════════════════════════════════════════════════════════ + +TOKENIZER_WARN_RE = re.compile(r"tokenizer failed, fallback to char tokens") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + +def classify_fallback(record, tokenizer_degraded_ts=None): + """对 process_tokens 策略行分类 fallback 原因。 + + Returns: 'A-Tokenizer退化' | 'B-char tokenize失败' | 'C-负载不均衡' | 'D-其他' + """ + reason = record.get("reason", "") + if "load imbalanced" in reason: + return "C-负载不均衡" + if "tokenize failed" in reason: + return "B-char tokenize失败" + return "D-其他" + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_cache(log_file, tail=None, eviction_duration_mins=30, hit_ratio_weight=1.0, load_balance_weight=1.0): + """分析 cache-aware 调度策略。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制 + eviction_duration_mins: 驱逐时间(分钟,默认 30) + hit_ratio_weight: hitRatio 权重(默认 1.0) + load_balance_weight: loadBalance 权重(默认 1.0) + + Returns: + dict: {strategy_dist, fallback_reasons, session_stickiness, suboptimal_selections, + eviction_impact, cold_starts, hitratio_stats, diagnoses, summary} + """ + h6_lines = _grep_lines(log_file, r"cache-aware prefill: final strategy:", tail) + tokenizer_warn_lines = _grep_lines(log_file, r"tokenizer failed, fallback to char tokens", tail) + + # 解析策略行 + strategy_records = [r for line in h6_lines for r in [parse_cache_strategy_line(line)] if r] + + if not strategy_records: + return { + "strategy_dist": [], + "fallback_reasons": [], + "session_stickiness": {}, + "suboptimal_selections": [], + "eviction_impact": [], + "cold_starts": 0, + "hitratio_stats": {}, + "diagnoses": [], + "summary": "未检测到 cache-aware 策略日志", + } + + # Tokenizer 退化次数 + tokenizer_degraded_count = len(tokenizer_warn_lines) + + # 策略分布 + strategy_dist = count_by(strategy_records, "strategy") + + # Fallback 原因 + fallback_records = [r for r in strategy_records if r.get("strategy") == "process_tokens"] + fallback_reasons = [] + if fallback_records: + for r in fallback_records: + r["fallback_type"] = classify_fallback(r) + fallback_reasons = count_by(fallback_records, "fallback_type") + + # hitRatio 统计 + hr_vals = [r.get("selected_hitRatio", 0) for r in strategy_records if "selected_hitRatio" in r] + hitratio_stats = compute_statistics(hr_vals) if hr_vals else {} + + # Session 粘性分析 + session_stickiness = _analyze_session_stickiness(strategy_records) + + # 非最优选择分析 + suboptimal = _analyze_suboptimal(strategy_records, hit_ratio_weight, load_balance_weight) + + # 驱逐影响 + eviction_impact = _analyze_eviction(strategy_records, eviction_duration_mins) + + # 冷启动 + cold_starts = sum(1 for r in strategy_records if r.get("hitRatios") == {}) + + total = len(strategy_records) + cache_aware_count = sum(1 for r in strategy_records if r["strategy"] == "cache_aware_scoring") + fallback_count = len(fallback_records) + + diagnoses = _diagnose( + strategy_dist, + fallback_reasons, + session_stickiness, + suboptimal, + eviction_impact, + cold_starts, + total, + tokenizer_degraded_count, + hitratio_stats, + ) + + return { + "strategy_dist": strategy_dist, + "fallback_reasons": fallback_reasons, + "session_stickiness": session_stickiness, + "suboptimal_selections": suboptimal, + "eviction_impact": eviction_impact, + "cold_starts": cold_starts, + "hitratio_stats": hitratio_stats, + "tokenizer_degraded_count": tokenizer_degraded_count, + "cross_diagnosis": _analyze_cross_diagnosis( + session_stickiness=session_stickiness, + hitratio_stats=hitratio_stats, + strategy_dist=strategy_dist, + eviction_impact=eviction_impact, + ), + "diagnoses": diagnoses, + "summary": f"{total} 策略决策, cache_aware {cache_aware_count}, fallback {fallback_count}, " + f"冷启动 {cold_starts}", + } + + +def _analyze_session_stickiness(records): + """Session 粘性分析。""" + sessions = defaultdict(list) + for r in records: + sid = (r.get("tags") or {}).get("session_id") + if sid and "selected" in r: + sessions[sid].append(r["selected"]) + + result = {} + for sid, workers in sessions.items(): + if len(workers) < 2: + continue + same_count = sum(1 for i in range(1, len(workers)) if workers[i] == workers[i - 1]) + stickiness = round(same_count / (len(workers) - 1) * 100, 1) + switches = [(i, workers[i - 1], workers[i]) for i in range(1, len(workers)) if workers[i] != workers[i - 1]] + result[sid] = { + "total_requests": len(workers), + "stickiness_pct": stickiness, + "switches": len(switches), + } + + return result + + +def _analyze_suboptimal(records, hr_weight, lb_weight): + """非最优选择分析:selected 的 hitRatio 不是最高时,重新计算 score 对比。""" + suboptimal = [] + for r in records: + if r.get("strategy") != "cache_aware_scoring": + continue + hit_ratios = r.get("hitRatios", {}) + loads = r.get("loads", {}) + selected = r.get("selected") + if not hit_ratios or not selected or selected not in hit_ratios: + continue + + max_hr = max(hit_ratios.values()) if hit_ratios else 0 + sel_hr = hit_ratios.get(selected, 0) + + if sel_hr >= max_hr: + continue + + # 计算 scores: score = (100-hitRatio)/100 * hrWeight + loadRatio * lbWeight + # Go 源码使用 maxLoad 做归一化: loadRatio = load / maxLoad + max_load = max(loads.values()) if loads else 1 + max_load = max(max_load, 1) + scores = {} + for w_url in hit_ratios: + hr = hit_ratios.get(w_url, 0) + load = loads.get(w_url, 0) + load_ratio = load / max_load + score = (100 - hr) / 100 * hr_weight + load_ratio * lb_weight + scores[w_url] = round(score, 4) + + best_by_hr = min(hit_ratios, key=lambda w: -hit_ratios[w]) + sel_score = scores.get(selected, 0) + best_hr_score = scores.get(best_by_hr, 0) + + # 分类原因 + load_diff = abs(loads.get(selected, 0) - loads.get(best_by_hr, 0)) + if load_diff > 5: + reason = "负载主导" + elif max_hr < 10: + reason = "区分度不够" + elif abs(sel_score - best_hr_score) < 0.05: + reason = "正常竞争" + else: + reason = "综合权衡" + + suboptimal.append( + { + "ts": r.get("ts", ""), + "selected": _strip_scheme(selected), + "selected_hr": sel_hr, + "best_hr_worker": _strip_scheme(best_by_hr), + "best_hr": max_hr, + "reason": reason, + } + ) + + return suboptimal + + +def _analyze_eviction(records, eviction_mins): + """驱逐影响分析:同 session 连续请求间隔 > eviction_duration。""" + sessions = defaultdict(list) + for r in records: + sid = (r.get("tags") or {}).get("session_id") + ts = r.get("ts") + if sid and ts: + sessions[sid].append(r) + + impacts = [] + for sid, reqs in sessions.items(): + reqs.sort(key=lambda x: x.get("ts", "")) + for i in range(1, len(reqs)): + try: + prev_dt = parse_ts(reqs[i - 1]["ts"]) + curr_dt = parse_ts(reqs[i]["ts"]) + interval_mins = (curr_dt - prev_dt).total_seconds() / 60 + if interval_mins > eviction_mins: + curr_hr = reqs[i].get("selected_hitRatio", -1) + impacts.append( + { + "session_id": sid, + "interval_mins": round(interval_mins, 1), + "hitRatio_after": curr_hr, + "evicted": curr_hr == 0, + } + ) + except (ValueError, KeyError): + pass + + return impacts + + +def _diagnose( + strategy_dist, + fallback_reasons, + session_stickiness, + suboptimal, + eviction_impact, + cold_starts, + total, + tokenizer_degraded_count, + hitratio_stats, +): + """生成 cache 调度诊断。""" + diagnoses = [] + + # Tokenizer 退化 + if tokenizer_degraded_count > 0: + pct = round(tokenizer_degraded_count / max(total, 1) * 100, 1) + sev = "HIGH" if pct > 10 else "MEDIUM" + diagnoses.append( + { + "severity": sev, + "message": f"Tokenizer 退化 {tokenizer_degraded_count} 次 ({pct}%),精度降低", + "source_layer": "Router", + } + ) + + # Fallback 比例 + for s in strategy_dist: + if s["value"] == "process_tokens" and s["pct"] > 20: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'Fallback 到 process_tokens {s["pct"]}%,cache-aware 策略未生效', + "source_layer": "Router", + } + ) + + # 非最优选择 + if suboptimal and total > 0: + pct = round(len(suboptimal) / total * 100, 1) + if pct > 20: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"非最优选择 {pct}%({len(suboptimal)}/{total})", + "source_layer": "Router", + } + ) + + # 冷启动 + if cold_starts > 0 and total > 0: + pct = round(cold_starts / total * 100, 1) + if pct > 10: + diagnoses.append( + {"severity": "LOW", "message": f"冷启动 {pct}%(hitRatios=map[])", "source_layer": "Router"} + ) + + # 驱逐影响 + evicted = [e for e in eviction_impact if e["evicted"]] + if evicted: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"{len(evicted)} 次驱逐后 hitRatio=0,考虑增大 eviction-duration-mins", + "source_layer": "Router", + } + ) + + # hitRatio 整体偏低 + if hitratio_stats.get("mean", 100) < 20: + diagnoses.append( + { + "severity": "LOW", + "message": f'平均 hitRatio {hitratio_stats["mean"]}%,缓存效果较差', + "source_layer": "Router", + } + ) + + return diagnoses + + +def _analyze_cross_diagnosis(session_stickiness, hitratio_stats, strategy_dist, eviction_impact): + """交叉诊断:基于粘性/命中率/fallback/驱逐给出简表。""" + if not session_stickiness: + return [] + avg_stickiness = sum(v["stickiness_pct"] for v in session_stickiness.values()) / max(len(session_stickiness), 1) + mean_hr = hitratio_stats.get("mean", 0) + fallback_pct = 0 + for s in strategy_dist: + if s.get("value") == "process_tokens": + fallback_pct = s.get("pct", 0) + break + evicted_cnt = sum(1 for e in eviction_impact if e.get("evicted")) + + diagnosis = "运行良好" + action = "-" + if avg_stickiness >= 70 and mean_hr >= 40 and fallback_pct < 10: + diagnosis = "运行良好" + elif avg_stickiness >= 70 and mean_hr < 20 and evicted_cnt > 0: + diagnosis = "疑似驱逐导致命中率低" + action = "考虑增大 eviction-duration-mins" + elif avg_stickiness < 40 and fallback_pct >= 20: + diagnosis = "低粘性 + 高 fallback" + action = "检查负载阈值与 cache-aware 参数" + elif avg_stickiness < 40 and mean_hr < 20: + diagnosis = "低粘性 + 低命中" + action = "检查缓存预热与 prompt 稳定性" + + return [ + { + "avg_stickiness_pct": round(avg_stickiness, 1), + "mean_hitRatio_pct": round(mean_hr, 1), + "fallback_pct": round(fallback_pct, 1), + "evicted_after_timeout": evicted_cnt, + "diagnosis": diagnosis, + "action": action, + } + ] + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_cache_report(result): + """将分析结果格式化为终端报告。""" + sections = ["## Cache 调度诊断", ""] + sections.append(f' {result["summary"]}') + sections.append("") + detail_sections = ["# Cache 调度详情", "", f'总结: {result["summary"]}', ""] + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + sections.append(" 诊断见详情: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md)") + sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") + + # 策略分布 + if result["strategy_dist"]: + sections.append("### 策略分布") + sections.append("") + bar_data = [{"label": s["value"], "value": s["pct"], "count": s["count"]} for s in result["strategy_dist"]] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + detail_sections.append("## 策略分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") + + # hitRatio 统计 + hs = result.get("hitratio_stats", {}) + if hs: + sections.append("### hitRatio 统计") + sections.append("") + sections.append( + f' mean={hs.get("mean",0)}% p50={hs.get("p50",0)}% p90={hs.get("p90",0)}% ' + f'p99={hs.get("p99",0)}% max={hs.get("max",0)}%' + ) + sections.append("") + + # Fallback 原因 + if result["fallback_reasons"]: + sections.append("### Fallback 原因分布") + sections.append("") + bar_data = [{"label": f["value"], "value": f["pct"], "count": f["count"]} for f in result["fallback_reasons"]] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + detail_sections.append("## Fallback 原因分布") + detail_sections.append("") + detail_sections.append(render_bar(bar_data, show_count=True)) + detail_sections.append("") + + # Tokenizer 退化 + if result.get("tokenizer_degraded_count", 0) > 0: + sections.append(f' Tokenizer 退化: {result["tokenizer_degraded_count"]} 次') + sections.append("") + + # Session 粘性 + stickiness = result.get("session_stickiness", {}) + sections.append("### Session 粘性") + sections.append("") + sections.append(" Session 粘性详情见: [detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md)") + sections.append("") + if stickiness: + table_data = [ + { + "Session": sid, + "请求数": str(s["total_requests"]), + "粘性率": f'{s["stickiness_pct"]}%', + "切换次数": str(s["switches"]), + } + for sid, s in sorted(stickiness.items(), key=lambda x: x[1]["stickiness_pct"]) + ] + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append( + render_table( + table_data, + columns=["Session", "请求数", "粘性率", "切换次数"], + right_align={"请求数", "粘性率", "切换次数"}, + ) + ) + detail_sections.append("") + else: + sections.append(" 未检测到可计算粘性的多请求 Session。") + sections.append("") + detail_sections.append("## Session 粘性") + detail_sections.append("") + detail_sections.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + detail_sections.append("") + + # 非最优选择 + subs = result.get("suboptimal_selections") or [] + sections.append(f"### 非最优选择 ({len(subs)} 次)") + sections.append("") + sections.append(" 详情见: [detail/cache_suboptimal.md](../detail/cache_suboptimal.md)") + sections.append("") + if subs: + reason_counts = defaultdict(int) + for s in subs: + reason_counts[s["reason"]] += 1 + for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]): + sections.append(f" {reason}: {count} 次") + sections.append("") + detail_sections.append("## 非最优选择(Top 20)") + detail_sections.append("") + for s in subs[:20]: + detail_sections.append( + f'- [{s.get("ts","")}] selected={s.get("selected","")}({s.get("selected_hr",0)}), best={s.get("best_hr_worker","")}({s.get("best_hr",0)}), reason={s.get("reason","")}' + ) + detail_sections.append("") + else: + sections.append(" 未发现非最优选择(selected_hitRatio 始终为当次最高)。") + sections.append("") + detail_sections.append("## 非最优选择") + detail_sections.append("") + detail_sections.append("- 未发现非最优选择。") + detail_sections.append("") + + # 驱逐影响 + evictions = result.get("eviction_impact") or [] + evicted = [e for e in evictions if e["evicted"]] + sections.append(f"### 驱逐影响 ({len(evictions)} 次超时, {len(evicted)} 次缓存失效)") + sections.append("") + sections.append(" 详情见: [detail/cache_eviction.md](../detail/cache_eviction.md)") + sections.append("") + if evictions: + detail_sections.append("## 驱逐影响") + detail_sections.append("") + for e in evictions[:50]: + detail_sections.append( + f'- session={e.get("session_id","")[:24]} interval={e.get("interval_mins",0)}m hitRatio_after={e.get("hitRatio_after",0)} evicted={e.get("evicted",False)}' + ) + detail_sections.append("") + else: + sections.append(" 未检测到超时导致的潜在驱逐影响。") + sections.append("") + detail_sections.append("## 驱逐影响") + detail_sections.append("") + detail_sections.append("- 未检测到超时驱逐样本。") + detail_sections.append("") + + # 冷启动 + if result.get("cold_starts", 0) > 0: + sections.append(f' 冷启动: {result["cold_starts"]} 次(hitRatios=map[])') + sections.append("") + detail_sections.append("## 冷启动识别") + detail_sections.append("") + detail_sections.append(f'- 冷启动次数: {result["cold_starts"]}') + detail_sections.append("") + + sections.append("### 交叉诊断") + sections.append("") + sections.append(" 详情见: [detail/cache_cross.md](../detail/cache_cross.md)") + sections.append("") + if result.get("cross_diagnosis"): + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append( + render_table( + result["cross_diagnosis"], + columns=[ + "avg_stickiness_pct", + "mean_hitRatio_pct", + "fallback_pct", + "evicted_after_timeout", + "diagnosis", + "action", + ], + right_align={"avg_stickiness_pct", "mean_hitRatio_pct", "fallback_pct", "evicted_after_timeout"}, + ) + ) + detail_sections.append("") + else: + sections.append(" 样本不足,未生成交叉诊断。") + sections.append("") + detail_sections.append("## 交叉诊断") + detail_sections.append("") + detail_sections.append("- 样本不足,未生成交叉诊断。") + detail_sections.append("") + + sections.append( + "> 详细诊断: [detail/cache_diagnosis.md](../detail/cache_diagnosis.md) | " + "[detail/cache_session_stickiness.md](../detail/cache_session_stickiness.md) | " + "[detail/cache_suboptimal.md](../detail/cache_suboptimal.md) | " + "[detail/cache_eviction.md](../detail/cache_eviction.md) | " + "[detail/cache_fallback.md](../detail/cache_fallback.md) | " + "[detail/cache_cross.md](../detail/cache_cross.md)" + ) + sections.append("") + + return "\n".join(sections), "\n".join(detail_sections) + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py new file mode 100644 index 00000000000..f0e4c352b6c --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/errors.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +""" +Errors Analyzer — 错误分类分析 + +分析 Router 日志中的 ERROR/WARN 日志、HTTP 状态码分布、Panic 事件。 +按问题来源层(Router / FastDeploy 后端 / 客户端)标注每类错误。 +""" + +import os +import subprocess +import sys + +# 让 analyzers 能 import 同级 scripts 下的模块 +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_sparkline, render_table +from log_parser import extract_ts, parse_error_line, parse_http_line +from stats import count_by, time_bucket + +# ════════════════════════════════════════════════════════════════ +# 错误来源层映射(从 error_catalog.md 提取的核心规则) +# ════════════════════════════════════════════════════════════════ + +# 模板 → 来源层 映射(归一化后的模板匹配) +SOURCE_LAYER_RULES = [ + # Router 自身 + ("Failed to build disaggregate_info", "Router"), + ("Failed to encode modified request", "Router"), + ("Panic recovered", "Router"), + ("DefaultManager is nil", "Router"), + ("double-release", "Router"), + ("counter already cleaned up", "Router"), + ("counter already zero", "Router"), + ("tokenizer failed", "Router"), + ("Instance {url} role is unknown", "Router"), + ("Failed to read YAML file config/register.yaml", "Router"), + # 客户端 + ("Invalid request body", "客户端"), + ("Invalid JSON format", "客户端"), + ("Failed to read request body", "客户端"), + ("Failed to unmarshal request JSON", "客户端"), + # FD 后端(默认多数 ERROR 来自后端) + ("Failed to select", "FD 后端"), + ("Failed to connect to backend", "FD 后端"), + ("No available", "FD 后端"), + ("request failed", "FD 后端"), + ("Removed unhealthy", "FD 后端"), + ("is not healthy", "FD 后端"), + ("is healthy", "FD 后端"), + ("Backend request failed", "FD 后端"), + ("Decode request failed", "FD 后端"), + ("Prefill request failed", "FD 后端"), + ("Failed to create decode request", "FD 后端"), + ("Failed to create prefill request", "FD 后端"), + ("Failed to create backend request", "FD 后端"), + ("GetRemoteMetrics failed", "FD 后端"), +] + +IMPACT_RULES = [ + ("Failed to select", "请求可能返回 502/503"), + ("Failed to connect to backend", "后端不可达,请求失败"), + ("Panic recovered", "Router 代码异常,可能影响稳定性"), + ("scanner error", "流式响应中断"), + ("copy error", "非流式响应中断"), + ("Failed to read YAML file config/register.yaml", "可选配置未加载(若未启用可忽略)"), +] + +# scanner error / copy error 特殊处理:context canceled → 客户端,其他 → FD 后端 +SCANNER_COPY_PATTERNS = ("scanner error", "copy error") + + +def classify_source_layer(template, original=""): + """根据错误模板判断来源层。""" + # scanner error / copy error 特殊判断 + for pat in SCANNER_COPY_PATTERNS: + if pat in template or pat in original: + if "context canceled" in original: + return "客户端" + return "FD 后端" + + for pattern, layer in SOURCE_LAYER_RULES: + if pattern in template: + return layer + + return "未知" + + +def classify_impact(template): + for pattern, impact in IMPACT_RULES: + if pattern in template: + return impact + return "-" + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_errors(log_file, tail=None, top_n=20): + """分析日志中的错误。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制(None 则全量) + top_n: 错误 Top N + + Returns: + dict: { + error_top_n: [{template, count, pct, source_layer, level, urls}], + status_code_dist: [{value, count, pct}], + panic_list: [{ts, context}], + error_rate: float, + error_trend: [{bucket, count}], + total_errors: int, + total_warns: int, + total_requests: int, + summary: str, + } + """ + # Phase 1: Grep 提取各类日志 + error_lines = _grep_lines(log_file, r"\[ERROR\]", tail) + warn_lines = _grep_lines(log_file, r"\[WARN\]", tail) + http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail) + panic_lines = _grep_lines(log_file, "Panic recovered", tail) + + # Phase 2: 解析 + # 2.1 ERROR + WARN 归一化 + error_records = [parse_error_line(line) for line in error_lines] + warn_records = [parse_error_line(line) for line in warn_lines] + all_error_records = error_records + warn_records + + # 2.2 HTTP 请求解析 + http_records = [] + for line in http_lines: + r = parse_http_line(line) + if r: + http_records.append(r) + + # 2.3 Panic 提取 + panic_list = [] + for line in panic_lines: + ts = extract_ts(line) + panic_list.append({"ts": ts or "", "context": line.strip()}) + + # Phase 3: 分析 + # 3.1 按模板分组 Top N + error_top = _compute_error_top_n(all_error_records, top_n) + + # 3.2 HTTP 状态码分布 + status_dist = count_by(http_records, "status") + + # 3.3 错误率 + total_requests = len(http_records) + non_200 = sum(1 for r in http_records if r["status"] != 200) + error_rate = round(non_200 / total_requests * 100, 2) if total_requests else 0 + + # 3.4 错误趋势(按时间窗口统计非 200 请求数) + non_200_records = [r for r in http_records if r["status"] != 200] + error_trend = time_bucket(non_200_records, window="auto") + + return { + "error_top_n": error_top, + "status_code_dist": status_dist, + "panic_list": panic_list, + "error_rate": error_rate, + "error_trend": error_trend, + "total_errors": len(error_records), + "total_warns": len(warn_records), + "total_requests": total_requests, + } + + +def _compute_error_top_n(records, top_n): + """按模板分组并标注来源层。""" + # 分组 + groups = {} + for r in records: + tpl = r["template"] + if tpl not in groups: + groups[tpl] = { + "template": tpl, + "count": 0, + "level": r["level"], + "originals": [], + } + groups[tpl]["count"] += 1 + # 保留最多 5 个原始消息用于详细报告中提取 URL + if len(groups[tpl]["originals"]) < 5: + groups[tpl]["originals"].append(r["original"]) + + total = len(records) + result = [] + for g in sorted(groups.values(), key=lambda x: -x["count"]): + source_layer = classify_source_layer(g["template"], g["originals"][0] if g["originals"] else "") + result.append( + { + "template": g["template"], + "count": g["count"], + "pct": round(g["count"] / total * 100, 1) if total else 0, + "source_layer": source_layer, + "impact": classify_impact(g["template"]), + "level": g["level"], + "urls": _extract_urls(g["originals"]), + "sample_originals": g["originals"], + } + ) + if len(result) >= top_n: + break + + return result + + +def _extract_urls(originals): + import re + + urls = set() + for line in originals: + for m in re.findall(r"https?://[A-Za-z0-9_.:-]+", line): + urls.add(m) + return sorted(urls) + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + # 先 tail 再 grep + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + """简单 shell 引号转义。""" + return "'" + s.replace("'", "'\\''") + "'" + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_errors_report(result): + """将分析结果格式化为终端报告。 + + Args: + result: analyze_errors 返回的 dict + + Returns: + str: 格式化后的报告文本 + """ + sections = [] + + # 标题 + sections.append("## 错误分析") + sections.append("") + + # 概览 + sections.append( + f' ERROR: {result["total_errors"]} | ' + f'WARN: {result["total_warns"]} | ' + f'请求总数: {result["total_requests"]} | ' + f'错误率: {result["error_rate"]}%' + ) + sections.append(" 指标口径: ERROR/WARN=日志级别计数;请求总数=HTTP 请求行数;错误率=非200请求数/请求总数×100%。") + if result["error_rate"] == 0 and (result["total_errors"] > 0 or result["total_warns"] > 0): + sections.append(" ℹ 错误率为 0.0% 仅表示 HTTP 状态码均为 200;并不代表没有 ERROR/WARN 日志。") + sections.append("") + + # Panic + if result["panic_list"]: + sections.append(f' ⚠ Panic 事件: {len(result["panic_list"])} 次') + for p in result["panic_list"][:5]: + sections.append(f' [{p["ts"]}] {p["context"][:100]}') + sections.append("") + + # 错误 Top N + if result["error_top_n"]: + sections.append("### ERROR/WARN Top 分类") + sections.append("") + bar_data = [] + for e in result["error_top_n"][:10]: + label = e["template"][:50] + bar_data.append( + { + "label": label, + "value": e["pct"], + "count": e["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + sections.append(" 具体模板表见: [../detail/errors_topn.md](../detail/errors_topn.md)") + sections.append("") + yaml_missing_count = sum( + e["count"] for e in result["error_top_n"] if "Failed to read YAML file config/register.yaml" in e["template"] + ) + if yaml_missing_count > 0: + sections.append( + f" ℹ `Failed to read YAML file config/register.yaml` 出现 {yaml_missing_count} 次:若未启用该配置文件,可忽略。" + ) + sections.append("") + + # 状态码分布 + if result["status_code_dist"]: + sections.append("### HTTP 状态码分布") + sections.append("") + bar_data = [] + for s in result["status_code_dist"]: + bar_data.append( + { + "label": str(s["value"]), + "value": s["pct"], + "count": s["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 错误趋势 + if result["error_trend"] and len(result["error_trend"]) > 1: + sections.append("### 非 200 请求趋势") + sections.append("") + sections.append( + render_sparkline( + result["error_trend"], + value_field="count", + title="Error Count", + y_label="req", + ) + ) + sections.append("") + + return "\n".join(sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py new file mode 100644 index 00000000000..5d1994d9405 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/health.py @@ -0,0 +1,454 @@ +#!/usr/bin/env python3 +""" +Health Analyzer — Worker 健康时间线分析 + +追踪 Worker 上下线事件、恢复检测、可用性统计。 +按 Worker URL 聚合事件,构建状态时间线。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_table, render_timeline +from log_parser import extract_ts, parse_http_line, parse_ts + +# ════════════════════════════════════════════════════════════════ +# 健康事件解析 +# ════════════════════════════════════════════════════════════════ + +WORKER_URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +NOT_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is not healthy") +REMOVED_RE = re.compile(rf"Removed unhealthy \w+ instance:\s*{WORKER_URL_RE}") +IS_HEALTHY_RE = re.compile(rf"{WORKER_URL_RE}\s+is healthy") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{WORKER_URL_RE}") +CLEANUP_UNHEALTHY_RE = re.compile(rf"cleanup unhealthy.*?{WORKER_URL_RE}") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + +def parse_health_event(line): + """解析 H2 健康事件行。返回 {ts, worker, event_type} 或 None。""" + ts = extract_ts(line) + m = REMOVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "REMOVED"} + m = NOT_HEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "NOT_HEALTHY"} + m = IS_HEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "HEALTHY"} + return None + + +def parse_counter_preserved(line): + """解析 H5 counter preserved / cleanup 事件。""" + ts = extract_ts(line) + m = COUNTER_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "COUNTER_PRESERVED"} + m = CLEANUP_UNHEALTHY_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "event_type": "CLEANUP_UNHEALTHY"} + return None + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_health(log_file, tail=None): + """分析 Worker 健康状态。 + + Returns: + dict: {workers, diagnoses, time_range, summary} + """ + h2_lines = _grep_lines(log_file, r"Removed unhealthy|is not healthy|is healthy", tail) + h5_lines = _grep_lines(log_file, r"counter preserved|cleanup unhealthy", tail) + register_lines = _grep_lines(log_file, r"\[POST\] /register", tail) + + health_events = [e for line in h2_lines for e in [parse_health_event(line)] if e] + counter_events = [e for line in h5_lines for e in [parse_counter_preserved(line)] if e] + + register_events = [] + for line in register_lines: + r = parse_http_line(line) + if r and r["method"] == "POST" and r["path"] == "/register" and r["status"] == 200: + register_events.append({"ts": r["ts"], "client_ip": r["client_ip"]}) + + if not health_events and not register_events: + return { + "workers": {}, + "diagnoses": [], + "time_range": {"start": "", "end": ""}, + "summary": "未检测到 Worker 健康事件", + } + + workers = _build_worker_timelines(health_events, counter_events, register_events) + + all_ts = sorted([e["ts"] for e in health_events + register_events if e.get("ts")]) + time_range = {"start": all_ts[0] if all_ts else "", "end": all_ts[-1] if all_ts else ""} + + diagnoses = _diagnose(workers) + down_workers = sum(1 for w in workers.values() if w["down_count"] > 0) + + return { + "workers": workers, + "diagnoses": diagnoses, + "time_range": time_range, + "summary": f"{len(workers)} Worker(s), {down_workers} 有下线事件", + } + + +def _build_worker_timelines(health_events, counter_events, register_events): + """构建每个 Worker 的状态时间线。""" + worker_urls = {evt["worker"] for evt in health_events} + + # IP → worker URL 映射 + ip_to_urls = defaultdict(set) + for url in worker_urls: + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) + if ip_m: + ip_to_urls[ip_m.group(1)].add(url) + + worker_events = defaultdict(list) + for evt in health_events: + worker_events[evt["worker"]].append(evt) + + counter_counts = defaultdict(int) + for evt in counter_events: + if evt["event_type"] == "COUNTER_PRESERVED": + counter_counts[evt["worker"]] += 1 + + register_by_ip = defaultdict(list) + for evt in register_events: + register_by_ip[evt["client_ip"]].append(evt) + + workers = {} + for url in sorted(worker_urls): + events = sorted(worker_events[url], key=lambda e: e["ts"] or "") + ip_m = re.search(r"(?:https?://)?(\d+\.\d+\.\d+\.\d+)", url) + worker_ip = ip_m.group(1) if ip_m else "" + + # 恢复检测:REMOVED 后有 register + recovered = False + recovery_events = [] + for evt in events: + if evt["event_type"] == "REMOVED" and worker_ip: + for reg in register_by_ip.get(worker_ip, []): + if reg["ts"] and evt["ts"] and reg["ts"] > evt["ts"]: + recovered = True + recovery_events.append({"ts": reg["ts"], "type": "RE-REGISTERED"}) + break + + all_events = [{"ts": e["ts"], "type": e["event_type"]} for e in events] + for reg in register_by_ip.get(worker_ip, []): + all_events.append({"ts": reg["ts"], "type": "REGISTERED"}) + all_events.extend(recovery_events) + all_events.sort(key=lambda e: e["ts"] or "") + + down_periods = _compute_down_periods(all_events) + down_count = len(down_periods) + avg_down_s = (sum(p["duration_s"] for p in down_periods) / len(down_periods)) if down_periods else 0.0 + detect_latency = _compute_detect_latency(all_events) + + workers[url] = { + "events": all_events, + "uptime_pct": _compute_uptime_pct(all_events), + "down_count": down_count, + "avg_down_duration_s": round(avg_down_s, 1), + "recovered": recovered, + "inflight_preserved": counter_counts.get(url, 0), + "down_periods": down_periods, + "avg_detect_latency_s": detect_latency, + } + + return workers + + +def _compute_down_periods(events): + """从事件列表计算下线时段。""" + down_periods = [] + down_start = None + for evt in events: + if evt["type"] in ("NOT_HEALTHY", "REMOVED"): + if down_start is None and evt["ts"]: + down_start = evt["ts"] + elif evt["type"] in ("HEALTHY", "RE-REGISTERED"): + if down_start is not None and evt["ts"]: + try: + duration_s = (parse_ts(evt["ts"]) - parse_ts(down_start)).total_seconds() + down_periods.append({"start": down_start, "end": evt["ts"], "duration_s": max(0, duration_s)}) + except ValueError: + pass + down_start = None + if down_start is not None: + down_periods.append({"start": down_start, "end": None, "duration_s": 0}) + return down_periods + + +def _compute_detect_latency(events): + """计算 NOT_HEALTHY -> REMOVED 平均检测延迟(秒)。""" + last_unhealthy = None + latencies = [] + for evt in events: + if evt["type"] == "NOT_HEALTHY" and evt.get("ts"): + last_unhealthy = evt["ts"] + elif evt["type"] == "REMOVED" and last_unhealthy and evt.get("ts"): + try: + latencies.append((parse_ts(evt["ts"]) - parse_ts(last_unhealthy)).total_seconds()) + except ValueError: + pass + last_unhealthy = None + if not latencies: + return "-" + return round(sum(latencies) / len(latencies), 1) + + +def _compute_uptime_pct(events): + """计算 Worker 可用性百分比。""" + if not events: + return 100.0 + ts_list = [e["ts"] for e in events if e["ts"]] + if len(ts_list) < 2: + return 0.0 if events[0]["type"] in ("NOT_HEALTHY", "REMOVED") else 100.0 + try: + first_dt, last_dt = parse_ts(ts_list[0]), parse_ts(ts_list[-1]) + total_s = (last_dt - first_dt).total_seconds() + if total_s <= 0: + return 100.0 + except ValueError: + return 100.0 + + down_s, down_start = 0.0, None + for evt in events: + if evt["type"] in ("NOT_HEALTHY", "REMOVED") and down_start is None and evt["ts"]: + try: + down_start = parse_ts(evt["ts"]) + except ValueError: + pass + elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and down_start is not None and evt["ts"]: + try: + down_s += (parse_ts(evt["ts"]) - down_start).total_seconds() + except ValueError: + pass + down_start = None + if down_start is not None: + down_s += (last_dt - down_start).total_seconds() + + return round(max(0, total_s - down_s) / total_s * 100, 1) + + +def _diagnose(workers): + """根据 Worker 健康数据生成诊断。""" + diagnoses = [] + if not workers: + return diagnoses + + all_down = all(w["events"] and w["events"][-1]["type"] in ("NOT_HEALTHY", "REMOVED") for w in workers.values()) + if all_down: + diagnoses.append( + { + "severity": "CRITICAL", + "message": f"所有 Worker ({len(workers)}) 当前均不可用", + "source_layer": "FD 后端", + } + ) + + for url, w in workers.items(): + s = _strip_scheme(url) + if w["down_count"] > 3: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{s} 下线 {w["down_count"]} 次,Worker 不稳定', + "source_layer": "FD 后端", + } + ) + for p in w.get("down_periods", []): + if p["duration_s"] > 300: + diagnoses.append( + { + "severity": "HIGH", + "message": f'{s} 下线 {p["duration_s"]/60:.1f}min({p["start"]} ~ {p["end"] or "未恢复"})', + "source_layer": "FD 后端", + } + ) + if len(w["events"]) >= 3: + ts_list = [e["ts"] for e in w["events"] if e["ts"]] + if len(ts_list) >= 2: + try: + hours = (parse_ts(ts_list[-1]) - parse_ts(ts_list[0])).total_seconds() / 3600 + if hours > 0 and len(w["events"]) / hours > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{s} 状态变更频繁 ({len(w["events"])/hours:.1f} 次/小时)', + "source_layer": "FD 后端", + } + ) + except ValueError: + pass + if w["inflight_preserved"] > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{s} counter preserved {w["inflight_preserved"]} 次(下线时仍有 inflight 请求)', + "source_layer": "FD 后端", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_health_report(result): + """将分析结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_text) + summary_text: 总结部分(诊断 + 可用性表格 + 时间线) + detail_text: 事件详情(逐条事件记录,可能很长) + """ + sections = ["## Worker 健康分析", ""] + if not result["workers"]: + sections.append(" 未检测到 Worker 健康事件(所有 Worker 状态正常或无健康日志)") + return "\n".join(sections), "" + + sections.append(f' {result["summary"]}') + if result["time_range"]["start"]: + sections.append(f' 时间范围: {result["time_range"]["start"]} ~ {result["time_range"]["end"]}') + sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + sections.append(" 诊断见详情: [detail/health_events.md](../detail/health_events.md)") + sections.append("") + + # Worker 可用性表格 + sections.append("### Worker 可用性") + sections.append("") + table_data = [] + for url, w in sorted(result["workers"].items()): + avg_down = "" + if w["avg_down_duration_s"] > 0: + avg_down = ( + f'{w["avg_down_duration_s"]/60:.1f}min' + if w["avg_down_duration_s"] >= 60 + else f'{w["avg_down_duration_s"]:.0f}s' + ) + table_data.append( + { + "Worker": _strip_scheme(url), + "在线率": f'{w["uptime_pct"]}%', + "下线次数": str(w["down_count"]), + "平均下线时长": avg_down or "-", + "检测延迟": (f'{w["avg_detect_latency_s"]}s' if w["avg_detect_latency_s"] != "-" else "-"), + "恢复": "是" if w["recovered"] else ("否" if w["down_count"] > 0 else "-"), + "inflight保留": str(w["inflight_preserved"]) if w["inflight_preserved"] > 0 else "-", + } + ) + sections.append( + render_table( + table_data, + columns=["Worker", "在线率", "下线次数", "平均下线时长", "检测延迟", "恢复", "inflight保留"], + right_align={"在线率", "下线次数", "平均下线时长", "检测延迟", "inflight保留"}, + ) + ) + sections.append("") + + # 时间线 + if result["time_range"]["start"] and result["time_range"]["end"]: + sections.append("### Worker 时间线") + sections.append("") + timeline_data = _build_timeline_data(result) + if timeline_data: + sections.append(render_timeline(timeline_data, width=40)) + sections.append("") + + # 事件详情 → 拆分到 detail_text + detail_parts = ["# Worker 健康事件详情", ""] + has_events = False + if result.get("diagnoses"): + detail_parts.append("## 诊断") + detail_parts.append("") + for d in result["diagnoses"]: + detail_parts.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_parts.append("") + for url, w in sorted(result["workers"].items()): + if w["events"]: + has_events = True + detail_parts.append(f"## {_strip_scheme(url)}") + detail_parts.append("") + for evt in w["events"]: + detail_parts.append(f' [{evt["ts"]}] {evt["type"]}') + detail_parts.append("") + + detail_text = "\n".join(detail_parts) if has_events else "" + + # 主报告中添加引用 + if has_events: + sections.append("> 完整事件详情: [detail/health_events.md](../detail/health_events.md)") + sections.append("") + + return "\n".join(sections), detail_text + + +def _build_timeline_data(result): + """构建 render_timeline 需要的数据格式。""" + tr = result["time_range"] + if not tr["start"] or not tr["end"]: + return None + workers_data = {} + for url, w in result["workers"].items(): + periods = [] + status, start = "up", tr["start"] + for evt in w["events"]: + if not evt["ts"]: + continue + if evt["type"] in ("NOT_HEALTHY", "REMOVED") and status == "up": + periods.append({"from": start, "to": evt["ts"], "status": "up"}) + status, start = "down", evt["ts"] + elif evt["type"] in ("HEALTHY", "RE-REGISTERED") and status == "down": + periods.append({"from": start, "to": evt["ts"], "status": "down"}) + status, start = "up", evt["ts"] + periods.append({"from": start, "to": tr["end"], "status": status}) + workers_data[url] = periods + return {"start": tr["start"], "end": tr["end"], "workers": workers_data} + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py new file mode 100644 index 00000000000..508cf3824d9 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/latency.py @@ -0,0 +1,353 @@ +#!/usr/bin/env python3 +""" +Latency Analyzer — 延迟分析 + +分析 Router 日志中的请求延迟百分位数、延迟分布、吞吐量趋势、调度耗时、慢请求。 +仅统计推理请求路径(/v1/chat/completions, /v1/completions)。 +""" + +import os +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from chart import render_bar, render_sparkline, render_table +from log_parser import TS_MS_RE, extract_tags, parse_http_line +from stats import compute_statistics, time_bucket + +# ════════════════════════════════════════════════════════════════ +# 调度耗时解析 +# ════════════════════════════════════════════════════════════════ + + +def _parse_scheduling_ms(ts_ms_lines): + """从 ts_ms 行计算调度耗时(同一请求两个 ts_ms 之间的差值)。 + + 同一 request_id 的两条 ts_ms 行之间的时间差即为调度耗时。 + 返回 ms 列表。 + """ + from datetime import datetime + + # 按 request_id 分组 + by_reqid = defaultdict(list) + for line in ts_ms_lines: + m = TS_MS_RE.search(line) + if not m: + continue + ts_ms_str = m.group(1) + tags = extract_tags(line) + rid = tags.get("request_id", "") + if rid: + try: + dt = datetime.strptime(ts_ms_str, "%Y-%m-%d %H:%M:%S.%f") + by_reqid[rid].append(dt) + except ValueError: + pass + + # 计算每个 request_id 的 max - min 差值 + durations = [] + for rid, timestamps in by_reqid.items(): + if len(timestamps) >= 2: + timestamps.sort() + delta_ms = (timestamps[-1] - timestamps[0]).total_seconds() * 1000 + durations.append(round(delta_ms, 3)) + + return durations + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + +LATENCY_DIST_SPEC = "<100,100-500,500-1000,1000-5000,5000-10000,>10000" + + +def analyze_latency(log_file, tail=None): + """分析日志中的请求延迟。 + + Args: + log_file: 日志文件路径 + tail: 尾部行数限制 + + Returns: + dict: { + stats: {count, p50, p90, p95, p99, max, mean, stddev, distribution}, + latency_trend: [{bucket, latency_ms_p50}], + throughput_trend: [{bucket, count}], + slow_top10: [{ts, path, status, latency_ms, client_ip}], + scheduling_stats: {p50, p90, p99} | None, + diagnoses: [{message, severity}], + } + """ + # Phase 1: Grep 提取 + http_lines = _grep_lines(log_file, r"\[(POST|GET)\] /", tail) + ts_ms_lines = _grep_lines(log_file, "ts_ms=", tail) + + # Phase 2: 解析 HTTP 行(仅推理路径) + http_records = [] + for line in http_lines: + r = parse_http_line(line, inference_only=True) + if r: + http_records.append(r) + + # Phase 3: 分析 + + # 3.1 延迟统计 + latency_values = [r["latency_ms"] for r in http_records] + stats = compute_statistics( + latency_values, + percentiles_list=[50, 90, 95, 99], + distribution_spec=LATENCY_DIST_SPEC, + ) + + # 3.2 延迟趋势 (p50) + latency_trend = time_bucket( + http_records, + window="auto", + agg_specs=[("latency_ms", "p50")], + ) + + # 3.3 吞吐量趋势 + throughput_trend = time_bucket(http_records, window="auto") + + # 3.4 慢请求 Top 10 + sorted_by_latency = sorted(http_records, key=lambda r: -r["latency_ms"]) + slow_top10 = [] + for r in sorted_by_latency[:10]: + slow_top10.append( + { + "ts": r["ts"], + "path": r["path"], + "status": r["status"], + "latency_ms": r["latency_ms"], + "client_ip": r["client_ip"], + } + ) + + # 3.5 调度耗时 + scheduling_stats = None + if ts_ms_lines: + sched_durations = _parse_scheduling_ms(ts_ms_lines) + if sched_durations: + sched_raw = compute_statistics(sched_durations, percentiles_list=[50, 90, 99]) + scheduling_stats = { + "p50": sched_raw["p50"], + "p90": sched_raw["p90"], + "p99": sched_raw["p99"], + "count": sched_raw["count"], + } + + # 3.6 诊断规则 + diagnoses = _run_diagnostics(stats, scheduling_stats) + + return { + "stats": stats, + "latency_trend": latency_trend, + "throughput_trend": throughput_trend, + "slow_top10": slow_top10, + "scheduling_stats": scheduling_stats, + "diagnoses": diagnoses, + } + + +def _run_diagnostics(stats, scheduling_stats): + """应用诊断规则。""" + diagnoses = [] + + if stats["count"] == 0: + diagnoses.append({"message": "未找到推理请求", "severity": "INFO"}) + return diagnoses + + p99 = stats.get("p99", 0) + p50 = stats.get("p50", 0) + + # p99 > 10s + if p99 > 10000: + if scheduling_stats and scheduling_stats["p99"] < 100: + diagnoses.append( + { + "message": f'p99={p99:.0f}ms 但调度仅 {scheduling_stats["p99"]:.0f}ms → 延迟在后端推理层', + "severity": "HIGH", + } + ) + elif scheduling_stats and scheduling_stats["p99"] >= 100: + diagnoses.append( + { + "message": f'p99={p99:.0f}ms 且调度 p99={scheduling_stats["p99"]:.0f}ms → 调度层瓶颈', + "severity": "CRITICAL", + } + ) + else: + diagnoses.append( + { + "message": f"p99={p99:.0f}ms (>10s),后端推理延迟高", + "severity": "HIGH", + } + ) + + # 尾延迟 + if p50 > 0 and p99 / p50 > 10: + diagnoses.append( + { + "message": f"p99/p50={p99/p50:.1f}x → 尾延迟严重", + "severity": "MEDIUM", + } + ) + + if not diagnoses: + diagnoses.append( + { + "message": f"延迟正常 (p50={p50:.0f}ms, p99={p99:.0f}ms)", + "severity": "INFO", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + """用 grep 从日志文件提取匹配行。""" + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_latency_report(result): + """将分析结果格式化为终端报告。""" + sections = [] + stats = result["stats"] + + sections.append("## 延迟分析") + sections.append("") + + if stats["count"] == 0: + sections.append(" 未找到推理请求 (/v1/chat/completions, /v1/completions)") + return "\n".join(sections) + + # 百分位数概览 + sections.append( + f' 推理请求: {stats["count"]} | ' + f'p50={_fmt_ms(stats["p50"])} p90={_fmt_ms(stats["p90"])} ' + f'p95={_fmt_ms(stats["p95"])} p99={_fmt_ms(stats["p99"])} ' + f'max={_fmt_ms(stats["max"])}' + ) + sections.append(" 指标口径: pXX=延迟分位数;吞吐量=每个时间桶内请求数(count);调度耗时=同 request_id 的 ts_ms(max-min)。") + sections.append("") + + # 延迟分布 + if stats.get("distribution"): + sections.append("### 延迟分布") + sections.append("") + bar_data = [] + for d in stats["distribution"]: + bar_data.append( + { + "label": d["range"], + "value": d["pct"], + "count": d["count"], + } + ) + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 延迟趋势 + if result["latency_trend"] and len(result["latency_trend"]) > 1: + sections.append("### 延迟趋势 (p50)") + sections.append("") + sections.append( + render_sparkline( + result["latency_trend"], + value_field="latency_ms_p50", + title="p50 Latency", + y_label="ms", + ) + ) + sections.append("") + + # 吞吐量趋势 + if result["throughput_trend"] and len(result["throughput_trend"]) > 1: + sections.append("### 吞吐量趋势") + sections.append("") + sections.append( + render_sparkline( + result["throughput_trend"], + value_field="count", + title="Throughput", + y_label="req", + ) + ) + sections.append("") + + # 调度耗时 + if result["scheduling_stats"]: + ss = result["scheduling_stats"] + sections.append(f'### 调度耗时 ({ss["count"]} samples)') + sections.append(f' p50={_fmt_ms(ss["p50"])} p90={_fmt_ms(ss["p90"])} p99={_fmt_ms(ss["p99"])}') + sections.append("") + + # 慢请求 Top 10 + if result["slow_top10"]: + sections.append("### 慢请求 Top 10") + sections.append("") + table_data = [] + for r in result["slow_top10"]: + table_data.append( + { + "时间": r["ts"][-8:] if len(r["ts"]) > 8 else r["ts"], + "延迟": _fmt_ms(r["latency_ms"]), + "状态": str(r["status"]), + "路径": r["path"], + "Client": r["client_ip"], + } + ) + sections.append( + render_table( + table_data, + columns=["时间", "延迟", "状态", "路径", "Client"], + ) + ) + sections.append("") + + # 诊断(仅在 detail 输出) + if result["diagnoses"]: + sections.append("### 诊断") + sections.append(" 诊断见详情: [detail/latency_diagnoses.md](../detail/latency_diagnoses.md)") + sections.append("") + + return "\n".join(sections) + + +def _fmt_ms(ms): + """格式化毫秒值为人类可读字符串。""" + if ms >= 60000: + return f"{ms/60000:.1f}min" + elif ms >= 1000: + return f"{ms/1000:.2f}s" + elif ms >= 1: + return f"{ms:.1f}ms" + else: + return f"{ms*1000:.0f}µs" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py new file mode 100644 index 00000000000..83b9c8a05e1 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load.py @@ -0,0 +1,367 @@ +#!/usr/bin/env python3 +""" +Load Analyzer — 负载与计数器分析 + +分析 Worker 负载分布、计数器异常、请求堆积检测、token 计数器。 +""" + +import os +import re +import subprocess +import sys +from collections import defaultdict + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from log_parser import extract_ts, match_select_release, parse_stats_line +from stats import compute_statistics, time_bucket + +# ════════════════════════════════════════════════════════════════ +# Counter 异常检测正则 +# ════════════════════════════════════════════════════════════════ + +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +DOUBLE_RELEASE_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?double-release") +COUNTER_CLEANED_RE = re.compile(rf"release worker:\s*{URL_RE}\s+skipped.*?counter already cleaned up") +COUNTER_PRESERVED_RE = re.compile(rf"counter preserved.*?{URL_RE}") +TOKEN_PRESERVED_RE = re.compile(rf"token counter preserved.*?{URL_RE}") + +# Token 事件 +SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +SELECT_REQ_COUNT_RE = re.compile(rf"select worker \((\w+)\):\s*{URL_RE},\s*count:\s*(\d+)") +RELEASE_REQ_COUNT_RE = re.compile(rf"release worker:\s*{URL_RE},\s*count:\s*(\d+)") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + +def _normalize_worker_type(worker_type): + t = (worker_type or "unknown").lower() + if t in ("prefill", "decode", "mixed"): + return t + return "unknown" + + +def parse_counter_anomaly(line): + """解析 H5 counter 异常行。""" + ts = extract_ts(line) + m = DOUBLE_RELEASE_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "double-release"} + m = COUNTER_CLEANED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-cleaned-up"} + m = COUNTER_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "counter-preserved"} + m = TOKEN_PRESERVED_RE.search(line) + if m: + return {"ts": ts, "worker": m.group(1), "anomaly_type": "token-preserved"} + return None + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_load(log_file, tail=None): + """分析负载与计数器。 + + Returns: + dict: {load_stats, worker_load, load_trend, counter_anomalies, + select_release, token_stats, diagnoses, summary} + """ + h7_lines = _grep_lines(log_file, r"\[stats\]", tail) + h3_lines = _grep_lines(log_file, r"select worker|release worker|Failed to select", tail) + h5_lines = _grep_lines( + log_file, + r"counter preserved|cleanup unhealthy|removed counters|counter already|double-release|preserved counters", + tail, + ) + h11_lines = _grep_lines(log_file, r"release [a-zA-Z_]+ tokens:", tail) + + # 解析 stats 行 + stats_records = [r for line in h7_lines for r in [parse_stats_line(line)] if r] + + # 负载统计 + total_running_vals = [r["total_running"] for r in stats_records if "total_running" in r] + load_stats = compute_statistics(total_running_vals) if total_running_vals else {} + + # Per-Worker 负载分布 + worker_running = defaultdict(list) + for r in stats_records: + for w_url, running in r.get("workers", {}).items(): + worker_running[w_url].append(running) + + worker_load = [] + for w_url in sorted(worker_running.keys()): + vals = worker_running[w_url] + avg = sum(vals) / len(vals) if vals else 0 + worker_load.append( + { + "worker": _strip_scheme(w_url), + "avg_running": round(avg, 1), + "max_running": max(vals) if vals else 0, + "samples": len(vals), + } + ) + + # 负载趋势 + load_trend = ( + time_bucket(stats_records, window="auto", agg_specs=[("total_running", "mean")]) if stats_records else [] + ) + + # Counter 异常 + counter_anomalies = defaultdict(lambda: defaultdict(int)) + for line in h5_lines: + evt = parse_counter_anomaly(line) + if evt: + counter_anomalies[evt["anomaly_type"]][evt["worker"]] += 1 + + anomaly_summary = [] + for atype, workers in counter_anomalies.items(): + total = sum(workers.values()) + anomaly_summary.append( + { + "type": atype, + "total": total, + "workers": dict(workers), + } + ) + + # Select/Release 匹配 + sr_result = ( + match_select_release(h3_lines + h11_lines) + if h3_lines + else { + "matched": [], + "unmatched_selects": [], + "unmatched_releases": [], + "failed_selects": [], + "per_worker": {}, + "id_coverage": {}, + "type_summary": {}, + "worker_type_profile": {}, + } + ) + + # Token 统计 + token_stats = _analyze_tokens(h3_lines, h11_lines) + counter_last_state = _analyze_counter_last_state(h3_lines + h11_lines) + + # 请求堆积检测 + pileup = _detect_pileup(stats_records) + + # 诊断 + diagnoses = _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup) + + return { + "load_stats": load_stats, + "worker_load": worker_load, + "load_trend": load_trend, + "counter_anomalies": anomaly_summary, + "select_release": sr_result, + "token_stats": token_stats, + "counter_last_state": counter_last_state, + "pileup_detected": pileup, + "diagnoses": diagnoses, + "summary": f"{len(stats_records)} stats 采样, {len(worker_running)} Worker(s)", + } + + +def _analyze_tokens(h3_lines, h11_lines): + """分析 token 分配与释放。""" + token_alloc = defaultdict(list) + token_release = defaultdict(list) + + for line in h3_lines: + m = SELECT_TOKENS_RE.search(line) + if m: + token_alloc[m.group(2)].append(int(m.group(3))) + + for line in h11_lines: + m = RELEASE_TOKENS_RE.search(line) + if m: + token_release[m.group(2)].append(int(m.group(3))) + + result = [] + all_workers = set(token_alloc.keys()) | set(token_release.keys()) + for w in sorted(all_workers): + allocs = token_alloc.get(w, []) + releases = token_release.get(w, []) + result.append( + { + "worker": _strip_scheme(w), + "alloc_count": len(allocs), + "alloc_avg": round(sum(allocs) / len(allocs), 0) if allocs else 0, + "release_count": len(releases), + } + ) + return result + + +def _analyze_counter_last_state(lines): + """统计每个 worker 的 request/token counter 最后一条计数日志值与动作类型。""" + state = defaultdict( + lambda: { + "req_last_action": "-", + "req_last_value": "-", + "token_last_action": "-", + "token_last_value": "-", + "last_ts": "", + } + ) + for line in lines: + ts = extract_ts(line) or "" + m = SELECT_REQ_COUNT_RE.search(line) + if m: + w = m.group(2) + state[w]["req_last_action"] = "select" + state[w]["req_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_REQ_COUNT_RE.search(line) + if m: + w = m.group(1) + state[w]["req_last_action"] = "release" + state[w]["req_last_value"] = m.group(2) + state[w]["last_ts"] = ts + continue + m = SELECT_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "select" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + m = RELEASE_TOKENS_RE.search(line) + if m: + w = m.group(2) + state[w]["token_last_action"] = "release" + state[w]["token_last_value"] = m.group(3) + state[w]["last_ts"] = ts + continue + + result = [] + for w in sorted(state.keys()): + s = state[w] + result.append({"worker": _strip_scheme(w), **s}) + return result + + +def _detect_pileup(stats_records): + """检测请求堆积:total_running 连续上升 >5 个采样点。""" + if len(stats_records) < 5: + return False + vals = [r.get("total_running", 0) for r in stats_records] + max_consecutive = 0 + current = 0 + for i in range(1, len(vals)): + if vals[i] > vals[i - 1]: + current += 1 + max_consecutive = max(max_consecutive, current) + else: + current = 0 + return max_consecutive >= 5 + + +def _diagnose(load_stats, worker_load, anomaly_summary, sr_result, token_stats, pileup): + """生成负载诊断。""" + diagnoses = [] + + if pileup: + diagnoses.append( + {"severity": "HIGH", "message": "total_running 持续上升,疑似请求堆积", "source_layer": "FD 后端"} + ) + + # 空闲 Worker + for w in worker_load: + if w["avg_running"] == 0 and w["samples"] > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{w["worker"]} running 持续 =0(空闲或故障未移除)', + "source_layer": "Router", + } + ) + + # 负载严重不均 + if load_stats.get("stddev", 0) > 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'负载标准差 {load_stats["stddev"]},分布不均衡', + "source_layer": "Router", + } + ) + + # Counter 异常 + for a in anomaly_summary: + if a["type"] == "double-release" and a["total"] > 0: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'double-release {a["total"]} 次(计数器逻辑 bug)', + "source_layer": "Router", + } + ) + + id_cov = sr_result.get("id_coverage", {}) + has_correlatable_ids = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) > 0 + + # Select/Release 不一致(仅在存在可关联 ID 时启用,避免无 ID 场景误报) + if has_correlatable_ids: + for w_url, pw in sr_result.get("per_worker", {}).items(): + delta = pw.get("delta", 0) + if delta >= 3: + diagnoses.append( + { + "severity": "MEDIUM", + "message": f"{_strip_scheme(w_url)} select-release 差值 {delta}(可能存在在途请求堆积)", + "source_layer": "FD 后端", + } + ) + + # Token 计数器潜在泄漏 + for t in token_stats: + if t.get("alloc_count", 0) > t.get("release_count", 0): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'{t["worker"]} token alloc/release 不平衡 ({t["alloc_count"]}/{t["release_count"]})', + "source_layer": "Router", + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py new file mode 100644 index 00000000000..5cbdc829bf6 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/load_report.py @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +"""Load report formatter.""" + +from chart import render_bar, render_sparkline, render_table + + +def _strip_scheme(url): + import re + return re.sub(r"^https?://", "", url) + + +def format_load_report(result): + """将分析结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_text) + """ + sections = ["## 负载与计数器分析", ""] + sections.append(f' {result["summary"]}') + sections.append("") + detail_sections = ["# 负载与计数器详情", ""] + detail_sections.append(f'总结: {result["summary"]}') + detail_sections.append("") + + if result["diagnoses"]: + sections.append("### 诊断") + sections.append("") + sections.append(f' 共 {len(result["diagnoses"])} 条诊断,见详情: [detail/load_diagnoses.md](../detail/load_diagnoses.md)') + sections.append("") + detail_sections.append("## 诊断") + detail_sections.append("") + for d in result["diagnoses"]: + detail_sections.append(f'[{d["severity"]}] [{d["source_layer"]}] {d["message"]}') + detail_sections.append("") + + # 负载概览 + ls = result.get("load_stats", {}) + if ls: + sections.append("### 负载概览 (total_running)") + sections.append("") + sections.append(" 说明: stats 采样来自 `[stats]` 周期日志(通常每 5s 一条),用于观察当前并发与负载变化趋势。") + sections.append( + f' mean={ls.get("mean",0)} p50={ls.get("p50",0)} p90={ls.get("p90",0)} ' + f'p99={ls.get("p99",0)} max={ls.get("max",0)} stddev={ls.get("stddev",0)}' + ) + sections.append("") + + # Per-Worker 负载 + if result["worker_load"]: + sections.append("### Per-Worker 负载") + sections.append("") + bar_data = [ + {"label": w["worker"][:25], "value": min(100, w["avg_running"] * 5), "count": w["avg_running"]} + for w in result["worker_load"] + ] + sections.append(render_bar(bar_data, show_count=True)) + sections.append("") + + # 负载趋势 + if result["load_trend"] and len(result["load_trend"]) > 1: + sections.append("### 负载趋势") + sections.append("") + sections.append( + render_sparkline( + result["load_trend"], value_field="total_running_mean", title="Total Running", y_label="req" + ) + ) + sections.append("") + + # Counter 异常 + if result["counter_anomalies"]: + sections.append("### 计数器异常") + sections.append("") + for a in result["counter_anomalies"]: + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) + sections.append(f' {a["type"]}: {a["total"]} 次 [{workers_str}]') + sections.append("") + detail_sections.append("## 计数器异常") + detail_sections.append("") + for a in result["counter_anomalies"]: + workers_str = ", ".join(f'{_strip_scheme(w)}({c})' for w, c in a["workers"].items()) + detail_sections.append(f'- {a["type"]}: {a["total"]} 次 [{workers_str}]') + detail_sections.append("") + + # 按 prefill / decode / mixed 分类统计 + type_summary = result.get("select_release", {}).get("type_summary", {}) + if type_summary: + sections.append("### 按类型统计(prefill / decode / mixed)") + sections.append("") + type_rows = [] + for t in ("prefill", "decode", "mixed", "unknown"): + s = type_summary.get(t) + if not s: + continue + token_display = "-" + if t == "prefill": + token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}' + elif t == "mixed" and (s.get("token_selects", 0) > 0 or s.get("token_releases", 0) > 0): + token_display = f'{s.get("token_selects",0)}/{s.get("token_releases",0)}' + type_rows.append( + { + "type": t, + "counter(S/R)": f'{s.get("counter_selects",0)}/{s.get("counter_releases",0)}', + "token(S/R)": token_display, + } + ) + if type_rows: + sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) + sections.append("") + sections.append(" 说明: prefill/mixed 的 token-select 同时表示 request counter + token counter 增加;decode 仅 request counter。") + sections.append(" 说明: `release prefill tokens` 会被识别为 token-release;worker type 按该 worker URL 在 select 中的类型映射(prefill/decode/mixed)。") + if type_summary.get("unknown"): + sections.append(" 说明: unknown 表示日志里缺少 worker type,且无法从邻近 select/release 关系推断。") + sections.append("") + detail_sections.append("## 按类型统计") + detail_sections.append("") + detail_sections.append(render_table(type_rows, columns=["type", "counter(S/R)", "token(S/R)"])) + detail_sections.append("") + + id_cov = result.get("select_release", {}).get("id_coverage", {}) + if id_cov: + sections.append("### 请求标识覆盖(基于 select 近似请求数)") + sections.append("") + sections.append( + " total={total} | with_request_id={with_rid} | without_request_id={without_rid} | " + "with_alt_id={with_alt} | without_any_id={without_any}".format( + total=id_cov.get("total_requests_estimated", 0), + with_rid=id_cov.get("with_request_id", 0), + without_rid=id_cov.get("without_request_id", 0), + with_alt=id_cov.get("with_alt_id", 0), + without_any=id_cov.get("without_any_id", 0), + ) + ) + if id_cov.get("without_any_id", 0) > 0: + sections.append(" ℹ 无 request/session/trace/req_id 时,不做退化匹配,仅统计为 untracked。") + sections.append(" 字段说明: total=select 事件总数估算;with_request_id=含 request_id;without_request_id=不含 request_id;with_alt_id=含 req_id/trace_id/session_id;without_any_id=四类 ID 都缺失。") + sections.append("") + detail_sections.append("## 请求标识覆盖字段说明") + detail_sections.append("") + detail_sections.append( + "- total: select 事件总数(近似请求数)\n" + "- with_request_id: 携带 request_id 的 select 数\n" + "- without_request_id: 未携带 request_id 的 select 数\n" + "- with_alt_id: 无 request_id 但携带 req_id/trace_id/session_id 的 select 数\n" + "- without_any_id: 四类 ID 都没有,无法做请求级关联" + ) + detail_sections.append("") + + # Select/Release 匹配 + sr = result.get("select_release", {}) + if sr.get("per_worker"): + sections.append("### Select/Release 匹配") + sections.append("") + id_cov = sr.get("id_coverage", {}) + no_correlatable_id = (id_cov.get("with_request_id", 0) + id_cov.get("with_alt_id", 0)) == 0 + table_data = [] + for w_url, pw in sorted(sr["per_worker"].items()): + delta_display = "N/A" if no_correlatable_id else str(pw["delta"]) + table_data.append( + { + "Worker": _strip_scheme(w_url), + "ReqSelect": str(pw["selects"]), + "ReqRelease": str(pw["releases"]), + "ReqDelta": delta_display, + "TokenSelect": str(pw.get("token_selects", 0)), + "TokenRelease": str(pw.get("token_releases", 0)), + } + ) + sections.append( + render_table( + table_data, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"}, + ) + ) + sections.append("") + if no_correlatable_id: + sections.append(" ℹ 当前样本无可关联 ID,Delta 不用于请求泄漏结论。") + sections.append("") + sections.append(" ℹ ReqDelta>0 可能仅表示仍有在途请求(尚未完成推理),需结合时间窗口观察。") + sections.append("") + sections.append(" 说明: TokenSelect 按 worker type 统计(prefill + mixed 的 select 都计入),不依赖日志里是否出现 tokens 字段。") + sections.append("") + detail_sections.append("## Select/Release Per-Worker") + detail_sections.append("") + + if sr.get("worker_type_profile"): + sections.append("### Worker URL 类型画像(基于 select)") + sections.append("") + rows = [] + for w, p in sorted(sr["worker_type_profile"].items()): + rows.append( + { + "Worker": _strip_scheme(w), + "Dominant": p.get("dominant_type", "unknown"), + "Prefill": p.get("prefill", 0), + "Decode": p.get("decode", 0), + "Mixed": p.get("mixed", 0), + } + ) + sections.append( + render_table( + rows, + columns=["Worker", "Dominant", "Prefill", "Decode", "Mixed"], + right_align={"Prefill", "Decode", "Mixed"}, + ) + ) + sections.append("") + detail_sections.append( + render_table( + table_data, + columns=["Worker", "ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"], + right_align={"ReqSelect", "ReqRelease", "ReqDelta", "TokenSelect", "TokenRelease"}, + ) + ) + detail_sections.append("") + + if sr.get("failed_selects"): + sections.append(f' ⚠ Failed to select: {len(sr["failed_selects"])} 次') + sections.append(" 解释: 路由在该时刻未能选出可用 worker,通常意味着可用池不足或健康状态异常。") + sections.append("") + detail_sections.append("## Failed to select") + detail_sections.append("") + for f in sr["failed_selects"]: + detail_sections.append(f'- [{f.get("ts","")}] line={f.get("line","")}') + detail_sections.append("") + + # Token 统计 + if result.get("token_stats"): + sections.append("### Token 计数器") + sections.append("") + sections.append( + render_table( + result["token_stats"], + columns=["worker", "alloc_count", "alloc_avg", "release_count"], + right_align={"alloc_count", "alloc_avg", "release_count"}, + ) + ) + sections.append("") + + if result.get("counter_last_state"): + sections.append("### 计数器末状态") + sections.append("") + sections.append(" 末状态详情见: [detail/load_counter_state.md](../detail/load_counter_state.md)") + sections.append("") + detail_sections.append("## Counter / Token Counter 末状态(最后一条计数日志)") + detail_sections.append("") + detail_sections.append( + render_table( + result["counter_last_state"], + columns=["worker", "req_last_action", "req_last_value", "token_last_action", "token_last_value", "last_ts"], + right_align={"req_last_value", "token_last_value"}, + ) + ) + detail_sections.append("") + + return "\n".join(sections), "\n".join(detail_sections) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py new file mode 100644 index 00000000000..ba4c7bd1051 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/analyzers/trace.py @@ -0,0 +1,616 @@ +#!/usr/bin/env python3 +""" +Trace Analyzer — 请求追踪 + +通过 session_id / trace_id / request_id / req_id 追踪单个或多个请求的 +完整生命周期,重建事件链,检测不完整生命周期。 +""" + +import os +import re +import subprocess +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from log_parser import ( + extract_tags, + extract_ts, + match_select_release, + parse_cache_strategy_line, + parse_http_line, +) + +# ════════════════════════════════════════════════════════════════ +# 事件识别正则 +# ════════════════════════════════════════════════════════════════ + +PARSING_COMPLETE_RE = re.compile(r"Parsing completed.*worker selection") +URL_RE = r"((?:https?://)?[A-Za-z0-9.-]+(?::\d+)?)" +SELECT_WORKER_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_WORKER_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*{URL_RE}") +RELEASE_TOKENS_RE = re.compile(rf"release prefill tokens:\s*{URL_RE},\s*tokens:\s*(\d+)") +REQUEST_COMPLETE_RE = re.compile(r"Request completed successfully") +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + +# Prefill 事件 +PREFILL_FIRST_CHUNK_RE = re.compile(rf"\[prefill\] first chunk received.*?{URL_RE}") +PREFILL_DONE_RE = re.compile(rf"\[prefill\] non-stream prefill response done.*?{URL_RE}") +PREFILL_ERROR_RE = re.compile(rf"\[prefill\] (scanner error|copy error).*?{URL_RE}") +PREFILL_DEFER_RE = re.compile(rf"\[prefill\] release in defer.*?{URL_RE}") +PREFILL_ERR_PATH_RE = re.compile(rf"\[prefill\] release in CommonCompletions defer \(error path\).*?{URL_RE}") +FAILED_SELECT_RE = re.compile(r"Failed to select") + + +def _strip_scheme(url): + return re.sub(r"^https?://", "", url) + + +# ════════════════════════════════════════════════════════════════ +# 主分析函数 +# ════════════════════════════════════════════════════════════════ + + +def analyze_trace(log_file, trace_ids, tail=None): + """追踪指定 ID 的请求生命周期。 + + Args: + log_file: 日志文件路径 + trace_ids: ID 列表(逗号分隔的字符串或列表) + tail: 尾部行数限制 + + Returns: + dict: {traces: {id: {events, lifecycle_complete, diagnoses}}, summary} + """ + auto_discovery_summary = "" + if isinstance(trace_ids, str): + normalized = trace_ids.strip().lower() + if normalized in ("all", "full", "all_ids", "全部", "全量"): + trace_ids, auto_discovery_summary = _discover_full_trace_targets(log_file, tail=tail) + else: + trace_ids = [tid.strip() for tid in trace_ids.split(",") if tid.strip()] + + if not trace_ids: + return {"traces": {}, "summary": "未指定追踪 ID"} + + traces = {} + for tid in trace_ids: + # Grep 搜索四种 context tag + pattern = f"session_id:{tid}|trace_id:{tid}|request_id:{tid}|req_id:{tid}" + matching_lines = _grep_lines(log_file, pattern, tail) + + if not matching_lines: + traces[tid] = { + "events": [], + "lifecycle_complete": False, + "diagnoses": [{"severity": "INFO", "message": f"未找到 ID={tid} 的匹配行"}], + "matched_tag": None, + "related_ids": {}, + } + continue + + # 识别匹配到的 tag 类型,并展开 session 下所有 request_id + first_tags = extract_tags(matching_lines[0]) + is_session = tid in [first_tags.get("session_id", "")] + + # 如果是 session_id,收集所有关联的 request_id + related_request_ids = set() + if is_session: + for line in matching_lines: + tags = extract_tags(line) + rid = tags.get("request_id", "") + if rid: + related_request_ids.add(rid) + + # 为每个 request_id 额外搜索行 + extra_lines = [] + for rid in related_request_ids: + rid_lines = _grep_lines(log_file, f"request_id:{rid}", tail) + extra_lines.extend(rid_lines) + all_lines = list(set(matching_lines + extra_lines)) + else: + all_lines = matching_lines + + # 解析事件链 + events = _parse_event_chain(all_lines) + lifecycle_complete = _check_lifecycle_complete(events) + sr_check = match_select_release(all_lines) + diagnoses = _diagnose_trace(events, lifecycle_complete, sr_check) + + tag_coverage = _build_id_coverage_stats(all_lines) + tag_combos = _build_id_combo_stats(all_lines) + matched_tags = _detect_matched_tags(all_lines, tid) + traces[tid] = { + "events": events, + "lifecycle_complete": lifecycle_complete, + "diagnoses": diagnoses, + "sr_check": sr_check, + "matched_tag": _format_matched_tag(matched_tags), + "matched_tags": matched_tags, + "related_ids": { + "request_ids": sorted(related_request_ids) if is_session else [], + }, + "id_coverage": tag_coverage, + "id_combos": tag_combos, + } + + total_traced = len(traces) + complete = sum(1 for t in traces.values() if t["lifecycle_complete"]) + + summary = f"{total_traced} ID(s) 追踪, {complete} 生命周期完整" + if auto_discovery_summary: + summary += f" | {auto_discovery_summary}" + + return {"traces": traces, "summary": summary} + + +def _discover_full_trace_targets(log_file, tail=None): + """全量追踪目标发现。 + + 规则: + 1) 有 session_id 的优先按 session_id 追踪 + 2) 无 session 但有 trace_id 的按 trace_id 追踪 + 3) 剩余“孤立”的 request_id/req_id 单独追踪 + """ + lines = _grep_lines(log_file, r"session_id:|trace_id:|request_id:|req_id:", tail=tail) + if not lines: + return [], "全量追踪未发现任何可用 ID" + + session_ids = set() + trace_ids = set() + all_request_ids = set() + request_ids_with_session_or_trace = set() + + for line in lines: + tags = extract_tags(line) + sid = tags.get("session_id") + tid = tags.get("trace_id") + rid = tags.get("request_id") or tags.get("req_id") + has_session = bool(sid) + has_trace = bool(tid) + has_request = bool(rid) + + if has_session: + session_ids.add(sid) + if has_trace: + trace_ids.add(tid) + if has_request: + all_request_ids.add(rid) + if has_session or has_trace: + request_ids_with_session_or_trace.add(rid) + + standalone_request_ids = all_request_ids - request_ids_with_session_or_trace + + targets = [] + chosen = set() + for bucket in (sorted(session_ids), sorted(trace_ids), sorted(standalone_request_ids)): + for _id in bucket: + if _id and _id not in chosen: + chosen.add(_id) + targets.append(_id) + + summary = ( + "全量ID发现: " + f"session={len(session_ids)}, trace={len(trace_ids)}, " + f"standalone_request={len(standalone_request_ids)}, total_targets={len(targets)}" + ) + return targets, summary + + +def _parse_event_chain(lines): + """从匹配行重建事件链,按时间排序。""" + events = [] + + for line in lines: + ts = extract_ts(line) + tags = extract_tags(line) + + # HTTP 请求进入/完成 + http = parse_http_line(line) + if http: + events.append( + { + "ts": ts, + "type": "HTTP", + "tags": tags, + "method": http["method"], + "path": http["path"], + "status": http["status"], + "latency_ms": http["latency_ms"], + "raw": line.strip(), + } + ) + continue + + # Parsing completed + if PARSING_COMPLETE_RE.search(line): + events.append({"ts": ts, "type": "PARSING_COMPLETE", "tags": tags, "raw": line.strip()}) + continue + + # Cache-aware strategy + strategy = parse_cache_strategy_line(line) + if strategy: + events.append( + { + "ts": ts, + "type": "CACHE_STRATEGY", + "tags": tags, + "strategy": strategy.get("strategy"), + "selected": strategy.get("selected", ""), + "selected_hitRatio": strategy.get("selected_hitRatio", 0), + "raw": line.strip(), + } + ) + continue + + # Select worker + m = SELECT_WORKER_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "SELECT_WORKER", + "tags": tags, + "worker_type": m.group(1) or "unknown", + "worker": m.group(2), + "raw": line.strip(), + } + ) + continue + + # Release worker + m = RELEASE_WORKER_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "RELEASE_WORKER", + "tags": tags, + "worker_type": m.group(1) or "unknown", + "worker": m.group(2), + "raw": line.strip(), + } + ) + continue + + # Release tokens + m = RELEASE_TOKENS_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "RELEASE_TOKENS", + "tags": tags, + "worker": m.group(1), + "tokens": int(m.group(2)), + "raw": line.strip(), + } + ) + continue + + # Prefill events + m = PREFILL_FIRST_CHUNK_RE.search(line) + if m: + events.append( + {"ts": ts, "type": "PREFILL_FIRST_CHUNK", "tags": tags, "worker": m.group(1), "raw": line.strip()} + ) + continue + m = PREFILL_DONE_RE.search(line) + if m: + events.append({"ts": ts, "type": "PREFILL_DONE", "tags": tags, "worker": m.group(1), "raw": line.strip()}) + continue + m = PREFILL_ERROR_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "PREFILL_ERROR", + "tags": tags, + "error": m.group(1), + "worker": m.group(2), + "raw": line.strip(), + } + ) + continue + m = PREFILL_DEFER_RE.search(line) + if m: + events.append( + {"ts": ts, "type": "PREFILL_DEFER_RELEASE", "tags": tags, "worker": m.group(1), "raw": line.strip()} + ) + continue + m = PREFILL_ERR_PATH_RE.search(line) + if m: + events.append( + { + "ts": ts, + "type": "PREFILL_ERROR_PATH_RELEASE", + "tags": tags, + "worker": m.group(1), + "raw": line.strip(), + } + ) + continue + + # Request completed + if REQUEST_COMPLETE_RE.search(line): + events.append({"ts": ts, "type": "REQUEST_COMPLETE", "tags": tags, "raw": line.strip()}) + continue + + # ts_ms + m = TS_MS_RE.search(line) + if m: + events.append({"ts": ts, "type": "TS_MS", "tags": tags, "ts_ms": m.group(1), "raw": line.strip()}) + continue + + # Failed to select + if FAILED_SELECT_RE.search(line): + events.append({"ts": ts, "type": "FAILED_SELECT", "tags": tags, "raw": line.strip()}) + continue + + # 按时间排序 + events.sort(key=lambda e: e.get("ts") or "") + return events + + +def _check_lifecycle_complete(events): + """检查生命周期是否完整。""" + types = {e["type"] for e in events} + has_entry = "HTTP" in types or "PARSING_COMPLETE" in types + has_exit = "REQUEST_COMPLETE" in types or ( + "HTTP" in types and any(e["type"] == "HTTP" and e.get("status") for e in events) + ) + has_select = "SELECT_WORKER" in types + has_release = "RELEASE_WORKER" in types + + return has_entry and has_exit and (not has_select or has_release) + + +def _diagnose_trace(events, lifecycle_complete, sr_check=None): + """生成追踪诊断。""" + diagnoses = [] + types = [e["type"] for e in events] + + if not lifecycle_complete: + if "SELECT_WORKER" in types and "RELEASE_WORKER" not in types: + diagnoses.append({"severity": "HIGH", "message": "有 select 但无 release — 疑似请求卡住"}) + elif "HTTP" not in types and "PARSING_COMPLETE" not in types: + diagnoses.append({"severity": "MEDIUM", "message": "未找到 HTTP 入口事件"}) + elif "REQUEST_COMPLETE" not in types: + diagnoses.append({"severity": "MEDIUM", "message": "未检测到请求完成事件 — 疑似异常退出"}) + + if "PREFILL_ERROR" in types: + for e in events: + if e["type"] == "PREFILL_ERROR": + diagnoses.append( + {"severity": "HIGH", "message": f'Prefill 错误: {e.get("error","")} @ {e.get("worker","")}'} + ) + + if "FAILED_SELECT" in types: + diagnoses.append({"severity": "HIGH", "message": "Failed to select worker — 无可用 Worker"}) + + if sr_check: + if sr_check.get("unmatched_selects"): + diagnoses.append( + { + "severity": "HIGH", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_selects"])} 个 unmatched select', + } + ) + if sr_check.get("unmatched_releases"): + diagnoses.append( + { + "severity": "MEDIUM", + "message": f'match-select-release 检测到 {len(sr_check["unmatched_releases"])} 个 unmatched release', + } + ) + + return diagnoses + + +# ════════════════════════════════════════════════════════════════ +# 报告格式化 +# ════════════════════════════════════════════════════════════════ + + +def format_trace_report(result): + """将追踪结果格式化为终端报告。 + + Returns: + tuple: (summary_text, detail_dict) + summary_text: 总结部分(概览 + 诊断 + 生命周期状态) + detail_dict: {trace_id: event_chain_text} 各 ID 的完整事件链 + """ + sections = ["## 请求追踪", ""] + sections.append(f' {result["summary"]}') + sections.append("") + + detail_dict = {} + + for tid, trace in result["traces"].items(): + sections.append(f"### ID: {tid}") + if trace.get("matched_tag"): + sections.append(f' 匹配类型: {trace["matched_tag"]}') + if trace.get("id_coverage"): + c = trace["id_coverage"] + sections.append( + " ID统计: " + f'request_only={c["request_only"]}, session_only={c["session_only"]}, trace_only={c["trace_only"]}' + ) + if trace.get("related_ids", {}).get("request_ids"): + sections.append(f' 关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') + + status = "完整" if trace["lifecycle_complete"] else "不完整" + sections.append(f" 生命周期: {status}") + sections.append("") + + # 诊断 + if trace["diagnoses"]: + for d in trace["diagnoses"]: + sections.append(f' [{d["severity"]}] {d["message"]}') + sections.append("") + + # 事件链 → 拆分到 detail_dict + if trace["events"]: + detail_lines = [f"# 请求追踪事件链: {tid}", ""] + if trace.get("matched_tag"): + detail_lines.append(f'匹配类型: {trace["matched_tag"]}') + if trace.get("id_coverage"): + c = trace["id_coverage"] + detail_lines.append("ID覆盖统计:") + detail_lines.append( + f'- only_request_id: {c["request_only"]} | only_session_id: {c["session_only"]} | only_trace_id: {c["trace_only"]}' + ) + if trace.get("id_combos"): + detail_lines.append("") + detail_lines.append("标签组合明细(按唯一ID计数):") + for item in trace["id_combos"]: + detail_lines.append( + f'- combo={item["combo"]} | count={item["count"]} | ids={", ".join(item["ids"])}' + ) + if trace.get("related_ids", {}).get("request_ids"): + detail_lines.append(f'关联 request_ids: {", ".join(trace["related_ids"]["request_ids"])}') + detail_lines.append(f"生命周期: {status}") + detail_lines.append("") + detail_lines.append("## 事件链(整理)") + detail_lines.append("") + for evt in trace["events"]: + line = f' [{evt.get("ts","")}] {evt["type"]}' + if evt.get("worker"): + line += f' → {_strip_scheme(evt["worker"])}' + if evt.get("status"): + line += f' [{evt["status"]}]' + if evt.get("latency_ms"): + line += f' {evt["latency_ms"]}ms' + if evt.get("strategy"): + line += f' strategy={evt["strategy"]}' + if evt.get("selected_hitRatio"): + line += f' hitRatio={evt["selected_hitRatio"]}' + if evt.get("tokens"): + line += f' tokens={evt["tokens"]}' + if evt.get("error"): + line += f' error={evt["error"]}' + if evt.get("ts_ms"): + line += f' ts_ms={evt["ts_ms"]}' + detail_lines.append(line) + detail_lines.append("") + detail_lines.append("## 原始日志 RAW") + detail_lines.append("") + for evt in trace["events"]: + if evt.get("raw"): + detail_lines.append(evt["raw"]) + detail_lines.append("") + detail_dict[tid] = "\n".join(detail_lines) + + # 主报告中添加引用和摘要 + safe_tid = tid.replace("/", "_") + sections.append(f' 事件数: {len(trace["events"])}') + sections.append( + f" > 完整事件链: [detail/trace/trace_{safe_tid}.md](../detail/trace/trace_{safe_tid}.md)" + ) + sections.append("") + + return "\n".join(sections), detail_dict + + +# ════════════════════════════════════════════════════════════════ +# Grep 工具 +# ════════════════════════════════════════════════════════════════ + + +def _grep_lines(log_file, pattern, tail=None): + try: + if tail: + cmd = f"tail -n {tail} {_shell_quote(log_file)} | grep -E {_shell_quote(pattern)}" + else: + cmd = f"grep -E {_shell_quote(pattern)} {_shell_quote(log_file)}" + result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=60) + if result.returncode > 1: + return [] + return [line for line in result.stdout.split("\n") if line.strip()] + except (subprocess.TimeoutExpired, FileNotFoundError): + return [] + + +def _shell_quote(s): + return "'" + s.replace("'", "'\\''") + "'" + + +def _detect_matched_tags(lines, target_id): + matched = set() + for line in lines: + tags = extract_tags(line) + for key in ("request_id", "trace_id", "session_id", "req_id"): + if tags.get(key) == target_id: + matched.add(key) + return sorted(matched) + + +def _format_matched_tag(matched_tags): + if not matched_tags: + return "unknown" + if len(matched_tags) == 1: + return matched_tags[0] + return "+".join(matched_tags) + + +def _build_id_coverage_stats(lines): + request_only_ids = set() + session_only_ids = set() + trace_only_ids = set() + + for line in lines: + tags = extract_tags(line) + req_val = tags.get("request_id") or tags.get("req_id") + session_val = tags.get("session_id") + trace_val = tags.get("trace_id") + has_request = bool(req_val) + has_session = bool(session_val) + has_trace = bool(trace_val) + + if has_request and not has_session and not has_trace: + request_only_ids.add(req_val) + if has_session and not has_request and not has_trace: + session_only_ids.add(session_val) + if has_trace and not has_request and not has_session: + trace_only_ids.add(trace_val) + + return { + "request_only": len(request_only_ids), + "session_only": len(session_only_ids), + "trace_only": len(trace_only_ids), + } + + +def _build_id_combo_stats(lines): + combo_to_ids = {} + for line in lines: + tags = extract_tags(line) + keys = [] + if tags.get("request_id"): + keys.append("request_id") + if tags.get("req_id"): + keys.append("req_id") + if tags.get("session_id"): + keys.append("session_id") + if tags.get("trace_id"): + keys.append("trace_id") + combo = "+".join(keys) if keys else "no_id_tag" + + ids = [] + if tags.get("request_id"): + ids.append(tags["request_id"]) + if tags.get("req_id"): + ids.append(tags["req_id"]) + if tags.get("session_id"): + ids.append(tags["session_id"]) + if tags.get("trace_id"): + ids.append(tags["trace_id"]) + id_key = "|".join(ids) if ids else "" + + combo_to_ids.setdefault(combo, set()).add(id_key) + + rows = [] + for combo, ids in combo_to_ids.items(): + rows.append({"combo": combo, "count": len(ids), "ids": sorted(ids)}) + rows.sort(key=lambda x: x["count"], reverse=True) + return rows diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py new file mode 100644 index 00000000000..1eaea1369f8 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/chart.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python3 +""" +Chart — 终端可视化渲染工具 + +提供 sparkline 折线图、Unicode 柱状图、Markdown 表格、Worker 时间线的渲染函数。 +所有函数返回字符串(不直接打印),方便组装到报告中。 + +Python 3 stdlib only,零依赖。 +""" + +from datetime import datetime + +# ════════════════════════════════════════════════════════════════ +# Sparkline 折线图 +# ════════════════════════════════════════════════════════════════ + +BLOCK_CHARS = " ▁▂▃▄▅▆▇█" + + +def render_sparkline( + records, value_field="value", bucket_field="bucket", title=None, y_label=None, y_range=None, width=60 +): + """渲染 8 级 Unicode sparkline 折线图。 + + Args: + records: dict 列表,每个 dict 包含 bucket_field 和 value_field + value_field: 数值字段名 + bucket_field: 时间桶字段名 + title: 图表标题 + y_label: Y 轴标签(如 '%') + y_range: Y 轴范围 (min, max) 元组,None 则自动 + width: 图表宽度(字符数) + + Returns: + str: 渲染后的图表文本 + """ + if not records: + return " (no data)" + + all_values = [] + for r in records: + v = r.get(value_field) + if v is not None: + all_values.append(float(v)) + + if not all_values: + return " (no numeric data)" + + # Y 轴范围 + if y_range: + y_min, y_max = y_range + else: + y_min = min(all_values) + y_max = max(all_values) + if y_max == y_min: + y_min = 0 if y_max > 0 else y_max - 1 + y_max = max(y_max, 1) + + y_span = y_max - y_min if y_max != y_min else 1 + + # 降采样 + n = len(records) + if n > width: + step = n / width + sampled = [] + for i in range(width): + start_idx = int(i * step) + end_idx = int((i + 1) * step) + chunk = records[start_idx:end_idx] + vals = [float(r.get(value_field, 0)) for r in chunk if r.get(value_field) is not None] + avg_record = { + bucket_field: chunk[0].get(bucket_field, ""), + value_field: sum(vals) / len(vals) if vals else 0, + } + sampled.append(avg_record) + records = sampled + + lines = [] + + # 标题行 + def fmt_val(v): + if abs(v) >= 1000: + return f"{v:.0f}" + elif abs(v) >= 10: + return f"{v:.1f}" + return f"{v:.2f}" + + header_parts = [] + if title: + header_parts.append(title) + header_parts.append(f"min={fmt_val(min(all_values))}") + header_parts.append(f"max={fmt_val(max(all_values))}") + if y_label: + header_parts.append(f"({y_label})") + lines.append(" " + " ".join(header_parts)) + + # Sparkline 字符 + spark_chars = [] + for r in records: + v = r.get(value_field) + if v is None: + spark_chars.append(" ") + continue + v = float(v) + normalized = (v - y_min) / y_span + level = max(0, min(8, round(normalized * 8))) + spark_chars.append(BLOCK_CHARS[level]) + lines.append(" " + "".join(spark_chars)) + + # X 轴标签 + data_width = len(records) + if data_width > 0: + + def short_bucket(r): + b = str(r.get(bucket_field, "")) + if " " in b: + b = b.split(" ")[-1] + return b[:5] if len(b) >= 5 else b + + lbl_width = 6 + max_labels = max(1, data_width // lbl_width) + n_records = len(records) + + if n_records <= 2: + indices = list(range(n_records)) + elif n_records <= max_labels: + indices = [0, n_records - 1] + else: + n_labels = min(5, max(2, max_labels)) + indices = [int(i * (n_records - 1) / (n_labels - 1)) for i in range(n_labels)] + + label_line = [" "] * (data_width + lbl_width + 2) + last_end = -1 + for idx in indices: + lbl = short_bucket(records[idx]) + pos = idx + if pos < last_end: + continue + for ci, c in enumerate(lbl): + p = pos + ci + if p < len(label_line): + label_line[p] = c + last_end = pos + len(lbl) + 1 + lines.append(" " + "".join(label_line).rstrip()) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Unicode 柱状图 +# ════════════════════════════════════════════════════════════════ + + +def render_bar(data, bar_width=20, show_count=False): + """渲染 Unicode 柱状图。 + + Args: + data: dict 列表,每个 dict 包含 label, value(百分比 0-100), 可选 count + bar_width: 柱状图宽度(字符数) + show_count: 是否显示绝对数量 + + Returns: + str: 渲染后的图表文本 + """ + if not data: + return " (no data)" + + max_label_len = max(len(str(d.get("label", ""))) for d in data) + max_label_len = max(max_label_len, 4) + + lines = [] + for d in data: + label = str(d.get("label", "")) + value = float(d.get("value", 0)) + count = d.get("count") + + filled = round(value / 100 * bar_width) if value > 0 else 0 + filled = max(1, filled) if value > 0 else 0 + filled = min(bar_width, filled) + empty = bar_width - filled + bar = "█" * filled + "░" * empty + + line = f" {label:<{max_label_len}} {bar} {value:>5.1f}%" + if show_count and count is not None: + line += f" (N={count})" + lines.append(line) + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Markdown 表格 +# ════════════════════════════════════════════════════════════════ + + +def render_table(data, columns=None, right_align=None): + """渲染 Markdown 表格。 + + Args: + data: dict 列表 + columns: 列名列表,None 则用第一条记录的所有 key + right_align: 右对齐的列名集合 + + Returns: + str: 渲染后的表格文本 + """ + if not data: + return " (no data)" + + if columns is None: + columns = list(data[0].keys()) + if right_align is None: + right_align = set() + + # 计算列宽 + col_widths = {} + for col in columns: + col_widths[col] = len(col) + for row in data: + val = str(row.get(col, "")) + col_widths[col] = max(col_widths[col], len(val)) + + # 表头 + header_parts = [] + sep_parts = [] + for col in columns: + w = col_widths[col] + if col in right_align: + header_parts.append(f" {col:>{w}} ") + sep_parts.append("-" * (w + 1) + ":") + else: + header_parts.append(f" {col:<{w}} ") + sep_parts.append(":" + "-" * (w + 1)) + + lines = [] + lines.append("|" + "|".join(header_parts) + "|") + lines.append("|" + "|".join(sep_parts) + "|") + + # 数据行 + for row in data: + row_parts = [] + for col in columns: + val = str(row.get(col, "")) + w = col_widths[col] + if col in right_align: + row_parts.append(f" {val:>{w}} ") + else: + row_parts.append(f" {val:<{w}} ") + lines.append("|" + "|".join(row_parts) + "|") + + return "\n".join(lines) + + +# ════════════════════════════════════════════════════════════════ +# Worker 可用性时间线 +# ════════════════════════════════════════════════════════════════ + + +def render_timeline(data, width=40): + """渲染 Worker 可用性时间线。 + + Args: + data: dict,结构为: + { + 'start': 'YYYY/MM/DD HH:MM:SS', + 'end': 'YYYY/MM/DD HH:MM:SS', + 'workers': { + 'http://ip:port': [ + {'from': 'ts', 'to': 'ts', 'status': 'up'|'down'}, + ... + ], + ... + } + } + width: 时间线宽度(字符数) + + Returns: + str: 渲染后的时间线文本 + """ + if not data: + return " (no data)" + + start_str = data.get("start", "") + end_str = data.get("end", "") + workers = data.get("workers", {}) + + if not workers or not start_str or not end_str: + return " (insufficient data)" + + # Parse time range + try: + if "/" in start_str: + fmt = "%Y/%m/%d %H:%M:%S" + else: + fmt = "%H:%M:%S" + t_start = datetime.strptime(start_str, fmt) + t_end = datetime.strptime(end_str, fmt) + except ValueError: + return f" (cannot parse time range: {start_str} ~ {end_str})" + + total_seconds = (t_end - t_start).total_seconds() + if total_seconds <= 0: + total_seconds = 1 + + lines = [] + + for worker_url, periods in workers.items(): + # Short label: just IP:PORT + label = worker_url.replace("http://", "") + bar = ["░"] * width + + for period in periods: + p_start_str = period.get("from", start_str) + p_end_str = period.get("to", end_str) + status = period.get("status", "up") + + try: + if "/" in p_start_str: + p_start = datetime.strptime(p_start_str, "%Y/%m/%d %H:%M:%S") + p_end = datetime.strptime(p_end_str, "%Y/%m/%d %H:%M:%S") + else: + p_start = datetime.strptime(p_start_str, "%H:%M:%S") + p_end = datetime.strptime(p_end_str, "%H:%M:%S") + except ValueError: + continue + + start_pos = int((p_start - t_start).total_seconds() / total_seconds * width) + end_pos = int((p_end - t_start).total_seconds() / total_seconds * width) + start_pos = max(0, min(width - 1, start_pos)) + end_pos = max(0, min(width, end_pos)) + + char = "█" if status == "up" else "░" + for i in range(start_pos, end_pos): + bar[i] = char + + up_count = bar.count("█") + uptime_pct = round(up_count / width * 100, 1) + + max_label_len = max(len(w.replace("http://", "")) for w in workers) + lines.append(f' {label:<{max_label_len}} {"".join(bar)} {uptime_pct}%') + + # Time axis + if lines: + max_label_len = max(len(w.replace("http://", "")) for w in workers) + padding = " " * (max_label_len + 4) + start_lbl = start_str.split(" ")[-1] if " " in start_str else start_str + end_lbl = end_str.split(" ")[-1] if " " in end_str else end_str + gap = width - len(start_lbl) - len(end_lbl) + lines.append(f'{padding}{start_lbl}{" " * max(1, gap)}{end_lbl}') + lines.append(f"{padding}█ = online ░ = offline") + + return "\n".join(lines) diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py new file mode 100644 index 00000000000..99864e1de16 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/log_parser.py @@ -0,0 +1,1116 @@ +#!/usr/bin/env python3 +""" +Router Log Parser — FastDeploy Go Router 日志解析原语 + +支持两种调用方式: +1. 作为模块导入:from log_parser import parse_http_line, parse_cache_strategy_line, ... +2. 作为 CLI 工具:grep 'pattern' logfile | python3 log_parser.py parse-http + +Python 3 stdlib only,零依赖。 +""" + +import argparse +import json +import re +import sys +from collections import defaultdict +from datetime import datetime, timedelta + +# ════════════════════════════════════════════════════════════════ +# 通用解析原语 +# ════════════════════════════════════════════════════════════════ + +# Go time.Duration.String() parser: handles 1h2m3.456s, 500µs, 150.5ms, etc. +DURATION_RE = re.compile(r"(\d+(?:\.\d+)?)(h|m(?!s)|s|ms|[µu]s|ns)") + + +def parse_go_duration_ms(s): + """解析 Go time.Duration.String() 输出为毫秒。 + + Examples: '1.5s' -> 1500.0, '500µs' -> 0.5, '1m30s' -> 90000.0 + """ + total = 0.0 + for m in DURATION_RE.finditer(s): + val, unit = float(m.group(1)), m.group(2) + if unit == "h": + total += val * 3600000 + elif unit == "m": + total += val * 60000 + elif unit == "s": + total += val * 1000 + elif unit == "ms": + total += val + elif unit in ("µs", "us"): + total += val / 1000 + elif unit == "ns": + total += val / 1000000 + return total + + +def parse_go_map(s): + """解析 Go fmt.Sprintf('%v', map) 输出:map[key1:val1 key2:val2 ...] + + 处理 URL 中冒号与 Go map key-value 分隔符的冲突(从最后一个冒号分割)。 + 空 map 'map[]' 返回空 dict。 + """ + inner_match = re.search(r"map\[(.*?)\]", s) + if not inner_match: + return {} + inner = inner_match.group(1).strip() + if not inner: + return {} + result = {} + for token in inner.split(): + idx = token.rfind(":") + if idx > 0: + key = token[:idx] + val_str = token[idx + 1 :] + try: + result[key] = int(val_str) if "." not in val_str else float(val_str) + except ValueError: + result[key] = val_str + return result + + +# 时间戳:YYYY/MM/DD HH:MM:SS +TS_RE = re.compile(r"(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})") + +# ts_ms:2025-01-15 18:25:33.123 +TS_MS_RE = re.compile(r"ts_ms=(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d+)") + + +def extract_ts(line): + """从日志行提取 YYYY/MM/DD HH:MM:SS 时间戳。""" + m = TS_RE.search(line) + return m.group(1) if m else None + + +def parse_ts(ts_str): + """将 YYYY/MM/DD HH:MM:SS 时间戳解析为 datetime。""" + return datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + + +# ════════════════════════════════════════════════════════════════ +# 时间范围过滤 +# ════════════════════════════════════════════════════════════════ + +import os +import subprocess +import tempfile + +_FULL_DT_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})\s+(\d{1,2}):(\d{2})(?::(\d{2}))?$") +_DATE_ONLY_RE = re.compile(r"^(\d{4})[/-](\d{1,2})[/-](\d{1,2})$") +_SHORT_DATE_RE = re.compile(r"^(\d{1,2})[/-](\d{1,2})(?:\s+(\d{1,2}):(\d{2})(?::(\d{2}))?)?$") +_TIME_ONLY_RE = re.compile(r"^(\d{1,2}):(\d{2})(?::(\d{2}))?$") + + +def _get_log_boundary_ts(log_file, which="first"): + """从日志文件首行或末行提取时间戳。""" + cmd = "head" if which == "first" else "tail" + try: + r = subprocess.run([cmd, "-1", log_file], capture_output=True, text=True, timeout=5) + return extract_ts(r.stdout) if r.returncode == 0 else None + except (subprocess.TimeoutExpired, FileNotFoundError): + return None + + +def complete_time_arg(time_str, log_file, is_end=False): + """解析灵活时间输入,补全缺失部分。 + + 支持格式: + 'YYYY/MM/DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS', 'YYYY/MM/DD', + 'MM/DD', 'MM/DD HH:MM', 'HH:MM:SS', 'HH:MM' + + 补全规则: + - 缺年份:从日志首行取 + - 缺日期:从日志末行取 + - 缺时间:start→00:00:00, end→23:59:59 + + Returns: 'YYYY/MM/DD HH:MM:SS' 格式字符串 + """ + if time_str is None: + return None + time_str = time_str.strip() + + # Case 1: 完整日期时间 + m = _FULL_DT_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + h, mi = m.group(4).zfill(2), m.group(5) + s = (m.group(6) or "00").zfill(2) + return f"{y}/{mo}/{d} {h}:{mi}:{s}" + + # Case 2: 仅日期 YYYY/MM/DD + m = _DATE_ONLY_RE.match(time_str) + if m: + y, mo, d = m.group(1), m.group(2).zfill(2), m.group(3).zfill(2) + t = "23:59:59" if is_end else "00:00:00" + return f"{y}/{mo}/{d} {t}" + + # Case 3: 短日期 MM/DD 或 MM/DD HH:MM[:SS] + m = _SHORT_DATE_RE.match(time_str) + if m: + mo, d = m.group(1).zfill(2), m.group(2).zfill(2) + ts = _get_log_boundary_ts(log_file, "first") + year = ts[:4] if ts else str(datetime.now().year) + if m.group(3): # 有时间部分 + h, mi = m.group(3).zfill(2), m.group(4) + s = (m.group(5) or "00").zfill(2) + return f"{year}/{mo}/{d} {h}:{mi}:{s}" + t = "23:59:59" if is_end else "00:00:00" + return f"{year}/{mo}/{d} {t}" + + # Case 4: 仅时间 HH:MM[:SS] + m = _TIME_ONLY_RE.match(time_str) + if m: + h, mi = m.group(1).zfill(2), m.group(2) + s = (m.group(3) or "00").zfill(2) + ts = _get_log_boundary_ts(log_file, "last") + date_part = ts[:10] if ts else f"{datetime.now().year}/01/01" + return f"{date_part} {h}:{mi}:{s}" + + # Fallback: 原样返回 + return time_str + + +def filter_file_by_time_range(log_file, start_str=None, end_str=None): + """用 awk 按时间范围预过滤日志文件。 + + 时间戳 YYYY/MM/DD HH:MM:SS 天然字典序可比,直接用 awk 字符串比较。 + 无时间戳的行(如 panic 堆栈续行)保留。 + + Args: + log_file: 原日志文件路径 + start_str: 起始时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + end_str: 结束时间 'YYYY/MM/DD HH:MM:SS'(含),或 None + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if not start_str and not end_str: + return (log_file, False) + + tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False, prefix="router_filtered_") + tmp.close() + + awk_script = r"""{ + ts = "" + if (match($0, /[0-9]{4}\/[0-9]{2}\/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}/)) { + ts = substr($0, RSTART, RLENGTH) + } + if (ts == "") { print; next } + if ((start == "" || ts >= start) && (end == "" || ts <= end)) print + }""" + + cmd = ["awk", "-v", f'start={start_str or ""}', "-v", f'end={end_str or ""}', awk_script, log_file] + + try: + with open(tmp.name, "w") as outf: + result = subprocess.run(cmd, stdout=outf, stderr=subprocess.PIPE, text=True, timeout=120) + if result.returncode != 0: + os.unlink(tmp.name) + return (log_file, False) + except (subprocess.TimeoutExpired, OSError): + if os.path.exists(tmp.name): + os.unlink(tmp.name) + return (log_file, False) + + return (tmp.name, True) + + +def filter_file_by_recent_minutes(log_file, minutes): + """按日志末时间戳向前过滤最近 N 分钟日志。 + + Returns: + tuple: (file_path, is_temp) — is_temp=True 时调用方负责删除 + """ + if minutes is None or minutes <= 0: + return (log_file, False) + + last_ts = _get_log_boundary_ts(log_file, "last") + if not last_ts: + return (log_file, False) + + try: + end_dt = parse_ts(last_ts) + except ValueError: + return (log_file, False) + + start_dt = end_dt - timedelta(minutes=minutes) + start_str = start_dt.strftime("%Y/%m/%d %H:%M:%S") + end_str = end_dt.strftime("%Y/%m/%d %H:%M:%S") + return filter_file_by_time_range(log_file, start_str=start_str, end_str=end_str) + + +# Context tag:[session_id:...], [request_id:...], [trace_id:...], [req_id:...] +TAG_RE = re.compile(r"\[(session_id|request_id|trace_id|req_id):([^\]]+)\]") + + +def extract_tags(line): + """从日志行提取 context tag。""" + return {m.group(1): m.group(2) for m in TAG_RE.finditer(line)} + + +# Log level +LEVEL_RE = re.compile(r"\[(INFO|ERROR|WARN|DEBUG)\]") + + +def extract_level(line): + """从日志行提取日志级别。""" + m = LEVEL_RE.search(line) + return m.group(1) if m else None + + +# ════════════════════════════════════════════════════════════════ +# HTTP 请求行解析(类别 H1) +# ════════════════════════════════════════════════════════════════ + +# H1 pattern: [METHOD] /path HTTP/1.1 STATUS LATENCY CLIENT_IP +HTTP_RE = re.compile( + r"\[(POST|GET|PUT|DELETE|PATCH|HEAD|OPTIONS)\]\s+" + r"(/\S*)\s+HTTP/\d\.\d\s+" + r"(\d{3})\s+" + r"(\S+)\s+" # latency (Go duration) + r"(\d+\.\d+\.\d+\.\d+)" # client IP +) + + +def parse_http_line(line, inference_only=False): + """解析 H1 HTTP 请求行。 + + 输入示例: + [INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 1.234567s 10.0.0.1 + + Args: + line: 日志行 + inference_only: True 则仅保留推理路径 (/v1/chat/completions, /v1/completions) + + 返回 dict 或 None。 + """ + ts = extract_ts(line) + m = HTTP_RE.search(line) + if not m: + return None + + method, path, status, latency_raw, client_ip = m.groups() + latency_ms = parse_go_duration_ms(latency_raw) + + if inference_only and path not in ("/v1/chat/completions", "/v1/completions"): + return None + + record = { + "ts": ts or "", + "method": method, + "path": path, + "status": int(status), + "latency_ms": round(latency_ms, 3), + "client_ip": client_ip, + } + + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Cache-Aware 策略行解析(类别 H6) +# ════════════════════════════════════════════════════════════════ + +URL_RE = r"(?:https?://)?[A-Za-z0-9.-]+(?::\d+)?" +STRATEGY_RE = re.compile(r"final strategy:\s*(\w+)") +SELECTED_RE = re.compile(rf"selected=({URL_RE})(?:,|\s|$)") +REASON_RE = re.compile(r"reason:\s*(.+?)(?:,\s*loads=|\.?\s*ts_ms=|$)") + + +def parse_cache_strategy_line(line): + """解析 cache-aware prefill 策略行。 + + 返回 dict 或 None(如果不是策略行)。 + """ + sm = STRATEGY_RE.search(line) + if not sm: + return None + + ts = extract_ts(line) + strategy = sm.group(1) + record = {"ts": ts or "", "strategy": strategy} + + sel_m = SELECTED_RE.search(line) + if sel_m: + record["selected"] = sel_m.group(1) + + reason_m = REASON_RE.search(line) + if reason_m and strategy == "process_tokens": + record["reason"] = reason_m.group(1).strip() + + hr_match = re.search(r"hitRatios=(map\[.*?\])", line) + if hr_match: + hit_ratios = parse_go_map(hr_match.group(1)) + record["hitRatios"] = hit_ratios + if "selected" in record: + record["selected_hitRatio"] = hit_ratios.get(record["selected"], 0) + else: + record["hitRatios"] = {} + if "selected" in record: + record["selected_hitRatio"] = 0 + + loads_match = re.search(r"loads=(map\[.*?\])", line) + if loads_match: + record["loads"] = parse_go_map(loads_match.group(1)) + + ts_ms_m = TS_MS_RE.search(line) + if ts_ms_m: + record["ts_ms"] = ts_ms_m.group(1) + + tags = extract_tags(line) + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Stats 行解析(类别 H7) +# ════════════════════════════════════════════════════════════════ + +TOTAL_RUNNING_RE = re.compile(r"total_running=(\d+)") +WORKER_RUNNING_RE = re.compile(rf"({URL_RE}): running=(\d+)") +CACHE_HR_RE = re.compile(r"cache_hit_rate=([\d.]+)%\s*\(hits=(\d+)/total=(\d+)\)") + + +def parse_stats_line(line): + """解析 [stats] 统计行。 + + 注意:hits 和 total 是 per-interval 的(每 5s 重置),累计值必须 sum 所有行。 + + 返回 dict 或 None(如果不是 stats 行)。 + """ + if "[stats]" not in line: + return None + + ts = extract_ts(line) + record = {"ts": ts or ""} + + tr_m = TOTAL_RUNNING_RE.search(line) + if tr_m: + record["total_running"] = int(tr_m.group(1)) + + workers = {} + for wm in WORKER_RUNNING_RE.finditer(line): + workers[wm.group(1)] = int(wm.group(2)) + record["workers"] = workers + + chr_m = CACHE_HR_RE.search(line) + if chr_m: + record["cache_hit_rate"] = float(chr_m.group(1)) + record["hits"] = int(chr_m.group(2)) + record["total"] = int(chr_m.group(3)) + + return record + + +# ════════════════════════════════════════════════════════════════ +# 错误消息模板归一化 +# ════════════════════════════════════════════════════════════════ + +NORMALIZE_PATTERNS = [ + (re.compile(r"https?://[\w.:]+"), "{url}"), + (re.compile(r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", re.I), "{uuid}"), + (re.compile(r"\d+\.\d+\.\d+\.\d+:\d+"), "{ip:port}"), + (re.compile(r"\b\d+\b"), "{N}"), +] + +# Message extraction: everything after "logger.go:NN: " (and optional context tags) +MSG_RE = re.compile(r"logger\.go:\d+:\s*(?:\[[^\]]*\]\s*)*(.+)") + + +def normalize_message(msg): + """将错误消息中的变量替换为占位符模板。""" + for pat, repl in NORMALIZE_PATTERNS: + msg = pat.sub(repl, msg) + return msg + + +def parse_error_line(line): + """解析 ERROR/WARN 行并进行模板归一化。 + + 返回 dict: {ts, level, original, template, tags} + """ + ts = extract_ts(line) + level = extract_level(line) + tags = extract_tags(line) + + mm = MSG_RE.search(line) + original = mm.group(1).strip() if mm else line + + template = normalize_message(original) + + record = { + "ts": ts or "", + "level": level or "", + "original": original, + "template": template, + } + if tags: + record["tags"] = tags + + return record + + +# ════════════════════════════════════════════════════════════════ +# Select/Release 事件匹配 +# ════════════════════════════════════════════════════════════════ + +SELECT_RE = re.compile(rf"select worker\s*(?:\((\w+)\))?:\s*({URL_RE})") +RELEASE_RE = re.compile(rf"release worker\s*(?:\((\w+)\))?:\s*({URL_RE})") +FAILED_SELECT_RE = re.compile(r"Failed to select") +SELECT_TOKENS_RE = re.compile(rf"select worker \((\w+)\):\s*({URL_RE}),\s*tokens:\s*(\d+)") +RELEASE_TOKENS_RE = re.compile(rf"release (?:([a-zA-Z_]+)\s+)?tokens:\s*({URL_RE}),\s*tokens:\s*(\d+)") + + +def _parse_ts_safe(ts): + if not ts: + return None + try: + return parse_ts(ts) + except ValueError: + return None + + +def _select_match_key(tags): + """构建请求关联 key,优先 request_id,其次 req_id/trace_id/session_id。""" + if not tags: + return (None, None) + rid = tags.get("request_id") + if rid: + return ("request_id", f"request_id:{rid}") + for k in ("req_id", "trace_id", "session_id"): + v = tags.get(k) + if v: + return ("alt_id", f"{k}:{v}") + return (None, None) + + +def _normalize_worker_type(worker_type): + """归一化 worker type。""" + t = (worker_type or "unknown").lower() + if t in ("prefill", "decode", "mixed"): + return t + return "unknown" + + +def _normalize_worker_url_key(url): + if not url: + return "" + return re.sub(r"^https?://", "", str(url).strip().rstrip("/")) + + +def _infer_release_worker_type(release, selects, fallback_window_s=120): + """为未显式标注 type 的 release 近似推断 worker type。 + + 优先级: + 1) 同 worker、时间上最近且不晚于 release 的 select type + 2) 若无可解析时间戳,则使用同 worker 的最后一个 select type + 3) 推断失败返回 unknown + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [s for s in selects if s.get("worker") == worker] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + # 回退:按出现顺序取同 worker 的最近 select + return _normalize_worker_type(candidates[-1].get("type")) + + +def _infer_token_release_worker_type(release, selects, fallback_window_s=120): + """为 token release 推断 worker type(prefill/mixed)。 + + 注意:日志文本通常固定为 `release prefill tokens`,即使 mixed 也可能走这条日志。 + 因此 token release 的类型优先依据同 worker 的邻近 select 推断。 + """ + worker = release.get("worker") + if not worker: + return "unknown" + + r_ts = _parse_ts_safe(release.get("ts")) + candidates = [ + s + for s in selects + if s.get("worker") == worker and _normalize_worker_type(s.get("type")) in ("prefill", "mixed") + ] + if not candidates: + return "unknown" + + if r_ts: + best = None + best_delta = None + for s in candidates: + s_ts = _parse_ts_safe(s.get("ts")) + if not s_ts: + continue + delta = (r_ts - s_ts).total_seconds() + if delta < 0 or delta > fallback_window_s: + continue + if best_delta is None or delta < best_delta: + best = s + best_delta = delta + if best is not None: + return _normalize_worker_type(best.get("type")) + + return _normalize_worker_type(candidates[-1].get("type")) + + +def match_select_release(lines, fallback_window_s=120): + """匹配 select/release worker 事件对。 + + Args: + lines: 日志行列表(字符串) + + Returns: + dict: {matched, unmatched_selects, failed_selects, per_worker} + """ + selects = [] + releases = [] + failed_selects = [] + + for line_no, line in enumerate(lines, 1): + ts = extract_ts(line) + tags = extract_tags(line) + + # Token-bearing select + tm = SELECT_TOKENS_RE.search(line) + if tm: + selects.append( + { + "ts": ts, + "worker": tm.group(2), + "worker_key": _normalize_worker_url_key(tm.group(2)), + "type": _normalize_worker_type(tm.group(1)), + "tags": tags, + "tokens": int(tm.group(3)), + "line": line_no, + } + ) + continue + + # Token-bearing release + trm = RELEASE_TOKENS_RE.search(line) + if trm: + token_type = trm.group(1) + releases.append( + { + "ts": ts, + "worker": trm.group(2), + "worker_key": _normalize_worker_url_key(trm.group(2)), + # 文本默认按 prefill 记,再结合同 worker 邻近 select 做纠偏(mixed 场景) + "type": f'{_normalize_worker_type(token_type or "prefill")}_tokens', + "raw_token_type": token_type or "", + "tags": tags, + "tokens": int(trm.group(3)), + "line": line_no, + } + ) + continue + + sm = SELECT_RE.search(line) + if sm: + selects.append( + { + "ts": ts, + "worker": sm.group(2), + "worker_key": _normalize_worker_url_key(sm.group(2)), + "type": _normalize_worker_type(sm.group(1)), + "tags": tags, + "tokens": None, + "line": line_no, + } + ) + continue + + rm = RELEASE_RE.search(line) + if rm: + releases.append( + { + "ts": ts, + "worker": rm.group(2), + "worker_key": _normalize_worker_url_key(rm.group(2)), + "type": _normalize_worker_type(rm.group(1)), + "tags": tags, + "tokens": None, + "line": line_no, + } + ) + continue + + if FAILED_SELECT_RE.search(line): + failed_selects.append({"ts": ts, "tags": tags, "line": line_no}) + + # Match by worker FIFO(select -> 同 worker 下一条 release) + matched = [] + unmatched_selects = [] + release_used = set() + + # 请求生命周期匹配只使用 request counter release(排除 token release) + # 说明:request_id 只用于覆盖率观测,不参与 select/release 配对条件。 + counter_release_indexes = [i for i, r in enumerate(releases) if not str(r.get("type", "")).endswith("_tokens")] + # 请求 ID 覆盖(按 select 事件近似请求数) + total_req_est = len(selects) + with_request_id = 0 + with_alt_id = 0 + without_any_id = 0 + + pending_selects = [] + for s in selects: + key_type, key = _select_match_key(s.get("tags", {})) + if key_type == "request_id": + with_request_id += 1 + elif key_type == "alt_id": + with_alt_id += 1 + else: + without_any_id += 1 + + pending_selects.append(s) + + for s in pending_selects: + sdt = _parse_ts_safe(s.get("ts")) + best_idx = None + best_ts = None + for ri in counter_release_indexes: + if ri in release_used: + continue + r = releases[ri] + if r.get("worker_key") != s.get("worker_key"): + continue + rdt = _parse_ts_safe(r.get("ts")) + # 优先选择时间不早于 select 的最早 release;解析失败则按出现顺序 + if sdt and rdt and rdt < sdt: + continue + if best_idx is None: + best_idx = ri + best_ts = rdt + elif rdt and best_ts and rdt < best_ts: + best_idx = ri + best_ts = rdt + + if best_idx is not None: + r = releases[best_idx] + s_key_type, s_key = _select_match_key(s.get("tags", {})) + r_key_type, r_key = _select_match_key(r.get("tags", {})) + if s_key and r_key: + if s_key == r_key: + id_check = "match" + else: + id_check = "mismatch" + elif s_key and not r_key: + id_check = "select_only" + elif (not s_key) and r_key: + id_check = "release_only" + else: + id_check = "both_missing" + + matched.append( + { + "request_id": s["tags"].get("request_id", ""), + "worker": s["worker"], + "select_ts": s["ts"], + "release_ts": r["ts"], + "type": s["type"], + "match_method": "worker_fifo", + "id_check": id_check, + } + ) + release_used.add(best_idx) + else: + unmatched_selects.append( + { + "worker": s["worker"], + "select_ts": s["ts"], + "type": s["type"], + "tags": s["tags"], + "note": "no matching release found (worker FIFO)", + } + ) + + # Per-worker summary(按 worker type 统计,不依赖日志中的 tokens 字段) + # 规则:prefill/mixed 的 select 均计入 token_selects。 + per_worker = defaultdict(lambda: {"selects": 0, "releases": 0, "token_selects": 0, "token_releases": 0}) + for s in selects: + s_type = _normalize_worker_type(s.get("type")) + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker[wkey]["selects"] += 1 + if s_type in ("prefill", "mixed"): + per_worker[wkey]["token_selects"] += 1 + for r in releases: + wkey = r.get("worker_key") or _normalize_worker_url_key(r.get("worker")) + if str(r.get("type", "")).endswith("_tokens"): + per_worker[wkey]["token_releases"] += 1 + else: + per_worker[wkey]["releases"] += 1 + + pw_result = {} + for w, counts in per_worker.items(): + pw_result[w] = { + "selects": counts["selects"], + "releases": counts["releases"], + "delta": counts["selects"] - counts["releases"], + "token_selects": counts["token_selects"], + "token_releases": counts["token_releases"], + } + + # 基于 select 构建 worker URL -> dominant type 映射 + per_worker_type_counts = defaultdict(lambda: defaultdict(int)) + for s in selects: + wkey = s.get("worker_key") or _normalize_worker_url_key(s.get("worker")) + per_worker_type_counts[wkey][_normalize_worker_type(s.get("type"))] += 1 + worker_dominant_type = {} + for w, counts in per_worker_type_counts.items(): + worker_dominant_type[w] = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] if counts else "unknown" + + # 为未显式标注 type 的 release 推断 worker type(避免大量 unknown) + inferred_release_types = {} + for i, r in enumerate(releases): + r_type_raw = str(r.get("type", "")) + if r_type_raw.endswith("_tokens"): + base_t = _normalize_worker_type(r_type_raw.replace("_tokens", "")) + # token release 按 worker URL 对应的 select 类型映射,不做邻近时间纠偏 + mapped_t = worker_dominant_type.get( + r.get("worker_key") or _normalize_worker_url_key(r.get("worker")), "unknown" + ) + if mapped_t in ("prefill", "decode", "mixed"): + base_t = mapped_t + inferred_release_types[i] = f"{base_t}_tokens" + continue + base_t = _normalize_worker_type(r_type_raw) + if base_t != "unknown": + inferred_release_types[i] = base_t + continue + inferred_release_types[i] = _infer_release_worker_type(r, selects, fallback_window_s=fallback_window_s) + + # 按 worker type 分类统计(prefill/decode/mixed,必要时保留 unknown) + type_summary = defaultdict( + lambda: { + "counter_selects": 0, + "counter_releases": 0, + "token_selects": 0, + "token_releases": 0, + } + ) + for s in selects: + s_type = _normalize_worker_type(s.get("type")) + type_summary[s_type]["counter_selects"] += 1 + if s_type in ("prefill", "mixed"): + type_summary[s_type]["token_selects"] += 1 + for i, r in enumerate(releases): + inferred = inferred_release_types.get(i, _normalize_worker_type(str(r.get("type", "")))) + r_type = _normalize_worker_type(str(inferred).replace("_tokens", "")) + if str(inferred).endswith("_tokens"): + type_summary[r_type]["token_releases"] += 1 + else: + type_summary[r_type]["counter_releases"] += 1 + + # 每个 worker URL 的类型画像(基于 select) + worker_type_profile = {} + for w, counts in per_worker_type_counts.items(): + dominant = "unknown" + if counts: + dominant = sorted(counts.items(), key=lambda kv: -kv[1])[0][0] + worker_type_profile[w] = { + "dominant_type": dominant, + "prefill": counts.get("prefill", 0), + "decode": counts.get("decode", 0), + "mixed": counts.get("mixed", 0), + "unknown": counts.get("unknown", 0), + } + + return { + "matched": matched, + "unmatched_selects": unmatched_selects, + "unmatched_releases": [], + "failed_selects": failed_selects, + "per_worker": pw_result, + "id_coverage": { + "total_requests_estimated": total_req_est, + "with_request_id": with_request_id, + "without_request_id": total_req_est - with_request_id, + "with_alt_id": with_alt_id, + "without_any_id": without_any_id, + }, + "type_summary": dict(type_summary), + "worker_type_profile": worker_type_profile, + } + + +# ════════════════════════════════════════════════════════════════ +# 不支持请求检测 +# ════════════════════════════════════════════════════════════════ + +# Router 已知路由白名单 (method, path) +KNOWN_ROUTES = { + ("POST", "/v1/chat/completions"), + ("POST", "/v1/completions"), + ("POST", "/register"), + ("GET", "/registered_number"), + ("GET", "/registered"), + ("GET", "/health_generate"), + ("GET", "/metrics"), +} + + +def find_unsupported_requests(lines): + """从 HTTP 日志行中筛选不匹配任何已知路由的请求。 + + Returns: + dict: {details: [...], summary: {total, unique_paths: {path: count}}} + """ + details = [] + path_counts = defaultdict(int) + + for line in lines: + record = parse_http_line(line) + if not record: + continue + key = (record["method"], record["path"]) + if key not in KNOWN_ROUTES: + details.append( + { + "ts": record["ts"], + "method": record["method"], + "path": record["path"], + "status": record["status"], + "client_ip": record["client_ip"], + } + ) + path_counts[f"{record['method']} {record['path']}"] += 1 + + return { + "details": details, + "summary": { + "total": len(details), + "unique_paths": dict(path_counts), + }, + } + + +def _cli_unsupported_requests(args): + """CLI: 检测不支持的请求。""" + lines = [line.rstrip("\n") for line in sys.stdin] + result = find_unsupported_requests(lines) + + if args.summary_only: + print(json.dumps(result["summary"], ensure_ascii=False)) + else: + print(json.dumps(result, ensure_ascii=False)) + + +# ════════════════════════════════════════════════════════════════ +# CLI 入口 +# ════════════════════════════════════════════════════════════════ + + +def _cli_parse_stream(parse_fn): + """通用 CLI 流式解析:从 stdin 读入日志行,输出 JSON Lines 到 stdout。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_fn(line) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def _cli_parse_http(args): + """CLI: 解析 HTTP 请求行。""" + parsed = 0 + skipped = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_http_line(line, inference_only=args.inference_only) + if record: + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + else: + skipped += 1 + print(f"Parsed {parsed} lines, skipped {skipped}", file=sys.stderr) + + +def _cli_normalize_errors(args): + """CLI: 归一化错误消息。""" + parsed = 0 + for line in sys.stdin: + line = line.rstrip("\n") + record = parse_error_line(line) + print(json.dumps(record, ensure_ascii=False)) + parsed += 1 + print(f"Normalized {parsed} lines", file=sys.stderr) + + +def _cli_match_select_release(args): + """CLI: 匹配 select/release 事件。""" + lines = [line.rstrip("\n") for line in sys.stdin] + result = match_select_release(lines) + print(json.dumps(result, ensure_ascii=False)) + + +def _cli_self_test(args): + """运行内置测试。""" + passed = 0 + failed = 0 + + def check(name, got, expected): + nonlocal passed, failed + if got == expected: + print(f" PASS: {name}") + passed += 1 + else: + print(f" FAIL: {name}") + print(f" expected: {expected}") + print(f" got: {got}") + failed += 1 + + print("=== Testing parse_go_duration_ms ===") + check("simple seconds", parse_go_duration_ms("1.5s"), 1500.0) + check("milliseconds", parse_go_duration_ms("150ms"), 150.0) + check("fractional ms", parse_go_duration_ms("150.5ms"), 150.5) + check("microseconds µs", parse_go_duration_ms("500µs"), 0.5) + check("microseconds us", parse_go_duration_ms("500us"), 0.5) + check("nanoseconds", parse_go_duration_ms("500ns"), 0.0005) + check("composite m+s", parse_go_duration_ms("1m30s"), 90000.0) + check("composite h+m+s", parse_go_duration_ms("1h2m3s"), 3723000.0) + check("composite h+m+fractional_s", parse_go_duration_ms("1h2m3.456s"), 3723456.0) + check("pure minutes", parse_go_duration_ms("2m"), 120000.0) + check("zero", parse_go_duration_ms("0s"), 0.0) + check("sub-ms decimal", parse_go_duration_ms("2.798235ms"), 2.798235) + + print("\n=== Testing parse_go_map ===") + check("single entry", parse_go_map("map[http://10.0.0.1:9263:100]"), {"http://10.0.0.1:9263": 100}) + check( + "multi entry", + parse_go_map("map[http://10.0.0.1:9263:100 http://10.0.0.2:9867:50]"), + {"http://10.0.0.1:9263": 100, "http://10.0.0.2:9867": 50}, + ) + check("empty map", parse_go_map("map[]"), {}) + check("float values", parse_go_map("map[http://10.0.0.1:9263:0.85]"), {"http://10.0.0.1:9263": 0.85}) + + print("\n=== Testing extract_ts ===") + check("standard", extract_ts("[INFO] 2025/01/15 18:25:33 logger.go:45: msg"), "2025/01/15 18:25:33") + check("no timestamp", extract_ts("no timestamp here"), None) + + print("\n=== Testing extract_tags ===") + check( + "session+request", + extract_tags("[session_id:abc] [request_id:def]"), + {"session_id": "abc", "request_id": "def"}, + ) + check( + "all four", + extract_tags("[trace_id:t1] [req_id:r1] [session_id:s1] [request_id:rq1]"), + {"trace_id": "t1", "req_id": "r1", "session_id": "s1", "request_id": "rq1"}, + ) + check("no tags", extract_tags("no tags here"), {}) + + print("\n=== Testing parse_http_line ===") + http_line = "[INFO] 2025/01/15 18:25:33 logger.go:45: [POST] /v1/chat/completions HTTP/1.1 200 2.798235ms 10.0.0.1" + r = parse_http_line(http_line) + check("http method", r["method"], "POST") + check("http path", r["path"], "/v1/chat/completions") + check("http status", r["status"], 200) + check("http latency", r["latency_ms"], 2.798) + check("http client_ip", r["client_ip"], "10.0.0.1") + + r_infer = parse_http_line( + "[INFO] 2025/01/15 18:25:33 logger.go:45: [GET] /health HTTP/1.1 200 1ms 10.0.0.1", inference_only=True + ) + check("inference_only filters health", r_infer, None) + + print("\n=== Testing normalize_message ===") + check("url", normalize_message("Failed to connect to http://10.0.0.1:9965"), "Failed to connect to {url}") + check("uuid", normalize_message("request abc12345-1234-5678-9012-abcdef123456 failed"), "request {uuid} failed") + check( + "ip:port", + normalize_message("dial tcp 10.0.0.1:9965: connection refused"), + "dial tcp {ip:port}: connection refused", + ) + + print("\n=== Testing match_select_release (token release type inference) ===") + sample_lines = [ + "[INFO] 2026/04/12 10:00:00 logger.go:1: [request_id:r1] select worker (mixed): http://10.0.0.1:9965, count: 1", + "[INFO] 2026/04/12 10:00:01 logger.go:1: [request_id:r1] release prefill tokens: http://10.0.0.1:9965, tokens: 10", + "[INFO] 2026/04/12 10:00:02 logger.go:1: [request_id:r1] release worker: http://10.0.0.1:9965, count: 0", + ] + msr = match_select_release(sample_lines) + check("mixed token_releases inferred", msr["type_summary"].get("mixed", {}).get("token_releases", 0), 1) + check("prefill token_releases remains 0", msr["type_summary"].get("prefill", {}).get("token_releases", 0), 0) + + print(f'\n{"=" * 40}') + print(f"Results: {passed} passed, {failed} failed") + if failed: + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Log Parser", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + sub = parser.add_subparsers(dest="command") + + p = sub.add_parser("parse-http", help="解析 HTTP 请求行 (H1) → JSON Lines") + p.add_argument("--inference-only", action="store_true", help="仅保留推理路径") + + sub.add_parser("parse-cache-strategy", help="解析 cache-aware 策略行 (H6) → JSON Lines") + sub.add_parser("parse-stats", help="解析 [stats] 统计行 (H7) → JSON Lines") + sub.add_parser("normalize-errors", help="ERROR/WARN 行模板归一化 → JSON Lines") + sub.add_parser("match-select-release", help="匹配 select/release worker 事件") + p = sub.add_parser("unsupported-requests", help="检测不匹配已知路由的请求") + p.add_argument("--summary-only", action="store_true", help="仅输出汇总(不含详细列表)") + sub.add_parser("self-test", help="运行内置测试") + + args = parser.parse_args() + + if args.command == "parse-http": + _cli_parse_http(args) + elif args.command == "parse-cache-strategy": + _cli_parse_stream(parse_cache_strategy_line) + elif args.command == "parse-stats": + _cli_parse_stream(parse_stats_line) + elif args.command == "normalize-errors": + _cli_normalize_errors(args) + elif args.command == "match-select-release": + _cli_match_select_release(args) + elif args.command == "unsupported-requests": + _cli_unsupported_requests(args) + elif args.command == "self-test": + _cli_self_test(args) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py new file mode 100644 index 00000000000..a197ee7aff0 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/stats.py @@ -0,0 +1,278 @@ +#!/usr/bin/env python3 +""" +Stats — 通用统计计算工具 + +提供百分位数、分布、时间窗口聚合、分组计数等通用统计函数。 +不含任何业务逻辑或日志格式依赖。 + +Python 3 stdlib only,零依赖。 +""" + +import math +from collections import defaultdict +from datetime import datetime, timedelta + +# ════════════════════════════════════════════════════════════════ +# 百分位数与基础统计 +# ════════════════════════════════════════════════════════════════ + + +def percentile(sorted_vals, p): + """从已排序列表计算第 p 百分位数(线性插值)。""" + if not sorted_vals: + return 0.0 + n = len(sorted_vals) + k = (p / 100.0) * (n - 1) + f = math.floor(k) + c = math.ceil(k) + if f == c: + return sorted_vals[int(k)] + return sorted_vals[f] * (c - k) + sorted_vals[c] * (k - f) + + +def compute_statistics(values, percentiles_list=None, distribution_spec=None): + """计算一组数值的统计量。 + + Args: + values: 数值列表 + percentiles_list: 要计算的百分位数列表,默认 [50, 90, 95, 99] + distribution_spec: 分布区间规格字符串,如 '0-20,20-40,40-60,60-80,80-100' + + Returns: + dict: {count, min, max, mean, sum, stddev, p50, p90, ..., distribution} + """ + if percentiles_list is None: + percentiles_list = [50, 90, 95, 99] + + if not values: + result = {"count": 0, "min": 0, "max": 0, "mean": 0, "sum": 0, "stddev": 0} + for p in percentiles_list: + result[f"p{p}"] = 0 + if distribution_spec is not None: + result["distribution"] = [] + return result + + sorted_vals = sorted(values) + n = len(sorted_vals) + total = sum(sorted_vals) + mean = total / n + variance = sum((x - mean) ** 2 for x in sorted_vals) / n + stddev = math.sqrt(variance) + + result = { + "count": n, + "min": round(sorted_vals[0], 3), + "max": round(sorted_vals[-1], 3), + "mean": round(mean, 3), + "sum": round(total, 3), + "stddev": round(stddev, 3), + } + + for p in percentiles_list: + result[f"p{p}"] = round(percentile(sorted_vals, p), 3) + + if distribution_spec is not None: + result["distribution"] = compute_distribution(sorted_vals, distribution_spec) + + return result + + +def compute_distribution(sorted_vals, spec_str): + """根据区间规格计算分布直方图。 + + spec_str 示例:'0-20,20-40,40-60,60-80,80-100' + 每个区间是左闭右开 [lo, hi)。 + """ + buckets = _parse_distribution_spec(spec_str) + n = len(sorted_vals) + result = [] + for b in buckets: + if b[0] == "lt": + count = sum(1 for v in sorted_vals if v < b[1]) + label = b[2] + elif b[0] == "gt": + count = sum(1 for v in sorted_vals if v > b[1]) + label = b[2] + elif b[0] == "range": + count = sum(1 for v in sorted_vals if b[1] <= v < b[2]) + label = b[3] + else: + continue + result.append({"range": label, "count": count, "pct": round(count / n * 100, 1) if n else 0}) + return result + + +def _parse_distribution_spec(spec_str): + """解析分布区间规格:'<100,100-500,>1000' → bucket 定义列表。""" + buckets = [] + for part in spec_str.split(","): + part = part.strip() + if part.startswith("<"): + buckets.append(("lt", float(part[1:]), part)) + elif part.startswith(">"): + buckets.append(("gt", float(part[1:]), part)) + elif "-" in part: + lo, hi = part.split("-", 1) + buckets.append(("range", float(lo), float(hi), part)) + return buckets + + +# ════════════════════════════════════════════════════════════════ +# 时间窗口聚合 +# ════════════════════════════════════════════════════════════════ + + +def time_bucket(records, window="auto", agg_specs=None, ts_field="ts"): + """按时间窗口聚合记录。 + + Args: + records: dict 列表,每个 dict 必须有 ts_field 字段 + window: 窗口大小 '5s'/'1m'/'5m'/'auto' + agg_specs: 聚合规格列表 [(field, func), ...],如 [('selected_hitRatio', 'mean')] + func 支持:count, sum, mean, min, max, pNN + ts_field: 时间戳字段名 + + Returns: + list[dict]: 每个窗口一条记录 {bucket, count, field_func, ...} + """ + if agg_specs is None: + agg_specs = [("_", "count")] + + if not records: + return [] + + window_td = _parse_window(window, records, ts_field) + + # 按窗口分组 + buckets = defaultdict(list) + for r in records: + ts_str = r.get(ts_field, "") + if not ts_str: + continue + try: + dt = datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S") + except ValueError: + continue + bucket_dt = _align_to_bucket(dt, window_td) + bucket_key = bucket_dt.strftime("%Y/%m/%d %H:%M:%S") + buckets[bucket_key].append(r) + + # 按时间排序并聚合 + result = [] + for bucket_key in sorted(buckets.keys()): + bucket_records = buckets[bucket_key] + entry = {"bucket": bucket_key, "count": len(bucket_records)} + + for field, func in agg_specs: + if field == "_": + if func == "count": + entry["count"] = len(bucket_records) + continue + + values = [] + for r in bucket_records: + v = r.get(field) + if v is not None: + try: + values.append(float(v)) + except (ValueError, TypeError): + pass + + out_key = f"{field}_{func}" + entry[out_key] = _aggregate_values(values, func) + + result.append(entry) + + return result + + +def _parse_window(window_str, records, ts_field): + """解析窗口字符串为 timedelta。'auto' 根据数据跨度自动选择。""" + if window_str == "auto": + timestamps = [] + for r in records: + ts_str = r.get(ts_field, "") + if ts_str: + try: + timestamps.append(datetime.strptime(ts_str, "%Y/%m/%d %H:%M:%S")) + except ValueError: + pass + if len(timestamps) < 2: + return timedelta(minutes=1) + span = max(timestamps) - min(timestamps) + if span < timedelta(minutes=30): + return timedelta(seconds=5) + elif span < timedelta(hours=3): + return timedelta(minutes=1) + else: + return timedelta(minutes=5) + elif window_str.endswith("s"): + return timedelta(seconds=int(window_str[:-1])) + elif window_str.endswith("m"): + return timedelta(minutes=int(window_str[:-1])) + elif window_str.endswith("h"): + return timedelta(hours=int(window_str[:-1])) + return timedelta(minutes=1) + + +def _align_to_bucket(dt, window_td): + """将 datetime 对齐到窗口边界。""" + secs = max(1, int(window_td.total_seconds())) + epoch = datetime(dt.year, dt.month, dt.day) + offset = int((dt - epoch).total_seconds()) + aligned = (offset // secs) * secs + return epoch + timedelta(seconds=aligned) + + +def _aggregate_values(values, func): + """用指定函数聚合一组数值。""" + if not values: + return 0 + if func == "count": + return len(values) + elif func == "sum": + return round(sum(values), 3) + elif func == "mean": + return round(sum(values) / len(values), 3) + elif func == "min": + return round(min(values), 3) + elif func == "max": + return round(max(values), 3) + elif func.startswith("p"): + p = int(func[1:]) + return round(percentile(sorted(values), p), 3) + return 0 + + +# ════════════════════════════════════════════════════════════════ +# 分组计数 +# ════════════════════════════════════════════════════════════════ + + +def count_by(records, field, top_n=None): + """按指定字段分组计数。 + + Args: + records: dict 列表 + field: 分组字段名 + top_n: 只返回前 N 个(按计数降序) + + Returns: + list[dict]: [{value, count, pct}],按计数降序排列 + """ + counts = defaultdict(int) + total = 0 + for r in records: + val = r.get(field) + if val is not None: + counts[str(val)] += 1 + total += 1 + + result = [] + for val, count in sorted(counts.items(), key=lambda x: -x[1]): + result.append({"value": val, "count": count, "pct": round(count / total * 100, 1) if total else 0}) + + if top_n: + result = result[:top_n] + + return result diff --git a/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py new file mode 100644 index 00000000000..b00521e6b01 --- /dev/null +++ b/fastdeploy/golang_router/.claude/skills/troubleshoot/scripts/troubleshoot.py @@ -0,0 +1,559 @@ +#!/usr/bin/env python3 +""" +Troubleshoot — FastDeploy Go Router 综合问题排查主编排器 + +Usage: + python3 troubleshoot.py [options] + +Options: + --errors 仅分析错误日志 + --latency 仅分析延迟 + --health 仅分析 Worker 健康 + --cache 仅分析 Cache 调度 + --load 仅分析负载与计数器 + --trace ID 追踪指定请求(支持逗号分隔多 ID;传 all 可全量追踪) + --tail N 仅分析尾部 N 行(支持 5000/1k/1w 等行数写法) + --start TIME 起始时间(如 "16:00:00"、"03/31 16:00") + --end TIME 结束时间(如 "17:00:00"、"2026/03/31 17:00:00") + --output DIR 详细报告导出目录(默认: skill_output/troubleshoot//) + +支持维度:errors, latency, health, cache, load, trace +""" + +import argparse +import re +import os +import sys +from datetime import datetime +from pathlib import Path + +# 确保能 import 同级模块 +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from analyzers.cache import analyze_cache, format_cache_report +from analyzers.errors import analyze_errors, format_errors_report +from analyzers.health import analyze_health, format_health_report +from analyzers.latency import analyze_latency, format_latency_report +from analyzers.load import analyze_load +from analyzers.load_report import format_load_report +from analyzers.trace import analyze_trace, format_trace_report +from log_parser import ( + complete_time_arg, + filter_file_by_time_range, +) + + +def determine_log_file(user_path=None): + """确定日志文件路径。 + + 搜索顺序: + 1. 用户指定路径(直接使用,不质疑) + 2. logs/router.log + 3. fd-router.log(golang_router 根目录) + """ + if user_path: + p = Path(user_path).expanduser() + if p.is_file(): + return str(p) + print(f"ERROR: 文件不存在: {user_path}", file=sys.stderr) + print( + "提示: 若路径含空格/括号,请使用引号,例如: " + "python3 scripts/troubleshoot.py 'fastdeploy/golang_router/logs/fd-router (2).log' --load", + file=sys.stderr, + ) + sys.exit(1) + + # 统一基于脚本位置与当前工作目录搜索,避免 CWD 差异导致找不到日志。 + script_dir = Path(__file__).resolve().parent + golang_router_dir = script_dir.parents[2] # .../fastdeploy/golang_router + cwd = Path.cwd() + + # 精确候选(优先常见命名) + exact_candidates = [ + golang_router_dir / "logs" / "router.log", + golang_router_dir / "fd-router.log", + cwd / "logs" / "router.log", + cwd / "fd-router.log", + cwd / "fastdeploy" / "golang_router" / "logs" / "router.log", + cwd / "fastdeploy" / "golang_router" / "fd-router.log", + ] + for p in exact_candidates: + if p.is_file(): + return str(p) + + # 模糊候选:支持 fd-router (2).log 等命名 + pattern_roots = [ + golang_router_dir / "logs", + golang_router_dir, + cwd / "logs", + cwd, + cwd / "fastdeploy" / "golang_router" / "logs", + cwd / "fastdeploy" / "golang_router", + ] + dynamic_candidates = [] + for root in pattern_roots: + if not root.is_dir(): + continue + dynamic_candidates.extend(sorted(root.glob("fd-router*.log"))) + dynamic_candidates.extend(sorted(root.glob("router*.log"))) + + if dynamic_candidates: + return str(dynamic_candidates[0]) + + print("ERROR: 未找到日志文件。请指定路径或检查 logs/ 目录。", file=sys.stderr) + print("已搜索: logs/router.log, fd-router.log, fd-router*.log, router*.log", file=sys.stderr) + sys.exit(1) + + +def parse_tail_arg(tail_str): + """解析 --tail 参数:支持数字及 k/w 缩写。""" + if tail_str is None: + return None + s = str(tail_str).strip().lower() + m = re.fullmatch(r"(\d+)([kw])?", s) + if not m: + raise ValueError("--tail 仅支持行数(如 5000、1k、1w)。按时间请改用 --start/--end") + value = int(m.group(1)) + unit = m.group(2) + if unit == "k": + value *= 1000 + elif unit == "w": + value *= 10000 + if value <= 0: + raise ValueError("--tail 行数必须 > 0") + return {"type": "lines", "value": value} + + +def determine_status(results): + """根据分析结果判定全局状态。""" + reasons = [] + + # Errors 维度 + errors_result = results.get("errors") + if errors_result: + if errors_result["panic_list"]: + return "CRITICAL", f'{len(errors_result["panic_list"])} Panic 事件' + if errors_result["error_rate"] > 20: + return "CRITICAL", f'错误率 {errors_result["error_rate"]}%' + if errors_result["error_rate"] > 5: + reasons.append(f'错误率 {errors_result["error_rate"]}%') + for s in errors_result["status_code_dist"]: + code = str(s["value"]) + if code in ("502", "503") and s["count"] > 0: + reasons.append(f'{code}: {s["count"]}') + + # Latency 维度 + latency_result = results.get("latency") + if latency_result: + for d in latency_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Health 维度 + health_result = results.get("health") + if health_result: + for d in health_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Load 维度 + load_result = results.get("load") + if load_result: + for d in load_result.get("diagnoses", []): + if d["severity"] == "CRITICAL": + return "CRITICAL", d["message"] + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + # Cache 维度 + cache_result = results.get("cache") + if cache_result: + for d in cache_result.get("diagnoses", []): + if d["severity"] == "HIGH": + reasons.append(d["message"]) + + if reasons: + # 去重但保留完整信息 + deduped = [] + seen = set() + for r in reasons: + if r not in seen: + deduped.append(r) + seen.add(r) + return "DEGRADED", ";".join(deduped) + + if not results: + return "HEALTHY", "无分析数据" + + return "HEALTHY", "无严重问题" + + +def format_full_report(results, status, status_reason): + """组装完整报告。 + + Returns: + tuple: (report_text, details) + report_text: 主报告文本(总结 + 可视化) + details: dict 包含需要拆分到独立文件的详情数据 + - 'health_events': str 或 None + - 'load_select_release': str 或 None + - 'trace_files': {trace_id: text} 或 {}(写入 detail/trace/) + """ + parts = [] + details = { + "health_events": None, + "load_select_release": None, + "latency_diagnoses": None, + "cache_diagnosis": None, + "load_diagnoses": None, + "load_counter_state": None, + "cache_session_stickiness": None, + "cache_suboptimal": None, + "cache_eviction": None, + "cache_fallback": None, + "cache_cross": None, + "errors_topn": None, + "trace_files": {}, + } + + # 状态行 + parts.append(f"STATUS: {status} — {status_reason}") + parts.append( + "状态定义: HEALTHY=无明显异常;DEGRADED=服务可用但存在性能/稳定性问题(需关注);CRITICAL=服务不可用或高风险故障。" + ) + parts.append("=" * 60) + parts.append("") + + # 各维度报告 + if "errors" in results: + parts.append(format_errors_report(results["errors"])) + if results["errors"].get("error_top_n"): + lines = [ + "# Errors TopN 详情", + "", + "| 模板 | 数量 | 级别 | 来源层 | 影响 |", + "|:--|--:|:--|:--|:--|", + ] + for e in results["errors"]["error_top_n"]: + lines.append( + f'| {e.get("template","")} | {e.get("count",0)} | {e.get("level","")} | {e.get("source_layer","")} | {e.get("impact","-")} |' + ) + lines.append("") + lines.append("## 涉及 URLs") + lines.append("") + for e in results["errors"]["error_top_n"]: + urls = e.get("urls") or [] + if not urls: + continue + lines.append(f'- 模板: {e.get("template","")}') + for u in urls: + lines.append(f" - {u}") + lines.append("") + details["errors_topn"] = "\n".join(lines) + + if "latency" in results: + parts.append(format_latency_report(results["latency"])) + if results["latency"].get("diagnoses"): + lines = ["# 延迟诊断详情", ""] + for d in results["latency"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] {d.get("message","")}') + lines.append("") + details["latency_diagnoses"] = "\n".join(lines) + + if "health" in results: + summary, detail = format_health_report(results["health"]) + parts.append(summary) + if detail: + details["health_events"] = detail + + if "load" in results: + summary, detail = format_load_report(results["load"]) + parts.append(summary) + if detail: + details["load_select_release"] = detail + if results["load"].get("diagnoses"): + lines = ["# Load 诊断详情", ""] + for d in results["load"]["diagnoses"]: + lines.append(f'[{d.get("severity","")}] [{d.get("source_layer","")}] {d.get("message","")}') + lines.append("") + details["load_diagnoses"] = "\n".join(lines) + if results["load"].get("counter_last_state"): + rows = results["load"]["counter_last_state"] + lines = [ + "# Load Counter 末状态", + "", + "| worker | req_last_action | req_last_value | token_last_action | token_last_value | last_ts |", + "|:--|:--|--:|:--|--:|:--|", + ] + for r in rows: + lines.append( + f'| {r.get("worker","")} | {r.get("req_last_action","-")} | {r.get("req_last_value","-")} | {r.get("token_last_action","-")} | {r.get("token_last_value","-")} | {r.get("last_ts","")} |' + ) + lines.append("") + details["load_counter_state"] = "\n".join(lines) + + if "cache" in results: + summary, detail = format_cache_report(results["cache"]) + parts.append(summary) + if detail: + details["cache_diagnosis"] = detail + c = results["cache"] + lines = ["# Cache Session 粘性详情", ""] + if c.get("session_stickiness"): + for sid, s in c["session_stickiness"].items(): + lines.append( + f'- {sid}: req={s.get("total_requests",0)}, stickiness={s.get("stickiness_pct",0)}%, switches={s.get("switches",0)}' + ) + else: + lines.append("- 无可用样本(需要同一 session 至少 2 次请求)。") + lines.append("") + details["cache_session_stickiness"] = "\n".join(lines) + + lines = ["# Cache 非最优选择详情", ""] + if c.get("suboptimal_selections"): + for x in c["suboptimal_selections"][:200]: + lines.append( + f'- [{x.get("ts","")}] selected={x.get("selected","")} best={x.get("best_hr_worker","")} reason={x.get("reason","")}' + ) + else: + lines.append("- 未发现非最优选择。") + lines.append("") + details["cache_suboptimal"] = "\n".join(lines) + + lines = ["# Cache 驱逐影响详情", ""] + if c.get("eviction_impact"): + for x in c["eviction_impact"][:200]: + lines.append( + f'- session={x.get("session_id","")} interval={x.get("interval_mins",0)}m hitRatio_after={x.get("hitRatio_after",0)} evicted={x.get("evicted",False)}' + ) + else: + lines.append("- 未检测到超时驱逐样本。") + lines.append("") + details["cache_eviction"] = "\n".join(lines) + + lines = ["# Cache Fallback 原因详情", ""] + if c.get("fallback_reasons"): + for x in c["fallback_reasons"]: + lines.append(f'- {x.get("value","")}: {x.get("count",0)} ({x.get("pct",0)}%)') + else: + lines.append("- 未出现 fallback 记录。") + lines.append("") + details["cache_fallback"] = "\n".join(lines) + + lines = ["# Cache 交叉诊断详情", ""] + if c.get("cross_diagnosis"): + for x in c["cross_diagnosis"]: + lines.append( + f'- diagnosis={x.get("diagnosis","")}, action={x.get("action","")}, avg_stickiness={x.get("avg_stickiness_pct",0)}%' + ) + else: + lines.append("- 样本不足,未生成交叉诊断。") + lines.append("") + details["cache_cross"] = "\n".join(lines) + + if "trace" in results: + summary, detail_dict = format_trace_report(results["trace"]) + parts.append(summary) + if detail_dict: + details["trace_files"] = detail_dict + + return "\n".join(parts), details + + +def save_detailed_report(report_text, output_dir, details=None): + """保存报告到文件。 + + Args: + report_text: 主报告文本 + output_dir: 输出目录 + details: 详情数据 dict(来自 format_full_report) + """ + summary_dir = os.path.join(output_dir, "summary") + detail_dir = os.path.join(output_dir, "detail") + os.makedirs(summary_dir, exist_ok=True) + os.makedirs(detail_dir, exist_ok=True) + filepath = os.path.join(summary_dir, "troubleshoot_report.md") + + with open(filepath, "w", encoding="utf-8") as f: + f.write("# Router Troubleshooting Report\n") + f.write(f'> Generated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n') + f.write(report_text) + + # 保存详情到 detail/ 子目录 + if details: + if details.get("health_events"): + health_path = os.path.join(detail_dir, "health_events.md") + with open(health_path, "w", encoding="utf-8") as f: + f.write(details["health_events"]) + + if details.get("load_select_release"): + load_path = os.path.join(detail_dir, "load_select_release.md") + with open(load_path, "w", encoding="utf-8") as f: + f.write(details["load_select_release"]) + + if details.get("latency_diagnoses"): + latency_path = os.path.join(detail_dir, "latency_diagnoses.md") + with open(latency_path, "w", encoding="utf-8") as f: + f.write(details["latency_diagnoses"]) + + if details.get("cache_diagnosis"): + cache_path = os.path.join(detail_dir, "cache_diagnosis.md") + with open(cache_path, "w", encoding="utf-8") as f: + f.write(details["cache_diagnosis"]) + if details.get("load_diagnoses"): + with open(os.path.join(detail_dir, "load_diagnoses.md"), "w", encoding="utf-8") as f: + f.write(details["load_diagnoses"]) + if details.get("load_counter_state"): + with open(os.path.join(detail_dir, "load_counter_state.md"), "w", encoding="utf-8") as f: + f.write(details["load_counter_state"]) + if details.get("cache_session_stickiness") is not None: + with open(os.path.join(detail_dir, "cache_session_stickiness.md"), "w", encoding="utf-8") as f: + f.write(details["cache_session_stickiness"]) + if details.get("cache_suboptimal") is not None: + with open(os.path.join(detail_dir, "cache_suboptimal.md"), "w", encoding="utf-8") as f: + f.write(details["cache_suboptimal"]) + if details.get("cache_eviction") is not None: + with open(os.path.join(detail_dir, "cache_eviction.md"), "w", encoding="utf-8") as f: + f.write(details["cache_eviction"]) + if details.get("cache_fallback") is not None: + with open(os.path.join(detail_dir, "cache_fallback.md"), "w", encoding="utf-8") as f: + f.write(details["cache_fallback"]) + if details.get("cache_cross") is not None: + with open(os.path.join(detail_dir, "cache_cross.md"), "w", encoding="utf-8") as f: + f.write(details["cache_cross"]) + if details.get("errors_topn"): + with open(os.path.join(detail_dir, "errors_topn.md"), "w", encoding="utf-8") as f: + f.write(details["errors_topn"]) + + trace_detail_dir = os.path.join(detail_dir, "trace") + if details.get("trace_files"): + os.makedirs(trace_detail_dir, exist_ok=True) + for trace_id, trace_text in details.get("trace_files", {}).items(): + safe_id = trace_id.replace("/", "_") + trace_path = os.path.join(trace_detail_dir, f"trace_{safe_id}.md") + with open(trace_path, "w", encoding="utf-8") as f: + f.write(trace_text) + + return filepath + + +def main(): + parser = argparse.ArgumentParser( + description="FastDeploy Go Router Troubleshooting", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("log_file", nargs="?", help="日志文件路径") + parser.add_argument("--errors", action="store_true", help="仅分析错误日志") + parser.add_argument("--latency", action="store_true", help="仅分析延迟") + parser.add_argument("--health", action="store_true", help="仅分析 Worker 健康") + parser.add_argument("--cache", action="store_true", help="仅分析 Cache 调度") + parser.add_argument("--load", action="store_true", help="仅分析负载与计数器") + parser.add_argument("--trace", metavar="ID", help="追踪指定请求(逗号分隔多 ID;传 all 可全量追踪)") + parser.add_argument("--tail", help="尾部行数(如 5000、1k、1w)。按时间请使用 --start/--end") + parser.add_argument( + "--start", default=None, help='起始时间(如 "16:00:00"、"03/31 16:00"、"2026/03/31 16:00:00")' + ) + parser.add_argument("--end", default=None, help='结束时间(如 "17:00:00"、"03/31 17:00"、"2026/03/31 17:00:00")') + parser.add_argument("--output", help="详细报告导出目录(默认:skill_output/troubleshoot//)") + + args = parser.parse_args() + + # 确定日志文件 + log_file = determine_log_file(args.log_file) + print(f"日志文件: {log_file}", file=sys.stderr) + + # --tail 与 --start/--end 不能混用(两者是不同的范围选择方式) + if args.tail and (args.start or args.end): + print("Error: --tail 与 --start/--end 不能同时使用,请选择其一", file=sys.stderr) + sys.exit(1) + + # 时间范围预过滤(--start 和 --end 可单独或同时指定) + import atexit + + start_ts = None + end_ts = None + if args.start or args.end: + start_ts = complete_time_arg(args.start, log_file, is_end=False) if args.start else None + end_ts = complete_time_arg(args.end, log_file, is_end=True) if args.end else None + filtered_path, is_temp = filter_file_by_time_range(log_file, start_ts, end_ts) + if is_temp: + atexit.register(lambda p=filtered_path: os.unlink(p) if os.path.exists(p) else None) + log_file = filtered_path + print(f'时间范围过滤: {start_ts or "..."} ~ {end_ts or "..."}', file=sys.stderr) + + tail_arg = parse_tail_arg(args.tail) + tail = None + if tail_arg and tail_arg["type"] == "lines": + tail = tail_arg["value"] + + # 确定分析模式 + any_mode = args.errors or args.latency or args.health or args.cache or args.load or args.trace + run_errors = args.errors or (not any_mode) + run_latency = args.latency or (not any_mode) + run_health = args.health or (not any_mode) + run_load = args.load or (not any_mode) + run_cache = args.cache or (not any_mode) + run_trace = bool(args.trace) # trace 需要指定 ID(支持 all),全量扫描不自动调用 + + results = {} + step = 0 + total_steps = sum([run_errors, run_latency, run_health, run_cache, run_load, run_trace]) + + # 执行分析 + if run_errors: + step += 1 + print(f"[{step}/{total_steps}] 分析错误日志...", file=sys.stderr) + results["errors"] = analyze_errors(log_file, tail=tail) + + if run_latency: + step += 1 + print(f"[{step}/{total_steps}] 分析请求延迟...", file=sys.stderr) + results["latency"] = analyze_latency(log_file, tail=tail) + + if run_health: + step += 1 + print(f"[{step}/{total_steps}] 分析 Worker 健康...", file=sys.stderr) + results["health"] = analyze_health(log_file, tail=tail) + + if run_cache: + step += 1 + print(f"[{step}/{total_steps}] 分析 Cache 调度...", file=sys.stderr) + results["cache"] = analyze_cache(log_file, tail=tail) + + if run_load: + step += 1 + print(f"[{step}/{total_steps}] 分析负载与计数器...", file=sys.stderr) + results["load"] = analyze_load(log_file, tail=tail) + + if run_trace: + step += 1 + print(f"[{step}/{total_steps}] 追踪请求...", file=sys.stderr) + results["trace"] = analyze_trace(log_file, args.trace, tail=tail) + + # 判定状态 + status, status_reason = determine_status(results) + + # 输出报告 + report, details = format_full_report(results, status, status_reason) + print(report) + + # 保存详细报告 + run_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + if args.output: + output_base = args.output + else: + script_dir = os.path.dirname(os.path.abspath(__file__)) + golang_router_root = os.path.normpath(os.path.join(script_dir, "..", "..", "..", "..")) + output_base = os.path.join(golang_router_root, "skill_output", "troubleshoot") + output_dir = os.path.join(output_base, run_timestamp) + filepath = save_detailed_report(report, output_dir, details=details) + print(f"\n详细报告已保存到: {filepath}", file=sys.stderr) + + +if __name__ == "__main__": + main() diff --git a/fastdeploy/golang_router/.gitignore b/fastdeploy/golang_router/.gitignore new file mode 100644 index 00000000000..58b5c84d190 --- /dev/null +++ b/fastdeploy/golang_router/.gitignore @@ -0,0 +1,2 @@ +# Generated skill analysis outputs +skill_output/ diff --git a/fastdeploy/golang_router/cmd/main.go b/fastdeploy/golang_router/cmd/main.go index e0e8c98e137..c3670622ab2 100644 --- a/fastdeploy/golang_router/cmd/main.go +++ b/fastdeploy/golang_router/cmd/main.go @@ -41,7 +41,15 @@ func main() { } // Initialize logger - logger.Init(cfg.Log.Level, cfg.Log.Output) + logCfg := logger.Config{ + Level: cfg.Log.Level, + Output: cfg.Log.Output, + Dir: cfg.Log.Dir, + MaxAgeDays: cfg.Log.MaxAgeDays, + MaxTotalSizeMB: cfg.Log.MaxTotalSizeMB, + CleanupIntervalSecs: cfg.Log.CleanupIntervalSecs, + } + logger.Init(logCfg) defer logger.CloseLogFile() // Initialize manager @@ -59,6 +67,7 @@ func main() { go scheduler_handler.StartBackupCleanupTask(context.Background(), intervalCleanupSecs) statsIntervalSecs := cfg.Scheduler.StatsIntervalSecs go scheduler_handler.StartStatsReporter(context.Background(), statsIntervalSecs) + go logger.StartLogCleanup(context.Background(), logCfg) // Start server addr := ":" + cfg.Server.Port diff --git a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml index be4b11227d2..075d8eec5fd 100644 --- a/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml +++ b/fastdeploy/golang_router/examples/run_with_config/config/config.example.yaml @@ -29,3 +29,7 @@ manager: log: level: "info" # debug, info, warn, error output: "file" # stdout, file + dir: "logs" # log directory; default: logs + max-age-days: 7 # max days to keep log files; default: 7 + max-total-size-mb: 500 # max total log size in MB; default: 500 + cleanup-interval-secs: 3600 # cleanup check interval in seconds; default: 3600 diff --git a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml index be4b11227d2..5e1091b0eef 100644 --- a/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml +++ b/fastdeploy/golang_router/examples/run_with_default_workers/config/config.example.yaml @@ -29,3 +29,6 @@ manager: log: level: "info" # debug, info, warn, error output: "file" # stdout, file + max-age-days: 7 # max days to keep log files; default: 7 + max-total-size-mb: 500 # max total log size in MB; default: 500 + cleanup-interval-secs: 3600 # cleanup check interval in seconds; default: 3600 diff --git a/fastdeploy/golang_router/internal/config/config.go b/fastdeploy/golang_router/internal/config/config.go index 2cb8226961d..7a6dc3fc504 100644 --- a/fastdeploy/golang_router/internal/config/config.go +++ b/fastdeploy/golang_router/internal/config/config.go @@ -49,8 +49,12 @@ type SchedulerConfig struct { } type LogConfig struct { - Level string `yaml:"level"` // debug, info, warn, error - Output string `yaml:"output"` // stdout, file + Level string `yaml:"level"` // debug, info, warn, error + Output string `yaml:"output"` // stdout, file + Dir string `yaml:"dir"` // log directory; defaults to "logs" + MaxAgeDays int `yaml:"max-age-days"` // max days to keep log files; 0 = use default (7) + MaxTotalSizeMB int `yaml:"max-total-size-mb"` // max total log size in MB; 0 = use default (500) + CleanupIntervalSecs float64 `yaml:"cleanup-interval-secs"` // cleanup check interval in seconds; 0 = use default (3600) } func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) { @@ -81,6 +85,15 @@ func Load(configPath, listenPort string, isSplitwise bool) (*Config, error) { if cfg.Log.Level == "" { cfg.Log.Level = "info" } + if cfg.Log.MaxAgeDays == 0 { + cfg.Log.MaxAgeDays = 7 + } + if cfg.Log.MaxTotalSizeMB == 0 { + cfg.Log.MaxTotalSizeMB = 500 + } + if cfg.Log.CleanupIntervalSecs == 0 { + cfg.Log.CleanupIntervalSecs = 3600 + } if cfg.Manager.HealthCheckEndpoint == "" { cfg.Manager.HealthCheckEndpoint = "/health" } diff --git a/fastdeploy/golang_router/internal/gateway/completions_test.go b/fastdeploy/golang_router/internal/gateway/completions_test.go index 825544ff5e3..4fea9736ad6 100644 --- a/fastdeploy/golang_router/internal/gateway/completions_test.go +++ b/fastdeploy/golang_router/internal/gateway/completions_test.go @@ -20,7 +20,7 @@ import ( ) func TestMain(m *testing.M) { - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) gin.SetMode(gin.TestMode) os.Exit(m.Run()) } diff --git a/fastdeploy/golang_router/internal/manager/health_test.go b/fastdeploy/golang_router/internal/manager/health_test.go index bc42031d85f..f50ea2d00b2 100644 --- a/fastdeploy/golang_router/internal/manager/health_test.go +++ b/fastdeploy/golang_router/internal/manager/health_test.go @@ -15,7 +15,7 @@ import ( func init() { // Initialize logger for all tests - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) } func TestCheckServiceHealth(t *testing.T) { diff --git a/fastdeploy/golang_router/internal/middleware/logger_test.go b/fastdeploy/golang_router/internal/middleware/logger_test.go index da9c7290567..47b63742547 100644 --- a/fastdeploy/golang_router/internal/middleware/logger_test.go +++ b/fastdeploy/golang_router/internal/middleware/logger_test.go @@ -12,7 +12,7 @@ import ( func init() { // Initialize logger to avoid nil pointer dereference in recovery middleware - logger.Init("info", "stdout") + logger.Init(logger.Config{Level: "info", Output: "stdout"}) } func TestLoggerMiddleware(t *testing.T) { diff --git a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go index 48737c03c72..2259087d619 100644 --- a/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go +++ b/fastdeploy/golang_router/internal/scheduler/handler/prefill_cache_aware.go @@ -384,6 +384,9 @@ func (c *radixPrefixCache) Record(tokens []int, worker string) { // evictionWorker periodically evicts inactive nodes func (c *radixPrefixCache) evictionWorker(interval time.Duration) { + if interval <= 0 { + return + } ticker := time.NewTicker(interval) defer ticker.Stop() for { diff --git a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go index d3b6dacfdc4..e1155e3686b 100644 --- a/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go +++ b/fastdeploy/golang_router/internal/scheduler/handler/tokenizer_test.go @@ -586,13 +586,13 @@ func TestParseTokensFromBody(t *testing.T) { name: "invalid JSON format", input: []byte(`invalid json`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "empty body", input: []byte(``), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "large array of tokens", @@ -610,13 +610,13 @@ func TestParseTokensFromBody(t *testing.T) { name: "non-array input_ids", input: []byte(`{"input_ids": "not an array"}`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, { name: "malformed array", input: []byte(`{"input_ids": [1, "two", 3]}`), expected: nil, - err: errors.New("tokenizer response missing tokens"), + err: errors.New("tokenizer response unmarshal failed"), }, } @@ -629,8 +629,8 @@ func TestParseTokensFromBody(t *testing.T) { t.Errorf("parseTokensFromBody() error = %v, wantErr %v", err, tt.err) return } - if err != nil && tt.err != nil && err.Error() != tt.err.Error() { - t.Errorf("parseTokensFromBody() error message = %v, want %v", err.Error(), tt.err.Error()) + if err != nil && tt.err != nil && !strings.Contains(err.Error(), tt.err.Error()) { + t.Errorf("parseTokensFromBody() error message = %v, want containing %v", err.Error(), tt.err.Error()) return } diff --git a/fastdeploy/golang_router/pkg/logger/logger.go b/fastdeploy/golang_router/pkg/logger/logger.go index 8e213fc0c9f..daa23d55450 100644 --- a/fastdeploy/golang_router/pkg/logger/logger.go +++ b/fastdeploy/golang_router/pkg/logger/logger.go @@ -1,12 +1,27 @@ package logger import ( + "context" + "fmt" "log" "os" + "path/filepath" + "sort" + "strings" "sync" - "context" + "time" ) +// Config holds logger configuration. +type Config struct { + Level string + Output string + Dir string // log directory; defaults to "logs" + MaxAgeDays int + MaxTotalSizeMB int + CleanupIntervalSecs float64 +} + var ( infoLogger *log.Logger errorLogger *log.Logger @@ -14,37 +29,260 @@ var ( debugLogger *log.Logger level string once sync.Once - logFile *os.File + writer *rotatingWriter // nil when output is stdout ) +// nowFunc is overridable in tests for time-dependent logic. +var nowFunc = time.Now + type contextKey string + const TraceIDKey contextKey = "trace_id" const ReqIDKey contextKey = "req_id" const RequestIDKey contextKey = "request_id" const SessionIDKey contextKey = "session_id" -// Init initialize logger -func Init(logLevel, output string) { - once.Do(func() { - level = logLevel +// gracePeriod is how long we keep the previous day's file open after rotation. +const gracePeriod = 5 * time.Minute + +// rotatingWriter implements io.Writer with day-level rotation and dual-file writes. +// Current day's log is written to "router-YYYY-MM-DD.log" and "router.log" is a +// symlink pointing to the current day's file. On day change a new date file is +// created and the symlink is updated. During a short grace period after rotation, +// log lines whose timestamp belongs to the previous day are written to the old file. +type rotatingWriter struct { + mu sync.Mutex + currentFile *os.File // today's router-.log + prevFile *os.File // previous day's router-.log during grace period (may be nil) + currentDate string // "2006-01-02" + prevDate string // previous date during grace period + graceUntil time.Time // when to close prevFile + retryAfter time.Time // earliest time to retry a failed rotation (backoff) + logDir string +} + +func newRotatingWriter(logDir string) (*rotatingWriter, error) { + today := nowFunc().Format("2006-01-02") + datePath := filepath.Join(logDir, "router-"+today+".log") + symlinkPath := filepath.Join(logDir, "router.log") + + // Migration: if router.log is a regular file (legacy), rename it to the date file. + if info, err := os.Lstat(symlinkPath); err == nil && info.Mode().IsRegular() { + os.Rename(symlinkPath, datePath) + } + + // Open the date file (append mode). + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + return nil, err + } + + // Create/update symlink: router.log -> router-.log + if err := updateSymlink(symlinkPath, "router-"+today+".log"); err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale: %v\n", symlinkPath, err) + } + + return &rotatingWriter{ + currentFile: f, + currentDate: today, + logDir: logDir, + }, nil +} + +// needsRotate checks if rotation is needed under the lock. +func (w *rotatingWriter) needsRotate(today string) (bool, string) { + w.mu.Lock() + defer w.mu.Unlock() + needs := today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) + return needs, w.logDir +} + +// tryOpenRotateFile checks if rotation is needed and pre-opens the new log file +// outside the lock to avoid blocking other writers on slow file I/O. +func (w *rotatingWriter) tryOpenRotateFile(today string) *os.File { + needs, logDir := w.needsRotate(today) + if !needs { + return nil + } + + datePath := filepath.Join(logDir, "router-"+today+".log") + f, err := os.OpenFile(datePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) + if err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to open new log file %s: %v, keeping current file\n", datePath, err) + return nil + } + return f +} + +func (w *rotatingWriter) Write(p []byte) (n int, err error) { + today := nowFunc().Format("2006-01-02") + + // Pre-open new file outside the lock to reduce lock-held I/O time. + preOpened := w.tryOpenRotateFile(today) + + w.mu.Lock() + defer w.mu.Unlock() + + // Authoritative rotation check under lock. + if today != w.currentDate && (w.retryAfter.IsZero() || !nowFunc().Before(w.retryAfter)) { + if preOpened != nil { + w.commitRotate(today, preOpened) + preOpened = nil // ownership transferred + } else { + // File open failed; set backoff so we don't retry on every Write. + w.retryAfter = nowFunc().Add(30 * time.Second) + } + } + // If another goroutine already rotated, close the unused pre-opened file. + if preOpened != nil { + preOpened.Close() + } + + // Close previous file if grace period expired. + if w.prevFile != nil && nowFunc().After(w.graceUntil) { + w.prevFile.Close() + w.prevFile = nil + w.prevDate = "" + } + + // During grace period, route log lines to the correct file based on timestamp. + target := w.currentFile + if w.prevFile != nil { + if logDate := parseLogDate(p); logDate == w.prevDate { + target = w.prevFile + } + } + + return target.Write(p) +} + +func (w *rotatingWriter) Close() error { + w.mu.Lock() + defer w.mu.Unlock() + if w.prevFile != nil { + w.prevFile.Close() + w.prevFile = nil + } + if w.currentFile != nil { + return w.currentFile.Close() + } + return nil +} + +// commitRotate finalises the rotation with a pre-opened file. Must be called with w.mu held. +func (w *rotatingWriter) commitRotate(newDate string, f *os.File) { + // Rotation succeeded — clear any retry backoff. + w.retryAfter = time.Time{} + + // Close any lingering previous file. + if w.prevFile != nil { + w.prevFile.Close() + w.prevFile = nil + } + + // Keep the old date file open for grace period writes. + w.prevFile = w.currentFile + w.prevDate = w.currentDate + w.graceUntil = nowFunc().Add(gracePeriod) + + w.currentFile = f + w.currentDate = newDate + + // Update symlink: router.log -> router-.log + symlinkPath := filepath.Join(w.logDir, "router.log") + if err := updateSymlink(symlinkPath, "router-"+newDate+".log"); err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Symlink %s may be stale (points to old date): %v\n", symlinkPath, err) + } +} + +// updateSymlink atomically replaces symlinkPath to point to target. +// It tries os.Remove + os.Symlink first; if remove fails (e.g. permission denied) +// it falls back to a temp-symlink + os.Rename for an atomic swap attempt. +func updateSymlink(symlinkPath, target string) error { + // Fast path: remove old, create new. + if err := os.Remove(symlinkPath); err != nil && !os.IsNotExist(err) { + // Remove failed (e.g. permission issue). Try atomic rename as fallback. + tmp := symlinkPath + ".tmp" + if err2 := os.Symlink(target, tmp); err2 != nil { + return fmt.Errorf("remove old symlink: %w; create temp symlink: %v", err, err2) + } + if err2 := os.Rename(tmp, symlinkPath); err2 != nil { + os.Remove(tmp) // best-effort cleanup + return fmt.Errorf("remove old symlink: %w; rename temp symlink: %v", err, err2) + } + return nil + } + if err := os.Symlink(target, symlinkPath); err != nil { + return fmt.Errorf("create symlink: %w", err) + } + return nil +} + +// parseLogDate extracts the date from a log line produced by log.LstdFlags. +// Format: "[LEVEL] 2006/01/02 15:04:05 ..." +// Returns "2006-01-02" or empty string on parse failure. +func parseLogDate(p []byte) string { + // Find the date pattern "YYYY/MM/DD" in the log prefix. + // log.LstdFlags produces: "2006/01/02 15:04:05" after the logger prefix. + // The prefix is like "[INFO] " (7 chars), so the date starts around index 7. + s := string(p) + for i := 0; i+10 <= len(s); i++ { + c := s[i] + if c >= '0' && c <= '9' && i+10 <= len(s) && s[i+4] == '/' && s[i+7] == '/' { + // Found a candidate "YYYY/MM/DD" — validate it. + year := s[i : i+4] + month := s[i+5 : i+7] + day := s[i+8 : i+10] + if !isAllDigits(month) || !isAllDigits(day) { + continue + } + m := (month[0]-'0')*10 + (month[1] - '0') + d := (day[0]-'0')*10 + (day[1] - '0') + if m < 1 || m > 12 || d < 1 || d > 31 { + continue + } + _ = year // year already starts with a digit; any 4-digit year is acceptable + return year + "-" + month + "-" + day + } + } + return "" +} + +// isAllDigits returns true if every byte in s is an ASCII digit. +func isAllDigits(s string) bool { + for i := 0; i < len(s); i++ { + if s[i] < '0' || s[i] > '9' { + return false + } + } + return true +} +// Init initializes the logger. +func Init(cfg Config) { + once.Do(func() { + level = cfg.Level flags := log.LstdFlags | log.Lshortfile - if output == "file" { - // Check if logs directory exists - if _, err := os.Stat("logs"); os.IsNotExist(err) { - if err := os.MkdirAll("logs", 0755); err != nil { + if cfg.Output == "file" { + logDir := cfg.Dir + if logDir == "" { + logDir = "logs" + } + if _, err := os.Stat(logDir); os.IsNotExist(err) { + if err := os.MkdirAll(logDir, 0755); err != nil { log.Fatalln("Failed to create logs directory:", err) } } - logFile, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + var err error + writer, err = newRotatingWriter(logDir) if err != nil { - log.Fatalln("Failed to open log file:", err) + log.Fatalln("Failed to create rotating log writer:", err) } - infoLogger = log.New(logFile, "[INFO] ", flags) - errorLogger = log.New(logFile, "[ERROR] ", flags) - warnLogger = log.New(logFile, "[WARN] ", flags) - debugLogger = log.New(logFile, "[DEBUG] ", flags) + infoLogger = log.New(writer, "[INFO] ", flags) + errorLogger = log.New(writer, "[ERROR] ", flags) + warnLogger = log.New(writer, "[WARN] ", flags) + debugLogger = log.New(writer, "[DEBUG] ", flags) } else { infoLogger = log.New(os.Stdout, "[INFO] ", flags) errorLogger = log.New(os.Stderr, "[ERROR] ", flags) @@ -54,9 +292,134 @@ func Init(logLevel, output string) { }) } +// CloseLogFile closes the log file if in file output mode. func CloseLogFile() { - if logFile != nil { - logFile.Close() + if writer != nil { + writer.Close() + } +} + +// StartLogCleanup blocks running periodic log cleanup; call it in a goroutine. +// It deletes archived log files older than MaxAgeDays and trims total log size +// to stay under MaxTotalSizeMB. +func StartLogCleanup(ctx context.Context, cfg Config) { + if cfg.Output != "file" { + return + } + if cfg.CleanupIntervalSecs <= 0 { + return + } + + logDir := cfg.Dir + if logDir == "" { + logDir = "logs" + } + + ticker := time.NewTicker(time.Duration(cfg.CleanupIntervalSecs * float64(time.Second))) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + cleanupLogs(logDir, cfg.MaxAgeDays, cfg.MaxTotalSizeMB) + } + } +} + +type logFileInfo struct { + name string + path string + date time.Time + size int64 +} + +// cleanupLogs removes archived log files based on age and total size limits. +func cleanupLogs(logDir string, maxAgeDays, maxTotalSizeMB int) { + entries, err := os.ReadDir(logDir) + if err != nil { + fmt.Fprintf(os.Stderr, "[WARN] Failed to read log directory for cleanup: %v\n", err) + return + } + + now := nowFunc() + today := now.Format("2006-01-02") + var archives []logFileInfo + + for _, entry := range entries { + if entry.IsDir() { + continue + } + name := entry.Name() + + // router.log is now a symlink; skip it. + if name == "router.log" { + continue + } + + // Match archived files: router-YYYY-MM-DD.log + if !strings.HasPrefix(name, "router-") || !strings.HasSuffix(name, ".log") { + continue + } + dateStr := strings.TrimPrefix(name, "router-") + dateStr = strings.TrimSuffix(dateStr, ".log") + fileDate, err := time.Parse("2006-01-02", dateStr) + if err != nil { + continue + } + // Never delete today's active date file. + if dateStr == today { + continue + } + info, err := entry.Info() + if err != nil { + continue + } + archives = append(archives, logFileInfo{ + name: name, + path: filepath.Join(logDir, name), + date: fileDate, + size: info.Size(), + }) + } + + // Sort by date ascending (oldest first). + sort.Slice(archives, func(i, j int) bool { + return archives[i].date.Before(archives[j].date) + }) + + // Phase 1: Age-based cleanup. + if maxAgeDays > 0 { + cutoff := now.AddDate(0, 0, -maxAgeDays) + remaining := archives[:0] + for _, f := range archives { + if f.date.Before(cutoff) { + if err := os.Remove(f.path); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", f.path, err) + } + } else { + remaining = append(remaining, f) + } + } + archives = remaining + } + + // Phase 2: Size-based cleanup. + if maxTotalSizeMB > 0 { + maxBytes := int64(maxTotalSizeMB) * 1024 * 1024 + var totalSize int64 + for _, f := range archives { + totalSize += f.size + } + for len(archives) > 0 && totalSize > maxBytes { + oldest := archives[0] + if err := os.Remove(oldest.path); err != nil { + fmt.Fprintf(os.Stderr, "[ERROR] Failed to remove log file %s: %v\n", oldest.path, err) + } + totalSize -= oldest.size + archives = archives[1:] + } } } diff --git a/fastdeploy/golang_router/pkg/logger/logger_test.go b/fastdeploy/golang_router/pkg/logger/logger_test.go index 59faeee2a4d..1d9874ded6f 100644 --- a/fastdeploy/golang_router/pkg/logger/logger_test.go +++ b/fastdeploy/golang_router/pkg/logger/logger_test.go @@ -4,13 +4,15 @@ import ( "bytes" "context" "os" + "path/filepath" "strings" "testing" + "time" ) func TestLoggerInit(t *testing.T) { t.Run("stdout output", func(t *testing.T) { - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) if infoLogger == nil || errorLogger == nil || warnLogger == nil || debugLogger == nil { t.Error("Loggers should be initialized") @@ -24,7 +26,7 @@ func TestLoggerInit(t *testing.T) { defer os.RemoveAll("logs") // sync.Once prevents re-init, so manually verify file creation logic - f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0666) + f, err := os.OpenFile("logs/router.log", os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0644) if err != nil { t.Fatalf("Failed to create log file: %v", err) } @@ -117,7 +119,7 @@ func TestLogLevels(t *testing.T) { func TestLogFunctions(t *testing.T) { var buf bytes.Buffer - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) level = "debug" // Redirect output @@ -132,7 +134,7 @@ func TestLogFunctions(t *testing.T) { } func TestContextPrefix(t *testing.T) { - Init("debug", "stdout") + Init(Config{Level: "debug", Output: "stdout"}) level = "debug" t.Run("nil context produces no prefix", func(t *testing.T) { @@ -151,7 +153,7 @@ func TestContextPrefix(t *testing.T) { } }) - t.Run("context without request_id produces [request_id:null]", func(t *testing.T) { + t.Run("context without request_id produces no request_id prefix", func(t *testing.T) { var buf bytes.Buffer oldOutput := infoLogger.Writer() defer func() { infoLogger.SetOutput(oldOutput) }() @@ -160,8 +162,11 @@ func TestContextPrefix(t *testing.T) { ctx := context.Background() Info(ctx, "mixed mode log") output := buf.String() - if !strings.Contains(output, "[request_id:null]") { - t.Errorf("context without request_id should produce [request_id:null], got: %s", output) + if strings.Contains(output, "[request_id:") { + t.Errorf("context without request_id should not produce request_id prefix, got: %s", output) + } + if !strings.Contains(output, "mixed mode log") { + t.Errorf("message should be present, got: %s", output) } }) @@ -179,3 +184,176 @@ func TestContextPrefix(t *testing.T) { } }) } + +func TestParseLogDate(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + {"standard INFO log line", "[INFO] 2024/03/15 10:30:45 some message", "2024-03-15"}, + {"standard ERROR log line", "[ERROR] 2024/01/02 09:00:00 error occurred", "2024-01-02"}, + {"standard WARN log line", "[WARN] 2025/12/31 23:59:59 warning msg", "2025-12-31"}, + {"standard DEBUG log line", "[DEBUG] 2024/06/01 00:00:00 debug info", "2024-06-01"}, + {"empty string", "", ""}, + {"no date pattern", "no date here at all", ""}, + {"incomplete date - only year", "2024/", ""}, + {"incomplete date - year and month", "[INFO] 2024/03", ""}, + {"short input", "abc", ""}, + {"date without log prefix", "2024/03/15 10:30:45 message", "2024-03-15"}, + {"date at different position", "prefix 2024/11/20 rest", "2024-11-20"}, + {"slash but not date", "path/to/file is not a date", ""}, + {"single character input", "x", ""}, + {"exactly 10 chars non-date", "abcdefghij", ""}, + {"boundary - first day of year", "[INFO] 2024/01/01 00:00:00 new year", "2024-01-01"}, + {"boundary - last day of year", "[INFO] 2024/12/31 23:59:59 year end", "2024-12-31"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := parseLogDate([]byte(tt.input)) + if got != tt.expected { + t.Errorf("parseLogDate(%q) = %q, want %q", tt.input, got, tt.expected) + } + }) + } +} + +func TestStartLogCleanup(t *testing.T) { + t.Run("cleanup runs for file output and respects cancellation", func(t *testing.T) { + tmpDir := t.TempDir() + + originalNowFunc := nowFunc + fixedNow := time.Date(2026, 4, 10, 12, 0, 0, 0, time.UTC) + nowFunc = func() time.Time { return fixedNow } + defer func() { nowFunc = originalNowFunc }() + + // Create archived logs: one older than 1 day and one recent. + oldLog := filepath.Join(tmpDir, "router-2026-04-07.log") + recentLog := filepath.Join(tmpDir, "router-2026-04-09.log") + todayLog := filepath.Join(tmpDir, "router-2026-04-10.log") + for _, p := range []string{oldLog, recentLog, todayLog} { + if err := os.WriteFile(p, []byte("test"), 0644); err != nil { + t.Fatalf("failed to create test log %s: %v", p, err) + } + } + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + defer close(done) + StartLogCleanup(ctx, Config{ + Output: "file", + Dir: tmpDir, + MaxAgeDays: 2, + CleanupIntervalSecs: 0.01, + }) + }() + + waitForCondition(t, 500*time.Millisecond, func() bool { + _, err := os.Stat(oldLog) + return os.IsNotExist(err) + }, "old log should be removed by StartLogCleanup") + + if _, err := os.Stat(recentLog); err != nil { + t.Fatalf("recent log should be kept, stat err: %v", err) + } + if _, err := os.Stat(todayLog); err != nil { + t.Fatalf("today log should be kept, stat err: %v", err) + } + + cancel() + select { + case <-done: + case <-time.After(500 * time.Millisecond): + t.Fatal("StartLogCleanup did not stop after context cancellation") + } + }) + + t.Run("non-file output returns immediately", func(t *testing.T) { + done := make(chan struct{}) + go func() { + defer close(done) + StartLogCleanup(context.Background(), Config{Output: "stdout", CleanupIntervalSecs: 1}) + }() + select { + case <-done: + case <-time.After(200 * time.Millisecond): + t.Fatal("StartLogCleanup should return immediately for non-file output") + } + }) +} + +func TestRotatingWriterCrossDayGracePeriodIntegration(t *testing.T) { + tmpDir := t.TempDir() + + originalNowFunc := nowFunc + defer func() { nowFunc = originalNowFunc }() + + current := time.Date(2026, 4, 10, 23, 59, 59, 0, time.UTC) + nowFunc = func() time.Time { return current } + + w, err := newRotatingWriter(tmpDir) + if err != nil { + t.Fatalf("failed to create rotating writer: %v", err) + } + defer w.Close() + + if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:59 first day line\n")); err != nil { + t.Fatalf("failed to write day-1 line: %v", err) + } + + current = time.Date(2026, 4, 11, 0, 0, 1, 0, time.UTC) + if _, err = w.Write([]byte("[INFO] 2026/04/11 00:00:01 second day line\n")); err != nil { + t.Fatalf("failed to write day-2 line: %v", err) + } + + if _, err = w.Write([]byte("[INFO] 2026/04/10 23:59:58 late previous-day line\n")); err != nil { + t.Fatalf("failed to write late previous-day line: %v", err) + } + + day1Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-10.log")) + if err != nil { + t.Fatalf("failed to read day-1 log: %v", err) + } + day1Content := string(day1Bytes) + if !strings.Contains(day1Content, "first day line") { + t.Fatalf("day-1 log missing initial line, content: %s", day1Content) + } + if !strings.Contains(day1Content, "late previous-day line") { + t.Fatalf("day-1 log missing late previous-day line, content: %s", day1Content) + } + + day2Bytes, err := os.ReadFile(filepath.Join(tmpDir, "router-2026-04-11.log")) + if err != nil { + t.Fatalf("failed to read day-2 log: %v", err) + } + day2Content := string(day2Bytes) + if !strings.Contains(day2Content, "second day line") { + t.Fatalf("day-2 log missing day-2 line, content: %s", day2Content) + } + if strings.Contains(day2Content, "late previous-day line") { + t.Fatalf("late previous-day line should not be in day-2 file, content: %s", day2Content) + } + + symlinkTarget, err := os.Readlink(filepath.Join(tmpDir, "router.log")) + if err != nil { + t.Fatalf("failed to read symlink: %v", err) + } + if symlinkTarget != "router-2026-04-11.log" { + t.Fatalf("router.log symlink target = %s, want router-2026-04-11.log", symlinkTarget) + } +} + +func waitForCondition(t *testing.T, timeout time.Duration, cond func() bool, msg string) { + t.Helper() + + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + if cond() { + return + } + time.Sleep(10 * time.Millisecond) + } + t.Fatal(msg) +}