From f9f1868a951686f8310942898f85d0aa9e8a277d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:46:07 +0800 Subject: [PATCH 01/12] feat(telemetry): opt-in anonymous ops telemetry per command Emit one dimensions-only metric per dws invocation (error rate, latency, command distribution, version/platform health) to an operator-configured sink. Independent of the audit machinery and OFF by default. - internal/telemetry: Event (10 coarse dimensions, no content/identity) + env-driven Forwarder (DWS_TELEMETRY_ENABLED/URL/TOKEN/TIMEOUT_MS) - wire emitTelemetry into executeInvocation's defer, reusing the existing outcome/err_class/duration already computed for command-end logging - docs/telemetry.md: fields, privacy boundary, SLS ingest + 4 alert rules - tests cover enable gating, POST contract, and the privacy boundary (param content must never leak into the payload) --- docs/telemetry.md | 89 ++++++++++++++++ internal/app/runner.go | 5 +- internal/app/telemetry_runtime.go | 63 +++++++++++ internal/app/telemetry_runtime_test.go | 113 ++++++++++++++++++++ internal/telemetry/event.go | 68 ++++++++++++ internal/telemetry/telemetry.go | 139 +++++++++++++++++++++++++ internal/telemetry/telemetry_test.go | 120 +++++++++++++++++++++ 7 files changed, 596 insertions(+), 1 deletion(-) create mode 100644 docs/telemetry.md create mode 100644 internal/app/telemetry_runtime.go create mode 100644 internal/app/telemetry_runtime_test.go create mode 100644 internal/telemetry/event.go create mode 100644 internal/telemetry/telemetry.go create mode 100644 internal/telemetry/telemetry_test.go diff --git a/docs/telemetry.md b/docs/telemetry.md new file mode 100644 index 00000000..f33188b3 --- /dev/null +++ b/docs/telemetry.md @@ -0,0 +1,89 @@ +# 运维遥测(Telemetry) + +dws 可以为**每一次命令调用**上报一条**匿名、纯维度**的运维指标,用于监控 +错误率、延迟、命令分布和版本/平台健康度。它是审计([audit](./audit.md))的运维侧 +对应物,但刻意做得**小得多**: + +- 只采**粗维度**,绝不采对象名、自由文本、peer id、设备指纹、自然语言原文。 + 没有"脱敏档",因为压根没有敏感字段可脱。 +- **独立于审计**:和 `DWS_AUDIT_*` 互不相关,可以只开遥测不开合规审计。 +- **默认全关**。不设 `DWS_TELEMETRY_ENABLED` 时,dws 不产生任何遥测,热路径零影响。 + +> 这是开源 CLI,集中上报必须 **opt-in + 明确告知**。默认不上报一个字节。 + +## 启用 + +| 环境变量 | 说明 | 示例 | +|---|---|---| +| `DWS_TELEMETRY_ENABLED` | 启用遥测(需同时配 URL 才生效) | `true` | +| `DWS_TELEMETRY_URL` | 上报端点,每次调用 POST 一条 JSON | `https://telemetry.example.com/dws` | +| `DWS_TELEMETRY_TOKEN` | 端点的 Bearer 鉴权(可选) | `xxxxx` | +| `DWS_TELEMETRY_TIMEOUT_MS` | 单次上报超时上限,毫秒(默认 1500) | `1500` | + +## 上报字段(全部) + +```json +{ + "schema_version": "1", + "ts": "2026-06-04T11:38:24+08:00", + "trace_id": "76a04f9eba0ad00c", // == 传输层 execution_id,可与服务端日志 join + "corp_id": "ding...", // 租户维度,best-effort(取自登录 token) + "cli_version": "1.0.34", // 版本健康:"这版本是不是把某命令搞挂了" + "channel": "openclaw", // 哪个 agent/集成在调用(DWS_CHANNEL) + "os": "darwin", // 粗平台,非 PII + "module": "doc", + "command": "doc", + "subcommand": "create_document", + "outcome": "ok", // ok | error + "err_class": "", // outcome=error 时的错误分类 + "exit_code": 0, + "duration_ms": 73 // 调用墙钟耗时,用于 P99 +} +``` + +**刻意不采**(看这个 struct 就能验证隐私边界):用户身份(user_id/姓名)、 +对象名/id、自由文本、设备 id/序列号、请求/响应 body。 + +## 接收端契约 + +任何 HTTP 服务都能接: + +``` +POST / +Content-Type: application/json +Authorization: Bearer # 对应 DWS_TELEMETRY_TOKEN +X-Dws-Telemetry-Schema: 1 +Body: 一条遥测事件 JSON +返回 2xx 即成功 +``` + +## 接入阿里云 SLS(生产推荐) + +SLS(日志服务)自带写入 / 存储 / 检索 / Dashboard / 告警,是运维监控的标准选型: + +1. **建库**:SLS 控制台建 Project + Logstore(如 `dws-telemetry`),设留存天数; + 给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` + 开字段索引,`duration_ms` 设为 long 型索引(要算 P99)。 +2. **建接收端点**:用**函数计算 FC** HTTP 触发器最省运维——校验 Bearer 后把 body + 作为一条日志 `PutLogs` 写进 Logstore(整条 JSON 放 `event` 字段,另抽 + `command`/`outcome`/`duration_ms`/`cli_version` 做索引列)。 +3. **下发**:把 FC 地址作为 `DWS_TELEMETRY_URL` 配到各端 dws。 + +### 上手就能用的 4 条告警(SLS 告警规则) + +| 告警 | SLS 查询(示意) | 触发 | +|---|---|---| +| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 5% | +| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | +| 调用量跌零 | `* \| select count(*)` | 5 分钟内 == 0 | + +告警通知渠道直接选钉钉机器人。 + +## 数据落在哪 / 两条流 + +- **不开 = 不出本机。** dws 不内置任何厂商默认上报地址。 +- **企业自有监控**:`DWS_TELEMETRY_URL` 指向企业自己的 SLS ingest。 +- **平台侧统一监控**:URL 指向钉钉的遥测 ingest——技术可行,但必须 opt-in + 告知。 + 因为本遥测**只含匿名维度**,隐私边界天然干净,适合做平台运维大盘。 +- 合规全量留痕是另一条线,走 [audit](./audit.md) 的企业自有 sink,别和遥测混。 diff --git a/internal/app/runner.go b/internal/app/runner.go index eb026b15..188d9ff6 100644 --- a/internal/app/runner.go +++ b/internal/app/runner.go @@ -299,9 +299,12 @@ func (r *runtimeRunner) executeInvocation(ctx context.Context, endpoint string, errReason = retErr.Error() } } + dur := time.Since(invokeStart) logging.LogCommandEnd(fl, execID, invocation.CanonicalProduct, invocation.Tool, - retErr == nil, time.Since(invokeStart), errCat, errReason) + retErr == nil, dur, errCat, errReason) + // Anonymous ops telemetry (opt-in, dimensions only). No-op when disabled. + emitTelemetry(execID, invocation, retErr == nil, errCat, dur) }() // Check if this product has plugin-level auth credentials registered. diff --git a/internal/app/telemetry_runtime.go b/internal/app/telemetry_runtime.go new file mode 100644 index 00000000..1444ae75 --- /dev/null +++ b/internal/app/telemetry_runtime.go @@ -0,0 +1,63 @@ +// Copyright 2026 Alibaba Group +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package app + +import ( + "os" + "runtime" + "time" + + authpkg "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/auth" + "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/executor" + "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry" +) + +// emitTelemetry ships one anonymous operational metric for a finished +// invocation. It is cheap to skip: telemetry.NewForwarderFromEnv returns nil +// (after a single env read) when telemetry is disabled, so the hot path pays +// nothing and never loads the token or touches request content. +// +// Only coarse dimensions are collected here — there is intentionally no path +// that reads param values, object names, or natural-language input. +func emitTelemetry(execID string, inv executor.Invocation, ok bool, errClass string, dur time.Duration) { + fwd := telemetry.NewForwarderFromEnv() + if fwd == nil { + return + } + + ev := telemetry.New(time.Now(), execID) + ev.CLIVersion = version + ev.Channel = os.Getenv(envDWSChannel) + ev.OS = runtime.GOOS + ev.Module = inv.CanonicalProduct + ev.Command = inv.CanonicalProduct + ev.Subcommand = inv.Tool + ev.DurationMS = dur.Milliseconds() + + // corp_id is the only identity-adjacent dimension, kept for per-tenant + // health. Best-effort: a missing/locked token simply omits it. + if td, err := authpkg.LoadTokenData(defaultConfigDir()); err == nil && td != nil { + ev.CorpID = td.CorpID + } + + if ok { + ev.Outcome = "ok" + } else { + ev.Outcome = "error" + ev.ErrClass = errClass + ev.ExitCode = 1 + } + + _ = fwd.Emit(ev) +} diff --git a/internal/app/telemetry_runtime_test.go b/internal/app/telemetry_runtime_test.go new file mode 100644 index 00000000..aa29dbc5 --- /dev/null +++ b/internal/app/telemetry_runtime_test.go @@ -0,0 +1,113 @@ +// Copyright 2026 Alibaba Group +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package app + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/executor" + "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry" +) + +// TestEmitTelemetryWiresEvent proves the app-layer hook assembles a correct, +// content-free event and ships it when telemetry is enabled. +func TestEmitTelemetryWiresEvent(t *testing.T) { + var body []byte + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + body, _ = io.ReadAll(r.Body) + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + t.Setenv(telemetry.EnvEnabled, "true") + t.Setenv(telemetry.EnvURL, srv.URL) + t.Setenv(envDWSChannel, "openclaw") + + inv := executor.Invocation{ + CanonicalProduct: "doc", + Tool: "create", + // Params carry content; telemetry must NOT read any of it. + Params: map[string]any{"title": "Q3 财报", "doc_id": "doc-secret-123"}, + } + + emitTelemetry("trace-xyz", inv, false, "validation", 123*time.Millisecond) + + if len(body) == 0 { + t.Fatal("no telemetry was POSTed") + } + var ev telemetry.Event + if err := json.Unmarshal(body, &ev); err != nil { + t.Fatalf("non-JSON body %q: %v", body, err) + } + + if ev.TraceID != "trace-xyz" { + t.Errorf("trace_id=%q", ev.TraceID) + } + if ev.Command != "doc" || ev.Subcommand != "create" { + t.Errorf("command/subcommand=%q/%q", ev.Command, ev.Subcommand) + } + if ev.Outcome != "error" || ev.ErrClass != "validation" || ev.ExitCode != 1 { + t.Errorf("outcome wiring wrong: %+v", ev) + } + if ev.DurationMS != 123 { + t.Errorf("duration_ms=%d, want 123", ev.DurationMS) + } + if ev.Channel != "openclaw" { + t.Errorf("channel=%q, want openclaw", ev.Channel) + } + if ev.OS == "" { + t.Error("os dimension should be set") + } + + // Privacy boundary: no param content may ever leak into the wire payload. + raw := string(body) + for _, secret := range []string{"Q3 财报", "doc-secret-123", "title"} { + if contains(raw, secret) { + t.Errorf("telemetry payload leaked content %q: %s", secret, raw) + } + } +} + +// TestEmitTelemetryNoopWhenDisabled proves the hot path is silent when off. +func TestEmitTelemetryNoopWhenDisabled(t *testing.T) { + called := false + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + t.Setenv(telemetry.EnvEnabled, "") + t.Setenv(telemetry.EnvURL, srv.URL) + + emitTelemetry("t", executor.Invocation{CanonicalProduct: "doc", Tool: "get"}, true, "", time.Second) + + if called { + t.Fatal("telemetry was sent while disabled") + } +} + +func contains(s, sub string) bool { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return true + } + } + return false +} diff --git a/internal/telemetry/event.go b/internal/telemetry/event.go new file mode 100644 index 00000000..338c66e1 --- /dev/null +++ b/internal/telemetry/event.go @@ -0,0 +1,68 @@ +// Copyright 2026 Alibaba Group +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package telemetry emits anonymous, dimensions-only operational metrics for +// each dws command invocation — error rate, latency, command distribution and +// version/platform health. It is the ops-monitoring counterpart to the audit +// package, but deliberately MUCH smaller: +// +// - It carries ONLY coarse dimensions: never object names, free text, peer +// ids, device fingerprints, or the user's natural-language intent. There is +// nothing to redact because nothing sensitive is ever collected. +// - It is independent of the audit package and its DWS_AUDIT_* switches, so +// an operator can run ops telemetry without enabling compliance auditing. +// - It is OFF by default. The CLI emits nothing unless the operator opts in +// via DWS_TELEMETRY_ENABLED, and the destination is operator-configured. +// +// What it deliberately does NOT collect (so reviewers can verify the privacy +// boundary by inspecting this struct alone): actor identity (user id/name), +// target object names/ids, free-text intent, device id / serial number, and +// request/response bodies. +package telemetry + +import "time" + +// SchemaVersion is bumped on any breaking change to Event's JSON shape. +const SchemaVersion = "1" + +// Event is the full operational record for one dws command. Every field is a +// low-cardinality (or trace) dimension safe to ship to a central ops sink. +type Event struct { + SchemaVersion string `json:"schema_version"` + Timestamp time.Time `json:"ts"` + TraceID string `json:"trace_id"` // == transport execution_id, for joining with server-side logs + + CorpID string `json:"corp_id,omitempty"` // tenant dimension, for per-org health; best-effort + CLIVersion string `json:"cli_version,omitempty"` // "did this release break a command at scale" + Channel string `json:"channel,omitempty"` // which integration/agent drove dws (DWS_CHANNEL) + OS string `json:"os,omitempty"` // runtime.GOOS — coarse platform, not PII + + Module string `json:"module"` // operated product, e.g. "doc" + Command string `json:"command"` // skill command + Subcommand string `json:"subcommand"` // skill subcommand, e.g. "create" + + Outcome string `json:"outcome"` // "ok" | "error" + ErrClass string `json:"err_class,omitempty"` // error category when outcome=error + ExitCode int `json:"exit_code"` + DurationMS int64 `json:"duration_ms"` // wall-clock latency of the invocation +} + +// New stamps the schema version, timestamp and trace id. The caller supplies +// the wall clock so callers stay testable and deterministic. +func New(ts time.Time, traceID string) *Event { + return &Event{ + SchemaVersion: SchemaVersion, + Timestamp: ts, + TraceID: traceID, + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go new file mode 100644 index 00000000..6483c4a1 --- /dev/null +++ b/internal/telemetry/telemetry.go @@ -0,0 +1,139 @@ +// Copyright 2026 Alibaba Group +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package telemetry + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "os" + "strings" + "time" + + "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/pkg/configmeta" +) + +// Environment variables that drive telemetry. All default OFF: the CLI emits +// nothing unless the operator opts in, and the destination is operator-set. +const ( + // EnvEnabled turns ops telemetry on ("true"/"1"). Independent of DWS_AUDIT_*. + EnvEnabled = "DWS_TELEMETRY_ENABLED" + // EnvURL is the ingest endpoint that receives one JSON Event per POST. + // Empty disables forwarding even when EnvEnabled is set. + EnvURL = "DWS_TELEMETRY_URL" + // EnvToken is an optional bearer for the ingest endpoint. + EnvToken = "DWS_TELEMETRY_TOKEN" + // EnvTimeoutMS bounds how long a single POST may block command exit. + EnvTimeoutMS = "DWS_TELEMETRY_TIMEOUT_MS" +) + +// defaultTimeout caps how long telemetry may delay command exit. Telemetry is a +// side effect, never a gate: a slow or dead sink must not punish the user. +const defaultTimeout = 1500 * time.Millisecond + +func init() { + for _, it := range []configmeta.ConfigItem{ + {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "启用匿名运维遥测(仅维度,无内容无身份,默认关)", Example: "true"}, + {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "遥测上报端点(每次调用 POST 一条 JSON)", Example: "https://telemetry.example.com/dws"}, + {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "遥测端点的 Bearer 鉴权(可选)", Sensitive: true, Example: "xxxxx"}, + {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "单次上报超时上限(毫秒,默认 1500)", Example: "1500"}, + } { + configmeta.Register(it) + } +} + +// Enabled reports whether telemetry should run. It requires BOTH the opt-in +// switch and a destination — neither alone does anything. +func Enabled() bool { + return truthy(os.Getenv(EnvEnabled)) && strings.TrimSpace(os.Getenv(EnvURL)) != "" +} + +// Forwarder ships events to the operator-configured endpoint. Best-effort: a +// transport error or non-2xx is returned for logging but never blocks beyond +// the timeout, and the command's own result is unaffected. +type Forwarder struct { + URL string + Token string + Client *http.Client +} + +// NewForwarderFromEnv builds a Forwarder from the env, or returns nil when +// telemetry is disabled. A nil *Forwarder's Emit is a safe no-op. +func NewForwarderFromEnv() *Forwarder { + if !Enabled() { + return nil + } + return &Forwarder{ + URL: strings.TrimSpace(os.Getenv(EnvURL)), + Token: strings.TrimSpace(os.Getenv(EnvToken)), + Client: &http.Client{Timeout: timeoutFromEnv()}, + } +} + +// Emit POSTs e as a single JSON object. A nil receiver is a no-op so callers +// never need a guard. Errors are returned (best-effort) but the bounded client +// timeout guarantees command exit is never delayed past the configured cap. +func (f *Forwarder) Emit(e *Event) error { + if f == nil || e == nil { + return nil + } + data, err := json.Marshal(e) + if err != nil { + return err + } + ctx, cancel := context.WithTimeout(context.Background(), f.Client.Timeout) + defer cancel() + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, f.URL, bytes.NewReader(data)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/json") + req.Header.Set("X-Dws-Telemetry-Schema", SchemaVersion) + if f.Token != "" { + req.Header.Set("Authorization", "Bearer "+f.Token) + } + resp, err := f.Client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return fmt.Errorf("telemetry: sink returned %d", resp.StatusCode) + } + return nil +} + +func timeoutFromEnv() time.Duration { + raw := strings.TrimSpace(os.Getenv(EnvTimeoutMS)) + if raw == "" { + return defaultTimeout + } + var ms int + if _, err := fmt.Sscanf(raw, "%d", &ms); err != nil || ms <= 0 { + return defaultTimeout + } + return time.Duration(ms) * time.Millisecond +} + +func truthy(s string) bool { + switch strings.ToLower(strings.TrimSpace(s)) { + case "1", "true", "yes", "on": + return true + default: + return false + } +} diff --git a/internal/telemetry/telemetry_test.go b/internal/telemetry/telemetry_test.go new file mode 100644 index 00000000..9fffef14 --- /dev/null +++ b/internal/telemetry/telemetry_test.go @@ -0,0 +1,120 @@ +// Copyright 2026 Alibaba Group +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package telemetry + +import ( + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestEnabledRequiresBothSwitchAndURL(t *testing.T) { + cases := []struct { + name, enabled, url string + want bool + }{ + {"both set", "true", "https://x.example/dws", true}, + {"only switch", "true", "", false}, + {"only url", "", "https://x.example/dws", false}, + {"neither", "", "", false}, + {"falsey switch", "0", "https://x.example/dws", false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + t.Setenv(EnvEnabled, c.enabled) + t.Setenv(EnvURL, c.url) + if got := Enabled(); got != c.want { + t.Fatalf("Enabled()=%v, want %v", got, c.want) + } + }) + } +} + +func TestNewForwarderFromEnvNilWhenDisabled(t *testing.T) { + t.Setenv(EnvEnabled, "") + t.Setenv(EnvURL, "") + if f := NewForwarderFromEnv(); f != nil { + t.Fatalf("expected nil forwarder when disabled, got %+v", f) + } + // Emit on a nil forwarder must be a safe no-op. + var nilFwd *Forwarder + if err := nilFwd.Emit(New(time.Unix(0, 0), "t")); err != nil { + t.Fatalf("nil Emit should be no-op, got %v", err) + } +} + +func TestForwarderEmitPostsJSON(t *testing.T) { + var gotBody []byte + var gotAuth, gotSchema string + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotBody, _ = io.ReadAll(r.Body) + gotAuth = r.Header.Get("Authorization") + gotSchema = r.Header.Get("X-Dws-Telemetry-Schema") + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + t.Setenv(EnvEnabled, "true") + t.Setenv(EnvURL, srv.URL) + t.Setenv(EnvToken, "secret-token") + + fwd := NewForwarderFromEnv() + if fwd == nil { + t.Fatal("expected a forwarder when enabled") + } + + ev := New(time.Unix(1700000000, 0).UTC(), "trace-123") + ev.CLIVersion = "1.2.3" + ev.Command = "doc" + ev.Subcommand = "create" + ev.Outcome = "ok" + ev.DurationMS = 42 + + if err := fwd.Emit(ev); err != nil { + t.Fatalf("Emit: %v", err) + } + + if gotAuth != "Bearer secret-token" { + t.Errorf("Authorization=%q, want Bearer secret-token", gotAuth) + } + if gotSchema != SchemaVersion { + t.Errorf("schema header=%q, want %q", gotSchema, SchemaVersion) + } + + var decoded Event + if err := json.Unmarshal(gotBody, &decoded); err != nil { + t.Fatalf("server got non-JSON body %q: %v", gotBody, err) + } + if decoded.TraceID != "trace-123" || decoded.Command != "doc" || + decoded.Subcommand != "create" || decoded.Outcome != "ok" || decoded.DurationMS != 42 { + t.Errorf("decoded event mismatch: %+v", decoded) + } +} + +func TestForwarderEmitReturnsErrorOnNon2xx(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer srv.Close() + + t.Setenv(EnvEnabled, "true") + t.Setenv(EnvURL, srv.URL) + fwd := NewForwarderFromEnv() + if err := fwd.Emit(New(time.Unix(0, 0), "t")); err == nil { + t.Fatal("expected error on 500 response") + } +} From efa5c48b543741f7cc10277ab0b3c4785c1cf824 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 11:51:20 +0800 Subject: [PATCH 02/12] docs(telemetry): reference FC->SLS ingest endpoint A minimal Flask web service to deploy as a Function Compute Web Function: verifies the bearer token, then writes each telemetry Event to an SLS Logstore via PutLogs (SLS cannot accept the raw signed-less POST directly). Promotes the query dimensions to their own columns and keeps the full event verbatim. Includes deploy walkthrough, local smoke test, and 4 alert rules. --- docs/telemetry/fc-sls-ingest/README.md | 83 +++++++++++++ docs/telemetry/fc-sls-ingest/app.py | 116 ++++++++++++++++++ docs/telemetry/fc-sls-ingest/requirements.txt | 3 + 3 files changed, 202 insertions(+) create mode 100644 docs/telemetry/fc-sls-ingest/README.md create mode 100644 docs/telemetry/fc-sls-ingest/app.py create mode 100644 docs/telemetry/fc-sls-ingest/requirements.txt diff --git a/docs/telemetry/fc-sls-ingest/README.md b/docs/telemetry/fc-sls-ingest/README.md new file mode 100644 index 00000000..2d1a232c --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/README.md @@ -0,0 +1,83 @@ +# dws 遥测接收端(函数计算 FC → SLS) + +这是 [运维遥测](../../telemetry.md) 的**参考接收端**:dws 把一条遥测 JSON POST 过来, +SLS 不能直接收裸 POST(写入要签名),所以这里垫一个最小 HTTP 服务,校验 token 后用 +`PutLogs` 写进 SLS。部署成函数计算(FC)的 **Web 函数**即可,不用关心 FC handler 签名。 + +``` +dws ──POST 一条 JSON──▶ 本服务(FC Web 函数) ──PutLogs──▶ SLS Logstore ──▶ 大盘/告警 +``` + +## 文件 + +- `app.py` — Flask 服务:`POST /` 校验 Bearer → 解析 JSON → 写 SLS;`GET /` 健康检查 +- `requirements.txt` — 依赖(flask / gunicorn / aliyun-log-python-sdk) + +## 一、先在 SLS 建库(控制台点几下) + +1. 建 **Project**(如 `dws-ops`)和 **Logstore**(如 `dws-telemetry`),设留存天数。 +2. 开索引:给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` + 设为 **text**;给 `duration_ms` / `exit_code` 设为 **long**(要做 P99 和聚合)。 + +## 二、部署本服务为 FC Web 函数 + +1. 函数计算控制台 → 创建函数 → **Web 函数** → Python 运行时。 +2. 上传本目录代码(含 `requirements.txt`,FC 会自动装依赖)。 +3. **启动命令**填:`gunicorn -b 0.0.0.0:9000 app:app`,**监听端口** `9000`。 +4. 给函数**绑定一个服务角色**,授权 `AliyunLogFullAccess`(或更小的 PutLogs 权限)。 + 这样就不用把 AccessKey 写进环境变量——FC 会自动注入 STS 临时凭证,`app.py` 已优先读它。 +5. 配 **环境变量**: + + | 变量 | 值 | 说明 | + |---|---|---| + | `SLS_ENDPOINT` | `cn-hangzhou.log.aliyuncs.com` | 按你的地域改 | + | `SLS_PROJECT` | `dws-ops` | 第一步建的 Project | + | `SLS_LOGSTORE` | `dws-telemetry` | 第一步建的 Logstore | + | `INGEST_TOKEN` | 自己生成一串随机串 | 必须和 dws 侧 `DWS_TELEMETRY_TOKEN` 一致 | + +6. 部署后拿到函数的 HTTP 触发器地址(形如 `https://xxx.cn-hangzhou.fcapp.run`)。 + +## 三、把 dws 接上 + +在跑 dws 的环境里(或由上层 agent 注入): + +```bash +export DWS_TELEMETRY_ENABLED=true +export DWS_TELEMETRY_URL="https://xxx.cn-hangzhou.fcapp.run" # 上一步的函数地址 +export DWS_TELEMETRY_TOKEN="<和 INGEST_TOKEN 相同的随机串>" +``` + +跑几条命令,到 SLS Logstore 查询页就能看到一条条记录。 + +## 四、本地先验证(可选,不依赖 FC) + +```bash +cd docs/telemetry/fc-sls-ingest +python3 -m venv .venv && . .venv/bin/activate +pip install -r requirements.txt +export SLS_ENDPOINT=... SLS_PROJECT=... SLS_LOGSTORE=... INGEST_TOKEN=dev +export ALIBABA_CLOUD_ACCESS_KEY_ID=... ALIBABA_CLOUD_ACCESS_KEY_SECRET=... +python app.py # 监听 :9000 +# 另开一个终端: +curl -XPOST localhost:9000/ -H 'Authorization: Bearer dev' \ + -H 'Content-Type: application/json' \ + -d '{"schema_version":"1","command":"doc","outcome":"ok","duration_ms":42}' +# 返回 204 即写入成功;去 SLS 控制台查 dws-telemetry。 +``` + +## 五、配告警(SLS 控制台 → 告警) + +| 告警 | 查询(示意) | 触发 | +|---|---|---| +| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 0.05 | +| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | +| 调用量跌零 | `* \| select count(*) as n` | n == 0(5 分钟窗口) | + +通知渠道直接选钉钉机器人 webhook。 + +## 安全须知 + +- `INGEST_TOKEN` 用强随机串,并和 dws 侧保持一致;不要留空。 +- 优先用 FC 服务角色(STS),不要把长期 AccessKey 写进环境变量。 +- 本服务只接**匿名维度**数据,不含用户内容/身份——隐私边界由 dws 客户端保证。 diff --git a/docs/telemetry/fc-sls-ingest/app.py b/docs/telemetry/fc-sls-ingest/app.py new file mode 100644 index 00000000..ca95f350 --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/app.py @@ -0,0 +1,116 @@ +# Copyright 2026 Alibaba Group +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# +# Reference telemetry ingest for dws (DingTalk Workspace CLI). +# +# Role: the "translator" between dws and SLS. dws POSTs one telemetry Event as +# JSON; SLS cannot accept that raw POST (its write API must be signed), so this +# tiny HTTP service verifies the bearer token, then writes the event into an SLS +# Logstore via PutLogs. +# +# Deploy as a Function Compute (FC) "Web Function": +# startup command: gunicorn -b 0.0.0.0:9000 app:app +# listen port: 9000 +# See README.md in this directory for the full walkthrough. + +import json +import os +import time + +from flask import Flask, request, abort + +from aliyun.log import LogClient, LogItem, PutLogsRequest + +app = Flask(__name__) + +# --- configuration (set these as FC environment variables) ------------------- +# SLS target. SLS_ENDPOINT looks like "cn-hangzhou.log.aliyuncs.com". +SLS_ENDPOINT = os.environ["SLS_ENDPOINT"] +SLS_PROJECT = os.environ["SLS_PROJECT"] +SLS_LOGSTORE = os.environ["SLS_LOGSTORE"] + +# Shared secret the CLI sends as `Authorization: Bearer `. This must +# match DWS_TELEMETRY_TOKEN on the dws side. Empty disables auth (NOT advised). +INGEST_TOKEN = os.environ.get("INGEST_TOKEN", "") + +# Credentials: prefer the STS credentials FC injects when a service role is +# bound (recommended — no long-lived keys in env). Fall back to explicit keys. +AK_ID = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID", "") +AK_SECRET = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET", "") +STS_TOKEN = os.environ.get("ALIBABA_CLOUD_SECURITY_TOKEN", "") + +# A new client per process is fine for FC; reuse across warm invocations. +_client = LogClient(SLS_ENDPOINT, AK_ID, AK_SECRET, securityToken=STS_TOKEN or None) + +# Fields lifted out of the event into their own SLS columns so they are +# query/aggregation-friendly (the full event is also stored verbatim). +_INDEX_FIELDS = ( + "trace_id", + "corp_id", + "cli_version", + "channel", + "os", + "module", + "command", + "subcommand", + "outcome", + "err_class", + "exit_code", + "duration_ms", +) + + +@app.get("/") +def health(): + # FC health checks and humans hitting the URL land here. + return "dws telemetry ingest ok\n", 200 + + +@app.post("/") +def ingest(): + # 1) Auth: constant-ish bearer check. + if INGEST_TOKEN: + auth = request.headers.get("Authorization", "") + if auth != f"Bearer {INGEST_TOKEN}": + abort(401) + + # 2) Parse one telemetry Event. + try: + event = request.get_json(force=True) + if not isinstance(event, dict): + raise ValueError("body is not a JSON object") + except Exception as e: # noqa: BLE001 - reject any malformed body + abort(400, description=f"bad json: {e}") + + # 3) Build the SLS log item. Keep the full event verbatim in `event`, + # and promote the dimensions to their own columns for querying. + item = LogItem() + item.set_time(int(time.time())) + contents = [("event", json.dumps(event, ensure_ascii=False))] + for k in _INDEX_FIELDS: + if k in event and event[k] is not None: + contents.append((k, str(event[k]))) + item.set_contents(contents) + + # 4) Write to SLS. topic/source are coarse routing labels. + req = PutLogsRequest( + project=SLS_PROJECT, + logstore=SLS_LOGSTORE, + topic=event.get("schema_version", ""), + source="dws-telemetry", + logitems=[item], + ) + try: + _client.put_logs(req) + except Exception as e: # noqa: BLE001 - surface SLS errors as 502 + # Telemetry is best-effort on the client; returning non-2xx just makes + # the CLI log a forward failure. Never crash the worker. + abort(502, description=f"sls put_logs failed: {e}") + + return "", 204 + + +if __name__ == "__main__": + # Local dev: python app.py, then POST to http://127.0.0.1:9000/ + app.run(host="0.0.0.0", port=9000) diff --git a/docs/telemetry/fc-sls-ingest/requirements.txt b/docs/telemetry/fc-sls-ingest/requirements.txt new file mode 100644 index 00000000..655290b9 --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/requirements.txt @@ -0,0 +1,3 @@ +flask>=3.0 +gunicorn>=21.0 +aliyun-log-python-sdk>=0.9.0 From f65fe99777784694fecc551bae80f2c9737f4db6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:16:36 +0800 Subject: [PATCH 03/12] docs(telemetry): zero-dep local sink + local-test flow + public/private boundary - localsink.py: stdlib-only HTTP collector to test the full dws->HTTP pipeline without SLS or Function Compute - telemetry.md: local-test walkthrough (incl. a mini local dashboard) and a section spelling out that the SLS project / endpoint / token live in the deployer's own infra and never enter this open-source repo --- docs/telemetry.md | 62 +++++++++++++++++++ docs/telemetry/fc-sls-ingest/localsink.py | 74 +++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 docs/telemetry/fc-sls-ingest/localsink.py diff --git a/docs/telemetry.md b/docs/telemetry.md index f33188b3..df4e9890 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -57,6 +57,68 @@ Body: 一条遥测事件 JSON 返回 2xx 即成功 ``` +## 本地测试(零依赖,不碰 SLS) + +上 SLS 之前,先在本机把整条链路跑通。用 `fc-sls-ingest/localsink.py` +(纯 Python 标准库,不用 `pip install` 任何东西)当接收端: + +```bash +# 1. 起本地接收端(带一个测试 token) +cd docs/telemetry/fc-sls-ingest +TOKEN=dev python3 localsink.py # 监听 127.0.0.1:8799,落盘 /tmp/dws_telemetry.jsonl + +# 2. 另开一个终端,把 dws 指向它 +export DWS_TELEMETRY_ENABLED=true +export DWS_TELEMETRY_URL=http://127.0.0.1:8799 +export DWS_TELEMETRY_TOKEN=dev + +# 3. 跑几条命令(--mock 不联网、不需要真实后端,也会触发上报) +dws doc create --title 测试 --mock +dws drive list --mock +``` + +接收端会实时打印每条事件,并追加到 `/tmp/dws_telemetry.jsonl`。验证要点: + +- 事件含 `command/outcome/duration_ms/cli_version/channel/os` 等维度; +- 把命令参数(如 `--title 测试`)和报文对照,确认**内容没出现在报文里**; +- 不带 token POST 应被拒(401)。 + +落盘后可以本地先模拟一把"大盘"会算的指标: + +```bash +python3 - <<'PY' +import json, collections +rows=[json.loads(l) for l in open('/tmp/dws_telemetry.jsonl') if l.strip()] +by=collections.defaultdict(lambda:{'n':0,'err':0,'dur':[]}) +for r in rows: + k=f"{r['command']} {r['subcommand']}"; b=by[k] + b['n']+=1; b['err']+=(r['outcome']!='ok'); b['dur'].append(r.get('duration_ms',0)) +for k,v in sorted(by.items(), key=lambda x:-x[1]['n']): + d=v['dur']; print(f"{k:<26}调用{v['n']:>4} 失败{v['err']:>3} avg{sum(d)//len(d):>5}ms max{max(d):>5}ms") +PY +``` + +> 说明:遥测只在命令真正进入 MCP 调用阶段才上报。若命令在参数解析层就报错 +> (未到调用),不会产生遥测——这是预期行为。 + +## 开源代码与内部资源的边界(公私边界) + +dws 是开源仓库,但**遥测数据进哪个 SLS、绑哪个内部应用,是部署方自己的事,不进仓库**。 +这条边界是设计出来的,不是巧合: + +| | 在哪 | 包含什么 | 进仓库吗 | +|---|---|---|---| +| dws 二进制 + 本目录 FC/local 参考代码 | 公开仓库 | 只会 POST 到 `DWS_TELEMETRY_URL`;**无 endpoint、无密钥、无应用名** | ✅ | +| SLS Project / FC 实例 / 真实 URL+token | 部署方内部基础设施 | 真实地址、鉴权、日志库;阿里内部还需绑定一个内部应用 | ❌ 永不进仓库,靠环境变量注入 | + +代码里**绝不硬编码任何厂商上报地址**,URL 一律运行时从环境变量读取。所以"代码公开" +与"数据落到部署方内部 SLS"天然解耦:换部署方只是换一组环境变量,仓库无需改动, +也看不到任何一方的真实配置。 + +> 阿里内部场景:SLS Project 需挂在一个 AONE 应用下(资源治理要求)。把它绑到 dws +> 后端所属的应用(如钉钉 MCP 网关应用)即可;这个绑定关系、真实 URL 与 token 全部 +> 留在内部,公开仓库不感知。 + ## 接入阿里云 SLS(生产推荐) SLS(日志服务)自带写入 / 存储 / 检索 / Dashboard / 告警,是运维监控的标准选型: diff --git a/docs/telemetry/fc-sls-ingest/localsink.py b/docs/telemetry/fc-sls-ingest/localsink.py new file mode 100644 index 00000000..5ab0d163 --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/localsink.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +# Copyright 2026 Alibaba Group +# Licensed under the Apache License, Version 2.0 (the "License"). +# +# Local telemetry sink for testing dws telemetry WITHOUT SLS or Function Compute. +# +# Zero dependencies (Python standard library only). It mimics the ingest +# contract: accepts `POST /` with a JSON body, optionally checks the bearer +# token, pretty-prints each event and appends the raw line to a JSONL file. +# Use this to validate the whole pipeline (dws -> HTTP) before touching any +# cloud infrastructure. +# +# Usage: +# python3 localsink.py # listen on :8799, no auth +# PORT=9000 TOKEN=dev python3 localsink.py +# +# Then point dws at it: +# export DWS_TELEMETRY_ENABLED=true +# export DWS_TELEMETRY_URL=http://127.0.0.1:8799 +# # export DWS_TELEMETRY_TOKEN=dev # only if you set TOKEN above + +import json +import os +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + +PORT = int(os.environ.get("PORT", "8799")) +TOKEN = os.environ.get("TOKEN", "") +OUTFILE = os.environ.get("OUTFILE", "/tmp/dws_telemetry.jsonl") + +_count = 0 + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self): # health check + self._send(200, "dws local telemetry sink ok\n") + + def do_POST(self): + global _count + if TOKEN and self.headers.get("Authorization", "") != f"Bearer {TOKEN}": + self._send(401, "unauthorized\n") + return + n = int(self.headers.get("Content-Length", "0")) + raw = self.rfile.read(n) + try: + event = json.loads(raw) + except Exception as e: # noqa: BLE001 + self._send(400, f"bad json: {e}\n") + return + + _count += 1 + with open(OUTFILE, "ab") as f: + f.write(raw + b"\n") + print(f"\n#{_count} ({len(raw)} bytes) -> {OUTFILE}") + print(json.dumps(event, indent=2, ensure_ascii=False), flush=True) + self._send(204, "") + + def _send(self, code, body): + data = body.encode("utf-8") + self.send_response(code) + self.send_header("Content-Length", str(len(data))) + self.end_headers() + if data: + self.wfile.write(data) + + def log_message(self, *args): # silence default access logging + pass + + +if __name__ == "__main__": + open(OUTFILE, "w").close() # truncate previous run + auth = f"(bearer required: {TOKEN!r})" if TOKEN else "(no auth)" + print(f"dws local telemetry sink listening on http://127.0.0.1:{PORT} {auth}") + print(f"capturing to {OUTFILE}\n") + ThreadingHTTPServer(("127.0.0.1", PORT), Handler).serve_forever() From a80a3a4521ee1d88bb15ba8647ca76c78eeb1d62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 14:31:26 +0800 Subject: [PATCH 04/12] feat(telemetry): FC ingest dry-run mode (deploy & verify before SLS) app.py now auto-detects mode: with no SLS_* env (or TELEMETRY_DRYRUN=true) it logs each event to stdout and returns 204 instead of writing to SLS, and the aliyun-log SDK is imported lazily so dry-run needs no extra dependency. Lets you deploy to Function Compute and confirm the client->FC pipeline end-to-end before provisioning any SLS resource. GET / reports the active mode. README documents the deploy-then-wire-SLS flow. --- docs/telemetry/fc-sls-ingest/README.md | 41 ++++-- docs/telemetry/fc-sls-ingest/app.py | 121 +++++++++++------- docs/telemetry/fc-sls-ingest/requirements.txt | 1 + 3 files changed, 107 insertions(+), 56 deletions(-) diff --git a/docs/telemetry/fc-sls-ingest/README.md b/docs/telemetry/fc-sls-ingest/README.md index 2d1a232c..2d5379cb 100644 --- a/docs/telemetry/fc-sls-ingest/README.md +++ b/docs/telemetry/fc-sls-ingest/README.md @@ -19,14 +19,29 @@ dws ──POST 一条 JSON──▶ 本服务(FC Web 函数) ──PutLogs─ 2. 开索引:给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` 设为 **text**;给 `duration_ms` / `exit_code` 设为 **long**(要做 P99 和聚合)。 +## 两种运行模式(自动判断) + +`app.py` 按环境变量自动切换,**不用改代码**: + +| 模式 | 触发条件 | 行为 | +|---|---|---| +| **dry-run** | 缺任一 SLS 变量,或设 `TELEMETRY_DRYRUN=true` | 收到事件只打到 stdout(FC 会进函数日志),返回 204。**不依赖 aliyun-log SDK**,适合先验证管线 | +| **SLS** | `SLS_ENDPOINT`+`SLS_PROJECT`+`SLS_LOGSTORE` 都配齐 | 校验后 `PutLogs` 写进 Logstore | + +`GET /` 健康检查会回显当前模式(`mode=dry-run` / `mode=sls`),部署后一眼可辨。 + ## 二、部署本服务为 FC Web 函数 1. 函数计算控制台 → 创建函数 → **Web 函数** → Python 运行时。 2. 上传本目录代码(含 `requirements.txt`,FC 会自动装依赖)。 3. **启动命令**填:`gunicorn -b 0.0.0.0:9000 app:app`,**监听端口** `9000`。 -4. 给函数**绑定一个服务角色**,授权 `AliyunLogFullAccess`(或更小的 PutLogs 权限)。 - 这样就不用把 AccessKey 写进环境变量——FC 会自动注入 STS 临时凭证,`app.py` 已优先读它。 -5. 配 **环境变量**: +4. **先空跑验证(强烈建议)**:第一次只配 `INGEST_TOKEN`,**不配 SLS 变量**(或加 + `TELEMETRY_DRYRUN=true`)。部署后 `GET /` 应显示 `mode=dry-run`;把 dws 指过来跑 + 几条命令,去 FC 的**函数日志**里能看到 `DRYRUN {...}` 行,就证明"客户端→FC"这段通了。 + 这一步**不需要 SLS、不需要建库、不需要 SDK**。 +5. **再接 SLS**:给函数**绑定一个服务角色**,授权 `AliyunLogFullAccess`(或更小的 + PutLogs 权限)——这样不用把 AccessKey 写进环境变量,FC 自动注入 STS 临时凭证, + `app.py` 已优先读它。然后补上 SLS 环境变量,`GET /` 变成 `mode=sls` 即生效: | 变量 | 值 | 说明 | |---|---|---| @@ -49,22 +64,28 @@ export DWS_TELEMETRY_TOKEN="<和 INGEST_TOKEN 相同的随机串>" 跑几条命令,到 SLS Logstore 查询页就能看到一条条记录。 -## 四、本地先验证(可选,不依赖 FC) +## 四、本地先验证(可选,不依赖 FC / SLS) + +最省事的本地验证用 `localsink.py`(纯标准库,零依赖),见 +[telemetry.md 的「本地测试」](../../telemetry.md#本地测试零依赖不碰-sls)。 + +也可以直接本地跑本服务的 **dry-run 模式**(不配 SLS、不用装 aliyun-log): ```bash cd docs/telemetry/fc-sls-ingest -python3 -m venv .venv && . .venv/bin/activate -pip install -r requirements.txt -export SLS_ENDPOINT=... SLS_PROJECT=... SLS_LOGSTORE=... INGEST_TOKEN=dev -export ALIBABA_CLOUD_ACCESS_KEY_ID=... ALIBABA_CLOUD_ACCESS_KEY_SECRET=... -python app.py # 监听 :9000 +pip install flask # dry-run 只需 flask;aliyun-log 仅 SLS 模式才要 +INGEST_TOKEN=dev python3 app.py # 不配 SLS_* -> 自动 dry-run,监听 :9000 # 另开一个终端: +curl -s localhost:9000/ # 应回显 mode=dry-run curl -XPOST localhost:9000/ -H 'Authorization: Bearer dev' \ -H 'Content-Type: application/json' \ -d '{"schema_version":"1","command":"doc","outcome":"ok","duration_ms":42}' -# 返回 204 即写入成功;去 SLS 控制台查 dws-telemetry。 +# 返回 204;事件会以 DRYRUN {...} 打印在 app.py 的终端里。 ``` +要本地连真 SLS 验证,再补 `SLS_ENDPOINT/SLS_PROJECT/SLS_LOGSTORE` 和一组 AccessKey +(`pip install -r requirements.txt` 装上 aliyun-log),`GET /` 会变成 `mode=sls`。 + ## 五、配告警(SLS 控制台 → 告警) | 告警 | 查询(示意) | 触发 | diff --git a/docs/telemetry/fc-sls-ingest/app.py b/docs/telemetry/fc-sls-ingest/app.py index ca95f350..f97e38ac 100644 --- a/docs/telemetry/fc-sls-ingest/app.py +++ b/docs/telemetry/fc-sls-ingest/app.py @@ -9,6 +9,14 @@ # tiny HTTP service verifies the bearer token, then writes the event into an SLS # Logstore via PutLogs. # +# Two modes (auto-detected): +# - SLS mode: all of SLS_ENDPOINT / SLS_PROJECT / SLS_LOGSTORE are set +# (and TELEMETRY_DRYRUN is not truthy) -> writes to SLS. +# - dry-run mode: otherwise -> just logs the event to stdout and returns 204. +# Lets you deploy to Function Compute and verify the pipeline +# end-to-end BEFORE wiring up SLS. The aliyun-log SDK is only +# imported in SLS mode, so dry-run needs no extra dependency. +# # Deploy as a Function Compute (FC) "Web Function": # startup command: gunicorn -b 0.0.0.0:9000 app:app # listen port: 9000 @@ -16,33 +24,17 @@ import json import os +import sys import time from flask import Flask, request, abort -from aliyun.log import LogClient, LogItem, PutLogsRequest - app = Flask(__name__) -# --- configuration (set these as FC environment variables) ------------------- -# SLS target. SLS_ENDPOINT looks like "cn-hangzhou.log.aliyuncs.com". -SLS_ENDPOINT = os.environ["SLS_ENDPOINT"] -SLS_PROJECT = os.environ["SLS_PROJECT"] -SLS_LOGSTORE = os.environ["SLS_LOGSTORE"] - # Shared secret the CLI sends as `Authorization: Bearer `. This must # match DWS_TELEMETRY_TOKEN on the dws side. Empty disables auth (NOT advised). INGEST_TOKEN = os.environ.get("INGEST_TOKEN", "") -# Credentials: prefer the STS credentials FC injects when a service role is -# bound (recommended — no long-lived keys in env). Fall back to explicit keys. -AK_ID = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID", "") -AK_SECRET = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET", "") -STS_TOKEN = os.environ.get("ALIBABA_CLOUD_SECURITY_TOKEN", "") - -# A new client per process is fine for FC; reuse across warm invocations. -_client = LogClient(SLS_ENDPOINT, AK_ID, AK_SECRET, securityToken=STS_TOKEN or None) - # Fields lifted out of the event into their own SLS columns so they are # query/aggregation-friendly (the full event is also stored verbatim). _INDEX_FIELDS = ( @@ -61,30 +53,40 @@ ) -@app.get("/") -def health(): - # FC health checks and humans hitting the URL land here. - return "dws telemetry ingest ok\n", 200 +def _truthy(v): + return str(v).strip().lower() in ("1", "true", "yes", "on") -@app.post("/") -def ingest(): - # 1) Auth: constant-ish bearer check. - if INGEST_TOKEN: - auth = request.headers.get("Authorization", "") - if auth != f"Bearer {INGEST_TOKEN}": - abort(401) +def _sls_mode(): + """SLS mode requires the three SLS vars and no explicit dry-run override.""" + if _truthy(os.environ.get("TELEMETRY_DRYRUN", "")): + return False + return all(os.environ.get(k) for k in ("SLS_ENDPOINT", "SLS_PROJECT", "SLS_LOGSTORE")) - # 2) Parse one telemetry Event. - try: - event = request.get_json(force=True) - if not isinstance(event, dict): - raise ValueError("body is not a JSON object") - except Exception as e: # noqa: BLE001 - reject any malformed body - abort(400, description=f"bad json: {e}") - # 3) Build the SLS log item. Keep the full event verbatim in `event`, - # and promote the dimensions to their own columns for querying. +# Lazily-built SLS client (only constructed once, only in SLS mode). Kept module +# global so warm FC invocations reuse it. +_client = None + + +def _get_client(): + global _client + if _client is None: + # Imported here (not at module load) so dry-run works without the SDK. + from aliyun.log import LogClient + + _client = LogClient( + os.environ["SLS_ENDPOINT"], + os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID", ""), + os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET", ""), + securityToken=os.environ.get("ALIBABA_CLOUD_SECURITY_TOKEN", "") or None, + ) + return _client + + +def _write_sls(event): + from aliyun.log import LogItem, PutLogsRequest + item = LogItem() item.set_time(int(time.time())) contents = [("event", json.dumps(event, ensure_ascii=False))] @@ -93,24 +95,51 @@ def ingest(): contents.append((k, str(event[k]))) item.set_contents(contents) - # 4) Write to SLS. topic/source are coarse routing labels. req = PutLogsRequest( - project=SLS_PROJECT, - logstore=SLS_LOGSTORE, + project=os.environ["SLS_PROJECT"], + logstore=os.environ["SLS_LOGSTORE"], topic=event.get("schema_version", ""), source="dws-telemetry", logitems=[item], ) + _get_client().put_logs(req) + + +@app.get("/") +def health(): + mode = "sls" if _sls_mode() else "dry-run" + return f"dws telemetry ingest ok (mode={mode})\n", 200 + + +@app.post("/") +def ingest(): + # 1) Auth: bearer check. + if INGEST_TOKEN: + if request.headers.get("Authorization", "") != f"Bearer {INGEST_TOKEN}": + abort(401) + + # 2) Parse one telemetry Event. try: - _client.put_logs(req) - except Exception as e: # noqa: BLE001 - surface SLS errors as 502 - # Telemetry is best-effort on the client; returning non-2xx just makes - # the CLI log a forward failure. Never crash the worker. - abort(502, description=f"sls put_logs failed: {e}") + event = request.get_json(force=True) + if not isinstance(event, dict): + raise ValueError("body is not a JSON object") + except Exception as e: # noqa: BLE001 - reject any malformed body + abort(400, description=f"bad json: {e}") + + # 3) Dispatch by mode. + if _sls_mode(): + try: + _write_sls(event) + except Exception as e: # noqa: BLE001 - surface SLS errors, never crash + abort(502, description=f"sls put_logs failed: {e}") + else: + # dry-run: emit to stdout (captured by FC function logs) so you can + # confirm the pipeline before SLS is wired up. + print("DRYRUN " + json.dumps(event, ensure_ascii=False), file=sys.stdout, flush=True) return "", 204 if __name__ == "__main__": # Local dev: python app.py, then POST to http://127.0.0.1:9000/ - app.run(host="0.0.0.0", port=9000) + app.run(host="0.0.0.0", port=int(os.environ.get("PORT", "9000"))) diff --git a/docs/telemetry/fc-sls-ingest/requirements.txt b/docs/telemetry/fc-sls-ingest/requirements.txt index 655290b9..7c26998c 100644 --- a/docs/telemetry/fc-sls-ingest/requirements.txt +++ b/docs/telemetry/fc-sls-ingest/requirements.txt @@ -1,3 +1,4 @@ flask>=3.0 gunicorn>=21.0 +# aliyun-log is only needed in SLS mode; dry-run mode (no SLS_* env) runs without it. aliyun-log-python-sdk>=0.9.0 From 6770c56ea52ff32ae77976edcff17c6b15f3a367 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 15:10:07 +0800 Subject: [PATCH 05/12] test(telemetry): one-shot local smoke script (no SLS/cloud/login) scripts/dev/telemetry_smoke.sh builds dws, starts the zero-dep local sink, fires --mock commands and asserts the pipeline: events received with all expected dimensions, bearer token enforced (401), and the privacy boundary (a sentinel command argument must never appear in any payload). Exits non-zero on failure, so it can gate pre-push / CI. --- scripts/dev/telemetry_smoke.sh | 94 ++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100755 scripts/dev/telemetry_smoke.sh diff --git a/scripts/dev/telemetry_smoke.sh b/scripts/dev/telemetry_smoke.sh new file mode 100755 index 00000000..55b6abb6 --- /dev/null +++ b/scripts/dev/telemetry_smoke.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# Copyright 2026 Alibaba Group +# Licensed under the Apache License, Version 2.0 (the "License"). +# +# One-shot local smoke test for dws telemetry — NO SLS, NO cloud, NO login. +# Builds dws, starts the zero-dependency local sink, fires a few --mock +# commands, then asserts the pipeline end-to-end: +# - events are received with the expected dimensions +# - command argument content never leaks into the payload (privacy boundary) +# - the bearer token is enforced (401 without it) +# Exits non-zero on any failure, so it is safe to wire into CI / pre-push. +# +# Usage: bash scripts/dev/telemetry_smoke.sh + +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +PORT="${PORT:-8799}" +TOKEN="dev" +SENTINEL="PRIVATE_SENTINEL_$$" # unique per run; must NOT appear in any event +BIN="$(mktemp -t dws-smoke.XXXXXX)" +OUT="$(mktemp -t dws-tel.XXXXXX.jsonl)" +SINK_PID="" + +cleanup() { + if [ -n "$SINK_PID" ]; then + kill "$SINK_PID" 2>/dev/null || true + wait "$SINK_PID" 2>/dev/null || true # absorb the job-control "Terminated" notice + fi + rm -f "$BIN" "$OUT" +} +trap cleanup EXIT + +say() { printf '\n\033[1m%s\033[0m\n' "$*"; } +fail() { printf '\033[31mFAIL: %s\033[0m\n' "$*" >&2; exit 1; } + +say "[1/5] build dws" +( cd "$ROOT" && go build -o "$BIN" ./cmd ) +echo " -> $BIN" + +say "[2/5] start local sink on :$PORT" +TOKEN="$TOKEN" PORT="$PORT" OUTFILE="$OUT" \ + python3 "$ROOT/docs/telemetry/fc-sls-ingest/localsink.py" >/dev/null 2>&1 & +SINK_PID=$! +sleep 1.2 +curl -fsS "http://127.0.0.1:$PORT/" >/dev/null || fail "sink not responding on :$PORT" +echo " -> sink up (pid $SINK_PID)" + +say "[3/5] auth enforced (POST without token must be 401)" +code="$(curl -s -o /dev/null -w '%{http_code}' -XPOST "http://127.0.0.1:$PORT/" -d '{}')" +[ "$code" = "401" ] || fail "expected 401 without token, got $code" +echo " -> 401 OK" + +say "[4/5] run --mock commands (telemetry on)" +export DWS_TELEMETRY_ENABLED=true +export DWS_TELEMETRY_URL="http://127.0.0.1:$PORT" +export DWS_TELEMETRY_TOKEN="$TOKEN" +export DWS_CHANNEL="smoke-test" +"$BIN" doc create --title "$SENTINEL" --mock >/dev/null 2>&1 || true +"$BIN" doc create --title other --mock >/dev/null 2>&1 || true +"$BIN" drive list --mock >/dev/null 2>&1 || true +sleep 1 + +say "[5/5] assert captured events" +python3 - "$OUT" "$SENTINEL" <<'PY' +import json, sys, collections +path, sentinel = sys.argv[1], sys.argv[2] +rows = [json.loads(l) for l in open(path) if l.strip()] +if len(rows) < 3: + print(f"FAIL: expected >=3 events, got {len(rows)}", file=sys.stderr); sys.exit(1) + +required = ("schema_version","trace_id","cli_version","os","command","subcommand","outcome","duration_ms") +for r in rows: + miss = [k for k in required if k not in r] + if miss: + print(f"FAIL: event missing fields {miss}: {r}", file=sys.stderr); sys.exit(1) + if r["channel"] != "smoke-test": + print(f"FAIL: channel not propagated: {r.get('channel')!r}", file=sys.stderr); sys.exit(1) + +# privacy boundary: the sentinel title must never appear anywhere in the payload +raw = open(path, encoding="utf-8").read() +if sentinel in raw: + print("FAIL: command content LEAKED into telemetry payload", file=sys.stderr); sys.exit(1) + +by = collections.defaultdict(lambda: {"n":0,"err":0,"d":[]}) +for r in rows: + k=f"{r['command']}/{r['subcommand']}"; b=by[k] + b["n"]+=1; b["err"]+=(r["outcome"]!="ok"); b["d"].append(r["duration_ms"]) +print(f" {len(rows)} events, all dimensions present, no content leak") +for k,v in sorted(by.items(), key=lambda x:-x[1]['n']): + d=v["d"]; print(f" {k:<26} calls {v['n']} err {v['err']} avg {sum(d)//len(d)}ms max {max(d)}ms") +PY + +say "PASS — telemetry pipeline healthy" From 966e0341702bcc5fc3efca7fc2b75818c85f5728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:12:33 +0800 Subject: [PATCH 06/12] docs(telemetry): switch all telemetry strings and docs to English Open-source repo convention: configmeta descriptions, docs/telemetry.md and the FC ingest README are now English (code comments were already English). No behavior change; tests still pass. --- docs/telemetry.md | 201 ++++++++++++++----------- docs/telemetry/fc-sls-ingest/README.md | 139 +++++++++-------- internal/app/telemetry_runtime_test.go | 4 +- internal/telemetry/telemetry.go | 8 +- 4 files changed, 190 insertions(+), 162 deletions(-) diff --git a/docs/telemetry.md b/docs/telemetry.md index df4e9890..9d33be3b 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -1,89 +1,98 @@ -# 运维遥测(Telemetry) +# Operational Telemetry -dws 可以为**每一次命令调用**上报一条**匿名、纯维度**的运维指标,用于监控 -错误率、延迟、命令分布和版本/平台健康度。它是审计([audit](./audit.md))的运维侧 -对应物,但刻意做得**小得多**: +dws can emit one **anonymous, dimensions-only** operational metric for **every +command invocation**, to monitor error rate, latency, command distribution and +version/platform health. It is the ops-monitoring counterpart to +[audit](./audit.md), but deliberately **much smaller**: -- 只采**粗维度**,绝不采对象名、自由文本、peer id、设备指纹、自然语言原文。 - 没有"脱敏档",因为压根没有敏感字段可脱。 -- **独立于审计**:和 `DWS_AUDIT_*` 互不相关,可以只开遥测不开合规审计。 -- **默认全关**。不设 `DWS_TELEMETRY_ENABLED` 时,dws 不产生任何遥测,热路径零影响。 +- It collects **coarse dimensions only** — never object names, free text, peer + ids, device fingerprints, or natural-language intent. There is no "redaction + tier" because nothing sensitive is ever collected. +- It is **independent of auditing**: unrelated to the `DWS_AUDIT_*` switches, so + you can run telemetry without enabling compliance auditing. +- It is **off by default**. With `DWS_TELEMETRY_ENABLED` unset, dws emits + nothing and the hot path is unaffected. -> 这是开源 CLI,集中上报必须 **opt-in + 明确告知**。默认不上报一个字节。 +> This is an open-source CLI: any centralized reporting must be **opt-in and +> disclosed**. Nothing is reported by default. -## 启用 +## Enabling -| 环境变量 | 说明 | 示例 | +| Environment variable | Description | Example | |---|---|---| -| `DWS_TELEMETRY_ENABLED` | 启用遥测(需同时配 URL 才生效) | `true` | -| `DWS_TELEMETRY_URL` | 上报端点,每次调用 POST 一条 JSON | `https://telemetry.example.com/dws` | -| `DWS_TELEMETRY_TOKEN` | 端点的 Bearer 鉴权(可选) | `xxxxx` | -| `DWS_TELEMETRY_TIMEOUT_MS` | 单次上报超时上限,毫秒(默认 1500) | `1500` | +| `DWS_TELEMETRY_ENABLED` | Enable telemetry (also needs URL to take effect) | `true` | +| `DWS_TELEMETRY_URL` | Ingest endpoint; one JSON event POSTed per command | `https://telemetry.example.com/dws` | +| `DWS_TELEMETRY_TOKEN` | Bearer token for the endpoint (optional) | `xxxxx` | +| `DWS_TELEMETRY_TIMEOUT_MS` | Per-POST timeout cap, ms (default 1500) | `1500` | -## 上报字段(全部) +## Reported fields (all of them) ```json { "schema_version": "1", "ts": "2026-06-04T11:38:24+08:00", - "trace_id": "76a04f9eba0ad00c", // == 传输层 execution_id,可与服务端日志 join - "corp_id": "ding...", // 租户维度,best-effort(取自登录 token) - "cli_version": "1.0.34", // 版本健康:"这版本是不是把某命令搞挂了" - "channel": "openclaw", // 哪个 agent/集成在调用(DWS_CHANNEL) - "os": "darwin", // 粗平台,非 PII + "trace_id": "76a04f9eba0ad00c", // == transport execution_id; join with server-side logs + "corp_id": "ding...", // tenant dimension, best-effort (from the login token) + "cli_version": "1.0.34", // version health: "did this release break a command" + "channel": "openclaw", // which agent/integration is driving dws (DWS_CHANNEL) + "os": "darwin", // coarse platform, not PII "module": "doc", "command": "doc", "subcommand": "create_document", "outcome": "ok", // ok | error - "err_class": "", // outcome=error 时的错误分类 + "err_class": "", // error category when outcome=error "exit_code": 0, - "duration_ms": 73 // 调用墙钟耗时,用于 P99 + "duration_ms": 73 // wall-clock latency, for P99 } ``` -**刻意不采**(看这个 struct 就能验证隐私边界):用户身份(user_id/姓名)、 -对象名/id、自由文本、设备 id/序列号、请求/响应 body。 +**Deliberately NOT collected** (verify the privacy boundary by reading this +struct): user identity (user_id/name), object names/ids, free text, device +id/serial number, request/response bodies. -## 接收端契约 +## Ingest contract -任何 HTTP 服务都能接: +Any HTTP service can receive it: ``` POST / Content-Type: application/json -Authorization: Bearer # 对应 DWS_TELEMETRY_TOKEN +Authorization: Bearer # matches DWS_TELEMETRY_TOKEN X-Dws-Telemetry-Schema: 1 -Body: 一条遥测事件 JSON -返回 2xx 即成功 +Body: one telemetry event as JSON +2xx means success ``` -## 本地测试(零依赖,不碰 SLS) +## Local testing (zero dependencies, no SLS) -上 SLS 之前,先在本机把整条链路跑通。用 `fc-sls-ingest/localsink.py` -(纯 Python 标准库,不用 `pip install` 任何东西)当接收端: +Before touching SLS, validate the whole pipeline locally. Use +`fc-sls-ingest/localsink.py` (standard library only, no `pip install`) as the +receiver: ```bash -# 1. 起本地接收端(带一个测试 token) +# 1. start the local sink (with a test token) cd docs/telemetry/fc-sls-ingest -TOKEN=dev python3 localsink.py # 监听 127.0.0.1:8799,落盘 /tmp/dws_telemetry.jsonl +TOKEN=dev python3 localsink.py # listens on 127.0.0.1:8799, writes /tmp/dws_telemetry.jsonl -# 2. 另开一个终端,把 dws 指向它 +# 2. in another terminal, point dws at it export DWS_TELEMETRY_ENABLED=true export DWS_TELEMETRY_URL=http://127.0.0.1:8799 export DWS_TELEMETRY_TOKEN=dev -# 3. 跑几条命令(--mock 不联网、不需要真实后端,也会触发上报) -dws doc create --title 测试 --mock +# 3. run a few commands (--mock needs no network or real backend, still emits) +dws doc create --title test --mock dws drive list --mock ``` -接收端会实时打印每条事件,并追加到 `/tmp/dws_telemetry.jsonl`。验证要点: +The sink prints each event live and appends it to `/tmp/dws_telemetry.jsonl`. +Things to check: -- 事件含 `command/outcome/duration_ms/cli_version/channel/os` 等维度; -- 把命令参数(如 `--title 测试`)和报文对照,确认**内容没出现在报文里**; -- 不带 token POST 应被拒(401)。 +- events carry `command/outcome/duration_ms/cli_version/channel/os`; +- compare a command argument (e.g. `--title test`) against the payload to + confirm **content never appears** in it; +- a POST without the token is rejected (401). -落盘后可以本地先模拟一把"大盘"会算的指标: +You can also reproduce locally what the dashboard would compute: ```bash python3 - <<'PY' @@ -94,58 +103,70 @@ for r in rows: k=f"{r['command']} {r['subcommand']}"; b=by[k] b['n']+=1; b['err']+=(r['outcome']!='ok'); b['dur'].append(r.get('duration_ms',0)) for k,v in sorted(by.items(), key=lambda x:-x[1]['n']): - d=v['dur']; print(f"{k:<26}调用{v['n']:>4} 失败{v['err']:>3} avg{sum(d)//len(d):>5}ms max{max(d):>5}ms") + d=v['dur']; print(f"{k:<26}calls {v['n']:>4} err {v['err']:>3} avg {sum(d)//len(d):>5}ms max {max(d):>5}ms") PY ``` -> 说明:遥测只在命令真正进入 MCP 调用阶段才上报。若命令在参数解析层就报错 -> (未到调用),不会产生遥测——这是预期行为。 +> Note: telemetry only emits once a command actually reaches the MCP call stage. +> If a command fails at argument parsing (before the call), no telemetry is +> produced — this is expected. -## 开源代码与内部资源的边界(公私边界) +## Open-source code vs internal resources (the public/private boundary) -dws 是开源仓库,但**遥测数据进哪个 SLS、绑哪个内部应用,是部署方自己的事,不进仓库**。 -这条边界是设计出来的,不是巧合: +dws is an open-source repo, but **which SLS the telemetry lands in, and which +internal application it binds to, is the deployer's own concern and never enters +the repo**. This boundary is by design, not by accident: -| | 在哪 | 包含什么 | 进仓库吗 | +| | Where | Contains | In the repo? | |---|---|---|---| -| dws 二进制 + 本目录 FC/local 参考代码 | 公开仓库 | 只会 POST 到 `DWS_TELEMETRY_URL`;**无 endpoint、无密钥、无应用名** | ✅ | -| SLS Project / FC 实例 / 真实 URL+token | 部署方内部基础设施 | 真实地址、鉴权、日志库;阿里内部还需绑定一个内部应用 | ❌ 永不进仓库,靠环境变量注入 | - -代码里**绝不硬编码任何厂商上报地址**,URL 一律运行时从环境变量读取。所以"代码公开" -与"数据落到部署方内部 SLS"天然解耦:换部署方只是换一组环境变量,仓库无需改动, -也看不到任何一方的真实配置。 - -> 阿里内部场景:SLS Project 需挂在一个 AONE 应用下(资源治理要求)。把它绑到 dws -> 后端所属的应用(如钉钉 MCP 网关应用)即可;这个绑定关系、真实 URL 与 token 全部 -> 留在内部,公开仓库不感知。 - -## 接入阿里云 SLS(生产推荐) - -SLS(日志服务)自带写入 / 存储 / 检索 / Dashboard / 告警,是运维监控的标准选型: - -1. **建库**:SLS 控制台建 Project + Logstore(如 `dws-telemetry`),设留存天数; - 给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` - 开字段索引,`duration_ms` 设为 long 型索引(要算 P99)。 -2. **建接收端点**:用**函数计算 FC** HTTP 触发器最省运维——校验 Bearer 后把 body - 作为一条日志 `PutLogs` 写进 Logstore(整条 JSON 放 `event` 字段,另抽 - `command`/`outcome`/`duration_ms`/`cli_version` 做索引列)。 -3. **下发**:把 FC 地址作为 `DWS_TELEMETRY_URL` 配到各端 dws。 - -### 上手就能用的 4 条告警(SLS 告警规则) - -| 告警 | SLS 查询(示意) | 触发 | +| dws binary + the FC/local reference code in this dir | public repo | only POSTs to `DWS_TELEMETRY_URL`; **no endpoint, no secret, no app name** | yes | +| SLS project / FC instance / real URL+token | the deployer's own infrastructure | real address, auth, log store; inside Alibaba it also binds to an internal application | no — never committed, injected via env | + +The code **never hardcodes any vendor reporting endpoint**; the URL is always +read from the environment at runtime. So "the code is open" and "the data lands +in the deployer's own SLS" are naturally decoupled: switching deployers is just +a different set of environment variables, with no repo change and no party's +real config visible. + +> Alibaba-internal note: an SLS project must belong to an AONE application +> (resource governance). Bind it to the application that owns the dws backend +> (e.g. the DingTalk MCP gateway app); the binding, real URL and token all stay +> internal and the public repo never knows about them. + +## Wiring up Alibaba Cloud SLS (recommended for production) + +SLS (Log Service) provides ingestion / storage / search / dashboards / alerting +out of the box, and is the standard choice for ops monitoring: + +1. **Create the store**: in the SLS console create a Project + Logstore (e.g. + `dws-telemetry`) with a retention period; index + `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` + as text and `duration_ms` as a long field (needed for P99). +2. **Create the ingest endpoint**: a **Function Compute (FC)** HTTP trigger is + the lowest-ops option — it verifies the bearer token and writes the body as + one log via `PutLogs` (store the full JSON in an `event` field, and promote + `command`/`outcome`/`duration_ms`/`cli_version` to indexed columns). +3. **Roll out**: set the FC address as `DWS_TELEMETRY_URL` on each dws install. + +### Four ready-to-use alerts (SLS alert rules) + +| Alert | SLS query (illustrative) | Trigger | |---|---|---| -| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 5% | -| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | -| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | -| 调用量跌零 | `* \| select count(*)` | 5 分钟内 == 0 | - -告警通知渠道直接选钉钉机器人。 - -## 数据落在哪 / 两条流 - -- **不开 = 不出本机。** dws 不内置任何厂商默认上报地址。 -- **企业自有监控**:`DWS_TELEMETRY_URL` 指向企业自己的 SLS ingest。 -- **平台侧统一监控**:URL 指向钉钉的遥测 ingest——技术可行,但必须 opt-in + 告知。 - 因为本遥测**只含匿名维度**,隐私边界天然干净,适合做平台运维大盘。 -- 合规全量留痕是另一条线,走 [audit](./audit.md) 的企业自有 sink,别和遥测混。 +| Error-rate spike | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 5% | +| P99 latency breach | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| One command failing at scale | `* \| select command, count_if(outcome='error') c group by command order by c desc` | c spikes for one command | +| Traffic dropped to zero | `* \| select count(*)` | == 0 within 5 minutes | + +Route the alert notifications straight to a DingTalk bot. + +## Where data lands / the two streams + +- **Off = nothing leaves the machine.** dws ships no built-in vendor endpoint. +- **Enterprise's own monitoring**: point `DWS_TELEMETRY_URL` at the + enterprise's own SLS ingest. +- **Platform-side monitoring**: pointing the URL at DingTalk's telemetry ingest + is technically fine, but must be opt-in and disclosed. Because this telemetry + carries **anonymous dimensions only**, the privacy boundary is clean and it + suits a platform-wide ops dashboard. +- Full compliance trails are a separate stream — use [audit](./audit.md)'s + enterprise-owned sink; do not mix it with telemetry. diff --git a/docs/telemetry/fc-sls-ingest/README.md b/docs/telemetry/fc-sls-ingest/README.md index 2d5379cb..9355fbba 100644 --- a/docs/telemetry/fc-sls-ingest/README.md +++ b/docs/telemetry/fc-sls-ingest/README.md @@ -1,104 +1,111 @@ -# dws 遥测接收端(函数计算 FC → SLS) +# dws telemetry ingest (Function Compute → SLS) -这是 [运维遥测](../../telemetry.md) 的**参考接收端**:dws 把一条遥测 JSON POST 过来, -SLS 不能直接收裸 POST(写入要签名),所以这里垫一个最小 HTTP 服务,校验 token 后用 -`PutLogs` 写进 SLS。部署成函数计算(FC)的 **Web 函数**即可,不用关心 FC handler 签名。 +This is the **reference receiver** for [operational telemetry](../../telemetry.md): +dws POSTs one telemetry JSON, and SLS cannot accept that raw POST (its write API +must be signed), so this minimal HTTP service verifies the token and writes to +SLS via `PutLogs`. Deploy it as a Function Compute (FC) **Web Function** — no +need to worry about FC handler signatures. ``` -dws ──POST 一条 JSON──▶ 本服务(FC Web 函数) ──PutLogs──▶ SLS Logstore ──▶ 大盘/告警 +dws ──POST one JSON──▶ this service (FC Web Function) ──PutLogs──▶ SLS Logstore ──▶ dashboard/alerts ``` -## 文件 +## Files -- `app.py` — Flask 服务:`POST /` 校验 Bearer → 解析 JSON → 写 SLS;`GET /` 健康检查 -- `requirements.txt` — 依赖(flask / gunicorn / aliyun-log-python-sdk) +- `app.py` — Flask service: `POST /` verifies the bearer → parses JSON → writes to SLS; `GET /` is a health check +- `localsink.py` — zero-dependency local sink for testing without SLS/FC +- `requirements.txt` — dependencies (flask / gunicorn / aliyun-log-python-sdk) -## 一、先在 SLS 建库(控制台点几下) +## 1. Create the store in SLS (a few clicks in the console) -1. 建 **Project**(如 `dws-ops`)和 **Logstore**(如 `dws-telemetry`),设留存天数。 -2. 开索引:给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` - 设为 **text**;给 `duration_ms` / `exit_code` 设为 **long**(要做 P99 和聚合)。 +1. Create a **Project** (e.g. `dws-ops`) and a **Logstore** (e.g. `dws-telemetry`) with a retention period. +2. Add indexes: `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` + as **text**; `duration_ms` / `exit_code` as **long** (for P99 and aggregation). -## 两种运行模式(自动判断) +## Two run modes (auto-detected) -`app.py` 按环境变量自动切换,**不用改代码**: +`app.py` switches automatically by environment variable — **no code change**: -| 模式 | 触发条件 | 行为 | +| Mode | Trigger | Behavior | |---|---|---| -| **dry-run** | 缺任一 SLS 变量,或设 `TELEMETRY_DRYRUN=true` | 收到事件只打到 stdout(FC 会进函数日志),返回 204。**不依赖 aliyun-log SDK**,适合先验证管线 | -| **SLS** | `SLS_ENDPOINT`+`SLS_PROJECT`+`SLS_LOGSTORE` 都配齐 | 校验后 `PutLogs` 写进 Logstore | - -`GET /` 健康检查会回显当前模式(`mode=dry-run` / `mode=sls`),部署后一眼可辨。 - -## 二、部署本服务为 FC Web 函数 - -1. 函数计算控制台 → 创建函数 → **Web 函数** → Python 运行时。 -2. 上传本目录代码(含 `requirements.txt`,FC 会自动装依赖)。 -3. **启动命令**填:`gunicorn -b 0.0.0.0:9000 app:app`,**监听端口** `9000`。 -4. **先空跑验证(强烈建议)**:第一次只配 `INGEST_TOKEN`,**不配 SLS 变量**(或加 - `TELEMETRY_DRYRUN=true`)。部署后 `GET /` 应显示 `mode=dry-run`;把 dws 指过来跑 - 几条命令,去 FC 的**函数日志**里能看到 `DRYRUN {...}` 行,就证明"客户端→FC"这段通了。 - 这一步**不需要 SLS、不需要建库、不需要 SDK**。 -5. **再接 SLS**:给函数**绑定一个服务角色**,授权 `AliyunLogFullAccess`(或更小的 - PutLogs 权限)——这样不用把 AccessKey 写进环境变量,FC 自动注入 STS 临时凭证, - `app.py` 已优先读它。然后补上 SLS 环境变量,`GET /` 变成 `mode=sls` 即生效: - - | 变量 | 值 | 说明 | +| **dry-run** | any SLS var missing, or `TELEMETRY_DRYRUN=true` | logs each event to stdout (captured by FC function logs) and returns 204. **No aliyun-log SDK needed**; good for validating the pipeline first | +| **SLS** | `SLS_ENDPOINT` + `SLS_PROJECT` + `SLS_LOGSTORE` all set | verifies, then `PutLogs` into the Logstore | + +`GET /` reports the active mode (`mode=dry-run` / `mode=sls`) — obvious at a glance after deploy. + +## 2. Deploy as an FC Web Function + +1. FC console → create function → **Web Function** → Python runtime. +2. Upload this directory (including `requirements.txt`; FC installs deps automatically). +3. **Startup command**: `gunicorn -b 0.0.0.0:9000 app:app`, **listen port** `9000`. +4. **Dry-run first (strongly recommended)**: on the first deploy set only + `INGEST_TOKEN` and **no SLS vars** (or add `TELEMETRY_DRYRUN=true`). After + deploy, `GET /` should show `mode=dry-run`; point dws at it, run a few + commands, and look for `DRYRUN {...}` lines in the **FC function logs** — that + proves the "client → FC" leg works. This step needs **no SLS, no store, no SDK**. +5. **Then wire SLS**: bind a **service role** to the function granting + `AliyunLogFullAccess` (or a narrower PutLogs permission) — that way no + AccessKey goes into env, FC injects STS temporary credentials, and `app.py` + reads them preferentially. Then add the SLS environment variables and `GET /` + flips to `mode=sls`: + + | Variable | Value | Notes | |---|---|---| - | `SLS_ENDPOINT` | `cn-hangzhou.log.aliyuncs.com` | 按你的地域改 | - | `SLS_PROJECT` | `dws-ops` | 第一步建的 Project | - | `SLS_LOGSTORE` | `dws-telemetry` | 第一步建的 Logstore | - | `INGEST_TOKEN` | 自己生成一串随机串 | 必须和 dws 侧 `DWS_TELEMETRY_TOKEN` 一致 | + | `SLS_ENDPOINT` | `cn-hangzhou.log.aliyuncs.com` | change for your region | + | `SLS_PROJECT` | `dws-ops` | the Project from step 1 | + | `SLS_LOGSTORE` | `dws-telemetry` | the Logstore from step 1 | + | `INGEST_TOKEN` | a random string you generate | must match `DWS_TELEMETRY_TOKEN` on the dws side | -6. 部署后拿到函数的 HTTP 触发器地址(形如 `https://xxx.cn-hangzhou.fcapp.run`)。 +6. After deploy, take the function's HTTP trigger URL (e.g. `https://xxx.cn-hangzhou.fcapp.run`). -## 三、把 dws 接上 +## 3. Wire up dws -在跑 dws 的环境里(或由上层 agent 注入): +In the environment that runs dws (or injected by the orchestrating agent): ```bash export DWS_TELEMETRY_ENABLED=true -export DWS_TELEMETRY_URL="https://xxx.cn-hangzhou.fcapp.run" # 上一步的函数地址 -export DWS_TELEMETRY_TOKEN="<和 INGEST_TOKEN 相同的随机串>" +export DWS_TELEMETRY_URL="https://xxx.cn-hangzhou.fcapp.run" # the function URL from above +export DWS_TELEMETRY_TOKEN="" ``` -跑几条命令,到 SLS Logstore 查询页就能看到一条条记录。 +Run a few commands and you'll see records appear in the SLS Logstore query page. -## 四、本地先验证(可选,不依赖 FC / SLS) +## 4. Local validation first (optional, no FC / no SLS) -最省事的本地验证用 `localsink.py`(纯标准库,零依赖),见 -[telemetry.md 的「本地测试」](../../telemetry.md#本地测试零依赖不碰-sls)。 +The simplest local check uses `localsink.py` (standard library, zero deps); see +[the "Local testing" section in telemetry.md](../../telemetry.md#local-testing-zero-dependencies-no-sls). -也可以直接本地跑本服务的 **dry-run 模式**(不配 SLS、不用装 aliyun-log): +You can also run this service's **dry-run mode** locally (no SLS, no aliyun-log): ```bash cd docs/telemetry/fc-sls-ingest -pip install flask # dry-run 只需 flask;aliyun-log 仅 SLS 模式才要 -INGEST_TOKEN=dev python3 app.py # 不配 SLS_* -> 自动 dry-run,监听 :9000 -# 另开一个终端: -curl -s localhost:9000/ # 应回显 mode=dry-run +pip install flask # dry-run needs only flask; aliyun-log is for SLS mode +INGEST_TOKEN=dev python3 app.py # no SLS_* -> auto dry-run, listens on :9000 +# in another terminal: +curl -s localhost:9000/ # should report mode=dry-run curl -XPOST localhost:9000/ -H 'Authorization: Bearer dev' \ -H 'Content-Type: application/json' \ -d '{"schema_version":"1","command":"doc","outcome":"ok","duration_ms":42}' -# 返回 204;事件会以 DRYRUN {...} 打印在 app.py 的终端里。 +# returns 204; the event prints as DRYRUN {...} in the app.py terminal. ``` -要本地连真 SLS 验证,再补 `SLS_ENDPOINT/SLS_PROJECT/SLS_LOGSTORE` 和一组 AccessKey -(`pip install -r requirements.txt` 装上 aliyun-log),`GET /` 会变成 `mode=sls`。 +To validate against real SLS locally, add `SLS_ENDPOINT/SLS_PROJECT/SLS_LOGSTORE` +and an AccessKey pair (`pip install -r requirements.txt` for aliyun-log); `GET /` +becomes `mode=sls`. -## 五、配告警(SLS 控制台 → 告警) +## 5. Configure alerts (SLS console → Alerts) -| 告警 | 查询(示意) | 触发 | +| Alert | Query (illustrative) | Trigger | |---|---|---| -| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 0.05 | -| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | -| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | -| 调用量跌零 | `* \| select count(*) as n` | n == 0(5 分钟窗口) | +| Error-rate spike | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 0.05 | +| P99 latency breach | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| One command failing at scale | `* \| select command, count_if(outcome='error') c group by command order by c desc` | c spikes for one command | +| Traffic dropped to zero | `* \| select count(*) as n` | n == 0 (5-minute window) | -通知渠道直接选钉钉机器人 webhook。 +Route notifications straight to a DingTalk bot webhook. -## 安全须知 +## Security notes -- `INGEST_TOKEN` 用强随机串,并和 dws 侧保持一致;不要留空。 -- 优先用 FC 服务角色(STS),不要把长期 AccessKey 写进环境变量。 -- 本服务只接**匿名维度**数据,不含用户内容/身份——隐私边界由 dws 客户端保证。 +- Use a strong random `INGEST_TOKEN`, keep it in sync with the dws side, and never leave it empty. +- Prefer an FC service role (STS); do not put long-lived AccessKeys in env vars. +- This service only receives **anonymous dimensions** — no user content/identity; the privacy boundary is enforced by the dws client. diff --git a/internal/app/telemetry_runtime_test.go b/internal/app/telemetry_runtime_test.go index aa29dbc5..fdd20bd5 100644 --- a/internal/app/telemetry_runtime_test.go +++ b/internal/app/telemetry_runtime_test.go @@ -43,7 +43,7 @@ func TestEmitTelemetryWiresEvent(t *testing.T) { CanonicalProduct: "doc", Tool: "create", // Params carry content; telemetry must NOT read any of it. - Params: map[string]any{"title": "Q3 财报", "doc_id": "doc-secret-123"}, + Params: map[string]any{"title": "Q3-Earnings-Report", "doc_id": "doc-secret-123"}, } emitTelemetry("trace-xyz", inv, false, "validation", 123*time.Millisecond) @@ -77,7 +77,7 @@ func TestEmitTelemetryWiresEvent(t *testing.T) { // Privacy boundary: no param content may ever leak into the wire payload. raw := string(body) - for _, secret := range []string{"Q3 财报", "doc-secret-123", "title"} { + for _, secret := range []string{"Q3-Earnings-Report", "doc-secret-123", "title"} { if contains(raw, secret) { t.Errorf("telemetry payload leaked content %q: %s", secret, raw) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 6483c4a1..4d7985ac 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -46,10 +46,10 @@ const defaultTimeout = 1500 * time.Millisecond func init() { for _, it := range []configmeta.ConfigItem{ - {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "启用匿名运维遥测(仅维度,无内容无身份,默认关)", Example: "true"}, - {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "遥测上报端点(每次调用 POST 一条 JSON)", Example: "https://telemetry.example.com/dws"}, - {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "遥测端点的 Bearer 鉴权(可选)", Sensitive: true, Example: "xxxxx"}, - {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "单次上报超时上限(毫秒,默认 1500)", Example: "1500"}, + {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "Enable anonymous ops telemetry (dimensions only, no content/identity; off by default)", Example: "true"}, + {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "Telemetry ingest endpoint (one JSON event POSTed per command)", Example: "https://telemetry.example.com/dws"}, + {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "Bearer token for the telemetry endpoint (optional)", Sensitive: true, Example: "xxxxx"}, + {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "Per-POST timeout cap in milliseconds (default 1500)", Example: "1500"}, } { configmeta.Register(it) } From 1f368d3ee42ac118909e3b8fb7be5239e52ca3a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:26:43 +0800 Subject: [PATCH 07/12] docs(telemetry): translate telemetry docs and config strings to English Convert all Chinese content in the telemetry surface to English so the public repo leaves no localized traces: - docs/telemetry.md (full doc) - docs/telemetry/fc-sls-ingest/README.md (FC->SLS receiver guide) - internal/telemetry/telemetry.go (config item descriptions) - internal/app/telemetry_runtime_test.go (test fixture string) No behavior change; English-only wording. --- docs/telemetry.md | 203 ++++++++++++++----------- docs/telemetry/fc-sls-ingest/README.md | 146 ++++++++++-------- internal/app/telemetry_runtime_test.go | 4 +- internal/telemetry/telemetry.go | 8 +- 4 files changed, 199 insertions(+), 162 deletions(-) diff --git a/docs/telemetry.md b/docs/telemetry.md index df4e9890..4b6ffe25 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -1,89 +1,99 @@ -# 运维遥测(Telemetry) +# Ops Telemetry -dws 可以为**每一次命令调用**上报一条**匿名、纯维度**的运维指标,用于监控 -错误率、延迟、命令分布和版本/平台健康度。它是审计([audit](./audit.md))的运维侧 -对应物,但刻意做得**小得多**: +dws can emit one **anonymous, dimensions-only** ops metric per **command +invocation**, used to monitor error rate, latency, command distribution, and +version/platform health. It is the ops-side counterpart of [audit](./audit.md), +but deliberately **far smaller**: -- 只采**粗维度**,绝不采对象名、自由文本、peer id、设备指纹、自然语言原文。 - 没有"脱敏档",因为压根没有敏感字段可脱。 -- **独立于审计**:和 `DWS_AUDIT_*` 互不相关,可以只开遥测不开合规审计。 -- **默认全关**。不设 `DWS_TELEMETRY_ENABLED` 时,dws 不产生任何遥测,热路径零影响。 +- Collects **coarse dimensions only** — never object names, free text, peer ids, + device fingerprints, or natural-language input. There is no "redaction profile" + because there are no sensitive fields to redact in the first place. +- **Independent of audit**: unrelated to `DWS_AUDIT_*`; you can enable telemetry + without enabling compliance audit. +- **Off by default**. With `DWS_TELEMETRY_ENABLED` unset, dws produces no + telemetry at all — zero impact on the hot path. -> 这是开源 CLI,集中上报必须 **opt-in + 明确告知**。默认不上报一个字节。 +> This is an open-source CLI, so centralized reporting must be **opt-in + +> explicitly disclosed**. By default, not a single byte is reported. -## 启用 +## Enabling -| 环境变量 | 说明 | 示例 | +| Environment variable | Description | Example | |---|---|---| -| `DWS_TELEMETRY_ENABLED` | 启用遥测(需同时配 URL 才生效) | `true` | -| `DWS_TELEMETRY_URL` | 上报端点,每次调用 POST 一条 JSON | `https://telemetry.example.com/dws` | -| `DWS_TELEMETRY_TOKEN` | 端点的 Bearer 鉴权(可选) | `xxxxx` | -| `DWS_TELEMETRY_TIMEOUT_MS` | 单次上报超时上限,毫秒(默认 1500) | `1500` | +| `DWS_TELEMETRY_ENABLED` | Enable telemetry (only takes effect when a URL is also set) | `true` | +| `DWS_TELEMETRY_URL` | Ingest endpoint; one JSON event is POSTed per invocation | `https://telemetry.example.com/dws` | +| `DWS_TELEMETRY_TOKEN` | Bearer auth for the endpoint (optional) | `xxxxx` | +| `DWS_TELEMETRY_TIMEOUT_MS` | Per-report timeout cap, in ms (default 1500) | `1500` | -## 上报字段(全部) +## Reported fields (complete) ```json { "schema_version": "1", "ts": "2026-06-04T11:38:24+08:00", - "trace_id": "76a04f9eba0ad00c", // == 传输层 execution_id,可与服务端日志 join - "corp_id": "ding...", // 租户维度,best-effort(取自登录 token) - "cli_version": "1.0.34", // 版本健康:"这版本是不是把某命令搞挂了" - "channel": "openclaw", // 哪个 agent/集成在调用(DWS_CHANNEL) - "os": "darwin", // 粗平台,非 PII + "trace_id": "76a04f9eba0ad00c", // == transport execution_id, joinable with server-side logs + "corp_id": "ding...", // tenant dimension, best-effort (from the login token) + "cli_version": "1.0.34", // version health: "did this release break a command" + "channel": "openclaw", // which agent/integration drove the call (DWS_CHANNEL) + "os": "darwin", // coarse platform, not PII "module": "doc", "command": "doc", "subcommand": "create_document", "outcome": "ok", // ok | error - "err_class": "", // outcome=error 时的错误分类 + "err_class": "", // error category when outcome=error "exit_code": 0, - "duration_ms": 73 // 调用墙钟耗时,用于 P99 + "duration_ms": 73 // wall-clock latency of the call, used for P99 } ``` -**刻意不采**(看这个 struct 就能验证隐私边界):用户身份(user_id/姓名)、 -对象名/id、自由文本、设备 id/序列号、请求/响应 body。 +**Deliberately not collected** (verify the privacy boundary by reading the +struct): user identity (user_id / name), object names/ids, free text, device +id/serial, request/response body. -## 接收端契约 +## Receiver contract -任何 HTTP 服务都能接: +Any HTTP service can receive it: ``` POST / Content-Type: application/json -Authorization: Bearer # 对应 DWS_TELEMETRY_TOKEN +Authorization: Bearer # matches DWS_TELEMETRY_TOKEN X-Dws-Telemetry-Schema: 1 -Body: 一条遥测事件 JSON -返回 2xx 即成功 +Body: one telemetry event JSON +Return 2xx for success ``` -## 本地测试(零依赖,不碰 SLS) +## Local testing (zero dependencies, no SLS) -上 SLS 之前,先在本机把整条链路跑通。用 `fc-sls-ingest/localsink.py` -(纯 Python 标准库,不用 `pip install` 任何东西)当接收端: +Before going to SLS, run the whole pipeline locally. Use +`fc-sls-ingest/localsink.py` (pure Python standard library, no `pip install` +needed) as the receiver: ```bash -# 1. 起本地接收端(带一个测试 token) +# 1. Start the local receiver (with a test token) cd docs/telemetry/fc-sls-ingest -TOKEN=dev python3 localsink.py # 监听 127.0.0.1:8799,落盘 /tmp/dws_telemetry.jsonl +TOKEN=dev python3 localsink.py # listens on 127.0.0.1:8799, writes /tmp/dws_telemetry.jsonl -# 2. 另开一个终端,把 dws 指向它 +# 2. In another terminal, point dws at it export DWS_TELEMETRY_ENABLED=true export DWS_TELEMETRY_URL=http://127.0.0.1:8799 export DWS_TELEMETRY_TOKEN=dev -# 3. 跑几条命令(--mock 不联网、不需要真实后端,也会触发上报) -dws doc create --title 测试 --mock +# 3. Run a few commands (--mock needs no network or real backend, still emits telemetry) +dws doc create --title test --mock dws drive list --mock ``` -接收端会实时打印每条事件,并追加到 `/tmp/dws_telemetry.jsonl`。验证要点: +The receiver prints each event in real time and appends to +`/tmp/dws_telemetry.jsonl`. Things to verify: -- 事件含 `command/outcome/duration_ms/cli_version/channel/os` 等维度; -- 把命令参数(如 `--title 测试`)和报文对照,确认**内容没出现在报文里**; -- 不带 token POST 应被拒(401)。 +- Events carry dimensions such as `command/outcome/duration_ms/cli_version/channel/os`; +- Compare command arguments (e.g. `--title test`) against the payload and confirm + the **content does not appear in the payload**; +- A POST without the token must be rejected (401). -落盘后可以本地先模拟一把"大盘"会算的指标: +Once written to disk, you can locally simulate the kind of metrics a dashboard +would compute: ```bash python3 - <<'PY' @@ -94,58 +104,71 @@ for r in rows: k=f"{r['command']} {r['subcommand']}"; b=by[k] b['n']+=1; b['err']+=(r['outcome']!='ok'); b['dur'].append(r.get('duration_ms',0)) for k,v in sorted(by.items(), key=lambda x:-x[1]['n']): - d=v['dur']; print(f"{k:<26}调用{v['n']:>4} 失败{v['err']:>3} avg{sum(d)//len(d):>5}ms max{max(d):>5}ms") + d=v['dur']; print(f"{k:<26}calls{v['n']:>4} err{v['err']:>3} avg{sum(d)//len(d):>5}ms max{max(d):>5}ms") PY ``` -> 说明:遥测只在命令真正进入 MCP 调用阶段才上报。若命令在参数解析层就报错 -> (未到调用),不会产生遥测——这是预期行为。 +> Note: telemetry is only emitted once a command actually reaches the MCP-call +> stage. If a command fails at argument parsing (before the call), no telemetry is +> produced — this is expected behavior. -## 开源代码与内部资源的边界(公私边界) +## Boundary between open-source code and internal resources (public/private split) -dws 是开源仓库,但**遥测数据进哪个 SLS、绑哪个内部应用,是部署方自己的事,不进仓库**。 -这条边界是设计出来的,不是巧合: +dws is an open-source repository, but **which SLS the telemetry lands in and which +internal app it binds to is the deployer's own concern and never goes into the +repo**. This boundary is by design, not accident: -| | 在哪 | 包含什么 | 进仓库吗 | +| | Where | Contains | In repo? | |---|---|---|---| -| dws 二进制 + 本目录 FC/local 参考代码 | 公开仓库 | 只会 POST 到 `DWS_TELEMETRY_URL`;**无 endpoint、无密钥、无应用名** | ✅ | -| SLS Project / FC 实例 / 真实 URL+token | 部署方内部基础设施 | 真实地址、鉴权、日志库;阿里内部还需绑定一个内部应用 | ❌ 永不进仓库,靠环境变量注入 | - -代码里**绝不硬编码任何厂商上报地址**,URL 一律运行时从环境变量读取。所以"代码公开" -与"数据落到部署方内部 SLS"天然解耦:换部署方只是换一组环境变量,仓库无需改动, -也看不到任何一方的真实配置。 - -> 阿里内部场景:SLS Project 需挂在一个 AONE 应用下(资源治理要求)。把它绑到 dws -> 后端所属的应用(如钉钉 MCP 网关应用)即可;这个绑定关系、真实 URL 与 token 全部 -> 留在内部,公开仓库不感知。 - -## 接入阿里云 SLS(生产推荐) - -SLS(日志服务)自带写入 / 存储 / 检索 / Dashboard / 告警,是运维监控的标准选型: - -1. **建库**:SLS 控制台建 Project + Logstore(如 `dws-telemetry`),设留存天数; - 给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` - 开字段索引,`duration_ms` 设为 long 型索引(要算 P99)。 -2. **建接收端点**:用**函数计算 FC** HTTP 触发器最省运维——校验 Bearer 后把 body - 作为一条日志 `PutLogs` 写进 Logstore(整条 JSON 放 `event` 字段,另抽 - `command`/`outcome`/`duration_ms`/`cli_version` 做索引列)。 -3. **下发**:把 FC 地址作为 `DWS_TELEMETRY_URL` 配到各端 dws。 - -### 上手就能用的 4 条告警(SLS 告警规则) - -| 告警 | SLS 查询(示意) | 触发 | +| dws binary + the FC/local reference code in this dir | Public repo | Only POSTs to `DWS_TELEMETRY_URL`; **no endpoint, no secret, no app name** | ✅ | +| SLS Project / FC instance / real URL+token | Deployer's internal infra | Real address, auth, logstore; inside Alibaba it also binds to an internal app | ❌ Never in the repo; injected via env vars | + +The code **never hardcodes any vendor reporting address**; the URL is always read +from an environment variable at runtime. So "code is public" and "data lands in +the deployer's internal SLS" are naturally decoupled: switching deployers is just +a different set of env vars, the repo needs no change, and no party's real config +is visible. + +> Inside Alibaba: the SLS Project must hang under an AONE app (resource-governance +> requirement). Bind it to the app that owns the dws backend (e.g. the DingTalk +> MCP gateway app); that binding, the real URL, and the token all stay internal — +> the public repo is unaware of them. + +## Wiring up Alibaba Cloud SLS (recommended for production) + +SLS (Log Service) ships with ingest / storage / search / dashboards / alerting — +a standard choice for ops monitoring: + +1. **Create the store**: in the SLS console create a Project + Logstore (e.g. + `dws-telemetry`), set retention days; index the fields `command` / + `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel`, and set + `duration_ms` as a long-typed index (needed for P99). +2. **Create the receiver endpoint**: a **Function Compute (FC)** HTTP trigger is + the lowest-ops option — after validating the Bearer, write the body as a single + log via `PutLogs` into the Logstore (put the whole JSON in an `event` field and + also extract `command`/`outcome`/`duration_ms`/`cli_version` as indexed + columns). +3. **Roll out**: set the FC address as `DWS_TELEMETRY_URL` on each dws endpoint. + +### Four ready-to-use alerts (SLS alert rules) + +| Alert | SLS query (illustrative) | Trigger | |---|---|---| -| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 5% | -| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | -| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | -| 调用量跌零 | `* \| select count(*)` | 5 分钟内 == 0 | - -告警通知渠道直接选钉钉机器人。 - -## 数据落在哪 / 两条流 - -- **不开 = 不出本机。** dws 不内置任何厂商默认上报地址。 -- **企业自有监控**:`DWS_TELEMETRY_URL` 指向企业自己的 SLS ingest。 -- **平台侧统一监控**:URL 指向钉钉的遥测 ingest——技术可行,但必须 opt-in + 告知。 - 因为本遥测**只含匿名维度**,隐私边界天然干净,适合做平台运维大盘。 -- 合规全量留痕是另一条线,走 [audit](./audit.md) 的企业自有 sink,别和遥测混。 +| Error-rate spike | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 5% | +| P99 latency over budget | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| One command failing broadly | `* \| select command, count_if(outcome='error') c group by command order by c desc` | c spikes for a single command | +| Call volume drops to zero | `* \| select count(*)` | == 0 within 5 minutes | + +The alert notification channel can be a DingTalk bot directly. + +## Where the data lands / two flows + +- **Off = never leaves the machine.** dws ships no default vendor reporting address. +- **Enterprise self-hosted monitoring**: point `DWS_TELEMETRY_URL` at the + enterprise's own SLS ingest. +- **Platform-side unified monitoring**: point the URL at DingTalk's telemetry + ingest — technically possible, but must be opt-in + disclosed. Because this + telemetry **contains only anonymous dimensions**, the privacy boundary is clean + by construction, suitable for a platform ops dashboard. +- Full compliance trails are a separate track — use the enterprise's own sink via + [audit](./audit.md); don't mix it with telemetry. diff --git a/docs/telemetry/fc-sls-ingest/README.md b/docs/telemetry/fc-sls-ingest/README.md index 2d5379cb..1e18e4cb 100644 --- a/docs/telemetry/fc-sls-ingest/README.md +++ b/docs/telemetry/fc-sls-ingest/README.md @@ -1,104 +1,118 @@ -# dws 遥测接收端(函数计算 FC → SLS) +# dws Telemetry Receiver (Function Compute FC → SLS) -这是 [运维遥测](../../telemetry.md) 的**参考接收端**:dws 把一条遥测 JSON POST 过来, -SLS 不能直接收裸 POST(写入要签名),所以这里垫一个最小 HTTP 服务,校验 token 后用 -`PutLogs` 写进 SLS。部署成函数计算(FC)的 **Web 函数**即可,不用关心 FC handler 签名。 +This is the **reference receiver** for [Ops Telemetry](../../telemetry.md): dws +POSTs one telemetry JSON over, but SLS cannot accept a raw POST directly (writes +must be signed), so this minimal HTTP service sits in between — it validates the +token and writes to SLS via `PutLogs`. Deploy it as a Function Compute (FC) **web +function**; you don't have to deal with the FC handler signature. ``` -dws ──POST 一条 JSON──▶ 本服务(FC Web 函数) ──PutLogs──▶ SLS Logstore ──▶ 大盘/告警 +dws ──POST one JSON──▶ this service (FC web fn) ──PutLogs──▶ SLS Logstore ──▶ dashboard/alerts ``` -## 文件 +## Files -- `app.py` — Flask 服务:`POST /` 校验 Bearer → 解析 JSON → 写 SLS;`GET /` 健康检查 -- `requirements.txt` — 依赖(flask / gunicorn / aliyun-log-python-sdk) +- `app.py` — Flask service: `POST /` validates Bearer → parses JSON → writes SLS; `GET /` health check +- `requirements.txt` — dependencies (flask / gunicorn / aliyun-log-python-sdk) -## 一、先在 SLS 建库(控制台点几下) +## 1. Create the store in SLS first (a few clicks in the console) -1. 建 **Project**(如 `dws-ops`)和 **Logstore**(如 `dws-telemetry`),设留存天数。 -2. 开索引:给 `command` / `subcommand` / `outcome` / `cli_version` / `corp_id` / `channel` - 设为 **text**;给 `duration_ms` / `exit_code` 设为 **long**(要做 P99 和聚合)。 +1. Create a **Project** (e.g. `dws-ops`) and a **Logstore** (e.g. + `dws-telemetry`), set retention days. +2. Enable indexes: set `command` / `subcommand` / `outcome` / `cli_version` / + `corp_id` / `channel` as **text**; set `duration_ms` / `exit_code` as **long** + (needed for P99 and aggregation). -## 两种运行模式(自动判断) +## Two run modes (auto-detected) -`app.py` 按环境变量自动切换,**不用改代码**: +`app.py` switches automatically by environment variables — **no code change**: -| 模式 | 触发条件 | 行为 | +| Mode | Trigger | Behavior | |---|---|---| -| **dry-run** | 缺任一 SLS 变量,或设 `TELEMETRY_DRYRUN=true` | 收到事件只打到 stdout(FC 会进函数日志),返回 204。**不依赖 aliyun-log SDK**,适合先验证管线 | -| **SLS** | `SLS_ENDPOINT`+`SLS_PROJECT`+`SLS_LOGSTORE` 都配齐 | 校验后 `PutLogs` 写进 Logstore | - -`GET /` 健康检查会回显当前模式(`mode=dry-run` / `mode=sls`),部署后一眼可辨。 - -## 二、部署本服务为 FC Web 函数 - -1. 函数计算控制台 → 创建函数 → **Web 函数** → Python 运行时。 -2. 上传本目录代码(含 `requirements.txt`,FC 会自动装依赖)。 -3. **启动命令**填:`gunicorn -b 0.0.0.0:9000 app:app`,**监听端口** `9000`。 -4. **先空跑验证(强烈建议)**:第一次只配 `INGEST_TOKEN`,**不配 SLS 变量**(或加 - `TELEMETRY_DRYRUN=true`)。部署后 `GET /` 应显示 `mode=dry-run`;把 dws 指过来跑 - 几条命令,去 FC 的**函数日志**里能看到 `DRYRUN {...}` 行,就证明"客户端→FC"这段通了。 - 这一步**不需要 SLS、不需要建库、不需要 SDK**。 -5. **再接 SLS**:给函数**绑定一个服务角色**,授权 `AliyunLogFullAccess`(或更小的 - PutLogs 权限)——这样不用把 AccessKey 写进环境变量,FC 自动注入 STS 临时凭证, - `app.py` 已优先读它。然后补上 SLS 环境变量,`GET /` 变成 `mode=sls` 即生效: - - | 变量 | 值 | 说明 | +| **dry-run** | Any SLS variable missing, or `TELEMETRY_DRYRUN=true` | Received events are printed to stdout (FC captures this in function logs) and return 204. **Does not require the aliyun-log SDK** — good for validating the pipeline first | +| **SLS** | `SLS_ENDPOINT`+`SLS_PROJECT`+`SLS_LOGSTORE` all set | After validation, `PutLogs` writes into the Logstore | + +The `GET /` health check echoes the current mode (`mode=dry-run` / `mode=sls`), +so it's obvious right after deploy. + +## 2. Deploy this service as an FC web function + +1. Function Compute console → Create function → **Web function** → Python runtime. +2. Upload this directory's code (incl. `requirements.txt`; FC installs deps + automatically). +3. **Startup command**: `gunicorn -b 0.0.0.0:9000 app:app`, **listen port** `9000`. +4. **Dry-run validation first (strongly recommended)**: on the first deploy set + only `INGEST_TOKEN` and **leave the SLS variables unset** (or add + `TELEMETRY_DRYRUN=true`). After deploy, `GET /` should show `mode=dry-run`; + point dws at it, run a few commands, and you'll see `DRYRUN {...}` lines in FC's + **function logs** — proving the "client → FC" leg works. This step **needs no + SLS, no store, no SDK**. +5. **Then wire up SLS**: **bind a service role** to the function and grant + `AliyunLogFullAccess` (or a narrower PutLogs permission) — this way you don't + put an AccessKey in env vars; FC injects STS temporary credentials and `app.py` + reads them first. Then add the SLS env vars; once `GET /` becomes `mode=sls` + it's live: + + | Variable | Value | Note | |---|---|---| - | `SLS_ENDPOINT` | `cn-hangzhou.log.aliyuncs.com` | 按你的地域改 | - | `SLS_PROJECT` | `dws-ops` | 第一步建的 Project | - | `SLS_LOGSTORE` | `dws-telemetry` | 第一步建的 Logstore | - | `INGEST_TOKEN` | 自己生成一串随机串 | 必须和 dws 侧 `DWS_TELEMETRY_TOKEN` 一致 | + | `SLS_ENDPOINT` | `cn-hangzhou.log.aliyuncs.com` | change to your region | + | `SLS_PROJECT` | `dws-ops` | the Project from step 1 | + | `SLS_LOGSTORE` | `dws-telemetry` | the Logstore from step 1 | + | `INGEST_TOKEN` | a random string you generate | must match dws-side `DWS_TELEMETRY_TOKEN` | -6. 部署后拿到函数的 HTTP 触发器地址(形如 `https://xxx.cn-hangzhou.fcapp.run`)。 +6. After deploy, grab the function's HTTP trigger address (like + `https://xxx.cn-hangzhou.fcapp.run`). -## 三、把 dws 接上 +## 3. Wire dws up -在跑 dws 的环境里(或由上层 agent 注入): +In the environment where dws runs (or injected by the host agent): ```bash export DWS_TELEMETRY_ENABLED=true -export DWS_TELEMETRY_URL="https://xxx.cn-hangzhou.fcapp.run" # 上一步的函数地址 -export DWS_TELEMETRY_TOKEN="<和 INGEST_TOKEN 相同的随机串>" +export DWS_TELEMETRY_URL="https://xxx.cn-hangzhou.fcapp.run" # the function address from above +export DWS_TELEMETRY_TOKEN="" ``` -跑几条命令,到 SLS Logstore 查询页就能看到一条条记录。 +Run a few commands and you'll see records appear in the SLS Logstore query page. -## 四、本地先验证(可选,不依赖 FC / SLS) +## 4. Validate locally first (optional, no FC / SLS needed) -最省事的本地验证用 `localsink.py`(纯标准库,零依赖),见 -[telemetry.md 的「本地测试」](../../telemetry.md#本地测试零依赖不碰-sls)。 +The simplest local validation uses `localsink.py` (pure standard library, zero +deps), see [the "Local testing" section in telemetry.md](../../telemetry.md#local-testing-zero-dependencies-no-sls). -也可以直接本地跑本服务的 **dry-run 模式**(不配 SLS、不用装 aliyun-log): +You can also run this service's **dry-run mode** locally (no SLS, no aliyun-log): ```bash cd docs/telemetry/fc-sls-ingest -pip install flask # dry-run 只需 flask;aliyun-log 仅 SLS 模式才要 -INGEST_TOKEN=dev python3 app.py # 不配 SLS_* -> 自动 dry-run,监听 :9000 -# 另开一个终端: -curl -s localhost:9000/ # 应回显 mode=dry-run +pip install flask # dry-run only needs flask; aliyun-log is only for SLS mode +INGEST_TOKEN=dev python3 app.py # no SLS_* -> auto dry-run, listens on :9000 +# in another terminal: +curl -s localhost:9000/ # should echo mode=dry-run curl -XPOST localhost:9000/ -H 'Authorization: Bearer dev' \ -H 'Content-Type: application/json' \ -d '{"schema_version":"1","command":"doc","outcome":"ok","duration_ms":42}' -# 返回 204;事件会以 DRYRUN {...} 打印在 app.py 的终端里。 +# returns 204; the event prints as DRYRUN {...} in the app.py terminal. ``` -要本地连真 SLS 验证,再补 `SLS_ENDPOINT/SLS_PROJECT/SLS_LOGSTORE` 和一组 AccessKey -(`pip install -r requirements.txt` 装上 aliyun-log),`GET /` 会变成 `mode=sls`。 +To validate against real SLS locally, add `SLS_ENDPOINT/SLS_PROJECT/SLS_LOGSTORE` +and an AccessKey (`pip install -r requirements.txt` to install aliyun-log), and +`GET /` will become `mode=sls`. -## 五、配告警(SLS 控制台 → 告警) +## 5. Configure alerts (SLS console → Alerts) -| 告警 | 查询(示意) | 触发 | +| Alert | Query (illustrative) | Trigger | |---|---|---| -| 错误率突增 | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 0.05 | -| P99 延迟超标 | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | -| 某命令大面积失败 | `* \| select command, count_if(outcome='error') c group by command order by c desc` | 单命令 c 突增 | -| 调用量跌零 | `* \| select count(*) as n` | n == 0(5 分钟窗口) | +| Error-rate spike | `* \| select count_if(outcome='error')*1.0/count(*) as err_rate` | err_rate > 0.05 | +| P99 latency over budget | `* \| select approx_percentile(duration_ms, 0.99) as p99` | p99 > 3000 | +| One command failing broadly | `* \| select command, count_if(outcome='error') c group by command order by c desc` | c spikes for a single command | +| Call volume drops to zero | `* \| select count(*) as n` | n == 0 (5-minute window) | -通知渠道直接选钉钉机器人 webhook。 +The notification channel can be a DingTalk bot webhook directly. -## 安全须知 +## Security notes -- `INGEST_TOKEN` 用强随机串,并和 dws 侧保持一致;不要留空。 -- 优先用 FC 服务角色(STS),不要把长期 AccessKey 写进环境变量。 -- 本服务只接**匿名维度**数据,不含用户内容/身份——隐私边界由 dws 客户端保证。 +- Use a strong random string for `INGEST_TOKEN`, keep it in sync with the dws + side, and never leave it empty. +- Prefer the FC service role (STS); do not put a long-lived AccessKey in env vars. +- This service only accepts **anonymous dimension** data — no user content or + identity; the privacy boundary is guaranteed by the dws client. diff --git a/internal/app/telemetry_runtime_test.go b/internal/app/telemetry_runtime_test.go index aa29dbc5..7b21876e 100644 --- a/internal/app/telemetry_runtime_test.go +++ b/internal/app/telemetry_runtime_test.go @@ -43,7 +43,7 @@ func TestEmitTelemetryWiresEvent(t *testing.T) { CanonicalProduct: "doc", Tool: "create", // Params carry content; telemetry must NOT read any of it. - Params: map[string]any{"title": "Q3 财报", "doc_id": "doc-secret-123"}, + Params: map[string]any{"title": "Q3 Earnings", "doc_id": "doc-secret-123"}, } emitTelemetry("trace-xyz", inv, false, "validation", 123*time.Millisecond) @@ -77,7 +77,7 @@ func TestEmitTelemetryWiresEvent(t *testing.T) { // Privacy boundary: no param content may ever leak into the wire payload. raw := string(body) - for _, secret := range []string{"Q3 财报", "doc-secret-123", "title"} { + for _, secret := range []string{"Q3 Earnings", "doc-secret-123", "title"} { if contains(raw, secret) { t.Errorf("telemetry payload leaked content %q: %s", secret, raw) } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 6483c4a1..170d99f9 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -46,10 +46,10 @@ const defaultTimeout = 1500 * time.Millisecond func init() { for _, it := range []configmeta.ConfigItem{ - {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "启用匿名运维遥测(仅维度,无内容无身份,默认关)", Example: "true"}, - {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "遥测上报端点(每次调用 POST 一条 JSON)", Example: "https://telemetry.example.com/dws"}, - {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "遥测端点的 Bearer 鉴权(可选)", Sensitive: true, Example: "xxxxx"}, - {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "单次上报超时上限(毫秒,默认 1500)", Example: "1500"}, + {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "Enable anonymous ops telemetry (dimensions only, no content or identity; off by default)", Example: "true"}, + {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "Telemetry ingest endpoint (one JSON event POSTed per invocation)", Example: "https://telemetry.example.com/dws"}, + {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "Bearer auth for the telemetry endpoint (optional)", Sensitive: true, Example: "xxxxx"}, + {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "Per-report timeout cap in milliseconds (default 1500)", Example: "1500"}, } { configmeta.Register(it) } From 9b293fbab6d7b4e3ad542cfd038f7e214da54d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 16:36:14 +0800 Subject: [PATCH 08/12] style(telemetry): gofmt event.go (align struct tag comments) --- internal/telemetry/event.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/telemetry/event.go b/internal/telemetry/event.go index 338c66e1..94a2aa2f 100644 --- a/internal/telemetry/event.go +++ b/internal/telemetry/event.go @@ -51,7 +51,7 @@ type Event struct { Command string `json:"command"` // skill command Subcommand string `json:"subcommand"` // skill subcommand, e.g. "create" - Outcome string `json:"outcome"` // "ok" | "error" + Outcome string `json:"outcome"` // "ok" | "error" ErrClass string `json:"err_class,omitempty"` // error category when outcome=error ExitCode int `json:"exit_code"` DurationMS int64 `json:"duration_ms"` // wall-clock latency of the invocation From 84c1948f5d89e44767eee4c48e66e64dbfe8f0f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 17:17:56 +0800 Subject: [PATCH 09/12] feat(telemetry): default-on for downstream builds via ldflags, with opt-out + disclosure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lets a downstream "fleet" distribution ship telemetry on-by-default to its own ingest, while the open-source build stays opt-in and off — and hardcodes no endpoint. - internal/telemetry/telemetry.go: - build-time vars defaultURL/defaultToken (empty in OSS; injected via -ldflags by a downstream build) - Enabled() posture: DWS_TELEMETRY_DISABLED hard opt-out wins; explicit DWS_TELEMETRY_ENABLED overrides; otherwise on only when a default endpoint is baked in. Env URL/token override the build defaults. - ShowNoticeOnce(): one-time stderr disclosure (marker ~/.dws/.telemetry_notice_shown) - new DWS_TELEMETRY_DISABLED env + configmeta registration - internal/app/telemetry_runtime.go: print the disclosure once when telemetry first activates - internal/telemetry/telemetry_test.go: cover baked-in default-on + opt-out (OSS opt-in cases unchanged) - docs/telemetry.md: document default posture, ldflags injection, opt-out, disclosure Verified e2e: a build with a baked-in endpoint and no env defaults on, prints the notice once, and reports; DWS_TELEMETRY_DISABLED=true suppresses it. --- docs/telemetry.md | 56 +++++++++++-- internal/app/telemetry_runtime.go | 5 ++ internal/telemetry/telemetry.go | 117 ++++++++++++++++++++++----- internal/telemetry/telemetry_test.go | 29 +++++++ 4 files changed, 183 insertions(+), 24 deletions(-) diff --git a/docs/telemetry.md b/docs/telemetry.md index 4b6ffe25..64ff135e 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -10,21 +10,65 @@ but deliberately **far smaller**: because there are no sensitive fields to redact in the first place. - **Independent of audit**: unrelated to `DWS_AUDIT_*`; you can enable telemetry without enabling compliance audit. -- **Off by default**. With `DWS_TELEMETRY_ENABLED` unset, dws produces no - telemetry at all — zero impact on the hot path. +- **Default posture depends on the build** (see [Default posture](#default-posture)): + the open-source build is **off** (pure opt-in); a downstream distribution may + bake in a default endpoint and ship **on by default**, with a one-time + disclosure and an opt-out. -> This is an open-source CLI, so centralized reporting must be **opt-in + -> explicitly disclosed**. By default, not a single byte is reported. +> This is an open-source CLI: the **public build never reports a byte and never +> hardcodes an endpoint**. Any on-by-default behavior lives only in a downstream +> build that injects its own endpoint — and even then it is disclosed once and +> can be opted out of. ## Enabling | Environment variable | Description | Example | |---|---|---| -| `DWS_TELEMETRY_ENABLED` | Enable telemetry (only takes effect when a URL is also set) | `true` | -| `DWS_TELEMETRY_URL` | Ingest endpoint; one JSON event is POSTed per invocation | `https://telemetry.example.com/dws` | +| `DWS_TELEMETRY_ENABLED` | Explicitly enable/disable; overrides the build default either way | `true` / `false` | +| `DWS_TELEMETRY_DISABLED` | Hard opt-out; wins over everything (the off switch for on-by-default builds) | `true` | +| `DWS_TELEMETRY_URL` | Ingest endpoint; overrides the build-time default; one JSON event POSTed per invocation | `https://telemetry.example.com/dws` | | `DWS_TELEMETRY_TOKEN` | Bearer auth for the endpoint (optional) | `xxxxx` | | `DWS_TELEMETRY_TIMEOUT_MS` | Per-report timeout cap, in ms (default 1500) | `1500` | +## Default posture + +`Enabled()` resolves like this: + +1. `DWS_TELEMETRY_DISABLED=true` → **off** (always wins). +2. No destination (no `DWS_TELEMETRY_URL` and no baked-in default) → **off**. +3. `DWS_TELEMETRY_ENABLED` set → its value wins (`true`/`false`). +4. Otherwise → **on only if the build baked in a default endpoint**; a bare env + URL in the open-source build stays opt-in (off until `DWS_TELEMETRY_ENABLED=true`). + +**Open-source build** → off; an operator opts in with `DWS_TELEMETRY_ENABLED=true` +plus a `DWS_TELEMETRY_URL`. + +**Downstream "fleet" build (on by default)** → inject a default endpoint at build +time via `-ldflags`, so every install of that distribution reports to the +operator's own ingest out of the box (users opt out with +`DWS_TELEMETRY_DISABLED=true`): + +```bash +go build -ldflags "\ + -X github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry.defaultURL=https:///dws \ + -X github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry.defaultToken=" ./cmd +``` + +The public repo never hardcodes a real endpoint — only your build does. This +keeps "code is open source" and "data lands in the operator's own sink" +decoupled. + +### One-time disclosure + +The first time telemetry is active on a machine, dws prints a one-time notice to +stderr and writes a marker (`~/.dws/.telemetry_notice_shown`) so it never repeats: + +``` +ℹ️ dws reports anonymous operational telemetry (command, outcome, latency, version + — no content, no identity) to help monitor stability. Opt out anytime with + DWS_TELEMETRY_DISABLED=true. Details: docs/telemetry.md +``` + ## Reported fields (complete) ```json diff --git a/internal/app/telemetry_runtime.go b/internal/app/telemetry_runtime.go index 1444ae75..32f5a7d3 100644 --- a/internal/app/telemetry_runtime.go +++ b/internal/app/telemetry_runtime.go @@ -36,6 +36,11 @@ func emitTelemetry(execID string, inv executor.Invocation, ok bool, errClass str return } + // Disclosure: print the telemetry notice once per machine the first time it + // becomes active (required because a downstream build may ship it on by + // default). Best-effort; never blocks the command. + telemetry.ShowNoticeOnce(defaultConfigDir()) + ev := telemetry.New(time.Now(), execID) ev.CLIVersion = version ev.Channel = os.Getenv(envDWSChannel) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 170d99f9..64a270c2 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -20,34 +20,62 @@ import ( "fmt" "net/http" "os" + "path/filepath" "strings" "time" "github.com/DingTalk-Real-AI/dingtalk-workspace-cli/pkg/configmeta" ) -// Environment variables that drive telemetry. All default OFF: the CLI emits -// nothing unless the operator opts in, and the destination is operator-set. +// Environment variables that drive telemetry. +// +// Posture depends on the build (see the defaultURL build-time var and Enabled): +// - Open-source build: OFF; the operator must opt in with EnvEnabled + EnvURL. +// - Downstream build with a baked-in default endpoint: ON by default, so a +// fleet reports to the operator's ingest out of the box; users opt out with +// EnvDisabled. const ( - // EnvEnabled turns ops telemetry on ("true"/"1"). Independent of DWS_AUDIT_*. + // EnvEnabled explicitly turns telemetry on/off ("true"/"1" or "false"/"0"), + // overriding the build posture either way. EnvEnabled = "DWS_TELEMETRY_ENABLED" - // EnvURL is the ingest endpoint that receives one JSON Event per POST. - // Empty disables forwarding even when EnvEnabled is set. + // EnvDisabled is a hard opt-out. When truthy it disables telemetry no matter + // what the build default or EnvEnabled says. + EnvDisabled = "DWS_TELEMETRY_DISABLED" + // EnvURL is the ingest endpoint that receives one JSON Event per POST. It + // overrides the build-time default endpoint when set. EnvURL = "DWS_TELEMETRY_URL" - // EnvToken is an optional bearer for the ingest endpoint. + // EnvToken is an optional bearer for the ingest endpoint. Overrides the + // build-time default token when set. EnvToken = "DWS_TELEMETRY_TOKEN" // EnvTimeoutMS bounds how long a single POST may block command exit. EnvTimeoutMS = "DWS_TELEMETRY_TIMEOUT_MS" ) +// Build-time defaults, empty in the open-source build so telemetry stays opt-in +// and OFF. A downstream distribution may inject these via -ldflags to ship +// telemetry on-by-default to its own ingest, e.g.: +// +// go build -ldflags "\ +// -X github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry.defaultURL=https:///dws \ +// -X github.com/DingTalk-Real-AI/dingtalk-workspace-cli/internal/telemetry.defaultToken=" ./cmd +// +// The public repo never hardcodes a real endpoint; only a downstream build does. +// This keeps "code is open source" and "data lands in the operator's own sink" +// decoupled. +var ( + defaultURL string + defaultToken string +) + // defaultTimeout caps how long telemetry may delay command exit. Telemetry is a // side effect, never a gate: a slow or dead sink must not punish the user. const defaultTimeout = 1500 * time.Millisecond func init() { for _, it := range []configmeta.ConfigItem{ - {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "Enable anonymous ops telemetry (dimensions only, no content or identity; off by default)", Example: "true"}, - {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "Telemetry ingest endpoint (one JSON event POSTed per invocation)", Example: "https://telemetry.example.com/dws"}, + {Name: EnvEnabled, Category: configmeta.CategoryDebug, Description: "Explicitly enable/disable ops telemetry (overrides the build default)", Example: "true"}, + {Name: EnvDisabled, Category: configmeta.CategoryDebug, Description: "Hard opt-out of ops telemetry (wins over everything)", Example: "true"}, + {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "Telemetry ingest endpoint (overrides the build-time default; one JSON event POSTed per invocation)", Example: "https://telemetry.example.com/dws"}, {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "Bearer auth for the telemetry endpoint (optional)", Sensitive: true, Example: "xxxxx"}, {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "Per-report timeout cap in milliseconds (default 1500)", Example: "1500"}, } { @@ -55,30 +83,83 @@ func init() { } } -// Enabled reports whether telemetry should run. It requires BOTH the opt-in -// switch and a destination — neither alone does anything. +// resolvedURL returns the effective ingest endpoint: the env override if set, +// otherwise the build-time default (empty in the open-source build). +func resolvedURL() string { + if u := strings.TrimSpace(os.Getenv(EnvURL)); u != "" { + return u + } + return strings.TrimSpace(defaultURL) +} + +// resolvedToken returns the effective bearer token: env override, else default. +func resolvedToken() string { + if t := strings.TrimSpace(os.Getenv(EnvToken)); t != "" { + return t + } + return strings.TrimSpace(defaultToken) +} + +// Enabled reports whether telemetry should run. +// +// - EnvDisabled (hard opt-out) always wins. +// - With no destination (no env URL and no baked-in default) nothing is sent. +// - EnvEnabled, when set, is an explicit operator override either way. +// - Otherwise: ON only when a default endpoint is baked into the build +// (downstream distribution). A bare env URL in the open-source build stays +// opt-in (off until EnvEnabled is also set). func Enabled() bool { - return truthy(os.Getenv(EnvEnabled)) && strings.TrimSpace(os.Getenv(EnvURL)) != "" + if truthy(os.Getenv(EnvDisabled)) { + return false + } + if resolvedURL() == "" { + return false + } + if v := strings.TrimSpace(os.Getenv(EnvEnabled)); v != "" { + return truthy(v) + } + return strings.TrimSpace(defaultURL) != "" +} + +// noticeText is the one-time disclosure shown when telemetry is active. Keep it +// short, factual, and actionable (how to opt out). +const noticeText = "ℹ️ dws reports anonymous operational telemetry (command, outcome, latency, version — no content, no identity) to help monitor stability. Opt out anytime with DWS_TELEMETRY_DISABLED=true. Details: docs/telemetry.md" + +// ShowNoticeOnce prints the telemetry disclosure to stderr the first time +// telemetry is active on this machine, then writes a marker so it never repeats. +// Best-effort: any filesystem error silently skips — telemetry, including its +// disclosure, must never disrupt the command. +func ShowNoticeOnce(configDir string) { + if strings.TrimSpace(configDir) == "" { + return + } + marker := filepath.Join(configDir, ".telemetry_notice_shown") + if _, err := os.Stat(marker); err == nil { + return + } + fmt.Fprintln(os.Stderr, noticeText) + _ = os.WriteFile(marker, []byte(time.Now().UTC().Format(time.RFC3339)+"\n"), 0o644) } -// Forwarder ships events to the operator-configured endpoint. Best-effort: a -// transport error or non-2xx is returned for logging but never blocks beyond -// the timeout, and the command's own result is unaffected. +// Forwarder ships events to the configured endpoint. Best-effort: a transport +// error or non-2xx is returned for logging but never blocks beyond the timeout, +// and the command's own result is unaffected. type Forwarder struct { URL string Token string Client *http.Client } -// NewForwarderFromEnv builds a Forwarder from the env, or returns nil when -// telemetry is disabled. A nil *Forwarder's Emit is a safe no-op. +// NewForwarderFromEnv builds a Forwarder using the effective URL/token, or +// returns nil when telemetry is disabled. A nil *Forwarder's Emit is a safe +// no-op. func NewForwarderFromEnv() *Forwarder { if !Enabled() { return nil } return &Forwarder{ - URL: strings.TrimSpace(os.Getenv(EnvURL)), - Token: strings.TrimSpace(os.Getenv(EnvToken)), + URL: resolvedURL(), + Token: resolvedToken(), Client: &http.Client{Timeout: timeoutFromEnv()}, } } diff --git a/internal/telemetry/telemetry_test.go b/internal/telemetry/telemetry_test.go index 9fffef14..359c7fc2 100644 --- a/internal/telemetry/telemetry_test.go +++ b/internal/telemetry/telemetry_test.go @@ -44,6 +44,35 @@ func TestEnabledRequiresBothSwitchAndURL(t *testing.T) { } } +func TestEnabledWithBakedInDefaultEndpoint(t *testing.T) { + // Simulate a downstream build that injected a default endpoint via -ldflags. + orig := defaultURL + defaultURL = "https://fleet.example/dws" + t.Cleanup(func() { defaultURL = orig }) + + cases := []struct { + name, enabled, disabled, url string + want bool + }{ + {"default on (no env)", "", "", "", true}, + {"hard opt-out wins", "", "true", "", false}, + {"hard opt-out beats explicit enable", "true", "true", "", false}, + {"explicit disable via enabled=false", "false", "", "", false}, + {"explicit enable", "true", "", "", true}, + {"env url overrides default, still on", "", "", "https://other.example/dws", true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + t.Setenv(EnvEnabled, c.enabled) + t.Setenv(EnvDisabled, c.disabled) + t.Setenv(EnvURL, c.url) + if got := Enabled(); got != c.want { + t.Fatalf("Enabled()=%v, want %v", got, c.want) + } + }) + } +} + func TestNewForwarderFromEnvNilWhenDisabled(t *testing.T) { t.Setenv(EnvEnabled, "") t.Setenv(EnvURL, "") From f687c509cc6a4b8cd596bd6a61bf60b23326a605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 20:36:02 +0800 Subject: [PATCH 10/12] docs(telemetry): add deployable receiver artifacts (Dockerfile + Serverless Devs) Make the FC->SLS reference receiver deployable without hand-steps: - Dockerfile: container image (AONE / any container platform), gunicorn on :9000, app.py auto-detects dry-run vs SLS mode from env. Built + run + received a live event locally. - s.yaml + deploy.sh: Serverless Devs spec for public Aliyun FC (s build && s deploy). - .dockerignore: keep the image to app.py + requirements.txt. No behavior change to the receiver; packaging only. --- docs/telemetry/fc-sls-ingest/.dockerignore | 6 +++ docs/telemetry/fc-sls-ingest/Dockerfile | 22 ++++++++ docs/telemetry/fc-sls-ingest/deploy.sh | 32 ++++++++++++ docs/telemetry/fc-sls-ingest/s.yaml | 58 ++++++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 docs/telemetry/fc-sls-ingest/.dockerignore create mode 100644 docs/telemetry/fc-sls-ingest/Dockerfile create mode 100755 docs/telemetry/fc-sls-ingest/deploy.sh create mode 100644 docs/telemetry/fc-sls-ingest/s.yaml diff --git a/docs/telemetry/fc-sls-ingest/.dockerignore b/docs/telemetry/fc-sls-ingest/.dockerignore new file mode 100644 index 00000000..5d75e3ff --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/.dockerignore @@ -0,0 +1,6 @@ +localsink.py +README.md +s.yaml +deploy.sh +Dockerfile +.dockerignore diff --git a/docs/telemetry/fc-sls-ingest/Dockerfile b/docs/telemetry/fc-sls-ingest/Dockerfile new file mode 100644 index 00000000..79df921d --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/Dockerfile @@ -0,0 +1,22 @@ +# dws telemetry ingest — container image (AONE-deployable / any container platform). +# +# Build: docker build -t dws-telemetry-ingest . +# Run: docker run -p 9000:9000 -e INGEST_TOKEN= dws-telemetry-ingest # dry-run +# docker run -p 9000:9000 -e INGEST_TOKEN= \ +# -e SLS_ENDPOINT=... -e SLS_PROJECT=... -e SLS_LOGSTORE=... dws-telemetry-ingest # -> SLS +# +# For AONE: point the app's build at this Dockerfile; expose port 9000; set the +# env vars (INGEST_TOKEN required; SLS_* to write to an internal SLS Logstore). +# Grant the running identity SLS PutLogs so app.py uses injected creds. +FROM python:3.10-slim + +WORKDIR /app +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +COPY app.py . + +ENV PORT=9000 +EXPOSE 9000 + +# gunicorn web server; app.py auto-detects dry-run vs SLS mode from env. +CMD ["gunicorn", "-b", "0.0.0.0:9000", "--workers", "2", "--timeout", "30", "app:app"] diff --git a/docs/telemetry/fc-sls-ingest/deploy.sh b/docs/telemetry/fc-sls-ingest/deploy.sh new file mode 100755 index 00000000..013ae0b7 --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/deploy.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# Deploy the dws telemetry ingest to Alibaba Cloud Function Compute via Serverless Devs. +# +# Prereqs (one-time): +# npm i -g @serverless-devs/s +# s config add # add an Aliyun credential alias named "default" +# +# Usage: +# INGEST_TOKEN= ./deploy.sh # dry-run-first deploy (no SLS) +# INGEST_TOKEN= SLS_ENDPOINT=cn-hangzhou.log.aliyuncs.com \ +# SLS_PROJECT=dws-ops SLS_LOGSTORE=dws-telemetry ./deploy.sh # write to SLS +set -euo pipefail +cd "$(dirname "$0")" + +: "${INGEST_TOKEN:?set INGEST_TOKEN to a random shared secret (must match dws-side DWS_TELEMETRY_TOKEN)}" +export SLS_ENDPOINT="${SLS_ENDPOINT:-}" SLS_PROJECT="${SLS_PROJECT:-}" SLS_LOGSTORE="${SLS_LOGSTORE:-}" + +echo "==> building (installs flask/gunicorn/aliyun-log from requirements.txt)" +s build --use-docker || s build + +echo "==> deploying" +s deploy -y + +echo "==> function info (look for the http trigger URL)" +s info + +echo +echo "Verify:" +echo " curl / # expect: mode=dry-run (or mode=sls once SLS_* set)" +echo " curl -XPOST / -H \"Authorization: Bearer \$INGEST_TOKEN\" \\" +echo " -H 'Content-Type: application/json' -d '{\"schema_version\":\"1\",\"command\":\"doc\",\"outcome\":\"ok\",\"duration_ms\":42}'" +echo "Then set DWS_TELEMETRY_URL= + DWS_TELEMETRY_TOKEN=\$INGEST_TOKEN (or bake via ldflags)." diff --git a/docs/telemetry/fc-sls-ingest/s.yaml b/docs/telemetry/fc-sls-ingest/s.yaml new file mode 100644 index 00000000..6dd8a733 --- /dev/null +++ b/docs/telemetry/fc-sls-ingest/s.yaml @@ -0,0 +1,58 @@ +# Serverless Devs (s) deploy spec for the dws telemetry ingest (FC web function). +# +# One-time prep: +# s config add # add an Aliyun credential alias named "default" +# export INGEST_TOKEN= # shared secret (must equal dws-side DWS_TELEMETRY_TOKEN) +# # optional — omit for a dry-run-first deploy, add later to write to SLS: +# # export SLS_ENDPOINT=cn-hangzhou.log.aliyuncs.com +# # export SLS_PROJECT=dws-ops +# # export SLS_LOGSTORE=dws-telemetry +# +# Deploy: s build && s deploy -y (or just ./deploy.sh) +# URL: s info (look for the http trigger url) +edition: 3.0.0 +name: dws-telemetry-ingest +access: default + +vars: + region: cn-hangzhou # change to your region + +resources: + ingest: + component: fc3 + props: + region: ${vars.region} + functionName: dws-telemetry-ingest + description: dws ops telemetry ingest (FC web function -> SLS) + runtime: python3.10 + code: ./ + handler: app.app # unused by web functions, but a value is required + timeout: 30 + memorySize: 512 + cpu: 0.35 + diskSize: 512 + instanceConcurrency: 20 + # Grant SLS write so app.py uses FC-injected STS creds (no AccessKey in env). + # Fill in your account uid + a role that has AliyunLogFullAccess (or PutLogs): + # role: acs:ram:::role/ + environmentVariables: + INGEST_TOKEN: ${env('INGEST_TOKEN')} + SLS_ENDPOINT: ${env('SLS_ENDPOINT')} + SLS_PROJECT: ${env('SLS_PROJECT')} + SLS_LOGSTORE: ${env('SLS_LOGSTORE')} + customRuntimeConfig: + command: + - gunicorn + args: + - "-b" + - "0.0.0.0:9000" + - "app:app" + port: 9000 + triggers: + - triggerName: http + triggerType: http + triggerConfig: + authType: anonymous # app.py enforces the Bearer token itself + methods: + - GET + - POST From 4f96c94494f4e94680b41003dc77d8d7de3c8934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 21:29:48 +0800 Subject: [PATCH 11/12] =?UTF-8?q?feat(telemetry):=20local=20file=20sink=20?= =?UTF-8?q?(DWS=5FTELEMETRY=5FFILE)=20=E2=80=94=20lightest,=20server-less?= =?UTF-8?q?=20monitoring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a zero-infra sink: when DWS_TELEMETRY_FILE is set, each event is appended as one JSON line to that local file instead of being POSTed — no receiver, no FC, no SLS. Ideal for local/per-machine stability monitoring; aggregate the file with a small script (see docs/telemetry.md). File sink takes precedence over URL and, when set, enables telemetry (with the same DWS_TELEMETRY_DISABLED opt-out). - telemetry.go: EnvFile + resolvedFile (with ~ expansion); Enabled() counts a file sink as a destination; Forwarder.File appends JSONL in Emit. - test: file sink enables + appends valid JSON lines + opt-out still wins. - docs: "Local monitoring (lightest)" section + one-line aggregation. --- docs/telemetry.md | 30 +++++++++++++++ internal/telemetry/telemetry.go | 48 +++++++++++++++++++++--- internal/telemetry/telemetry_test.go | 56 ++++++++++++++++++++++++++++ 3 files changed, 129 insertions(+), 5 deletions(-) diff --git a/docs/telemetry.md b/docs/telemetry.md index 64ff135e..094b5ac9 100644 --- a/docs/telemetry.md +++ b/docs/telemetry.md @@ -26,6 +26,7 @@ but deliberately **far smaller**: |---|---|---| | `DWS_TELEMETRY_ENABLED` | Explicitly enable/disable; overrides the build default either way | `true` / `false` | | `DWS_TELEMETRY_DISABLED` | Hard opt-out; wins over everything (the off switch for on-by-default builds) | `true` | +| `DWS_TELEMETRY_FILE` | **Local file sink** — append each event as one JSON line here instead of POSTing (no server, no network). Takes precedence over URL | `~/.dws/telemetry.jsonl` | | `DWS_TELEMETRY_URL` | Ingest endpoint; overrides the build-time default; one JSON event POSTed per invocation | `https://telemetry.example.com/dws` | | `DWS_TELEMETRY_TOKEN` | Bearer auth for the endpoint (optional) | `xxxxx` | | `DWS_TELEMETRY_TIMEOUT_MS` | Per-report timeout cap, in ms (default 1500) | `1500` | @@ -69,6 +70,35 @@ stderr and writes a marker (`~/.dws/.telemetry_notice_shown`) so it never repeat DWS_TELEMETRY_DISABLED=true. Details: docs/telemetry.md ``` +## Local monitoring (lightest — no server, no SLS) + +The smallest possible setup: point telemetry at a **local file**. No receiver, no +FC, no SLS — each machine appends its own events; you aggregate the file whenever. + +```bash +# turn it on (file sink alone enables telemetry) +export DWS_TELEMETRY_FILE=~/.dws/telemetry.jsonl + +# ... use dws normally ... + +# one-line stability view (per command: calls / errors / avg latency) +python3 - <<'PY' +import json, collections, os +rows=[json.loads(l) for l in open(os.path.expanduser('~/.dws/telemetry.jsonl')) if l.strip()] +by=collections.defaultdict(lambda:{'n':0,'err':0,'dur':[]}) +for r in rows: + k=f"{r.get('command')}.{r.get('subcommand')}"; b=by[k] + b['n']+=1; b['err']+=(r.get('outcome')!='ok'); b['dur'].append(r.get('duration_ms',0)) +print(f"{'command':<28}{'calls':>6}{'err':>5}{'avg_ms':>8}") +for k,v in sorted(by.items(),key=lambda x:-x[1]['n']): + d=v['dur'] or [0]; print(f"{k:<28}{v['n']:>6}{v['err']:>5}{sum(d)//len(d):>8}") +PY +``` + +For a small fleet, collect each machine's `telemetry.jsonl` (rsync/scp) and run +the same aggregation over the combined files. Scale to the URL→ingest path only +when you outgrow this. + ## Reported fields (complete) ```json diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 64a270c2..ef93f4aa 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -49,6 +49,10 @@ const ( EnvToken = "DWS_TELEMETRY_TOKEN" // EnvTimeoutMS bounds how long a single POST may block command exit. EnvTimeoutMS = "DWS_TELEMETRY_TIMEOUT_MS" + // EnvFile is the lightest sink: when set, each event is appended as one JSON + // line to this local file instead of being POSTed — no server, no network. + // Ideal for local/per-machine stability monitoring. Takes precedence over URL. + EnvFile = "DWS_TELEMETRY_FILE" ) // Build-time defaults, empty in the open-source build so telemetry stays opt-in @@ -78,11 +82,28 @@ func init() { {Name: EnvURL, Category: configmeta.CategoryDebug, Description: "Telemetry ingest endpoint (overrides the build-time default; one JSON event POSTed per invocation)", Example: "https://telemetry.example.com/dws"}, {Name: EnvToken, Category: configmeta.CategoryDebug, Description: "Bearer auth for the telemetry endpoint (optional)", Sensitive: true, Example: "xxxxx"}, {Name: EnvTimeoutMS, Category: configmeta.CategoryDebug, Description: "Per-report timeout cap in milliseconds (default 1500)", Example: "1500"}, + {Name: EnvFile, Category: configmeta.CategoryDebug, Description: "Local file sink: append each event as one JSON line here instead of POSTing (no server). Takes precedence over URL", Example: "~/.dws/telemetry.jsonl"}, } { configmeta.Register(it) } } +// resolvedFile returns the local file sink path (env only). When set, events are +// appended to this file instead of POSTed — the lightest, server-less sink. +func resolvedFile() string { + return expandHome(strings.TrimSpace(os.Getenv(EnvFile))) +} + +// expandHome resolves a leading ~ to the user's home directory. +func expandHome(p string) string { + if p == "~" || strings.HasPrefix(p, "~/") { + if home, err := os.UserHomeDir(); err == nil { + return filepath.Join(home, strings.TrimPrefix(p, "~")) + } + } + return p +} + // resolvedURL returns the effective ingest endpoint: the env override if set, // otherwise the build-time default (empty in the open-source build). func resolvedURL() string { @@ -112,13 +133,15 @@ func Enabled() bool { if truthy(os.Getenv(EnvDisabled)) { return false } - if resolvedURL() == "" { + if resolvedURL() == "" && resolvedFile() == "" { return false } if v := strings.TrimSpace(os.Getenv(EnvEnabled)); v != "" { return truthy(v) } - return strings.TrimSpace(defaultURL) != "" + // On when a default endpoint is baked into the build (downstream distribution) + // or a local file sink is explicitly set (user opted into local monitoring). + return strings.TrimSpace(defaultURL) != "" || resolvedFile() != "" } // noticeText is the one-time disclosure shown when telemetry is active. Keep it @@ -147,6 +170,7 @@ func ShowNoticeOnce(configDir string) { type Forwarder struct { URL string Token string + File string // local file sink; when set, append JSONL instead of POSTing Client *http.Client } @@ -160,13 +184,15 @@ func NewForwarderFromEnv() *Forwarder { return &Forwarder{ URL: resolvedURL(), Token: resolvedToken(), + File: resolvedFile(), Client: &http.Client{Timeout: timeoutFromEnv()}, } } -// Emit POSTs e as a single JSON object. A nil receiver is a no-op so callers -// never need a guard. Errors are returned (best-effort) but the bounded client -// timeout guarantees command exit is never delayed past the configured cap. +// Emit ships e as a single JSON object. With a file sink configured it appends +// one JSON line locally (no network); otherwise it POSTs to the URL. A nil +// receiver is a no-op so callers never need a guard. Errors are returned +// (best-effort) but never block command exit past the configured timeout. func (f *Forwarder) Emit(e *Event) error { if f == nil || e == nil { return nil @@ -175,6 +201,18 @@ func (f *Forwarder) Emit(e *Event) error { if err != nil { return err } + + // Local file sink (lightest path): append one JSON line, no network. + if f.File != "" { + fh, openErr := os.OpenFile(f.File, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644) + if openErr != nil { + return openErr + } + defer fh.Close() + _, writeErr := fh.Write(append(data, '\n')) + return writeErr + } + ctx, cancel := context.WithTimeout(context.Background(), f.Client.Timeout) defer cancel() diff --git a/internal/telemetry/telemetry_test.go b/internal/telemetry/telemetry_test.go index 359c7fc2..f7648da6 100644 --- a/internal/telemetry/telemetry_test.go +++ b/internal/telemetry/telemetry_test.go @@ -18,6 +18,9 @@ import ( "io" "net/http" "net/http/httptest" + "os" + "path/filepath" + "strings" "testing" "time" ) @@ -73,6 +76,59 @@ func TestEnabledWithBakedInDefaultEndpoint(t *testing.T) { } } +func TestFileSinkAppendsAndEnables(t *testing.T) { + path := filepath.Join(t.TempDir(), "telemetry.jsonl") + t.Setenv(EnvEnabled, "") + t.Setenv(EnvURL, "") + t.Setenv(EnvFile, path) + + // A file sink alone is a destination -> enabled (local monitoring opt-in). + if !Enabled() { + t.Fatal("Enabled()=false, want true when DWS_TELEMETRY_FILE is set") + } + fwd := NewForwarderFromEnv() + if fwd == nil { + t.Fatal("expected a forwarder when file sink is set") + } + if fwd.File != path { + t.Fatalf("forwarder File=%q, want %q", fwd.File, path) + } + + // Two events -> two JSON lines, no network. + for _, oc := range []string{"ok", "error"} { + ev := New(time.Unix(0, 0), "t") + ev.Command = "doc" + ev.Outcome = oc + if err := fwd.Emit(ev); err != nil { + t.Fatalf("Emit: %v", err) + } + } + data, err := os.ReadFile(path) + if err != nil { + t.Fatalf("read sink file: %v", err) + } + lines := 0 + for _, l := range strings.Split(strings.TrimSpace(string(data)), "\n") { + if strings.TrimSpace(l) == "" { + continue + } + var ev Event + if err := json.Unmarshal([]byte(l), &ev); err != nil { + t.Fatalf("line not JSON: %q (%v)", l, err) + } + lines++ + } + if lines != 2 { + t.Fatalf("file has %d JSON lines, want 2", lines) + } + + // Hard opt-out still wins over a file sink. + t.Setenv(EnvDisabled, "true") + if Enabled() { + t.Fatal("Enabled()=true with DWS_TELEMETRY_DISABLED set, want false") + } +} + func TestNewForwarderFromEnvNilWhenDisabled(t *testing.T) { t.Setenv(EnvEnabled, "") t.Setenv(EnvURL, "") From af9024293ffab0f3b36bd2c2a06a588d29c61a94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=AE=E9=9B=A8?= <47820304+PeterGuy326@users.noreply.github.com> Date: Thu, 4 Jun 2026 22:24:59 +0800 Subject: [PATCH 12/12] feat(telemetry): localsink can act as a small LAN collector (HOST + APPEND) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the zero-dep local sink usable as a tiny central collector on your own machine — e.g. "monitor on my computer" for a small team, no SLS/FC needed: - HOST env (default 127.0.0.1; set 0.0.0.0 to accept POSTs from LAN machines) - APPEND env (default truncate for tests; APPEND=1 keeps history across restarts) - startup banner shows the real bind host + append mode Token auth is strongly advised when binding 0.0.0.0. Verified: a dws pointed at the machine's LAN IP lands events in the collector file. --- docs/telemetry/fc-sls-ingest/localsink.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/docs/telemetry/fc-sls-ingest/localsink.py b/docs/telemetry/fc-sls-ingest/localsink.py index 5ab0d163..f25ed712 100644 --- a/docs/telemetry/fc-sls-ingest/localsink.py +++ b/docs/telemetry/fc-sls-ingest/localsink.py @@ -24,8 +24,14 @@ from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer PORT = int(os.environ.get("PORT", "8799")) +# HOST: 127.0.0.1 (default, local-only/safe). Set HOST=0.0.0.0 to accept POSTs +# from other machines on your LAN — e.g. to use THIS computer as a small central +# collector that teammates' dws report into (token auth strongly recommended then). +HOST = os.environ.get("HOST", "127.0.0.1") TOKEN = os.environ.get("TOKEN", "") OUTFILE = os.environ.get("OUTFILE", "/tmp/dws_telemetry.jsonl") +# APPEND=1 keeps history across restarts (central collector); default truncates. +APPEND = os.environ.get("APPEND", "") not in ("", "0", "false", "no") _count = 0 @@ -67,8 +73,9 @@ def log_message(self, *args): # silence default access logging if __name__ == "__main__": - open(OUTFILE, "w").close() # truncate previous run - auth = f"(bearer required: {TOKEN!r})" if TOKEN else "(no auth)" - print(f"dws local telemetry sink listening on http://127.0.0.1:{PORT} {auth}") - print(f"capturing to {OUTFILE}\n") - ThreadingHTTPServer(("127.0.0.1", PORT), Handler).serve_forever() + if not APPEND: + open(OUTFILE, "w").close() # truncate previous run (test default) + auth = f"(bearer required: {TOKEN!r})" if TOKEN else "(no auth — set TOKEN!)" + print(f"dws local telemetry sink listening on http://{HOST}:{PORT} {auth}") + print(f"capturing to {OUTFILE} (append={APPEND})\n") + ThreadingHTTPServer((HOST, PORT), Handler).serve_forever()