diff --git a/docs/Taskfile.yaml b/docs/Taskfile.yaml new file mode 100644 index 0000000..319b9e6 --- /dev/null +++ b/docs/Taskfile.yaml @@ -0,0 +1,75 @@ +version: '3' + +vars: + DIAGRAMS_DIR: "{{.USER_WORKING_DIR}}/docs/diagrams" + OUTPUT_FORMAT: "png" + +tasks: + generate: + desc: Generate all documentation artifacts (diagrams, etc.) + cmds: + - task: diagrams:render + silent: true + + diagrams: + desc: Generate all architecture diagrams from PlantUML + cmds: + - task: diagrams:render + silent: true + + diagrams:render: + desc: Render PlantUML diagrams to PNG format using Docker + cmds: + - | + set -e + echo "Rendering PlantUML diagrams..." + echo "" + + # Check if PlantUML files exist + if ! ls {{.DIAGRAMS_DIR}}/*.puml 1>/dev/null 2>&1; then + echo "No PlantUML source files found in {{.DIAGRAMS_DIR}}" + exit 0 + fi + + # Render using Docker (no local installation required) + docker run --rm \ + -v "{{.DIAGRAMS_DIR}}":/data \ + plantuml/plantuml:latest \ + -t{{.OUTPUT_FORMAT}} \ + /data/*.puml + + echo "" + echo "Diagrams rendered in {{.DIAGRAMS_DIR}}" + echo "" + echo "Generated files:" + ls -1 {{.DIAGRAMS_DIR}}/*.{{.OUTPUT_FORMAT}} 2>/dev/null | xargs -n1 basename || echo "No output files found" + silent: true + + diagrams:clean: + desc: Remove generated diagram files + cmds: + - | + rm -f {{.DIAGRAMS_DIR}}/*.png {{.DIAGRAMS_DIR}}/*.svg + echo "Generated diagram files removed" + silent: true + + diagrams:validate: + desc: Validate PlantUML syntax using Docker + cmds: + - | + set -e + echo "Validating PlantUML diagrams..." + + # Check if PlantUML files exist + if ! ls {{.DIAGRAMS_DIR}}/*.puml 1>/dev/null 2>&1; then + echo "No PlantUML source files found in {{.DIAGRAMS_DIR}}" + exit 0 + fi + + docker run --rm \ + -v "{{.DIAGRAMS_DIR}}":/data \ + plantuml/plantuml:latest \ + -syntax \ + /data/*.puml + echo "All diagrams are valid" + silent: true diff --git a/docs/architecture/customer-facing-logs.md b/docs/architecture/customer-facing-logs.md new file mode 100644 index 0000000..43f8f03 --- /dev/null +++ b/docs/architecture/customer-facing-logs.md @@ -0,0 +1,757 @@ +# Customer-Facing Logs + +Status: Draft +Scope (v1): AI Edge (HTTPProxy + WAF) logs and Compute workload stdout/stderr + +## Motivation + +Datum platform services emit operational signals — request logs, security +events, control-plane activity — that customers need visibility into for +debugging, compliance, and security investigation. Today there is no +customer-facing query surface for these logs. Customers running workloads on +AI Edge (Datum's HTTP proxy + WAF product) cannot answer basic questions +like "show me 5xx responses for my proxy in the last hour" or "which +requests did the WAF block." Customers running compute workloads have no +visibility into their workload output. + +This design defines a project-scoped, multi-tenant logs pipeline with a +structured HTTP query API backed by ClickHouse SQL. AI Edge and Compute are +the v1 scope: AI Edge +produces high-volume access logs and WAF events that are the most acute +customer need; Compute produces workload stdout/stderr, which is the minimum +viable debugging surface for compute customers. + +## Goals (v1) + +- Customers can query AI Edge access logs, WAF events, and Compute workload + output for their project through `datumctl`. +- All logs are tenant-isolated at storage and query time; cross-tenant + reads are structurally impossible. +- Each log event is stored exactly once. There is no duplication of records + across partitions. +- Log schemas are declared once by the producing service and surface + automatically as catalog metadata (resource types, label vocabulary, log + definitions). +- 7-day default retention for operational logs. Retention is platform-set + in v1; not user-controllable. + +## Non-Goals (v1) + +- Control-plane audit logs. Audit logs are collected by the activity + system (`milo-os/activity`) and stored separately; they do not flow + through this pipeline. +- Customer-configurable log export — deferred to a follow-on enhancement. + The NATS subject structure is designed to enable this without schema + changes when the time comes. +- Body-content redaction via regex; v1 redacts at attribute level only. +- Log-based metrics and alerting derived from log streams. +- Per-project ingestion quota. Volume protection in v1 is platform-set + defaults at the gateway; a `LogIngestionQuota` resource is a follow-on + enhancement. +- Structured OTLP export from within compute workloads (custom spans, + metrics). v1 covers stdout/stderr only; structured OTLP from workload + code requires a sidecar or per-workload endpoint and is a follow-on. + +## Layers + +### 1. Service Declaration + +Services declare what they emit in their `ServiceConfiguration` +(`services.miloapis.com/v1alpha1`). Two fields participate: + +- `spec.monitoredResourceTypes[]` — already fans out to + `billing.MonitoredResourceType`; now also fans out to a new + `telemetry.MonitoredResourceType`. +- `spec.logs[]` (new) — fans out to `telemetry.LogDefinition`. + +Each log event produces exactly one row — there is no duplication of records +across partitions. `org_id` and `project_id` are stamped by +platform-controlled producers at config time; the row policy enforces tenant +isolation at read time. + +AI Edge declaration: + +```yaml +apiVersion: services.miloapis.com/v1alpha1 +kind: ServiceConfiguration +metadata: + name: networking-datumapis-com +spec: + serviceRef: + name: networking-datumapis-com + phase: Published + monitoredResourceTypes: + - resourceTypeName: networking.datumapis.com/HTTPProxy + displayName: HTTP Proxy + gvk: + group: networking.datumapis.com + kind: HTTPProxy + labels: + - name: resource.group + description: API group of the resource (networking.datumapis.com). + - name: resource.kind + description: Resource kind (HTTPProxy). + - name: resource.name + description: Name of the HTTPProxy instance. + - name: resource.namespace + description: Project namespace the HTTPProxy belongs to. + - name: hostname + description: Hostname the request was received on. + logs: + - logID: networking.datumapis.com/httpproxy-access + displayName: HTTP Proxy Access Log + description: One entry per HTTP request handled by the proxy. + monitoredResourceType: networking.datumapis.com/HTTPProxy + + entrySchema: + - name: http.request.id + description: Per-request correlation ID (Envoy x-request-id). + - name: http.request.method + description: HTTP method (GET, POST, etc). + - name: http.response.status_code + description: HTTP response status returned to the client. + - name: url.path + description: Request path. + - name: client.address + description: Client IP. + - name: user_agent.original + description: Verbatim User-Agent header sent by the client. + - name: http.request.duration_ms + description: Request duration in milliseconds. + - name: edge.pop.ingress + description: PoP code that received the request (e.g. cdg1). + - name: edge.pop.upstream + description: PoP that routed to the upstream when different from ingress; empty when handled at ingress. + - name: waf.outcome + description: Summary of WAF decision for this request — allowed, blocked, or challenged. + - name: waf.matched_rules + description: Number of WAF rules that matched on this request. Non-zero implies a paired httpproxy-waf entry exists per matched rule. + categoryGroups: [allLogs] + + - logID: networking.datumapis.com/httpproxy-waf + displayName: HTTP Proxy WAF Event Log + description: One entry per WAF rule evaluation that matched or blocked. + monitoredResourceType: networking.datumapis.com/HTTPProxy + + entrySchema: + - name: http.request.id + description: Matches the http.request.id on the paired httpproxy-access entry. + - name: waf.rule.id + description: Identifier of the WAF rule that matched. + - name: waf.action + description: Action taken for this rule — block, log, challenge. + - name: waf.severity + description: Severity classification of the matched rule. + categoryGroups: [allLogs] +``` + +Compute declaration: + +```yaml +apiVersion: services.miloapis.com/v1alpha1 +kind: ServiceConfiguration +metadata: + name: compute-datumapis-com +spec: + serviceRef: + name: compute-datumapis-com + phase: Published + monitoredResourceTypes: + - resourceTypeName: compute.datumapis.com/Workload + displayName: Compute Workload + gvk: + group: compute.datumapis.com + kind: Workload + labels: + - name: resource.group + description: API group of the resource (compute.datumapis.com). + - name: resource.kind + description: Resource kind (Workload). + - name: resource.name + description: Name of the Workload. + - name: resource.namespace + description: Project namespace the Workload belongs to. + logs: + - logID: compute.datumapis.com/workload-stdout + displayName: Workload Output + description: Stdout and stderr from workload containers. + monitoredResourceType: compute.datumapis.com/Workload + + entrySchema: + - name: stream + description: Output stream — stdout or stderr. + - name: container.name + description: Container name within the workload. + categoryGroups: [allLogs] +``` + +### 2. Platform Catalog + +The telemetry operator (`milo-os/telemetry`) owns two new CRDs that the +`ServiceConfiguration` controller fans out into. + +`telemetry.MonitoredResourceType` — instance-identifying label vocabulary +for a resource Kind. Parallel to `billing.MonitoredResourceType`: + +```yaml +apiVersion: telemetry.miloapis.com/v1alpha1 +kind: MonitoredResourceType +metadata: + name: networking-datumapis-com-httpproxy +spec: + resourceTypeName: networking.datumapis.com/HTTPProxy + phase: Published + displayName: HTTP Proxy + gvk: + group: networking.datumapis.com + kind: HTTPProxy + labels: + - name: resource.group + - name: resource.kind + - name: resource.name + - name: resource.namespace + - name: hostname +``` + +`LogDefinition` — the log type catalog entry; references +`MonitoredResourceType` by `resourceTypeName`: + +```yaml +apiVersion: telemetry.miloapis.com/v1alpha1 +kind: LogDefinition +metadata: + name: networking-datumapis-com-httpproxy-access +spec: + logID: networking.datumapis.com/httpproxy-access + phase: Published + displayName: HTTP Proxy Access Log + monitoredResourceType: networking.datumapis.com/HTTPProxy + + entrySchema: + - name: http.request.id + - name: http.request.method + - name: http.response.status_code + - name: url.path + - name: client.address + - name: user_agent.original + - name: http.request.duration_ms + - name: edge.pop.ingress + - name: edge.pop.upstream + - name: waf.outcome + - name: waf.matched_rules + categoryGroups: [allLogs] +``` + +Both CRDs are server-managed: the `ServiceConfiguration` controller is the +sole writer. Customers read them via standard list/get to populate UIs and +discover available log types. + +### 3. Ingestion Pipeline + +![Ingestion Pipeline](../diagrams/ingestion-pipeline.png) + +Log records enter the pipeline from two sources with different trust models. +In both cases, `org_id` and `project_id` are resolved by the platform at +configuration time — not per-record at the gateway. The gateway receives +them pre-stamped and trusts them because the stamping components are +platform-controlled. + +#### 3a. AI Edge (Envoy) + +The HTTPProxy controller generates Envoy xDS configuration for each +`HTTPProxy` resource. At reconcile time, the controller knows the HTTPProxy's +namespace (= `project_id`) and the project's parent organization (= `org_id`). +It stamps both into the OTel access log resource attributes: + +```yaml +# Generated by HTTPProxy controller into Envoy xDS +access_log: + - name: envoy.access_loggers.open_telemetry + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.open_telemetry.v3.OpenTelemetryAccessLogConfig + common_config: + grpc_service: + envoy_grpc: {cluster_name: telemetry_gateway} + log_name: networking.datumapis.com/httpproxy-access + resource_attributes: + values: + - key: org_id + value: {string_value: "org-abc123"} # resolved at reconcile time + - key: project_id + value: {string_value: "proj-xyz456"} # = HTTPProxy's namespace + - key: resource.group + value: {string_value: "networking.datumapis.com"} + - key: resource.kind + value: {string_value: "HTTPProxy"} + - key: resource.name + value: {string_value: "my-proxy"} # = HTTPProxy.metadata.name + - key: resource.namespace + value: {string_value: "proj-xyz456"} + attributes: + values: + - key: http.request.id + value: {string_value: "%REQ(x-request-id)%"} + - key: http.request.method + value: {string_value: "%REQ(:method)%"} + - key: http.response.status_code + value: {string_value: "%RESPONSE_CODE%"} + - key: url.path + value: {string_value: "%REQ(x-forwarded-proto)%://%REQ(:authority)%%REQ(:path)%"} + - key: client.address + value: {string_value: "%DOWNSTREAM_REMOTE_ADDRESS_WITHOUT_PORT%"} + - key: user_agent.original + value: {string_value: "%REQ(user-agent)%"} + - key: http.request.duration_ms + value: {string_value: "%DURATION%"} + - key: waf.outcome + value: {string_value: "%FILTER_STATE(waf.outcome)%"} + - key: waf.matched_rules + value: {string_value: "%FILTER_STATE(waf.matched_rules)%"} +``` + +Envoy is a platform-controlled producer — Datum generates its configuration, +not the customer. The gateway therefore trusts the `org_id` and `project_id` +values it receives from Envoy without further catalog lookup. + +#### 3b. Compute (Workload stdout/stderr) + +Compute workload containers run on Kraftlet nodes managed by the +unikraft-provider. The container runtime writes stdout/stderr to files on the +node at: + +``` +/var/log/pods/__//0.log +``` + +An OTel Collector DaemonSet (contrib distribution) deployed to each Kraftlet +node scrapes these files using the `filelog` receiver, which handles the CRI-O +log format natively. The `k8sattributes` processor enriches each log line with +metadata from the pod's labels: + +- `org_id` — from pod label `datum.org_id` +- `project_id` — from pod label `datum.project_id` +- `resource.name` — from pod label `compute.datumapis.com/workload-name` +- `resource.namespace` — from pod label `compute.datumapis.com/workload-deployment-name` (or namespace) +- `container.name` — from the log file path +- `stream` — `stdout` or `stderr` + +The unikraft-provider stamps `datum.org_id` and `datum.project_id` on every +Pod it creates, sourcing the values from the parent Instance's labels, which +the compute controller stamps from the Workload's namespace (= `project_id`) +and the namespace's owning organization (= `org_id`): + +```go +// In unikraft-provider instance controller, when building the Pod spec: +pod.Labels["datum.org_id"] = instance.Labels["datum.org_id"] +pod.Labels["datum.project_id"] = instance.Labels["datum.project_id"] +``` + +The OTel Collector DaemonSet exports records as OTLP to the gateway with +`log_id` set to `compute.datumapis.com/workload-stdout`. The gateway treats it +as a platform-controlled producer; the pod label values are trusted. Using +the contrib collector for this DaemonSet is consistent with the collectors +already running in the edge and hub clusters — same image, same config +patterns. + +#### 3c. Gateway Responsibilities + +The gateway is the OTLP bridge — a thin Go service that accepts OTLP/HTTP, +applies schema validation, and writes to NATS. It is not responsible for +resolving tenant identity; that is done upstream by platform-controlled +producers. + +Gateway responsibilities: + +1. Receive OTLP log records over HTTP. +2. Validate that `org_id` and `project_id` are present on the record. +3. Look up the `LogDefinition` for the record's `log_id` from a local + in-memory cache (populated by a Kubernetes watch; updated on definition + changes without restart). Reject records with unregistered log IDs. +4. Validate that the record's resource attributes are a subset of the + vocabulary declared by the `LogDefinition`'s `MonitoredResourceType`. +5. Write one NATS message to subject + `telemetry.logs...`. +6. Acknowledge the OTLP request. + +No per-record catalog API calls. The only remote calls are the initial watch +connection to the Kubernetes API (for `LogDefinition` cache population) and +the NATS publish. + +A NATS JetStream subject sits between the gateway and ClickHouse. NATS +gives us: + +- **Backpressure.** If ClickHouse is down or slow, the consumer pauses; + NATS retains the backlog rather than the gateway dropping records. +- **Live tail.** The same stream feeds the `/tail` WebSocket handler without + polling ClickHouse — see Live Tail below. +- **Future log export.** The subject hierarchy + `telemetry.logs...` is designed so a future + export worker can subscribe to `telemetry.logs..>` to forward a + customer's logs to their own destination without any schema changes. + +A ClickHouse-writer consumer drains NATS into the `platform_logs` table +in batches. + +### 4. Storage + +Shared ClickHouse `platform_logs` table, OTel-aligned schema. `org_id` and +`project_id` are the partition key and lead the sort key. Each log event +is stored exactly once. + +```sql +CREATE TABLE platform_logs ( + org_id LowCardinality(String), + project_id LowCardinality(String), + timestamp UInt64, + observed_timestamp UInt64, + severity_number UInt8, + severity_text LowCardinality(String), + body String, + log_id LowCardinality(String), + resource_type LowCardinality(String), + resource_group LowCardinality(String), + resource_kind LowCardinality(String), + resource_name String, + resource_namespace LowCardinality(String), + attributes_string Map(String, String), + resources_string Map(String, String), + trace_id String, + span_id String +) +ENGINE = MergeTree() +PARTITION BY (org_id, toYYYYMM(toDateTime(timestamp / 1e9))) +ORDER BY (org_id, project_id, resource_type, resource_name, log_id, timestamp) +TTL toDateTime(timestamp / 1e9) + INTERVAL 7 DAY DELETE; +``` + +Row-level access control is enforced by a ClickHouse row policy on the +tenant-facing query user: + +```sql +CREATE ROW POLICY tenant_isolation ON platform_logs + FOR SELECT + USING org_id = getSetting('org_id') AND project_id = getSetting('project_id') + TO api_reader; +``` + +The query API sets these session settings from the authenticated request +context before executing any query. Datum operations staff use a separate +ClickHouse role that bypasses the row policy, allowing fleet-wide queries +without duplicating records. + +`resource_type` is the `group/Kind` of the Datum resource that produced +the log — e.g. `networking.datumapis.com/HTTPProxy` or +`compute.datumapis.com/Workload`. `resource_name` is the name of the +specific instance within the project. Together they are a denormalised +Kubernetes object reference, scoped within `(org_id, project_id)`. +A `resource_name` of `my-proxy` in one project is unrelated to the same +name in another. + +Top-level columns are chosen for the two common query shapes: + +- **Per-resource**: "give me all access logs for proxy XYZ". Served by + the `(org_id, project_id, resource_type, resource_name, log_id)` prefix + of the sort key. +- **Per-project**: "give me all logs for project X". Served by the + `(org_id, project_id)` prefix. + +### 5. Query API + +![Query Path](../diagrams/query-path.png) + +#### Why not Loki-compatible + +An earlier draft of this design used a Loki-compatible API +(`/loki/api/v1/...`). That approach was dropped for three reasons: + +1. **Translation layer cost.** A correct LogQL → SQL translator covering + line filters, label filters, unwrap expressions, and metric queries is a + non-trivial piece of software to build and maintain. Every ClickHouse + feature that doesn't map cleanly to LogQL — window functions, joins across + log types for request correlation, histogram aggregations — is either + inaccessible or requires escape hatches that break the abstraction. + +2. **SQL is the direction.** Axiom, Cloudflare, AWS CloudWatch Logs + Insights, and DataDog's analytics surface are all SQL or SQL-like (KQL, + APL). LogQL is a Grafana Cloud-specific protocol; the broader industry + has concluded that logs are a table and SQL is the right query language + for them. Building a LogQL shim would be swimming against that current. + +3. **We already have a working ClickHouse query path.** Our POC validated + ClickHouse SQL for log queries directly. `datumctl` calls the structured + API; internal ops uses the ClickHouse datasource in Grafana. There is no + tooling gap that Loki compatibility closes. + +#### Structured HTTP query API + +The customer query surface is a structured HTTP API exposed under the +project's control-plane endpoint: + +``` +POST {project-control-plane-endpoint}/telemetry/v1/query +POST {project-control-plane-endpoint}/telemetry/v1/tail (WebSocket upgrade) +GET {project-control-plane-endpoint}/telemetry/v1/log-definitions +GET {project-control-plane-endpoint}/telemetry/v1/log-definitions/{logID} +``` + +`{project-control-plane-endpoint}` is the same per-project control-plane +URL Milo already issues for Kubernetes API access. The project is resolved +from the endpoint; `org_id` and `project_id` are never passed by the caller. + +The Milo gateway enforces IAM before the request reaches the telemetry +handler. The handler executes ClickHouse SQL with `org_id` and `project_id` +injected from the authenticated endpoint context as session settings; the +row policy enforces them at the database layer. + +Query request body: + +```json +{ + "log_id": "networking.datumapis.com/httpproxy-access", + "resource_name": "my-proxy", + "start": "2026-06-19T10:00:00Z", + "end": "2026-06-19T11:00:00Z", + "filter": { + "http.response.status_code": "500" + }, + "limit": 100, + "order": "desc" +} +``` + +All filter fields map directly to top-level ClickHouse columns or +`attributes_string` map lookups — no query language to parse. The handler +returns log lines as a JSON array; `datumctl logs` wraps this endpoint. + +Label and series discovery is served from the `MonitoredResourceType` +catalog (`/log-definitions`). `datumctl` queries this endpoint to populate +filter completions. + +#### `datumctl` integration + +`datumctl logs` calls `POST /telemetry/v1/query` with a structured filter +built from its flags. Live tail calls `POST /telemetry/v1/tail` and streams +over the WebSocket. + +A `LogQuery` custom resource (Kubernetes-native, modelled on `AuditLogQuery` +in `milo-os/activity`) is supported for kubectl-native and GitOps workflows. +It translates the structured query spec to the same ClickHouse SQL path. + +### 6. Access Control + +Milo IAM gates access at the project boundary via standard Kubernetes +RBAC on the project's telemetry endpoint. Because the URL is the project +control-plane endpoint, the same RBAC that protects the rest of the +project's resources protects log queries — no separate access model. + +Datum operations staff access fleet-wide data through internal tooling +using a privileged ClickHouse role that bypasses the row policy. This +access is scoped to internal tooling and audit-logged; it does not require +storing any data twice. + +## Cross-Cutting Concerns + +### Retention + +Fixed defaults; not user-controllable in v1. + +| Category Group | Retention | +|---|---| +| `allLogs` | 7 days | + +Implemented via the table TTL on the `timestamp` column. Per-project or +per-category retention overrides are a follow-on enhancement. + +### Default Enablement + +`allLogs` collection is opt-in per project via a `LogCollectionPolicy` +resource. Customers don't get surprise bills from log volume tracking +workload activity they didn't request. + +For v1: proxy access logs default off, WAF events default on, compute +workload output default off. + +### Log Export (Future) + +Customer-configurable export to external destinations (OTLP endpoints, S3, +etc.) is a follow-on enhancement. The NATS subject hierarchy +`telemetry.logs...` is designed to support this +without schema changes: an export worker subscribes to +`telemetry.logs..>` as an independent JetStream consumer, reads the +same messages the ClickHouse writer already consumes, and forwards them to +the customer's configured destination. Credentials and endpoint +configuration for that destination are held by the export worker, not by the +storage or ingest layers. + +### Live Tail + +The `/tail` endpoint is served by a small handler that constructs a NATS +subject from the authenticated request context — +`telemetry.logs...>` for all project logs, or +`telemetry.logs...` when a specific log type is +requested. `org_id` and `project_id` are embedded in the subject, so NATS +delivers only that project's messages. Any additional filters (`resource_name`, +attribute values) are applied in-process before forwarding over the WebSocket. +This avoids polling ClickHouse and keeps tail latency in the low hundreds of +milliseconds. + +### Request Correlation + +A single HTTP request through AI Edge produces one access log entry +(`httpproxy-access`) and zero-or-more WAF entries (`httpproxy-waf`, one +per matched rule). All of them carry the same `http.request.id` +(Envoy's `x-request-id`, which already propagates through the filter +chain to the WAF sidecar). That's the join key. + +The model favours denormalisation on the access log for the common case: + +- `waf.outcome` (`allowed` / `blocked` / `challenged`) and + `waf.matched_rules` (count) are stamped directly on the access log, + so the high-frequency "show me blocked requests" query is a single + stream filter, not a join. +- The per-rule `httpproxy-waf` entries carry the rule id, action, and + severity, joined back to the access log by `http.request.id` when the + customer needs to drill in to "which rules fired on this request." + +```sql +SELECT * FROM platform_logs +WHERE log_id IN ( + 'networking.datumapis.com/httpproxy-access', + 'networking.datumapis.com/httpproxy-waf' + ) + AND attributes_string['http.request.id'] = 'phl94-1779186433904-397d1bd984ce' +ORDER BY timestamp +``` + +`edge.pop.ingress` and `edge.pop.upstream` live only on the access log; +WAF entries inherit them by joining on `http.request.id`. They are emission +context, not resource identity — one `HTTPProxy` serves from many PoPs — so +they are stamped at emission by the data plane and are not part of the +`MonitoredResourceType` vocabulary. The same reasoning is why the WAF entry +schema is lean: client IP, user agent, response status, PoP — anything that +already exists on the paired access entry is reached via the join rather than +duplicated on every matched-rule row. + +### Redaction + +- Platform-managed allowlist of attribute keys always dropped or hashed + at the gateway (`*.token`, `*.password`, `authorization`, ...). +- Customer-configurable `LogRedactionPolicy` resource — attribute-level + drop/hash rules only. +- Body content is **not** redacted in v1. Documented as a constraint; + services are pushed to put structured data in attributes. + +## Fan-Out Summary + +``` +ServiceConfiguration + spec.monitoredResourceTypes[] → billing.MonitoredResourceType (existing) + → telemetry.MonitoredResourceType (new) + spec.logs[] → telemetry.LogDefinition (new) +``` + +One log event → one NATS message → one ClickHouse row. + +## Changes from Previous Design + +- **Removed double-write (consumer/producer destinations).** The original + design wrote each log event twice — once to the customer's partition and + once to a service-team producer partition. This doubled storage cost + without providing isolation guarantees beyond what a privileged ClickHouse + role already provides. Datum operations staff access fleet-wide data via a + superuser role on internal tooling. + +- **Replaced `tenant.kind`/`tenant.name` with `org_id`/`project_id`.** + The original design had the gateway resolve tenant identity per record via + a catalog lookup. `org_id` and `project_id` are now stamped by + platform-controlled producers at configuration time (HTTPProxy controller + into Envoy xDS; unikraft-provider into pod labels). The gateway receives + them pre-resolved and trusts them without a catalog call. + +- **Added Compute as a v1 producer.** Workload stdout/stderr is collected + by an OTel Collector DaemonSet on Kraftlet nodes and forwarded to the gateway. + `org_id` and `project_id` flow through pod labels stamped by the + unikraft-provider. + +- **Removed `consumer_name` column.** With single-write storage and a + privileged role for fleet-wide access, the column serves no purpose. + +- **Replaced Loki-compatible query API with a structured HTTP API.** + The Loki wire protocol requires a LogQL → SQL translation layer that is + non-trivial to build correctly, limits access to ClickHouse-native + features, and ties the design to a Grafana Cloud-specific protocol the + broader industry is not converging on. Axiom, Cloudflare, AWS CloudWatch + Logs Insights, and DataDog's analytics surface are all SQL or SQL-like; + logs are a table and SQL is the right query language for them. The Grafana + ClickHouse datasource covers the same Grafana use cases without a + translation layer, validated in our working POC. `datumctl logs` calls the + structured API directly; a `LogQuery` CRD covers kubectl-native access. + +## v1 Delivery Slice + +In dependency order: + +1. CRDs: `MonitoredResourceType`, `LogDefinition`, + `LogCollectionPolicy`, `LogRedactionPolicy`, `LogQuery`. +2. Fan-out controllers in this operator for `MonitoredResourceType` and + `LogDefinition`. +3. NATS JetStream stream and ClickHouse `platform_logs` table. +4. OTLP gateway: validates `org_id`/`project_id` presence, validates + `log_id` against cached `LogDefinition`, validates resource attribute + vocabulary, writes to NATS. +5. ClickHouse writer consumer draining NATS into `platform_logs`. +6. AI Edge integration: HTTPProxy controller stamps `org_id`, `project_id`, + and resource identity into Envoy xDS OTel access log config. + `ServiceConfiguration` for `networking-datumapis-com` with the two log + definitions. +7. Compute integration: unikraft-provider stamps `datum.org_id` and + `datum.project_id` on pods. OTel Collector DaemonSet (contrib) deployed to + Kraftlet nodes with `filelog` receiver and `k8sattributes` processor. + `ServiceConfiguration` for `compute-datumapis-com` with + `workload-stdout` log definition. +8. Structured HTTP query API at + `{project-control-plane-endpoint}/telemetry/v1/...` backed by + ClickHouse SQL with row-policy enforcement, plus the NATS-backed + `/tail` WebSocket handler. +9. Catalog-backed log-definition discovery (`/log-definitions`). +10. Internal Grafana (Datum ops) ClickHouse datasource configuration and + fleet-wide dashboard templates. Customer query surface is `datumctl logs` + only; no customer-facing Grafana in v1. + +## Open Questions + +- Whether `LogCollectionPolicy` is project-scoped or finer-grained (per + `HTTPProxy` or per `Workload`). Project-scoped is the simpler v1. +- How catalog-backed label discovery handles tenant-specific label values + (e.g. the set of `resource.name` values that actually exist in the + project). Likely a hybrid: label names from catalog, values from + ClickHouse with a short cache. +- Whether Kraftlet writes workload stdout/stderr to + `/var/log/pods/__//0.log` — the standard + path for containerd/CRI-O runtimes. Unikraft VMs are not OCI containers; + the log path is a Kraftlet implementation detail. Needs confirmation from + the compute team before the OTel Collector DaemonSet config is finalised. +- Whether the OTel Collector DaemonSet on Kraftlet nodes requires a privileged + pod security context to read `/var/log/pods`. Likely yes; needs sign-off + from the compute team. +- Whether unikraft-provider stamping `datum.org_id` and `datum.project_id` + on Pods is feasible as described. This requires: (a) the compute controller + to stamp those labels on Instance objects, and (b) the unikraft-provider to + copy them from Instance to Pod. Neither is current behaviour; both need + agreement from the compute team. +- Whether Instance objects currently carry `datum.org_id` and + `datum.project_id` labels (set by the compute controller from the + Workload's namespace and owning organisation), or whether that stamping + also needs to be added. This is the upstream half of the pod label chain. +- Whether the WAF sidecar sets `waf.outcome` and `waf.matched_rules` as + Envoy filter state keys (referenced in the xDS snippet as + `%FILTER_STATE(waf.outcome)%` etc.). If it does not, the denormalised WAF + summary fields on the access log are not achievable without changes to the + WAF sidecar. Needs confirmation from whoever owns the WAF filter. +- Whether the HTTPProxy controller can write `resource_attributes` into the + Envoy `OpenTelemetryAccessLogConfig` xDS field. The field is a real Envoy + API (available since ~1.26) but depends on what level of xDS the HTTPProxy + controller currently generates. Needs confirmation from the networking team. +- Who owns the Datum operations ClickHouse role and what tooling uses it. + Suggested: internal Grafana instance in `datum-cloud/infra`, same pattern + as our current telemetry system Grafana. diff --git a/docs/diagrams/ingestion-pipeline.png b/docs/diagrams/ingestion-pipeline.png new file mode 100644 index 0000000..92783ca Binary files /dev/null and b/docs/diagrams/ingestion-pipeline.png differ diff --git a/docs/diagrams/ingestion-pipeline.puml b/docs/diagrams/ingestion-pipeline.puml new file mode 100644 index 0000000..8265b3d --- /dev/null +++ b/docs/diagrams/ingestion-pipeline.puml @@ -0,0 +1,32 @@ +@startuml ingestion-pipeline +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Container.puml + +LAYOUT_LANDSCAPE() +LAYOUT_WITH_LEGEND() + +title Customer-Facing Logs — Ingestion Pipeline + +System_Ext(ai_edge, "AI Edge Data Plane", "Envoy + WAF sidecar. HTTPProxy controller stamps org_id, project_id, and resource identity into Envoy xDS access log config at reconcile time — no per-record catalog lookup.") + +System_Ext(compute, "Compute Workloads", "Unikraft VMs on Kraftlet nodes. OTel Collector DaemonSet (contrib) scrapes /var/log/pods via filelog receiver. k8sattributes processor reads datum.org_id and datum.project_id from pod labels stamped by unikraft-provider.") + +System_Ext(catalog, "Telemetry Catalog", "LogDefinition and MonitoredResourceType CRDs published by the telemetry-services-operator. Defines legal log IDs and resource attribute vocabulary per resource type.") + +System_Boundary(edge, "Edge Cluster") { + Container(gateway, "OTLP Gateway", "Go service", "Validates org_id and project_id are present. Validates log_id against cached LogDefinition (Kubernetes informer, no API call per record). Validates resource attribute vocabulary. Publishes one NATS message per record.") + ContainerQueue(nats_edge, "NATS Edge", "NATS leaf node", "Buffers records; forwards to hub via leaf node connection. Provides backpressure if hub is slow.") +} + +System_Boundary(hub, "Hub Cluster") { + ContainerQueue(nats_hub, "NATS Hub", "NATS JetStream", "Durable stream per subject telemetry.logs.... ClickHouse pulls directly; live-tail handler subscribes independently.") + ContainerDb(clickhouse, "ClickHouse", "platform_logs (MergeTree)", "logs_ingest table uses ENGINE=NATS to pull from JetStream. Materialized view extracts columns into platform_logs. Partitioned by (org_id, month). Sorted by (org_id, project_id, resource_type, resource_name, log_id, timestamp). TTL 7 days.") +} + +Rel(ai_edge, gateway, "OTLP/HTTP log records", "OTLP") +Rel(compute, gateway, "OTLP/HTTP log records", "OTLP") +Rel(gateway, catalog, "Validate log_id and attribute vocabulary", "informer cache") +Rel(gateway, nats_edge, "Publish one message per record", "NATS") +Rel(nats_edge, nats_hub, "Leaf node forwarding", "NATS") +Rel(nats_hub, clickhouse, "Pull via NATS engine table", "JetStream") + +@enduml diff --git a/docs/diagrams/query-path.png b/docs/diagrams/query-path.png new file mode 100644 index 0000000..01ee228 Binary files /dev/null and b/docs/diagrams/query-path.png differ diff --git a/docs/diagrams/query-path.puml b/docs/diagrams/query-path.puml new file mode 100644 index 0000000..c26a3de --- /dev/null +++ b/docs/diagrams/query-path.puml @@ -0,0 +1,32 @@ +@startuml query-path +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Container.puml + +LAYOUT_LANDSCAPE() +LAYOUT_WITH_LEGEND() + +title Customer-Facing Logs — Query Path + +Person(customer, "Customer", "Project member. Authenticated via Milo IAM.") +Person(datum_ops, "Datum Ops", "Internal Datum staff. Fleet-wide access.") + +Container(datumctl, "datumctl", "CLI", "kubectl-style log streaming and querying. Calls the structured HTTP API.") + +System_Boundary(query_layer, "Query Layer") { + Container(query_api, "Query API", "Go service", "POST /telemetry/v1/query — structured JSON filter translated to ClickHouse SQL. POST /telemetry/v1/tail — WebSocket; subscribes to NATS subject constructed from auth context. GET /telemetry/v1/log-definitions — catalog-backed label discovery.") +} + +System_Boundary(hub, "Hub Cluster") { + ContainerQueue(nats_hub, "NATS Hub", "NATS JetStream", "Live tail source. Handler subscribes to telemetry.logs...> — org_id and project_id are encoded in the subject, not filtered post-subscription.") + ContainerDb(clickhouse, "ClickHouse", "platform_logs (MergeTree)", "Row policy for api_reader role: org_id = getSetting('org_id') AND project_id = getSetting('project_id'). Cross-tenant reads are structurally impossible. Privileged role for Datum ops bypasses row policy.") +} + +Container(grafana, "Grafana (internal)", "Datum ops tooling in datum-cloud/infra", "ClickHouse datasource with privileged role. Fleet-wide dashboards and ad-hoc log queries. Not customer-facing.") + +Rel(customer, datumctl, "datumctl logs ...") +Rel(datumctl, query_api, "POST /query, POST /tail", "HTTPS") +Rel(query_api, clickhouse, "SQL with session settings org_id + project_id from auth context", "TCP/9000") +Rel(query_api, nats_hub, "Subscribe for live tail", "NATS") +Rel(datum_ops, grafana, "Fleet-wide log queries and dashboards") +Rel(grafana, clickhouse, "SQL with privileged role (bypasses row policy)", "TCP/9000") + +@enduml