diff --git a/Taskfile.yaml b/Taskfile.yaml index 45a19142..a34a3994 100644 --- a/Taskfile.yaml +++ b/Taskfile.yaml @@ -1,6 +1,10 @@ version: '3' includes: + # Documentation tasks + docs: + taskfile: ./docs/Taskfile.yaml + dir: ./docs dev: taskfile: ./Taskfile.dev.yaml @@ -77,4 +81,9 @@ tasks: echo "" echo "🎉 All Prometheus rule tests passed." fi - silent: false \ No newline at end of file + silent: false + + generate: + desc: Run code generation (deepcopy, defaults) + deps: + - task: docs:generate \ No newline at end of file diff --git a/docs/Taskfile.yaml b/docs/Taskfile.yaml new file mode 100644 index 00000000..a526bfc8 --- /dev/null +++ b/docs/Taskfile.yaml @@ -0,0 +1,69 @@ +version: '3' + +vars: + DIAGRAMS_DIR: "{{.ROOT_DIR}}/docs/diagrams" + OUTPUT_FORMAT: "png" + PLANTUML_IMAGE: plantuml/plantuml:1.2026.4 + +tasks: + generate: + desc: Generate all documentation artifacts (diagrams, etc.) + cmds: + - task: diagrams:render + silent: true + + diagrams: + desc: Generate all architecture diagrams from PlantUML + cmds: + - task: diagrams:render + silent: true + + diagrams:render: + desc: Render PlantUML diagrams to PNG format using Docker + cmds: + - | + set -e + echo "Rendering PlantUML diagrams..." + echo "" + + # Check if PlantUML files exist + if ! ls {{.DIAGRAMS_DIR}}/*.puml >/dev/null 2>&1; then + echo "❌ Error: PlantUML source files (*.puml) not found in {{.DIAGRAMS_DIR}}" + exit 1 + fi + + # Render using Docker (no local installation required) + docker run --rm \ + -v "{{.DIAGRAMS_DIR}}":/data \ + {{.PLANTUML_IMAGE}} \ + -t{{.OUTPUT_FORMAT}} \ + /data/*.puml + + echo "" + echo "✅ Diagrams rendered in {{.DIAGRAMS_DIR}}" + echo "" + echo "Generated files:" + ls -1 {{.DIAGRAMS_DIR}}/*.{{.OUTPUT_FORMAT}} 2>/dev/null | xargs -n1 basename || echo "No output files found" + silent: true + + diagrams:clean: + desc: Remove generated diagram files + cmds: + - | + rm -f {{.DIAGRAMS_DIR}}/*.png {{.DIAGRAMS_DIR}}/*.svg + echo "✅ Generated diagram files removed" + silent: true + + diagrams:validate: + desc: Validate PlantUML syntax using Docker + cmds: + - | + set -e + echo "Validating PlantUML diagrams..." + docker run --rm \ + -v "{{.DIAGRAMS_DIR}}":/data \ + {{.PLANTUML_IMAGE}} \ + -syntax \ + /data/*.puml + echo "✅ All diagrams are valid" + silent: true diff --git a/docs/diagrams/http-metering-c4.png b/docs/diagrams/http-metering-c4.png new file mode 100644 index 00000000..c3c184d6 Binary files /dev/null and b/docs/diagrams/http-metering-c4.png differ diff --git a/docs/diagrams/http-metering-c4.puml b/docs/diagrams/http-metering-c4.puml new file mode 100644 index 00000000..33fda98a --- /dev/null +++ b/docs/diagrams/http-metering-c4.puml @@ -0,0 +1,25 @@ +@startuml http-metering-c4 +!include https://raw.githubusercontent.com/plantuml-stdlib/C4-PlantUML/master/C4_Container.puml + +LAYOUT_WITH_LEGEND() + +title C4 Container Diagram - HTTP Traffic Metering System + +Person(client, "End User / Client", "Requests services exposed via Datum Cloud Edge") + +System_Boundary(edge_cluster, "Edge Cluster") { + Container(envoy, "Envoy Gateway Proxy", "Envoy/Go", "Handles ingress HTTP traffic, terminates TLS, enforces WAF/rate-limiting, emits JSON access logs to stdout") + Container(vector_collector, "billing-usage-collector-vector", "Vector DaemonSet (Billing)", "Tails Envoy container logs, parses JSON access logs, translates to CloudEvents, and forwards them to the Billing System") + Container(nso, "Network Services Operator", "Go", "Deploys Envoy Gateway and configures EnvoyProxy logging policies") +} + +System_Boundary(control_plane, "Platform Control Plane") { + Container(billing_system, "Billing System & Service Catalog", "Platform Service", "Handles service registration, event validation, attribution, and storage") +} + +Rel(client, envoy, "Sends HTTPS requests to", "HTTPS") +Rel(nso, envoy, "Configures & manages", "Kubernetes API / EnvoyProxy CR") +Rel_D(envoy, vector_collector, "Outputs JSON access logs to", "stdout / container logs") +Rel_D(vector_collector, billing_system, "Forwards batched events to", "HTTPS CloudEvents") + +@enduml diff --git a/docs/diagrams/http-metering-sequence.png b/docs/diagrams/http-metering-sequence.png new file mode 100644 index 00000000..9691e563 Binary files /dev/null and b/docs/diagrams/http-metering-sequence.png differ diff --git a/docs/diagrams/http-metering-sequence.puml b/docs/diagrams/http-metering-sequence.puml new file mode 100644 index 00000000..a11424f2 --- /dev/null +++ b/docs/diagrams/http-metering-sequence.puml @@ -0,0 +1,35 @@ +@startuml http-metering-sequence +skinparam BoxPadding 10 +skinparam ParticipantPadding 10 + +actor Client as client +box "Edge Cluster Node" #LightBlue + participant "Envoy Gateway (Proxy)" as envoy + participant "billing-usage-collector-vector\n(DaemonSet)" as vector +end box + +box "Platform Control Plane" #LightYellow + participant "Billing System" as billing +end box + +client -> envoy : 1. HTTP Request (GET /path) +activate envoy +envoy -> envoy : Route lookup & WAF checks +envoy -> client : 2. HTTP Response (200 OK with Egress Bytes) +deactivate envoy + +note over envoy : Request completed +envoy -> vector : 3. Write structured JSON access log to stdout\n(contains bytes, duration, route name/namespace) +activate vector + +vector -> vector : Tail logs, parse JSON,\nand map to CloudEvent\n(No enrichment) + +vector -> billing : 4. Forward batched CloudEvents\n(HTTPS batch ingest) +activate billing + +billing -> billing : Validate, attribute, and persist +billing --> vector : 200 OK / 202 Accepted +deactivate billing +deactivate vector + +@enduml diff --git a/docs/enhancements/http-metering/http-traffic-metering.md b/docs/enhancements/http-metering/http-traffic-metering.md new file mode 100644 index 00000000..7df3a435 --- /dev/null +++ b/docs/enhancements/http-metering/http-traffic-metering.md @@ -0,0 +1,495 @@ +--- +status: provisional +stage: alpha +latest-milestone: "v0.x" +--- + +# HTTP Traffic Metering for Network Services + +- [Summary](#summary) +- [Motivation](#motivation) + - [Goals](#goals) + - [Non-Goals](#non-goals) +- [Proposal](#proposal) + - [Data and Control Flow Diagrams](#data-and-control-flow-diagrams) + - [User Stories](#user-stories) + - [Notes/Constraints/Caveats](#notesconstraintscaveats) + - [Risks and Mitigations](#risks-and-mitigations) +- [Design Details](#design-details) + - [Monitored Resource: HTTPRoute](#monitored-resource-httproute) + - [Service and ServiceConfiguration Definitions](#service-and-serviceconfiguration-definitions) + - [Kustomize Bundle Layout](#kustomize-bundle-layout) +- [Production Readiness Review Questionnaire](#production-readiness-review-questionnaire) + - [Feature Enablement and Rollback](#feature-enablement-and-rollback) + - [Rollout, Upgrade and Rollback Planning](#rollout-upgrade-and-rollback-planning) + - [Monitoring Requirements](#monitoring-requirements) + - [Dependencies](#dependencies) + - [Scalability](#scalability) + - [Troubleshooting](#troubleshooting) +- [Open Decisions](#open-decisions) +- [Implementation History](#implementation-history) +- [Drawbacks](#drawbacks) +- [Alternatives](#alternatives) + +## Summary + +Network Services operates an Envoy Gateway-based edge proxy that routes HTTP traffic, terminates TLS, and enforces WAF and rate-limit policies on behalf of platform customers. Today, it lacks a billing presence: there is no registration in the service catalog, no meter definitions, and no integration with the durable usage pipeline. + +This enhancement defines the architecture, data structures, and metadata required to bring HTTP traffic metering and service catalog registration to Network Services. Under this design: +- Network Services' identity and billable metrics are declared via a standard `Service` and companion `ServiceConfiguration` resource (`services.miloapis.com/v1alpha1`). +- Envoy Gateway proxies are instrumented to write structured JSON access logs, which are scraped, parsed, and forwarded as CloudEvents to the platform's billing pipeline by the existing `billing-usage-collector-vector` DaemonSet. + +## Motivation + +Network Services is a core utility that incurs direct infrastructure costs. Capturing consumption signals is necessary for platform billing and cost-attribution. + +Because `MeterDefinition` fields (such as `meterName` and `measurement.unit`) are immutable once published, establishing correct definitions in the `Draft`/`Provisional` phase is critical. Doing so avoids costly SDK upgrades, meter deprecation cycles, and data migrations. + +### Goals + +- Establish Network Services' identity in the platform service catalog to make it discoverable and activatable by platform consumers. +- Define clear, usage-based billing metrics for HTTP traffic (requests, bandwidth, and connection time) so customers pay proportionally to their consumption. +- Design a reliable, zero-data-loss telemetry collection path that ensures accurate billing without impacting proxy performance or request latency. +- Provide clear architectural visibility into the edge-to-billing data flow for platform operators. + +### Non-Goals + +- **Pricing tiers, currencies, and billing cycle schedules:** This design only concerns measuring and reporting raw usage quantities. Determining pricing rates, tier discounts, billing schedules, and invoice calculations is out of scope. +- **Runtime traffic enforcement and quota limits:** Telemetry collection does not gate or throttle traffic. Rate limiting, WAF enforcement, and bandwidth capping remain governed by separate gateway policies, not by the billing pipeline. + +## Proposal + +We propose to register the service and implement HTTP traffic metering via access log scraping. + +- The **Monitored Resource** is the Kubernetes Gateway API `HTTPRoute` resource, representing the customer-facing HTTP endpoint. +- **Catalog Registration** is handled via declarative YAML configurations. The service catalog fan-out controller automatically creates `MonitoredResourceType` and `MeterDefinition` resources in the billing namespace. +- **Telemetry Emission** is handled by instrumenting the Envoy Gateway instances to write structured JSON access logs to stdout. The node-level `billing-usage-collector-vector` DaemonSet (already deployed as part of the billing pipeline) tails these log files directly, parses and maps the raw logs into CloudEvents, handles local disk buffering, and reliably forwards them to the central Billing System. + +### Data and Control Flow Diagrams + +#### System Container Architecture (C4 Container Diagram) + +The following diagram outlines the relationship between the Edge Cluster components and the Platform Control Plane Billing Pipeline: + +![C4 Diagram](../diagrams/http-metering-c4.png) + +#### Operational Data Flow (Sequence Diagram) + +The sequence below details the path of a client request, the generation of the telemetry log, and its propagation to the billing pipeline: + +![Sequence Diagram](../diagrams/http-metering-sequence.png) + +### User Stories + +#### Story 1 +As a platform customer, I want to deploy `HTTPRoute` resources to expose my services and pay only for the volume of requests and bandwidth my application consumes. + +#### Story 2 +As a platform administrator, I want to track active connections and bytes transferred per HTTPRoute in the billing portal to manage capacity and attribute costs to projects. + +### Notes/Constraints/Caveats + +- **Log Schema Locking:** Since Vector relies on parsing the Envoy stdout logs, any changes to the Envoy access log format must be carefully managed to avoid breaking the parsing pipeline. +- **Persistent Connections:** Traditional access logs only print a line once the request/connection finishes. For long-running connections (e.g. WebSockets), connection seconds are computed upon termination. We need to evaluate whether periodic sample-based heartbeats are required for extremely long connections (days/weeks). + +### Risks and Mitigations + +- **Risk:** High traffic volume (e.g. 10k+ requests per second) generating a high rate of log output, consuming excessive disk I/O and CPU. + - *Mitigation:* Vector Agent performs on-node log parsing, batching, and compression, reducing transport overhead. Ingestion Gateways will be scaled horizontally to handle load peaks. +- **Risk:** Node restarts resulting in the loss of in-flight logs. + - *Mitigation:* Standard Docker/Kubernetes container runtimes persist log files on the host disk, allowing the Vector Agent to resume tailing from the last read offset upon restart. + +--- + +## Design Details + +### Monitored Resource: HTTPRoute + +Rather than tracking usage at the `Gateway` or `HTTPProxy` level, we meter consumption per **`HTTPRoute`**. This is because: +1. It maps directly to a customer's specific application service surface. +2. It separates L7 (HTTP) billing from future L4 (TCP/UDP) billing, which would be represented by `TCPRoute` or `UDPRoute` resources. + +We define four billing metrics for `HTTPRoute` resources: + +| Signal | Proposed Metric Name | Unit (UCUM) | Metric Kind | +|--------|----------------------|-------------|-------------| +| Request count | `networking.datumapis.com/gateway/requests` | `{request}` | Delta | +| Egress bytes | `networking.datumapis.com/gateway/egress-bytes` | `By` | Delta | +| Ingress bytes | `networking.datumapis.com/gateway/ingress-bytes` | `By` | Delta | +| Connection seconds | `networking.datumapis.com/gateway/connection-seconds` | `s` | Delta | + +#### Dimensions + +Each metric will record the following dimensions: +- `region`: Deployment region. +- `gateway`: Name of the parent Gateway. +- `gateway_namespace`: Namespace of the parent Gateway. +- `gateway_class`: Underlying GatewayClass (for pricing class differentiation). +- `httproute_name`: The `HTTPRoute` resource name. +- `httproute_namespace`: The `HTTPRoute` namespace. +- `project_name`: Human-readable name of the project that owns the route (see [Surfacing Signals from the Edge](#surfacing-signals-from-the-edge)). + +--- + +### Surfacing Signals from the Edge + +![HTTP Metering Signal Pipeline](./signal-pipeline.png) + +All metering signals originate from the **edge cluster**, where the Envoy +Gateway proxies (`datum-downstream-gateway`) actually serve customer traffic. +There is no central collection point that observes individual requests — the +proxy is the only component that sees each request, so the signal must be +captured, enriched, and emitted at the edge before being forwarded to the +central Billing System. + +The raw access log already carries everything the meters need *except* one +thing: the `route_name` field identifies the owning project only by its +control-plane namespace UID (e.g. `ns-`), not by the +human-readable project name. To populate the `project_name` dimension, three +components collaborate at the edge: + +1. **Extension Server (xDS mutation).** The NSO extension server implements + `ApplyTPPRouteConfig` in `internal/extensionserver/mutate/tpp.go`. During + each xDS route-config build, it iterates every VirtualHost owned by NSO and + calls `injectProjectNameMetadata` on every route, which writes the resolved + `project_name` string directly into + `filter_metadata["datum-gateway"]["project_name"]` on the Envoy + `RouteConfiguration` proto. This happens for every NSO-owned route + regardless of whether a `TrafficProtectionPolicy` governs it — WAF config is + an optional overlay on top of the metadata that is always stamped. The + project name is sourced from `idx.ProjectNames[dsNS]`, the + downstream-namespace → project-name mapping the operator maintains in its + policy index. + +2. **Envoy access log format.** The `EnvoyProxy` access log JSON format + includes a `project_name` field read from the xDS route metadata: + `project_name: "%METADATA(ROUTE:datum-gateway:project_name)%"`. Because the + extension server stamps the metadata into the xDS route before any request is + served, every logged request for a customer route carries the resolved + project name. (`%METADATA(ROUTE:...)%` is used because the name lives in xDS + route metadata — it is not a per-request value and does not need to travel as + a header.) + +3. **Vector billing collector.** The `billing-usage-collector-vector` VRL + transform reads the `project_name` field from each parsed access log line + and adds it as a dimension on all four emitted CloudEvents (requests, + ingress-bytes, egress-bytes, connection-seconds). An absent or Envoy-default + `"-"` value is normalized to an empty string so unmatched routes do not + pollute the dimension. + +This keeps the entire signal path — xDS route enrichment, log emission, +parsing, and CloudEvent forwarding — co-located on the edge cluster. + +#### Transport: how access logs reach Vector + +The access log line travels from the Envoy proxy to the +`billing-usage-collector-vector` agent via the **File sink (stdout) + +`kubernetes_logs`** approach: Envoy writes JSON to `/dev/stdout` (the `File` +sink configured on `datum-downstream-gateway`) and the node-local Vector +DaemonSet tails the container log file via its `kubernetes_logs` source. + +--- + +### Service and ServiceConfiguration Definitions + +#### `service.yaml` +```yaml +apiVersion: services.miloapis.com/v1alpha1 +kind: Service +metadata: + name: networking-datumapis-com + labels: + app.kubernetes.io/name: network-services-operator + app.kubernetes.io/managed-by: kustomize +spec: + serviceName: networking.datumapis.com + phase: Draft + displayName: Network Services + description: | + Managed HTTP/HTTPS edge proxy, routing, and traffic protection + for Datum Cloud workloads. Provides programmable ingress via + Gateway API HTTPRoute and WAF/rate-limit policies. Billed via + the networking.datumapis.com/gateway meter family. + owner: + producerProjectRef: + name: datum-cloud +``` + +#### `serviceconfiguration.yaml` +```yaml +apiVersion: services.miloapis.com/v1alpha1 +kind: ServiceConfiguration +metadata: + name: networking-datumapis-com +spec: + phase: Draft + serviceRef: + name: networking-datumapis-com + monitoredResourceTypes: + - type: networking.datumapis.com/HTTPRoute + displayName: HTTP Route + description: | + A customer-defined HTTP routing configuration attached to a managed + Gateway. One HTTPRoute represents a single HTTP service surface on + the Datum Cloud edge. Usage events cover proxied request count, + egress bytes, ingress bytes, and active connection seconds. + gvk: + group: gateway.networking.k8s.io + kind: HTTPRoute + labels: + - name: region + description: Datum deployment region serving the requests. + - name: gateway + description: Name of the underlying Gateway resource. + - name: gateway_namespace + description: Namespace of the underlying Gateway resource. + - name: gateway_class + description: GatewayClass of the underlying Gateway. + - name: httproute_name + description: Name of the HTTPRoute that served the request. + - name: httproute_namespace + description: Namespace of the HTTPRoute that served the request. + metrics: + - name: networking.datumapis.com/gateway/requests + displayName: HTTP Route Requests + description: HTTP requests proxied through the route. + kind: Delta + unit: "{request}" + dimensions: + - region + - gateway + - gateway_namespace + - gateway_class + - httproute_name + - httproute_namespace + - name: networking.datumapis.com/gateway/egress-bytes + displayName: HTTP Route Egress Bytes + description: Bytes sent downstream by the route. + kind: Delta + unit: By + dimensions: + - region + - gateway + - gateway_namespace + - gateway_class + - httproute_name + - httproute_namespace + - name: networking.datumapis.com/gateway/ingress-bytes + displayName: HTTP Route Ingress Bytes + description: Bytes received from clients by the route. + kind: Delta + unit: By + dimensions: + - region + - gateway + - gateway_namespace + - gateway_class + - httproute_name + - httproute_namespace + - name: networking.datumapis.com/gateway/connection-seconds + displayName: HTTP Route Connection Seconds + description: Seconds long-lived connections (e.g. WebSocket) are held open. + kind: Delta + unit: s + dimensions: + - region + - gateway + - gateway_namespace + - gateway_class + - httproute_name + - httproute_namespace + billing: + consumerDestinations: + - monitoredResourceType: networking.datumapis.com/HTTPRoute + metrics: + - networking.datumapis.com/gateway/requests + - networking.datumapis.com/gateway/egress-bytes + - networking.datumapis.com/gateway/ingress-bytes + - networking.datumapis.com/gateway/connection-seconds +``` + +### Kustomize Bundle Layout + +We package these files following the project-standard layout pattern: + +``` +config/services/ + kustomization.yaml + networking.datumapis.com/ + kustomization.yaml + service.yaml + serviceconfiguration.yaml + README.md +``` + +--- + +## Production Readiness Review Questionnaire + +### Feature Enablement and Rollback + +#### How can this feature be enabled / disabled in a live cluster? +- **Other** + - Describe the mechanism: Catalog registration is enabled by deploying the `Service` and `ServiceConfiguration` manifests. Telemetry emission is enabled by configuring the Envoy Gateway access logs via `EnvoyProxy` and updating the `billing-usage-collector-vector` DaemonSet configuration. + - Will enabling / disabling the feature require downtime of the control plane? No. + - Will enabling / disabling the feature require downtime or reprovisioning of a node? No, Envoy Gateway supports dynamic configuration updates without dropping active traffic. + +#### Does enabling the feature change any default behavior? +No, it only adds background logging and telemetric forwarding. + +#### Can the feature be disabled once it has been enabled (i.e. can we roll back the enablement)? +Yes, reverting the `EnvoyProxy` configuration to its previous state disables access logging. + +#### What happens if we reenable the feature if it was previously rolled back? +Logging resumes, and `billing-usage-collector-vector` resumes parsing from the end of the log stream. + +--- + +### Rollout, Upgrade and Rollback Planning + +#### How can a rollout or rollback fail? Can it impact already running workloads? +Rollouts do not affect traffic handling directly. A malformed access log format patch could cause Vector parsing errors, leading to missing billing data but leaving request routing functional. + +#### What specific metrics should inform a rollback? +- Vector agent parsing error rates (`vector_transform_errors_total`). +- Billing Ingestion Gateway event rejection rate. + +#### Were upgrade and rollback tested? Was the upgrade->downgrade->upgrade path tested? +TBD during telemetry emission implementation. + +--- + +### Monitoring Requirements + +#### How can an operator determine if the feature is in use by workloads? +By checking the presence of incoming CloudEvents at the Billing Ingestion Gateway carrying the `networking.datumapis.com` source domain. + +#### How can someone using this feature know that it is working for their instance? +By checking the customer billing dashboard or querying the billing API for resource usage statistics. + +#### What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service? +- **Metrics** + - Metric name: `vector_transform_errors_total`, `ingestion_gateway_request_count` (with status codes). + - Components exposing the metric: `billing-usage-collector-vector`, Billing Ingestion Gateway. + +--- + +### Dependencies + +#### Does this feature depend on any specific services running in the cluster? +- **`billing-usage-collector-vector` DaemonSet:** Tails Envoy container logs, parses JSON access logs, translates to CloudEvents, and forwards events to the Billing System. + +--- + +### Scalability + +#### Will enabling / using this feature result in any new API calls? +- Logs are written locally to stdout; there are no new Kube API calls for logging. +- `billing-usage-collector-vector` performs HTTPS batch POST requests to the Billing System. Throughput scales linearly with request volume. + +#### Will enabling / using this feature result in introducing new API types? +No new Go-level types are introduced in the operator. `Service` and `ServiceConfiguration` are existing types in the platform's service catalog. + +--- + +### Troubleshooting + +#### How does this feature react if the API server is unavailable? +Telemetry generation and log scraping are independent of the Kubernetes API server. `billing-usage-collector-vector` will continue to tail files and forward events. + +#### What are other known failure modes? +- **Vector pipeline backlog:** If the Ingestion Gateway/Billing System is slow, `billing-usage-collector-vector` buffers events locally on the node disk. +- **Log rotation race:** Very high traffic might trigger rapid log rotation, which could cause minor data loss if the collector falls too far behind. + +## Open Decisions + +The following decisions are tracked for the implementation of this enhancement: + +| ID | Question | Status | Resolution | +|----|----------|--------|------------| +| OD-1 | How does the service catalog express the monitored resource? | **Resolved** | Via `services.miloapis.com/v1alpha1/ServiceConfiguration` carrying `spec.monitoredResourceTypes[]`, `spec.metrics[]` and `spec.billing` inline. The fan-out controller produces the `MonitoredResourceType` and meter resources in the billing namespace. | +| OD-2 | Canonical `serviceName` | **Resolved** | `networking.datumapis.com`. | +| OD-3 | `producerProjectRef.name` | **Resolved** | `datum-cloud`. | +| OD-4 | Bundle layout | **Resolved** | Per-service-domain directory under `config/services/networking.datumapis.com/`, matching `datum-cloud/datum/config/services//`. | +| OD-5 | Is the Vector Agent DaemonSet planned to run on the edge cluster nodes that host Envoy Gateway pods? | **Resolved** | Yes. The shared platform `billing-usage-collector-vector` runs as a DaemonSet in the `billing-system` namespace. Under this design, this pre-existing agent will directly tail and parse the Envoy stdout logs, avoiding the need for a separate custom log-parsing agent. | +| OD-6 | Can the network-services-operator patch the `EnvoyProxy` CR to inject access log configuration? | **Resolved** | Yes. NSO configures and manages the Envoy Gateway proxies and can patch EnvoyProxy resources to enable structured JSON stdout logging. | +| OD-7 | Is the billing SDK published as a consumable Go module? | **N/A** | We do not compile or use the Billing Go SDK for proxy traffic metering. Instead, the `billing-usage-collector-vector` DaemonSet directly parses raw Envoy stdout logs, formats them into CloudEvents, and forwards them. | +| OD-8 | Enrichment-sidecar placement: per-node alongside Vector, or central in front of the Ingestion Gateway? | **N/A** | At this stage, we will not enrich the event information with additional control-plane data. The `billing-usage-collector-vector` DaemonSet will only parse the raw properties from the JSON logs and map them directly to the CloudEvent schema. | + +--- + +## Implementation History + +- **2026-06-10:** Refactored design brief into standard enhancement proposal, reframing the goal around HTTP Traffic Metering and adding C4 and sequence diagrams. + +--- + +## Drawbacks + +- **Log Volume Overhead:** Emitting and parsing logs for every request adds disk I/O and CPU load on the node. +- **Complexity of Connection Seconds:** Persisting connection duration for WebSockets requires tracking state until connection termination. + +--- + +## Alternatives + +### Access Log Transport: File Sink vs OTLP Sink + +The signal-collection design above is independent of *how* the Envoy access log +line reaches the `billing-usage-collector-vector` agent. Two transports were +evaluated: + +#### Option A1: File sink (stdout) + Vector `kubernetes_logs` (baseline) + +Envoy keeps its existing `File` access log sink writing JSON to `/dev/stdout`. +The container runtime persists this to the node's container log files, and +Vector tails them via a `kubernetes_logs` source. + +- *Pros:* No new ports or network hops; reuses the standard Kubernetes log + collection pattern; the `File` sink is already present in the base + `EnvoyProxy`; logs survive on disk if Vector is briefly down (checkpointed + tailing). +- *Cons:* Requires Vector to run as a **per-node DaemonSet co-located** with the + Envoy pod, because `kubernetes_logs` can only read the node it runs on. This + holds on **edge** clusters (Vector and Envoy are both DaemonSets), but breaks + where the billing Vector runs as a **Stateless-Aggregator** (staging/prod), as + a single aggregator pod cannot tail Envoy stdout on other nodes. It also needs + a `kubernetes_logs` source plus a ClusterRole for pod metadata, pod-label + filtering to avoid ingesting unrelated containers, and a `parse_json(.message)` + step. + +#### Option A2: OpenTelemetry (OTLP) sink (implemented in draft PR) + +Envoy adds an `OpenTelemetry` access log sink alongside the existing `File` +sink, pushing access logs directly to Vector's OTLP receiver +(`opentelemetry` source, gRPC :4317). The JSON fields arrive as OTLP +log-record attributes, which the VRL transform normalizes to top-level fields. + +- *Pros:* **Topology-independent** — works identically whether Vector is a + DaemonSet or an aggregator, since it targets the Vector Service DNS and lets + kube-proxy route. No `kubernetes_logs` source, no ClusterRole, no + per-container filtering, no re-parsing of a stringified message. An OTel + resource attribute (`service.name: nso-httproute-signals`) tags the stream. +- *Cons:* Adds a network sink and OTLP ports to the Vector Service; introduces a + push dependency (mitigated by keeping the `File` sink in parallel as a + fallback / for debugging). + +This transport is implemented in a draft PR: + + +PR: + + + +### Option B: Prometheus Scrape Delta Calculation +Run an operator loop that polls the Envoy `/stats` endpoint periodically. +- *Rejected because:* Loses per-request granularity and increases NSO statefulness/risk of double-counting. + +### Option C: gRPC Metrics Sink (OTel OTLP) +Configure Envoy's OTel metrics sink to push counters via gRPC OTLP to a platform-deployed OTel Collector. +- *Rejected because:* The billing pipeline design document explicitly rejects the OTLP path, since OTLP does not structurally enforce the ULID dedup key or the billing entity (`subject`). These fields would become opaque string attributes that must survive the OTLP round-trip intact. + +### Option D: WASM Filter Hook +Inject a Custom WASM Filter to accumulate and batch statistics within the proxy. +- *Rejected because:* Introduces build-time overhead (a separate Rust WASM repository/OCI image) and increases operational complexity compared to using standard Vector log parsing. + diff --git a/docs/enhancements/http-metering/signal-pipeline.png b/docs/enhancements/http-metering/signal-pipeline.png new file mode 100644 index 00000000..192e4030 Binary files /dev/null and b/docs/enhancements/http-metering/signal-pipeline.png differ diff --git a/docs/enhancements/http-metering/signal-pipeline.puml b/docs/enhancements/http-metering/signal-pipeline.puml new file mode 100644 index 00000000..c3e01aae --- /dev/null +++ b/docs/enhancements/http-metering/signal-pipeline.puml @@ -0,0 +1,56 @@ +@startuml signal-pipeline +skinparam sequenceArrowThickness 1.5 +skinparam sequenceGroupBorderColor #888888 +skinparam sequenceGroupFontColor #444444 +skinparam sequenceGroupBackgroundColor #F8F8F8 +skinparam participantBackgroundColor #DDEEFF +skinparam participantBorderColor #336699 +skinparam noteBorderColor #AAAAAA +skinparam noteBackgroundColor #FFFDE7 +skinparam defaultFontName "Helvetica" +skinparam defaultFontSize 12 + +title HTTP Metering Signal Pipeline + +participant "Envoy Gateway\n(Control Plane)" as EG +participant "NSO\nExtension Server" as EXT <> +participant "Envoy Proxy\n(Data Plane)\n/dev/stdout" as ENV +participant "Vector Agent\n(DaemonSet)\nkubernetes_logs" as VEC <> +participant "Billing Gateway\n(HTTP sink)" as BG + +== xDS Build (once per HTTPRoute change) == + +EG -> EXT : PostRouteModifyHook\n(RouteConfiguration) +note right of EXT + ApplyTPPRouteConfig iterates\nevery NSO-owned VirtualHost/route + and calls injectProjectNameMetadata,\nwriting project_name into + filter_metadata["datum-gateway"] + on each Envoy route proto. +end note +EXT -> EG : mutated RouteConfiguration\n(route.metadata.filter_metadata\n["datum-gateway"]["project_name"]) +EG -> ENV : xDS push — routes now carry\nproject_name in filter_metadata + +== Per-Request (data plane) == + +?-> ENV : HTTP request +activate ENV +ENV -> ENV : Serve request,\nwrite JSON access log to stdout\n(File sink, path: /dev/stdout)\nincl. METADATA(ROUTE:datum-gateway:project_name) +ENV -->?: HTTP response +deactivate ENV + +VEC -> ENV : tail container log file\n/var/log/pods/.../envoy/0.log\n(kubernetes_logs, gid=0) +note right of VEC + VRL transform: + 1. parse_json(.message) → top-level fields + 2. abort if route_name !~ httproute// + 3. derive subject = "project_name" + 4. emit 4 CloudEvents: + • requests = "1" + • ingress-bytes = bytes_received + • egress-bytes = bytes_sent + • connection-secs = duration_ms / 1000 + Each event carries dimensions including project_name. +end note +VEC -> BG : POST /v1/usage/events:batchIngest\n[4 CloudEvents, JSON body] + +@enduml diff --git a/docs/enhancements/service-catalog-registration.md b/docs/enhancements/service-catalog-registration.md deleted file mode 100644 index a88bf298..00000000 --- a/docs/enhancements/service-catalog-registration.md +++ /dev/null @@ -1,701 +0,0 @@ ---- -status: draft -stage: awaiting-sign-off -created: 2026-05-11 -github_issue: https://github.com/datum-cloud/network-services-operator/issues/155 ---- - -# Design Brief: Register Network Services in the Service Catalog - -- [Executive Summary](#executive-summary) -- [Problem Statement](#problem-statement) -- [Scope](#scope) -- [Billing Signals](#billing-signals) -- [Monitored Resource](#monitored-resource) -- [Service Registration](#service-registration) -- [Kustomize Bundle Layout](#kustomize-bundle-layout) -- [Usage Pipeline Integration — Investigation Findings](#usage-pipeline-integration--investigation-findings) -- [Open Decisions](#open-decisions) -- [Acceptance Criteria](#acceptance-criteria) -- [Implementation Plan](#implementation-plan) -- [References](#references) - ---- - -## Executive Summary - -Network Services operates an Envoy Gateway-based edge proxy that routes HTTP -traffic, terminates TLS, and enforces WAF and rate-limit policies on behalf of -platform customers. It currently has no billing presence — no `Service` -registration, no meter definitions, and no integration with the durable usage -pipeline. - -This brief defines the scope, approach, and open decisions for registering -Network Services in the platform service catalog and putting it on a path to -usage-based billing. - -The work splits cleanly into two independently deliverable phases: - -- **Phase 1 (catalog):** Declare a `Service` resource and a companion - `ServiceConfiguration` resource (`services.miloapis.com/v1alpha1`) that - carries the monitored-resource and meter declarations inline. No Go code - changes — pure YAML, packaged in `config/services/`. The - `MonitoredResourceType` and meter resources are produced by the catalog - fan-out controller from `ServiceConfiguration`; we do not author them - directly. -- **Phase 2 (emission):** Wire the edge proxy's telemetry into the billing SDK - and emit usage events to the Vector Agent. Requires the investigation findings - below to be confirmed before implementation begins. - -This brief covers both phases but distinguishes which decisions are resolved and -which require sign-off. - ---- - -## Problem Statement - -Network Services is a billable service with measurable consumption signals. Today -those signals are captured only as operational Prometheus metrics — there is no -path from those metrics to a billing account, and there is no `Service` resource -that makes Network Services discoverable in the platform catalog. - -Addressing this before any customer-facing billing launch is materially cheaper -than retrofitting metering after the fact. The `MeterDefinition` `meterName` and -`measurement.unit` fields are immutable once the resource reaches `Published` -phase. Getting them right at `Draft` costs a YAML PR. Getting them wrong costs a -new meter family, an updated SDK, and a customer migration. - ---- - -## Scope - -### In scope - -- `services.miloapis.com/v1alpha1` `Service` resource for Network Services. -- `services.miloapis.com/v1alpha1` `ServiceConfiguration` resource declaring - the monitored resource (`HTTPRoute`) and the four meters inline. -- `config/services/` Kustomize bundle in this repo, following the pattern - established in `datum-cloud/datum/config/services//`. -- An investigation document (this brief) covering how to extract the four - billing signals from the Envoy Gateway proxy and emit them via the billing SDK. - -### Out of scope - -- Rates, pricing tiers, or invoice generation. -- Changes to the `MeterDefinition` schema or billing pipeline contract. -- The billing SDK implementation (owned by `datum-cloud/billing`). -- Changes to Envoy Gateway's deployed configuration before the investigation - findings are confirmed. -- Cross-project billing or shared-infrastructure cost attribution. -- Promotion of any resource from `Draft` to `Published` phase (that step is - gated on billing-team sign-off and is a separate PR). - ---- - -## Billing Signals - -These four signals are the primary consumption dimensions for an HTTP edge proxy. -They are listed in order of commercial priority. - -| Signal | Proposed meterName | Unit (UCUM) | Aggregation | -|--------|--------------------|-------------|-------------| -| Request count | `networking.datumapis.com/gateway/requests` | `{request}` | Sum | -| Egress bytes | `networking.datumapis.com/gateway/egress-bytes` | `By` | Sum | -| Ingress bytes | `networking.datumapis.com/gateway/ingress-bytes` | `By` | Sum | -| Active connection seconds | `networking.datumapis.com/gateway/connection-seconds` | `s` | Sum | - -### Rationale - -**Request count** is the standard billing unit for reverse-proxy services -(Cloudflare, Fastly, AWS CloudFront). It directly captures platform work and is -the unit customers most naturally reason about. It is also the signal least -likely to produce billing surprises. - -**Egress bytes** captures bandwidth cost. It is standard in the hyperscaler -billing model and maps directly to infrastructure cost. Metering ingress and -egress separately gives the platform flexibility to price them differently — or -not at all — without introducing a new meter. - -**Connection seconds** is needed for persistent connections (WebSocket, -gRPC-streaming, long-poll). A customer running a WebSocket-heavy workload has a -cost profile that request-count billing significantly underrepresents. - -### Dimensions - -Each meter carries the following optional dimensions for cost attribution and -future pricing tiers: - -- `region` — Datum deployment region serving the request. -- `gateway` — The `Gateway` resource name, enabling per-gateway cost drill-down. -- `gateway_namespace` — Namespace of the `Gateway` resource, so gateways with - the same name in different namespaces are distinguishable. -- `gateway_class` — The `GatewayClass` of the underlying gateway, so different - gateway classes can be priced or analyzed independently. -- `httproute_name` — Name of the `HTTPRoute` that served the request, so usage - is traceable back to the specific route resource. -- `httproute_namespace` — Namespace of the `HTTPRoute`, completing the - cross-namespace identifier alongside `httproute_name`. - -Dimensions are declared as optional (not required) so events from proxies that -cannot populate them are not rejected at the Ingestion Gateway. - ---- - -## Monitored Resource - -The billable Kind is the **`HTTPRoute`**, not `HTTPProxy` and not `Gateway`. -Rationale: - -- An HTTP request is what is actually being processed and billed. -- `Gateway` is the underlying implementation; advanced consumers may interact - with it directly, but it is not the natural per-request billing unit. -- `HTTPRoute` is the resource customers attach to a `Gateway` to describe the - HTTP traffic they want routed — it is the closest model of "an HTTP service - the customer has stood up." -- Modelling the billable resource at the route layer leaves room for separate - `TCPRoute` and `UDPRoute` monitored resources later, with distinct meter - families if TCP/UDP traffic ends up priced differently from HTTP (which is - common practice — see below). - -### Provider precedent: HTTP vs TCP/UDP billing - -A survey of the major edge / load-balancing products supports modelling HTTP -and TCP/UDP as separate billable resources with separate meter families: - -| Provider | L7 product | L7 units | L4 product | L4 units | -|---|---|---|---|---| -| AWS | ALB | hourly + LCU (max of new conns/sec, **active conns**, GB, rule evals) | NLB | hourly + NLCU (max of new conns, **active conns**, GB) — distinct thresholds per TCP/UDP/TLS | -| GCP | External Application LB | forwarding rule $/hr + $/GB | External Network LB | forwarding rule $/hr + $/GB (passthrough reduced/free) | -| Cloudflare | Workers / CDN | $/million requests + CPU-ms | Spectrum | $/GB over plan allowance | -| Azure | Application Gateway | hourly + Capacity Units (compute, **persistent connections**, throughput) | Standard LB | $/hr per rule + $0.005/GB | -| Fastly | Delivery | $/10k requests + $/GB egress | (no public L4 product) | — | - -Three patterns recur: - -1. **Separate SKUs with separate unit names.** L7 and L4 are essentially always - different products with different billing dimensions. The only mainstream - exception is AWS Global Accelerator, deliberately positioned as - protocol-agnostic anycast transport. -2. **L7 prices the work; L4 prices the pipe.** L7 SKUs expose request, rule- - evaluation, or CPU-second dimensions. L4 SKUs collapse to bytes and - connections. -3. **Active / persistent connections is a first-class billing dimension when - long-lived connections matter, but normalized differently per protocol.** - AWS uses 3 k active/LCU on ALB vs 100 k active/NLCU on NLB precisely because - L4 connections are cheaper to hold open. Azure App Gateway exposes - "persistent connections" as one of its Capacity Unit dimensions. - -Implications for this design: - -- HTTPRoute meters stay request- and rule-evaluation-oriented (our four - signals fit cleanly). -- Future TCPRoute / UDPRoute meters should be byte- and connection-oriented, - with no "request count" meter. -- Connection-seconds is a legitimate cross-layer signal but, if it ever feeds - pricing, should be calibrated with different thresholds per protocol. -- A single shared meter family covering HTTP and TCP/UDP is the minority - pattern and only makes sense for L4-agnostic transport — not an - Envoy-Gateway L7-aware proxy. - -Sources: AWS ELB pricing, AWS Global Accelerator pricing, GCP Cloud Load -Balancing pricing, Cloudflare Spectrum billing docs, Cloudflare Workers -pricing, Azure Application Gateway and Standard LB pricing, Azure Front Door -billing, Fastly pricing. - -**The `MonitoredResourceType` is not shipped as a standalone YAML in this -bundle.** The service catalog's `ServiceConfiguration` resource -(`services.miloapis.com/v1alpha1`) is the source of truth — it carries both -the monitored-resource declarations and the meter declarations inline. A -controller fans out from `ServiceConfiguration` to the corresponding -`billing.miloapis.com/v1alpha1/MonitoredResourceType` (and to meter resources) -in the billing namespace. - -This pattern is already in production for the `compute` service. Its -`ServiceConfiguration` carries `spec.monitoredResourceTypes[]` and -`spec.meters[]` inline, and the fan-out creates `MonitoredResourceType`s like -`compute-datumapis-com-instance`. We follow the same pattern for Network -Services. - -Proposed `ServiceConfiguration` shape: - -```yaml -apiVersion: services.miloapis.com/v1alpha1 -kind: ServiceConfiguration -metadata: - name: networking-datumapis-com -spec: - phase: Draft - serviceRef: - name: networking-datumapis-com - monitoredResourceTypes: - - type: networking.datumapis.com/HTTPRoute - displayName: HTTP Route - description: | - A customer-defined HTTP routing configuration attached to a managed - Gateway. One HTTPRoute represents a single HTTP service surface on - the Datum Cloud edge. Usage events cover proxied request count, - egress bytes, ingress bytes, and active connection seconds. - gvk: - group: gateway.networking.k8s.io - kind: HTTPRoute - labels: - - name: region - description: Datum deployment region serving the requests. - - name: gateway - description: Name of the underlying Gateway resource. - - name: gateway_namespace - description: Namespace of the underlying Gateway resource. - - name: gateway_class - description: GatewayClass of the underlying Gateway. - - name: httproute_name - description: Name of the HTTPRoute that served the request. - - name: httproute_namespace - description: Namespace of the HTTPRoute that served the request. - meters: - - name: networking.datumapis.com/gateway/requests - displayName: HTTP Route Requests - description: HTTP requests proxied through the route. - measurement: - aggregation: Sum - unit: "{request}" - billing: - consumedUnit: "{request}" - pricingUnit: "{request}" - monitoredResourceTypes: - - networking.datumapis.com/HTTPRoute - - name: networking.datumapis.com/gateway/egress-bytes - displayName: HTTP Route Egress Bytes - description: Bytes sent downstream by the route. - measurement: - aggregation: Sum - unit: By - billing: - consumedUnit: By - pricingUnit: GiBy - monitoredResourceTypes: - - networking.datumapis.com/HTTPRoute - - name: networking.datumapis.com/gateway/ingress-bytes - displayName: HTTP Route Ingress Bytes - description: Bytes received from clients by the route. - measurement: - aggregation: Sum - unit: By - billing: - consumedUnit: By - pricingUnit: GiBy - monitoredResourceTypes: - - networking.datumapis.com/HTTPRoute - - name: networking.datumapis.com/gateway/connection-seconds - displayName: HTTP Route Connection Seconds - description: Seconds long-lived connections (e.g. WebSocket) are held open. - measurement: - aggregation: Sum - unit: s - billing: - consumedUnit: s - pricingUnit: h - monitoredResourceTypes: - - networking.datumapis.com/HTTPRoute -``` - -**OD-1 resolved.** The `ServiceConfiguration` resource exists today and is the -correct shape. Phase 1 ships one `Service` and one `ServiceConfiguration` in -`config/services/`. No `config/billing/` bundle is needed — `MeterDefinition` -and `MonitoredResourceType` resources are produced by the fan-out controller -from `ServiceConfiguration.spec`. - -Note (operational, not blocking): at the time of this brief the `compute` -ServiceConfiguration on staging is reporting `BillingFanOutFailed` because of -a DNS lookup error to `billing-webhook.milo-system.svc`. This is an -environment issue, not a shape issue — it confirms that the fan-out -controller is real and is the intended mechanism. - ---- - -## Service Registration - -The canonical `serviceName` is **`networking.datumapis.com`** and the -`producerProjectRef.name` is **`datum-cloud`** (resolves former OD-2 and OD-3). - -Proposed `Service` resource: - -```yaml -apiVersion: services.miloapis.com/v1alpha1 -kind: Service -metadata: - name: networking-datumapis-com - labels: - app.kubernetes.io/name: network-services-operator - app.kubernetes.io/managed-by: kustomize -spec: - serviceName: networking.datumapis.com - phase: Draft - displayName: Network Services - description: | - Managed HTTP/HTTPS edge proxy, routing, and traffic protection - for Datum Cloud workloads. Provides programmable ingress via - Gateway API HTTPRoute and WAF/rate-limit policies. Billed via - the networking.datumapis.com/gateway meter family. - owner: - producerProjectRef: - name: datum-cloud -``` - -The monitored-resource declarations and meter definitions live in the -companion `ServiceConfiguration` resource (see [Monitored Resource](#monitored-resource)), -not in the `Service` spec itself. - ---- - -## Kustomize Bundle Layout - -Following the per-service-domain layout established in -`datum-cloud/datum/config/services/`: - -``` -config/services/ - kustomization.yaml - networking.datumapis.com/ - kustomization.yaml - service.yaml - serviceconfiguration.yaml - README.md -``` - -No `config/billing/` bundle is needed. Both `MonitoredResourceType` and -`MeterDefinition` resources are produced by the catalog's fan-out controller -from `ServiceConfiguration.spec`. - -These resources are **not** wired into the operator's Deployment overlay. They -are control-plane resources deployed independently via Flux into the services -namespace. Wiring them into `datum-cloud/infra` Flux kustomizations is a -separate step handled by the platform/infra team after sign-off. - -**OD-4 resolved.** Bundle layout is a per-service-domain directory under -`config/services/`, matching the existing pattern in -`datum-cloud/datum/config/services//`. Concretely: -`config/services/networking.datumapis.com/` containing the `Service`, -`ServiceConfiguration`, `kustomization.yaml`, and `README.md`. - ---- - -## Usage Pipeline Integration — Investigation Findings - -This section records the investigation into how to extract the four billing -signals from the Envoy Gateway proxy. The conclusion is that Phase 2 requires a -confirmed architectural choice before implementation begins. - -### How Envoy Gateway exposes telemetry - -Envoy Gateway exposes traffic telemetry through three primary surfaces: - -1. **Prometheus scrape endpoint** — the Envoy data plane exposes a `/stats` - endpoint with counters including `envoy_http_downstream_rq_total` (request - count), `envoy_http_downstream_cx_rx_bytes_total` (ingress bytes), - `envoy_http_downstream_cx_tx_bytes_total` (egress bytes), and - `envoy_http_downstream_cx_active` (active connections, from which - connection-seconds can be derived). These are gauge/counter deltas — they are - not pre-attributed to a project or billing account. Importantly, the - cluster-level variants (`envoy_cluster_upstream_*`) carry `httproute_name`, - `httproute_namespace`, and `httproute_rule_ordinal` labels already, providing - per-route attribution with no configuration changes. Counter values are held - in-memory and lost when a pod restarts; the data loss window equals the scrape - interval (typically 15–30 s). - -2. **Access logs** — Envoy can be configured via the `EnvoyProxy` CR to emit - structured JSON access logs (one line per completed request). Each line carries - request bytes, response bytes, duration, upstream cluster, and extensible - metadata. Access logs are the most natural source for per-request billing - because each log line maps directly to one billable unit. - -3. **WASM filter hooks** — Envoy embeds a WASM runtime (V8 or Wasmtime) and - executes a `.wasm` binary inside the Envoy process. The proxy-wasm ABI - exposes lifecycle hooks (`on_request_headers`, `on_response_headers`, - `on_log`, `on_done`) that fire at well-defined points in the request - lifecycle. The filter reads request metadata via `get_property()` host - function calls and dispatches async HTTP calls via `dispatch_http_call()` — - both are synchronous from the filter's perspective but non-blocking to Envoy. - Because the filter runs in-process, no sidecar is needed and no data is lost on pod restart. - -### Candidate approaches - -#### Option A: Access log scraping via Vector Agent (recommended) - -Configure Envoy Gateway via an `EnvoyProxy` CR patch to emit structured JSON -access logs to stdout. Deploy the Vector Agent DaemonSet (already planned for -the billing pipeline's tier-1 durability model) to tail those logs from the -node. Vector parses each line, constructs a `UsageEvent`, and forwards it to the -billing Ingestion Gateway as a CloudEvent. - -**Pros:** -- Per-request granularity — maps naturally to the billing SDK's event model. -- Access logs already carry bytes in/out and duration per request. -- No changes to the network-services-operator Go binary. -- Vector is already the tier-1 durability component in the billing pipeline - architecture. - -**Cons:** -- High volume at scale. A gateway processing 10 k req/s produces 10 k log lines - per second. Vector's disk buffering and batching handle this, but the Ingestion - Gateway must be sized accordingly. -- Log format must be pinned — changes to the access log schema require a - billing-side migration. -- Connection-seconds for WebSocket / long-lived connections is not directly - available in per-request access logs (connections do not emit a per-request - log line while held open). This signal requires a separate mechanism (see - below). -- Requires an `EnvoyProxy` CR patch to configure the access log format. - -#### Option B: Prometheus counter polling by a controller-side emitter - -Run a periodic loop in the network-services-operator (or a sidecar) that reads -the Envoy `/stats` endpoint, computes counter deltas, and emits one `UsageEvent` -per billing interval (e.g., every 60 seconds) per gateway. - -**Pros:** -- Dramatically lower volume — one event per gateway per metric per polling - interval regardless of request rate. -- Connection-seconds is directly derivable from - `envoy_http_downstream_cx_active` sampled at a known interval. -- No change to access log configuration. - -**Cons:** -- Per-request granularity is lost — provenance drill-down in the portal would - be at the gateway level, not the individual request level. -- The emitter must maintain stateful last-seen counter values across restarts - without losing data or double-counting. -- Not a natural fit for the billing SDK's event-driven `Record()` interface. - -#### Option C: gRPC metrics sink (OTel OTLP) - -Configure Envoy's OTel metrics sink to push counters via gRPC OTLP to a -platform-deployed OTel Collector, which transforms them into CloudEvents and -forwards to the Ingestion Gateway. - -**Cons (rejected):** -- The billing pipeline design document explicitly rejects the OTLP path: "OTLP - does not structurally enforce the ULID dedup key or the billing entity - (`subject`). These fields would become opaque string attributes that must - survive the OTLP round-trip intact." - (`billing/docs/enhancements/usage-pipeline.md`) -- Attribution of OTLP metric streams to billing accounts is not defined in the - current pipeline design. - -#### Option D: WASM filter emitter - -Build a small WASM binary (Rust, using `proxy-wasm-rust-sdk`) that runs inside -the Envoy process and hooks into the request lifecycle via the proxy-wasm ABI. -On `on_log()` — which fires after each request completes and on connection close -for WebSocket / long-lived connections — the filter extracts billing signals and -accumulates them in shared memory. Rather than emitting one event per request, -the filter flushes a single batched payload every N seconds (configurable, -default 60 s) via `dispatch_http_call()`. At 10 k req/s this reduces outbound -billing calls from 10,000/s down to 1/min per proxy pod — a reduction of ~600,000× -in emission volume, with no loss of per-route accuracy since signals are -aggregated by `httproute_name` + `httproute_namespace` before flushing. - -The filter reads signal data through `get_property()` host calls: - -``` -request.size → ingress bytes for this request -response.size → egress bytes for this request -request.duration → connection duration (ms) on close -upstream.cluster_name → encodes httproute_name + httproute_namespace -response.code → HTTP status -``` - -Envoy's upstream cluster name for a route follows the pattern: -`httproute///rule/` — the filter parses this string -to extract route identity with no Kubernetes API calls. - -The binary is packaged as an OCI image and wired in via `EnvoyExtensionPolicy`: - -```yaml -apiVersion: gateway.envoyproxy.io/v1alpha1 -kind: EnvoyExtensionPolicy -metadata: - name: billing-wasm -spec: - targetRef: - group: gateway.networking.k8s.io - kind: Gateway - name: - wasm: - - name: billing-emitter - rootID: billing - code: - type: Image - image: - url: oci://ghcr.io/datum-cloud/billing-wasm:latest - config: | - { "endpoint": "http://billing-usage-collector-vector.billing-system.svc.cluster.local:9880/cloudevents", "flush_interval_seconds": 60 } -``` - -**Pros:** -- No data loss on pod restarts — each request emits independently; there are no - accumulated counters to lose. -- Connection-seconds for WebSocket / long-lived connections is captured natively - — `on_log()` fires on connection close with the full duration available. -- Runs inside the Envoy process — no sidecar required. -- No changes to the network-services-operator Go binary. -- Route identity (`httproute_name`, `httproute_namespace`) is available directly - from the upstream cluster name without Kubernetes API calls. - -**Cons:** -- Requires a separate Rust build pipeline and OCI image; adds a new artifact to - maintain. -- The filter cannot make synchronous network calls — billing dispatch goes - through Envoy's async `dispatch_http_call()` to a local agent. If that agent - is unavailable, events must be dropped or buffered in WASM shared memory - (limited). -- WASM sandbox has no filesystem access; durable buffering must be delegated to - Vector or another on-node agent. -- Adds a small per-request overhead (<1 ms) for the `on_log()` hook. - -**Comparison across options:** - -| | Option A (access logs) | Option B (Prometheus) | Option D (WASM) | -|---|---|---|---| -| Data loss on pod restart | near-zero | ~1 scrape interval | none | -| Per-request granularity | ✓ | ✗ | ✓ | -| Connection-seconds | requires separate mechanism | via `cx_length_ms_sum` | ✓ native on close | -| NSO Go changes | none | none | none | -| New build artifact | none | none | Rust WASM binary | -| Infrastructure changes | `EnvoyProxy` CR + Vector config | none | `EnvoyExtensionPolicy` | - -### Connection-seconds handling - -For Option A and Option B, the connection-seconds signal for WebSocket and other -long-lived connections requires a separate mechanism — per-request access logs -do not emit while a connection is held open, and Prometheus counters -(`envoy_cluster_upstream_cx_length_ms_sum`) only update when a connection -closes. - -For **Option D (WASM)**, connection-seconds is handled natively: `on_log()` -fires when a connection closes, at which point `request.duration` contains the -full connection lifetime in milliseconds. No additional Go controller changes -are needed. - -### Recommendation - -**Option A** (access log scraping via Vector) is the recommended approach for -the initial implementation. At current scale, per-request events without -aggregation are sufficient — one `UsageEvent` per completed request, forwarded -directly by Vector to the Ingestion Gateway. Aggregation is a pipeline-level -concern; if it becomes necessary at scale it will be added as a platform -capability in the billing pipeline, not as Envoy-specific logic. - -Connection-seconds for persistent connections is handled by a lightweight -controller-side emitter (see above). This approach requires: - -1. An `EnvoyProxy` CR patch configuring structured JSON access logs. -2. A Vector configuration to parse the log format and construct `UsageEvent`s. -3. A small addition to the gateway controller for connection-lifecycle events. - -None of these changes require modifications to the billing pipeline contract. - -### Pipeline context enrichment - -The pipeline needs every piece of context required to attribute a -`UsageEvent` to the right billing entity (subject project, owning `HTTPRoute`, -gateway, gateway class, region) — and Envoy access logs alone do not carry -all of that. The data plane sees host headers, route names, and listener -metadata; the billing pipeline needs Kubernetes-side identifiers (namespaces, -project refs, owner references). - -A sidecar that watches the relevant control-plane resources (`HTTPRoute`, -`Gateway`, `GatewayClass`, project metadata) and performs the translation from -data-plane identifiers to billing-entity identifiers is the right shape for -this. Two placements to evaluate: - -- **Per-node, alongside Vector:** Vector calls out to the sidecar (or reads an - enriched local cache) to attach project / route / gateway-class metadata to - each event before sending it to the Ingestion Gateway. -- **Central, in front of the Ingestion Gateway:** Vector forwards the raw - events; the enrichment service joins them against a cached view of the - relevant control-plane resources and forwards enriched events to ingestion. - -The per-node placement keeps the durability story simple (Vector's local disk -buffer continues to be the tier-1 guarantee) but pushes a copy of the -control-plane index to every node. The central placement is easier to scale -and operate but introduces a stateful enrichment hop. Final placement is an -OD for Phase 2 (see OD-8). - ---- - -## Open Decisions - -The following decisions are required before work can begin on each phase. - -| ID | Question | Owner | Blocking | -|----|----------|-------|---------| -| OD-1 | ~~How does the service catalog express the monitored resource?~~ — resolved: via `services.miloapis.com/v1alpha1/ServiceConfiguration` carrying `spec.monitoredResourceTypes[]` and `spec.meters[]` inline. Fan-out controller produces the `MonitoredResourceType` and meter resources in the billing namespace. | — | — | -| OD-2 | ~~Canonical `serviceName`~~ — resolved: `networking.datumapis.com`. | — | — | -| OD-3 | ~~`producerProjectRef.name`~~ — resolved: `datum-cloud`. | — | — | -| OD-4 | ~~Bundle layout~~ — resolved: per-service-domain directory under `config/services/networking.datumapis.com/`, matching `datum-cloud/datum/config/services//`. | — | — | -| OD-5 | Is the Vector Agent DaemonSet planned to run on the edge cluster nodes that host Envoy Gateway pods? | Platform / infra | Phase 2 | -| OD-6 | Can the network-services-operator patch the `EnvoyProxy` CR to inject access log configuration? | Kevin | Phase 2 | -| OD-7 | Is the billing SDK published as a consumable Go module? | Billing team | Phase 2 | -| OD-8 | Enrichment-sidecar placement: per-node alongside Vector, or central in front of the Ingestion Gateway? | Billing team / platform | Phase 2 | - -All Phase 1 decisions are resolved; implementation can begin. Phase 2 is -blocked on OD-5 through OD-8. - ---- - -## Acceptance Criteria - -From issue [#155](https://github.com/datum-cloud/network-services-operator/issues/155): - -- [ ] Network Services appears in the platform-wide service catalog in - `Published` phase. (Phase 1 delivers `Draft`; promotion to `Published` is a - separate sign-off step gated on the billing team.) -- [ ] Metering configuration covers request count, egress bytes, ingress bytes, - and connection seconds, all shipped in `Draft` phase initially. -- [ ] Resources are packaged as a Kustomize bundle under - `config/services/networking.datumapis.com/`. -- [ ] Investigation is complete with a clear confirmed approach for collecting - and emitting usage events from the proxy to the billing pipeline. - ---- - -## Implementation Plan - -### Phase 1 — Catalog registration (no Go changes, ~1–2 days) - -1. Author `config/services/networking.datumapis.com/`: - - `service.yaml` — `Service` resource (identity). - - `serviceconfiguration.yaml` — `ServiceConfiguration` carrying - `monitoredResourceTypes[]` and `meters[]` inline. - - `kustomization.yaml` and `README.md`. -2. Add a root `config/services/kustomization.yaml` referencing the - per-service-domain directory. -3. Open PR against `network-services-operator`. No Go code changes. No - `config/billing/` bundle — fan-out controller produces those resources. -4. Platform/infra team separately wires the bundle into `datum-cloud/infra` - Flux kustomizations (out of scope for this repo). - -### Phase 2 — Emission integration (~1–2 weeks) - -1. Resolve OD-5 through OD-8. -2. Add billing SDK to `go.mod`. -3. Patch the `EnvoyProxy` CR to enable structured JSON access logs. -4. Configure Vector to parse access log entries and emit `UsageEvent`s for - request count, egress bytes, and ingress bytes. -5. Build (or wire in) the enrichment sidecar that translates data-plane - identifiers to billing-entity identifiers, per OD-8. -6. Add connection-lifecycle emission to the gateway controller for the - connection-seconds signal. -7. Write unit and integration tests covering the emission logic. - ---- - -## References - -- GitHub issue #155: https://github.com/datum-cloud/network-services-operator/issues/155 -- Billing usage pipeline design: `datum-cloud/billing/docs/enhancements/usage-pipeline.md` -- `MeterDefinition` API type: `datum-cloud/billing/api/v1alpha1/meterdefinition_types.go` -- `MonitoredResourceType` API type: `datum-cloud/billing/api/v1alpha1/monitoredresourcetype_types.go` -- Live reference: `kubectl get serviceconfiguration compute -o yaml` against - the platform-wide control plane — shows the production shape of `spec.monitoredResourceTypes[]` and `spec.meters[]`. -- Per-service-domain bundle layout: `datum-cloud/datum/config/services/`