diff --git a/Dockerfile b/Dockerfile index d7764d22..ab995aae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ COPY go.mod go.sum ./ RUN go mod download # Copy the go source -COPY cmd/team-operator/main.go cmd/team-operator/main.go +COPY cmd/team-operator/ cmd/team-operator/ COPY api/ api/ COPY internal/ ./internal/ @@ -29,7 +29,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ -ldflags="-X 'github.com/posit-dev/team-operator/internal.VersionString=${VERSION}'"\ -a \ -o team-operator \ - cmd/team-operator/main.go + ./cmd/team-operator/ # Use distroless as minimal base image to package the team-operator binary # Refer to https://github.com/GoogleContainerTools/distroless for more details diff --git a/Justfile b/Justfile index e7239cb1..ad15fc2b 100644 --- a/Justfile +++ b/Justfile @@ -28,7 +28,7 @@ deps-up: # Run team-operator directly from source run: - go run cmd/team-operator/main.go + go run ./cmd/team-operator/ # Run team-operator via the Makefile target mrun: @@ -40,7 +40,7 @@ build: -ldflags="-X 'github.com/posit-dev/team-operator/internal.VersionString={{ VERSION }}'" \ -a \ -o ./bin/team-operator \ - cmd/team-operator/main.go + ./cmd/team-operator/ # Build ./bin/team-operator via the Makefile target mbuild: diff --git a/Makefile b/Makefile index 87e5bd8d..2a374e6e 100644 --- a/Makefile +++ b/Makefile @@ -190,7 +190,7 @@ test-integration: go-test test-kind ## Run all tests (unit + integration). .PHONY: build build: copy-crds generate-all fmt vet ## Build manager binary. - go build -o bin/team-operator ./cmd/team-operator/main.go + go build -o bin/team-operator ./cmd/team-operator/ .PHONY: docker-build docker-build: build ## Build the operator Docker image. @@ -203,7 +203,7 @@ distclean: .PHONY: run run: manifests generate-all fmt vet ## Run a controller from your host. - go run ./cmd/team-operator/main.go + go run ./cmd/team-operator/ ##@ Deployment diff --git a/api/core/v1beta1/chronicle_types.go b/api/core/v1beta1/chronicle_types.go index faf208d7..a0d70a7b 100644 --- a/api/core/v1beta1/chronicle_types.go +++ b/api/core/v1beta1/chronicle_types.go @@ -119,10 +119,13 @@ func (c *Chronicle) KubernetesLabels() map[string]string { } func (c *Chronicle) OwnerReferencesForChildren() []metav1.OwnerReference { + // APIVersion/Kind are hardcoded because controller-runtime's client.Get + // strips TypeMeta from typed-object responses, leaving c.APIVersion and + // c.Kind empty in the reconcile path. return []metav1.OwnerReference{ { - APIVersion: c.APIVersion, - Kind: c.Kind, + APIVersion: GroupVersion.String(), + Kind: "Chronicle", Name: c.Name, UID: c.UID, }, diff --git a/api/core/v1beta1/connect_types.go b/api/core/v1beta1/connect_types.go index 300b7302..1bc90f23 100644 --- a/api/core/v1beta1/connect_types.go +++ b/api/core/v1beta1/connect_types.go @@ -275,10 +275,13 @@ func (c *Connect) GetAwsAccountId() string { } func (c *Connect) OwnerReferencesForChildren() []metav1.OwnerReference { + // APIVersion/Kind are hardcoded because controller-runtime's client.Get + // strips TypeMeta from typed-object responses, leaving c.APIVersion and + // c.Kind empty in the reconcile path. return []metav1.OwnerReference{ { - APIVersion: c.APIVersion, - Kind: c.Kind, + APIVersion: GroupVersion.String(), + Kind: "Connect", Name: c.Name, UID: c.UID, }, diff --git a/api/core/v1beta1/packagemanager_types.go b/api/core/v1beta1/packagemanager_types.go index 3ee2b932..5e9e16fd 100644 --- a/api/core/v1beta1/packagemanager_types.go +++ b/api/core/v1beta1/packagemanager_types.go @@ -414,10 +414,13 @@ func (pm *PackageManager) CreateSecretVolumeFactory() *product.SecretVolumeFacto } func (pm *PackageManager) OwnerReferencesForChildren() []metav1.OwnerReference { + // APIVersion/Kind are hardcoded because controller-runtime's client.Get + // strips TypeMeta from typed-object responses, leaving pm.APIVersion and + // pm.Kind empty in the reconcile path. return []metav1.OwnerReference{ { - APIVersion: pm.APIVersion, - Kind: pm.Kind, + APIVersion: GroupVersion.String(), + Kind: "PackageManager", Name: pm.Name, UID: pm.UID, }, diff --git a/api/core/v1beta1/site_types.go b/api/core/v1beta1/site_types.go index e0236e17..e2c680ff 100644 --- a/api/core/v1beta1/site_types.go +++ b/api/core/v1beta1/site_types.go @@ -729,10 +729,13 @@ func (s *Site) GetSecretType() product.SiteSecretType { } func (s *Site) OwnerReferencesForChildren() []metav1.OwnerReference { + // APIVersion/Kind are hardcoded because controller-runtime's client.Get + // strips TypeMeta from typed-object responses, leaving s.APIVersion and + // s.Kind empty in the reconcile path. return []metav1.OwnerReference{ { - APIVersion: s.APIVersion, - Kind: s.Kind, + APIVersion: GroupVersion.String(), + Kind: "Site", Name: s.Name, UID: s.UID, }, diff --git a/api/core/v1beta1/workbench_types.go b/api/core/v1beta1/workbench_types.go index a26e1b5d..c0b87543 100644 --- a/api/core/v1beta1/workbench_types.go +++ b/api/core/v1beta1/workbench_types.go @@ -236,10 +236,13 @@ func init() { } func (w *Workbench) OwnerReferencesForChildren() []metav1.OwnerReference { + // APIVersion/Kind are hardcoded because controller-runtime's client.Get + // strips TypeMeta from typed-object responses, leaving w.APIVersion and + // w.Kind empty in the reconcile path. return []metav1.OwnerReference{ { - APIVersion: w.APIVersion, - Kind: w.Kind, + APIVersion: GroupVersion.String(), + Kind: "Workbench", Name: w.Name, UID: w.UID, }, diff --git a/cmd/team-operator/main.go b/cmd/team-operator/main.go index 00040387..141084cf 100644 --- a/cmd/team-operator/main.go +++ b/cmd/team-operator/main.go @@ -13,6 +13,7 @@ import ( "github.com/posit-dev/team-operator/api/keycloak/v2alpha1" "github.com/posit-dev/team-operator/api/product" + "github.com/posit-dev/team-operator/internal/observability" "github.com/traefik/traefik/v3/pkg/provider/kubernetes/crd/traefikio/v1alpha1" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -111,6 +112,20 @@ func main() { "configurable Workbench session pod field and writes one numbered label per "+ "match onto the pod. Per-site config lives in the Workbench CR's sessionLabels field.") + var ( + obsMetricsOTLPEndpoint string + obsMetricsExportInterval time.Duration + obsClusterName string + ) + + flag.StringVar(&obsMetricsOTLPEndpoint, "observability-metrics-otlp-endpoint", "", + "gRPC OTLP endpoint for metric push (e.g. otel-collector:4317). "+ + "Falls back to OTEL_EXPORTER_OTLP_METRICS_ENDPOINT then OTEL_EXPORTER_OTLP_ENDPOINT.") + flag.DurationVar(&obsMetricsExportInterval, "observability-metrics-export-interval", 30*time.Second, + "Cadence for OTLP metric export and async gauge collection") + flag.StringVar(&obsClusterName, "observability-cluster-name", "", + "Value for the k8s.cluster.name resource attribute") + opts := zap.Options{Development: true} opts.BindFlags(flag.CommandLine) @@ -124,6 +139,26 @@ func main() { zl.Info("team-operator version", "version", internal.VersionString) + instanceID := os.Getenv("POD_NAME") + if instanceID == "" { + setupLog.Info("POD_NAME env var not set; service.instance.id resource attribute will be empty. " + + "Wire POD_NAME from the downward API (metadata.name) for per-pod metric aggregation.") + } + + obsProvider := observability.NewProvider(context.Background(), observability.Config{ + OTLPEndpoint: obsMetricsOTLPEndpoint, + MetricsExportInterval: obsMetricsExportInterval, + ClusterName: obsClusterName, + InstanceID: instanceID, + }) + defer func() { + shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + if err := obsProvider.Shutdown(shutdownCtx); err != nil { + setupLog.Error(err, "error shutting down observability provider") + } + }() + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ Scheme: scheme, Metrics: server.Options{ @@ -171,62 +206,69 @@ func main() { } if err = (&corecontroller.SiteReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/site")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Site") os.Exit(1) } if err = (&corecontroller.PostgresDatabaseReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/postgres-database")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PostgresDatabase") os.Exit(1) } if err = (&corecontroller.ConnectReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/connect")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "ImplConnect") os.Exit(1) } if err = (&corecontroller.WorkbenchReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/workbench")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Workbench") os.Exit(1) } if err = (&corecontroller.PackageManagerReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/package-manager")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "PackageManager") os.Exit(1) } if err = (&corecontroller.ChronicleReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/chronicle")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Chronicle") os.Exit(1) } if err = (&corecontroller.FlightdeckReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Log: setupLog, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Log: setupLog, + Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/flightdeck")), }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "Flightdeck") os.Exit(1) @@ -248,6 +290,14 @@ func main() { //+kubebuilder:scaffold:builder + lister := &multiKindLister{client: mgr.GetClient(), log: setupLog} + if err := observability.RegisterResourceCountGauge( + obsProvider.Meter("team-operator/resource-count"), + lister, + ); err != nil { + setupLog.Error(err, "failed to register resource count gauge; continuing without it") + } + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { setupLog.Error(err, "unable to set up health check") os.Exit(1) diff --git a/cmd/team-operator/resource_lister.go b/cmd/team-operator/resource_lister.go new file mode 100644 index 00000000..8eb2dc98 --- /dev/null +++ b/cmd/team-operator/resource_lister.go @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package main + +import ( + "context" + + "github.com/go-logr/logr" + "sigs.k8s.io/controller-runtime/pkg/client" + + positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" + "github.com/posit-dev/team-operator/internal/observability" + "github.com/posit-dev/team-operator/internal/status" +) + +// multiKindLister implements observability.ResourceLister by listing all +// operator-managed CR kinds and returning per-(controller, namespace, phase) counts. +// It is wired into the async OTel gauge in main.go. +type multiKindLister struct { + client client.Client + log logr.Logger +} + +func (l *multiKindLister) List(ctx context.Context) ([]observability.ResourceCount, error) { + var counts []observability.ResourceCount + + counts = append(counts, l.listSites(ctx)...) + counts = append(counts, l.listConnects(ctx)...) + counts = append(counts, l.listWorkbenches(ctx)...) + counts = append(counts, l.listPackageManagers(ctx)...) + counts = append(counts, l.listChronicles(ctx)...) + counts = append(counts, l.listFlightdecks(ctx)...) + counts = append(counts, l.listPostgresDatabases(ctx)...) + + return counts, nil +} + +// readyPhase returns "ready" or "error" based on a boolean flag. +func readyPhase(ready bool) string { + if ready { + return observability.PhaseReady + } + return observability.PhaseError +} + +func (l *multiKindLister) listSites(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.SiteList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "site", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + phase := readyPhase(status.IsReady(list.Items[i].Status.Conditions)) + counts[[2]string{list.Items[i].Namespace, phase}]++ + } + return mapToResourceCounts("site", counts) +} + +func (l *multiKindLister) listConnects(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.ConnectList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "connect", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + counts[[2]string{list.Items[i].Namespace, readyPhase(list.Items[i].Status.Ready)}]++ + } + return mapToResourceCounts("connect", counts) +} + +func (l *multiKindLister) listWorkbenches(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.WorkbenchList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "workbench", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + counts[[2]string{list.Items[i].Namespace, readyPhase(list.Items[i].Status.Ready)}]++ + } + return mapToResourceCounts("workbench", counts) +} + +func (l *multiKindLister) listPackageManagers(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.PackageManagerList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "package-manager", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + counts[[2]string{list.Items[i].Namespace, readyPhase(list.Items[i].Status.Ready)}]++ + } + return mapToResourceCounts("package-manager", counts) +} + +func (l *multiKindLister) listChronicles(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.ChronicleList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "chronicle", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + counts[[2]string{list.Items[i].Namespace, readyPhase(list.Items[i].Status.Ready)}]++ + } + return mapToResourceCounts("chronicle", counts) +} + +func (l *multiKindLister) listFlightdecks(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.FlightdeckList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "flightdeck", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + counts[[2]string{list.Items[i].Namespace, readyPhase(list.Items[i].Status.Ready)}]++ + } + return mapToResourceCounts("flightdeck", counts) +} + +func (l *multiKindLister) listPostgresDatabases(ctx context.Context) []observability.ResourceCount { + var list positcov1beta1.PostgresDatabaseList + if err := l.client.List(ctx, &list); err != nil { + l.log.V(1).Info("resource_count: list failed", "kind", "postgres-database", "err", err.Error()) + return nil + } + counts := make(map[[2]string]int64, len(list.Items)) + for i := range list.Items { + // PostgresDatabaseStatus embeds CommonProductStatus (Conditions) but has no + // direct Ready bool field; use status.IsReady on the Conditions slice. + phase := readyPhase(status.IsReady(list.Items[i].Status.Conditions)) + counts[[2]string{list.Items[i].Namespace, phase}]++ + } + return mapToResourceCounts("postgres-database", counts) +} + +// mapToResourceCounts converts a namespace/phase count map into ResourceCount observations. +func mapToResourceCounts(controller string, m map[[2]string]int64) []observability.ResourceCount { + out := make([]observability.ResourceCount, 0, len(m)) + for k, n := range m { + out = append(out, observability.ResourceCount{ + Controller: controller, + Namespace: k[0], + Phase: k[1], + Count: n, + }) + } + return out +} diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index bfbb2f4d..18e7487f 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -70,6 +70,7 @@ spec: - /team-operator args: - --leader-elect + - --observability-metrics-export-interval=30s image: controller:latest imagePullPolicy: Always name: manager @@ -101,5 +102,9 @@ spec: env: - name: WATCH_NAMESPACES value: posit-team + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name serviceAccountName: controller-manager terminationGracePeriodSeconds: 10 diff --git a/config/observability/kustomization.yaml b/config/observability/kustomization.yaml new file mode 100644 index 00000000..99c17054 --- /dev/null +++ b/config/observability/kustomization.yaml @@ -0,0 +1,13 @@ +# config/observability/kustomization.yaml +# Optional overlay: enable OTLP metric push alongside the default Prometheus endpoint. +# Apply on top of config/default: +# kubectl kustomize config/observability | kubectl apply -f - +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../default +patches: + - path: manager_patch.yaml + target: + kind: Deployment + name: controller-manager diff --git a/config/observability/manager_patch.yaml b/config/observability/manager_patch.yaml new file mode 100644 index 00000000..c91f93cd --- /dev/null +++ b/config/observability/manager_patch.yaml @@ -0,0 +1,21 @@ +# manager_patch.yaml — patches the manager Deployment to add OTLP endpoint flag and env var. +# Replace OTEL_COLLECTOR_ENDPOINT with your collector's gRPC address. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + containers: + - name: manager + args: + - --leader-elect + - --observability-metrics-otlp-endpoint=$(OTEL_COLLECTOR_ENDPOINT) + - --observability-metrics-export-interval=30s + env: + - name: OTEL_COLLECTOR_ENDPOINT + value: "otel-collector.monitoring.svc.cluster.local:4317" + - name: OTEL_RESOURCE_ATTRIBUTES + value: "" diff --git a/dist/chart/templates/manager/manager.yaml b/dist/chart/templates/manager/manager.yaml index ee73f474..9273c24c 100644 --- a/dist/chart/templates/manager/manager.yaml +++ b/dist/chart/templates/manager/manager.yaml @@ -34,6 +34,13 @@ spec: {{- if .Values.sessionGroupLabels.enable }} - "--enable-session-group-labels" {{- end }} + {{- if .Values.observability.metrics.otlpEndpoint }} + - --observability-metrics-otlp-endpoint={{ .Values.observability.metrics.otlpEndpoint }} + {{- end }} + - --observability-metrics-export-interval={{ .Values.observability.metrics.metricsExportInterval }} + {{- if .Values.observability.clusterName }} + - --observability-cluster-name={{ .Values.observability.clusterName }} + {{- end }} command: - /team-operator {{- $tag := .Values.controllerManager.container.image.tag | default .Chart.AppVersion }} @@ -42,13 +49,15 @@ spec: {{- else }} image: {{ .Values.controllerManager.container.image.repository }}:{{ $tag }} {{- end }} - {{- if .Values.controllerManager.container.env }} env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name {{- range $key, $value := .Values.controllerManager.container.env }} - name: {{ $key }} value: {{ $value | quote }} {{- end }} - {{- end }} livenessProbe: {{- toYaml .Values.controllerManager.container.livenessProbe | nindent 12 }} readinessProbe: diff --git a/dist/chart/values.yaml b/dist/chart/values.yaml index 069d57e7..536a0e9c 100644 --- a/dist/chart/values.yaml +++ b/dist/chart/values.yaml @@ -116,3 +116,15 @@ networkPolicy: # under workbench.sessionLabels — see the team-operator docs for the schema. sessionGroupLabels: enable: false + +# [OBSERVABILITY]: OTel metrics configuration +observability: + metrics: + # gRPC OTLP endpoint for metric push. + # Leave empty to disable OTLP push (falls back to OTEL_EXPORTER_OTLP_ENDPOINT env var if set). + # Example: "otel-collector.monitoring.svc.cluster.local:4317" + otlpEndpoint: "" + # Cadence for OTLP metric export and async gauge collection + metricsExportInterval: "30s" + # Optional k8s.cluster.name resource attribute value + clusterName: "" diff --git a/docs/observability.md b/docs/observability.md new file mode 100644 index 00000000..272fa0ce --- /dev/null +++ b/docs/observability.md @@ -0,0 +1,144 @@ +# Operator Observability + +The team-operator emits OpenTelemetry metrics served via the standard `/metrics` endpoint +(Prometheus exporter) and optionally pushed via OTLP gRPC. This document covers Phase 1 +(metrics) of the operator's observability rollout. + +## Metrics Endpoint + +`/metrics` serves two metric families on the same endpoint: + +1. **controller-runtime built-ins** — always present, no configuration required: + - `controller_runtime_reconcile_total{controller, result}` + - `controller_runtime_reconcile_time_seconds{controller}` (histogram) + - `controller_runtime_reconcile_errors_total{controller}` + - `workqueue_*` metrics + +2. **Domain-specific operator metrics** (`team_operator_*`) — described below. + +## Domain Metrics + +### `team_operator_resource_count` (Gauge) + +Labels: `controller`, `namespace`, `phase` + +How many CRs of a given type are in a given namespace and phase. Refreshed every +`--observability-metrics-export-interval` (default: 30s) by an async gauge callback. +Not on the reconcile hot path. + +**Example PromQL:** +```promql +# Workbench CRs not yet ready in any namespace: +team_operator_resource_count{controller="workbench", phase!="ready"} + +# Total CRs managed per controller: +sum by (controller) (team_operator_resource_count) +``` + +### `team_operator_status_transition_total` (Counter) + +Labels: `controller`, `namespace`, `from_phase`, `to_phase` + +Incremented each time a reconcile moves a CR between phases. Useful for detecting +flapping (repeated error→ready→error cycles) or stuck controllers. + +The `from_phase` label reflects the CR's prior stable phase, derived from the existing +`Ready` condition's reason at the start of the reconcile. On a CR's first reconcile +(no prior conditions) `from_phase=unknown`. This lets dashboards distinguish +"fresh→ready" from "error→ready (recovery)". + +**Example PromQL:** +```promql +# Rate of error transitions across all controllers: +rate(team_operator_status_transition_total{to_phase="error"}[5m]) + +# Check for Connect flapping between ready and error: +increase(team_operator_status_transition_total{controller="connect"}[1h]) +``` + +### `team_operator_dependency_check_total` (Counter) + +Labels: `controller`, `namespace`, `dependency`, `result` + +Incremented each time a dependency check runs. `dependency` is one of: +`postgres`, `keycloak`, `secret`, `crd`. `result` is `success` or `error`. + +**Example PromQL:** +```promql +# Postgres dependency check failure rate: +rate(team_operator_dependency_check_total{dependency="postgres", result="error"}[5m]) +``` + +### `team_operator_reconcile_requeue_total` (Counter) + +Labels: `controller`, `namespace`, `reason` + +Distinguishes requeue reasons that controller-runtime collapses into "requeue". +`reason` is one of: `deps_not_ready`, `conflict`, `retry`, `rate_limit`. + +**Example PromQL:** +```promql +# Requeues due to dependency wait: +rate(team_operator_reconcile_requeue_total{reason="deps_not_ready"}[5m]) +``` + +## Configuration + +### Flags + +| Flag | Default | Purpose | +|------|---------|---------| +| `--observability-metrics-otlp-endpoint` | `""` | OTLP gRPC push endpoint | +| `--observability-metrics-export-interval` | `30s` | OTLP export and gauge refresh cadence | +| `--observability-cluster-name` | `""` | `k8s.cluster.name` resource attribute | + +To disable all OTel instrumentation, set the environment variable `OTEL_SDK_DISABLED=true`. + +### Environment Variables + +Env vars are fallbacks for flags. Flag values take precedence. + +| Variable | Purpose | +|----------|---------| +| `OTEL_EXPORTER_OTLP_ENDPOINT` | OTLP endpoint fallback (all signals) | +| `OTEL_EXPORTER_OTLP_METRICS_ENDPOINT` | OTLP endpoint fallback (metrics only) | +| `OTEL_RESOURCE_ATTRIBUTES` | Free-form resource attributes (`key=value,key=value`) | +| `OTEL_SDK_DISABLED` | Kill switch — disables all OTel instrumentation | +| `POD_NAME` | Set to `metadata.name` via Kubernetes downward API for `service.instance.id` | + +### Precedence + +`flag value > OTEL_EXPORTER_OTLP_METRICS_ENDPOINT > OTEL_EXPORTER_OTLP_ENDPOINT > default` + +## Enabling OTLP Push + +Point at an OpenTelemetry Collector or Grafana Agent: + +**Helm:** +```yaml +observability: + metrics: + otlpEndpoint: "otel-collector.monitoring.svc.cluster.local:4317" +``` + +**Kustomize** — apply the `config/observability/` overlay on top of `config/default/`. + +Both Prometheus and OTLP push can be active simultaneously. Enabling OTLP push does not +disable the `/metrics` endpoint. + +## Resource Attributes + +Every metric carries these resource attributes: + +| Attribute | Value | Source | +|-----------|-------|--------| +| `service.name` | `team-operator` | Hardcoded | +| `service.version` | Operator binary version | `internal.VersionString` | +| `service.instance.id` | Pod name | `$POD_NAME` env var | +| `k8s.cluster.name` | _(optional)_ | `--observability-cluster-name` flag | + +## Cardinality + +Worst case per metric: `controllers (7) × namespaces (~50) × enum values (≤10)` ≈ 3500 series. +This is comfortably within standard Prometheus limits. Per-CR-name labels are intentionally +excluded to prevent cardinality explosion at scale. diff --git a/go.mod b/go.mod index a369ad87..84e28c1c 100644 --- a/go.mod +++ b/go.mod @@ -94,7 +94,7 @@ require ( github.com/google/pprof v0.0.0-20251208000136-3d256cb9ff16 // indirect github.com/google/uuid v1.6.0 // indirect github.com/gorilla/mux v1.8.1 // indirect - github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect github.com/hashicorp/go-version v1.8.0 // indirect github.com/http-wasm/http-wasm-host-go v0.7.0 // indirect github.com/huandu/xstrings v1.5.0 // indirect @@ -118,8 +118,9 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.67.4 // indirect - github.com/prometheus/procfs v0.19.2 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/otlptranslator v1.0.0 // indirect + github.com/prometheus/procfs v0.20.1 // indirect github.com/rs/zerolog v1.34.0 // indirect github.com/shopspring/decimal v1.4.0 // indirect github.com/spf13/cast v1.10.0 // indirect @@ -128,37 +129,40 @@ require ( github.com/unrolled/render v1.7.0 // indirect github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/otel v1.39.0 // indirect + go.opentelemetry.io/otel v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.15.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.15.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.39.0 // indirect + go.opentelemetry.io/otel/exporters/prometheus v0.65.0 // indirect go.opentelemetry.io/otel/log v0.15.0 // indirect - go.opentelemetry.io/otel/metric v1.39.0 // indirect - go.opentelemetry.io/otel/sdk v1.39.0 // indirect + go.opentelemetry.io/otel/metric v1.43.0 // indirect + go.opentelemetry.io/otel/sdk v1.43.0 // indirect go.opentelemetry.io/otel/sdk/log v0.15.0 // indirect - go.opentelemetry.io/otel/trace v1.39.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect + go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect + go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.1 // indirect - go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.46.0 // indirect - golang.org/x/mod v0.31.0 // indirect - golang.org/x/net v0.48.0 // indirect - golang.org/x/oauth2 v0.34.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/sys v0.39.0 // indirect - golang.org/x/term v0.38.0 // indirect - golang.org/x/text v0.32.0 // indirect + golang.org/x/crypto v0.49.0 // indirect + golang.org/x/mod v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect + golang.org/x/oauth2 v0.35.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/sys v0.42.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.35.0 // indirect golang.org/x/time v0.14.0 // indirect - golang.org/x/tools v0.40.0 // indirect + golang.org/x/tools v0.42.0 // indirect gomodules.xyz/jsonpatch/v2 v2.5.0 // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect - google.golang.org/grpc v1.77.0 // indirect - google.golang.org/protobuf v1.36.10 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/grpc v1.80.0 // indirect + google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect diff --git a/go.sum b/go.sum index b4db1ef5..e1b46f72 100644 --- a/go.sum +++ b/go.sum @@ -162,8 +162,8 @@ github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3 h1:NmZ1PKzSTQbuGHw9DGPFomqkkLWMC+vZCkfs+FHv1Vg= -github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.3/go.mod h1:zQrxl1YP88HQlA6i9c63DSVPFklWpGX4OWAc9bFuaH4= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c= github.com/hashicorp/go-version v1.8.0 h1:KAkNb1HAiZd1ukkxDFGmokVZe1Xy9HG6NUp+bPle2i4= github.com/hashicorp/go-version v1.8.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/http-wasm/http-wasm-host-go v0.7.0 h1:+1KrRyOO6tWiDB24QrtSYyDmzFLBBs3jioKaUT0mq1c= @@ -296,10 +296,12 @@ github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= -github.com/prometheus/common v0.67.4 h1:yR3NqWO1/UyO1w2PhUvXlGQs/PtFmoveVO0KZ4+Lvsc= -github.com/prometheus/common v0.67.4/go.mod h1:gP0fq6YjjNCLssJCQp0yk4M8W6ikLURwkdd/YKtTbyI= -github.com/prometheus/procfs v0.19.2 h1:zUMhqEW66Ex7OXIiDkll3tl9a1ZdilUOd/F6ZXw4Vws= -github.com/prometheus/procfs v0.19.2/go.mod h1:M0aotyiemPhBCM0z5w87kL22CxfcH05ZpYlu+b4J7mw= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/otlptranslator v1.0.0 h1:s0LJW/iN9dkIH+EnhiD3BlkkP5QVIUVEoIwkU+A6qos= +github.com/prometheus/otlptranslator v1.0.0/go.mod h1:vRYWnXvI6aWGpsdY/mOT/cbeVRBlPWtBNDb7kGR3uKM= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= @@ -369,34 +371,38 @@ go.opentelemetry.io/collector/featuregate v1.41.0 h1:CL4UMsMQj35nMJC3/jUu8VvYB4M go.opentelemetry.io/collector/featuregate v1.41.0/go.mod h1:A72x92glpH3zxekaUybml1vMSv94BH6jQRn5+/htcjw= go.opentelemetry.io/collector/pdata v1.41.0 h1:2zurAaY0FkURbLa1x7f7ag6HaNZYZKSmI4wgzDegLgo= go.opentelemetry.io/collector/pdata v1.41.0/go.mod h1:h0OghaTYe4oRvLxK31Ny7gkyjJ1p8oniM5MiCzluQjc= -go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= -go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.15.0 h1:W+m0g+/6v3pa5PgVf2xoFMi5YtNR06WtS7ve5pcvLtM= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.15.0/go.mod h1:JM31r0GGZ/GU94mX8hN4D8v6e40aFlUECSQ48HaLgHM= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.15.0 h1:EKpiGphOYq3CYnIe2eX9ftUkyU+Y8Dtte8OaWyHJ4+I= go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.15.0/go.mod h1:nWFP7C+T8TygkTjJ7mAyEaFaE7wNfms3nV/vexZ6qt0= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 h1:8UQVDcZxOJLtX6gxtDt3vY2WTgvZqMQRzjsqiIHQdkc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0/go.mod h1:2lmweYCiHYpEjQ/lSJBYhj9jP1zvCvQW4BqL9dnT7FQ= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0 h1:f0cb2XPmrqn4XMy9PNliTgRKJgS5WcL/u0/WRYGz4t0= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.39.0/go.mod h1:vnakAaFckOMiMtOIhFI2MNH4FYrZzXCYxmb1LlhoGz8= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0 h1:in9O8ESIOlwJAEGTkkf34DesGRAc/Pn8qJ7k3r/42LM= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.39.0/go.mod h1:Rp0EXBm5tfnv0WL+ARyO/PHBEaEAT8UUHQ6AGJcSq6c= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.39.0 h1:Ckwye2FpXkYgiHX7fyVrN1uA/UYd9ounqqTuSNAv0k4= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.39.0/go.mod h1:teIFJh5pW2y+AN7riv6IBPX2DuesS3HgP39mwOspKwU= +go.opentelemetry.io/otel/exporters/prometheus v0.65.0 h1:jOveH/b4lU9HT7y+Gfamf18BqlOuz2PWEvs8yM7Q6XE= +go.opentelemetry.io/otel/exporters/prometheus v0.65.0/go.mod h1:i1P8pcumauPtUI4YNopea1dhzEMuEqWP1xoUZDylLHo= go.opentelemetry.io/otel/log v0.15.0 h1:0VqVnc3MgyYd7QqNVIldC3dsLFKgazR6P3P3+ypkyDY= go.opentelemetry.io/otel/log v0.15.0/go.mod h1:9c/G1zbyZfgu1HmQD7Qj84QMmwTp2QCQsZH1aeoWDE4= -go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= -go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= -go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= -go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= go.opentelemetry.io/otel/sdk/log v0.15.0 h1:WgMEHOUt5gjJE93yqfqJOkRflApNif84kxoHWS9VVHE= go.opentelemetry.io/otel/sdk/log v0.15.0/go.mod h1:qDC/FlKQCXfH5hokGsNg9aUBGMJQsrUyeOiW5u+dKBQ= go.opentelemetry.io/otel/sdk/log/logtest v0.14.0 h1:Ijbtz+JKXl8T2MngiwqBlPaHqc4YCaP/i13Qrow6gAM= go.opentelemetry.io/otel/sdk/log/logtest v0.14.0/go.mod h1:dCU8aEL6q+L9cYTqcVOk8rM9Tp8WdnHOPLiBgp0SGOA= -go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= -go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= -go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= -go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/atomic v1.3.2/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.5.0/go.mod h1:sABNBOSYdrvTF6hTgEIbc7YasKWGhgEQZyfxyTvoXHQ= @@ -414,8 +420,8 @@ go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.13.0/go.mod h1:zwrFLgMcdUuIBviXEYEH1YKNaOBnKXsx2IPda5bBwHM= go.uber.org/zap v1.27.1 h1:08RqriUEv8+ArZRYSTXy1LeBScaMpVSTBhCeaZYfMYc= go.uber.org/zap v1.27.1/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= -go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= -go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -430,8 +436,8 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.20.0/go.mod h1:Xwo95rrVNIoSMx9wa1JroENMToLWn3RNVrTBpLHgZPQ= -golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= -golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= +golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= +golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/exp v0.0.0-20251209150349-8475f28825e9 h1:MDfG8Cvcqlt9XXrmEiD4epKn7VJHZO84hejP9Jmp0MM= golang.org/x/exp v0.0.0-20251209150349-8475f28825e9/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= @@ -441,8 +447,8 @@ golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= -golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -454,19 +460,19 @@ golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= -golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= -golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= -golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= -golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190222072716-a9d3bda3a223/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -488,16 +494,16 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= -golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= -golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= -golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -507,8 +513,8 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= -golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= -golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -524,8 +530,8 @@ golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roY golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= -golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= -golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/xerrors v0.0.0-20190410155217-1f06c39b4373/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190513163551-3ee3066db522/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -534,17 +540,17 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= gomodules.xyz/jsonpatch/v2 v2.5.0 h1:JELs8RLM12qJGXU4u/TO3V25KW8GreMKl9pdkk14RM0= gomodules.xyz/jsonpatch/v2 v2.5.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= -google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= -google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= -google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= -google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= -google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM= -google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig= -google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= -google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= +google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= +google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= diff --git a/internal/controller/core/chronicle_controller.go b/internal/controller/core/chronicle_controller.go index 75963d8f..16c6b782 100644 --- a/internal/controller/core/chronicle_controller.go +++ b/internal/controller/core/chronicle_controller.go @@ -10,6 +10,7 @@ import ( "github.com/go-logr/logr" "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" v1 "k8s.io/api/apps/v1" @@ -30,8 +31,9 @@ import ( // ChronicleReconciler reconciles a Chronicle object type ChronicleReconciler struct { client.Client - Scheme *runtime.Scheme - Log logr.Logger + Scheme *runtime.Scheme + Log logr.Logger + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=chronicles,verbs=get;list;watch;create;update;patch;delete @@ -80,12 +82,16 @@ func (r *ChronicleReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( l.Info("Chronicle found; updating resources") - if res, err := r.ReconcileChronicle(ctx, req, &c); err != nil { + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(c.Status.Conditions) + + if res, err := r.ReconcileChronicle(ctx, req, &c, priorPhase); err != nil { l.Error(err, "error reconciling product state") + r.Instruments.RecordStatusTransition(ctx, "chronicle", req.Namespace, + priorPhase, observability.PhaseError) return res, err } - - // reconcile successful + // reconcile successful — success metric recorded inside ReconcileChronicle return ctrl.Result{}, nil } @@ -97,7 +103,7 @@ func (r *ChronicleReconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } -func (r *ChronicleReconciler) ReconcileChronicle(ctx context.Context, req ctrl.Request, c *positcov1beta1.Chronicle) (ctrl.Result, error) { +func (r *ChronicleReconciler) ReconcileChronicle(ctx context.Context, req ctrl.Request, c *positcov1beta1.Chronicle, priorPhase string) (ctrl.Result, error) { l := r.GetLogger(ctx).WithValues( "event", "reconcile-chronicle", "product", "chronicle", @@ -118,6 +124,8 @@ func (r *ChronicleReconciler) ReconcileChronicle(ctx context.Context, req ctrl.R l.Error(patchErr, "Error patching suspended status") return res, patchErr } + r.Instruments.RecordStatusTransition(ctx, "chronicle", c.Namespace, + priorPhase, observability.PhaseSuspended) return res, nil } @@ -161,6 +169,8 @@ func (r *ChronicleReconciler) ReconcileChronicle(ctx context.Context, req ctrl.R return ctrl.Result{}, err } + r.Instruments.RecordStatusTransition(ctx, "chronicle", c.Namespace, + priorPhase, observability.PhaseReady) return ctrl.Result{}, nil } diff --git a/internal/controller/core/chronicle_controller_test.go b/internal/controller/core/chronicle_controller_test.go index e4e2c6db..16abe47e 100644 --- a/internal/controller/core/chronicle_controller_test.go +++ b/internal/controller/core/chronicle_controller_test.go @@ -10,9 +10,12 @@ import ( "github.com/go-logr/logr" positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" "github.com/posit-dev/team-operator/api/localtest" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -56,7 +59,7 @@ func TestChronicleReconciler_Suspended(t *testing.T) { err := cli.Create(ctx, c) require.NoError(t, err) - res, err := r.ReconcileChronicle(ctx, req, c) + res, err := r.ReconcileChronicle(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -78,3 +81,80 @@ func TestChronicleReconciler_Suspended(t *testing.T) { assert.Equal(t, metav1.ConditionFalse, progressCond.Status) assert.Equal(t, status.ReasonSuspended, progressCond.Reason) } + +// TestChronicleReconciler_Metrics verifies that a status transition metric is recorded +// when ReconcileChronicle processes a suspended Chronicle (PhaseSuspended path). +func TestChronicleReconciler_Metrics(t *testing.T) { + ctx := context.Background() + ns := "posit-team" + name := "chronicle-metrics" + + fakeEnv := localtest.FakeTestEnv{} + cli, scheme, log := fakeEnv.Start(loadSchemes) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + + r := &ChronicleReconciler{ + Client: cli, + Scheme: scheme, + Log: log, + Instruments: observability.NewInstruments(mp.Meter("test")), + } + + ctx = logr.NewContext(ctx, log) + req := ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: ns, Name: name}, + } + + suspended := true + c := &positcov1beta1.Chronicle{ + TypeMeta: metav1.TypeMeta{ + Kind: "Chronicle", + APIVersion: "core.posit.team/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: name}, + Spec: positcov1beta1.ChronicleSpec{Suspended: &suspended}, + } + + err := cli.Create(ctx, c) + require.NoError(t, err) + + // ReconcileChronicle with Suspended=true exercises the PhaseSuspended recording path. + _, err = r.ReconcileChronicle(ctx, req, c, observability.PhaseUnknown) + require.NoError(t, err) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected exactly one data point for the single transition") + dp = sum.DataPoints[0] + found = true + break + } + if found { + break + } + } + require.True(t, found, "expected status transition metric to be emitted on suspended path") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "chronicle", + observability.LabelNamespace: ns, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseSuspended, + }, attrs) + assert.Equal(t, int64(1), dp.Value, "expected exactly one transition recorded") +} diff --git a/internal/controller/core/connect.go b/internal/controller/core/connect.go index 6a6aa844..319ae652 100644 --- a/internal/controller/core/connect.go +++ b/internal/controller/core/connect.go @@ -10,6 +10,7 @@ import ( "github.com/posit-dev/team-operator/api/templates" "github.com/posit-dev/team-operator/internal" "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" v1 "k8s.io/api/apps/v1" @@ -32,7 +33,7 @@ import ( //+kubebuilder:rbac:namespace=posit-team,groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:namespace=posit-team,groups=secrets-store.csi.x-k8s.io,resources=secretproviderclasses,verbs=get;list;watch;create;update;patch;delete -func (r *ConnectReconciler) ReconcileConnect(ctx context.Context, req ctrl.Request, c *positcov1beta1.Connect) (ctrl.Result, error) { +func (r *ConnectReconciler) ReconcileConnect(ctx context.Context, req ctrl.Request, c *positcov1beta1.Connect, priorPhase string) (ctrl.Result, error) { l := r.GetLogger(ctx).WithValues( "event", "reconcile-connect", "product", "connect", @@ -161,6 +162,8 @@ func (r *ConnectReconciler) ReconcileConnect(ctx context.Context, req ctrl.Reque return ctrl.Result{}, err } + r.Instruments.RecordStatusTransition(ctx, "connect", req.Namespace, + priorPhase, observability.PhaseReady) return ctrl.Result{}, nil } diff --git a/internal/controller/core/connect_controller.go b/internal/controller/core/connect_controller.go index bbf3389b..1470bca3 100644 --- a/internal/controller/core/connect_controller.go +++ b/internal/controller/core/connect_controller.go @@ -18,13 +18,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" + "github.com/posit-dev/team-operator/internal/observability" ) // ConnectReconciler reconciles a ImplConnect object type ConnectReconciler struct { client.Client - Scheme *runtime.Scheme - Log logr.Logger + Scheme *runtime.Scheme + Log logr.Logger + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=connects,verbs=get;list;watch;create;update;patch;delete @@ -77,11 +79,16 @@ func (r *ConnectReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct l.Info("Connect found; updating resources") - if res, err := r.ReconcileConnect(ctx, req, &c); err != nil { + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(c.Status.Conditions) + + if res, err := r.ReconcileConnect(ctx, req, &c, priorPhase); err != nil { l.Error(err, "error reconciling product state") + r.Instruments.RecordStatusTransition(ctx, "connect", req.Namespace, + priorPhase, observability.PhaseError) return res, err } - // reconcile successful + // reconcile successful — success metric recorded inside ReconcileConnect return ctrl.Result{}, nil } diff --git a/internal/controller/core/connect_test.go b/internal/controller/core/connect_test.go index 3f31ce26..99e68906 100644 --- a/internal/controller/core/connect_test.go +++ b/internal/controller/core/connect_test.go @@ -9,10 +9,13 @@ import ( localtest "github.com/posit-dev/team-operator/api/localtest" "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -26,6 +29,7 @@ func initConnectReconciler(t *testing.T, ctx context.Context, namespace, name st localEnv := localtest.LocalTestEnv{} cli, cliScheme, log, err := localEnv.Start(loadSchemes) require.NoError(t, err) + t.Cleanup(func() { _ = localEnv.Stop() }) r := &ConnectReconciler{ Client: cli, Scheme: cliScheme, @@ -90,6 +94,12 @@ func TestConnectReconciler_SAML(t *testing.T) { ctx, r, req, cli := initConnectReconciler(t, ctx, ns, name) + // Wire up an in-memory meter so we can assert metric recording. + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + r.Instruments = observability.NewInstruments(mp.Meter("test")) + c := defineDefaultConnect(t, ns, name) c.Spec.Auth = positcov1beta1.AuthSpec{ Type: positcov1beta1.AuthTypeSaml, @@ -101,7 +111,7 @@ func TestConnectReconciler_SAML(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -117,6 +127,93 @@ func TestConnectReconciler_SAML(t *testing.T) { require.True(t, exists, "rstudio-connect.gcfg should exist in the ConfigMap") assert.Contains(t, config, "[Authentication]\nProvider = saml", "SAML auth should be enabled") assert.Contains(t, config, "[SAML]\nIdPMetaDataURL = https://idp.example.com/saml/metadata\nIdPAttributeProfile = default\n", "SAML section should be configured") + + // Assert that the status transition metric was emitted with the expected + // label contract. A regression that swapped from/to phases, omitted the + // namespace label, or recorded the wrong controller would change this map. + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected one transition per reconcile") + dp = sum.DataPoints[0] + found = true + } + } + require.True(t, found, "expected status transition metric to be emitted") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "connect", + observability.LabelNamespace: ns, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseReady, + }, attrs) + assert.Equal(t, int64(1), dp.Value, "expected exactly one transition recorded") +} + +// TestConnectReconciler_ErrorRecordsTransition exercises the error emission +// site in Reconcile (not ReconcileConnect), so a regression that drops the +// error metric while keeping the success metric — or vice versa — is caught. +func TestConnectReconciler_ErrorRecordsTransition(t *testing.T) { + ctx := context.Background() + ns := "posit-team" + name := "connect-err" + + ctx, r, req, _ := initConnectReconciler(t, ctx, ns, name) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + r.Instruments = observability.NewInstruments(mp.Meter("test")) + + // Force ReconcileConnect to error early via the SAML mutual-exclusivity check. + c := defineDefaultConnect(t, ns, name) + c.Spec.Auth = positcov1beta1.AuthSpec{ + Type: positcov1beta1.AuthTypeSaml, + SamlMetadataUrl: "https://idp.example.com/saml/metadata", + SamlIdPAttributeProfile: "custom-profile", + SamlUsernameAttribute: "http://schemas.xmlsoap.org/ws/2005/05/identity/claims/upn", + } + + require.NoError(t, internal.BasicCreateOrUpdate(ctx, r, r.GetLogger(ctx), req.NamespacedName, &positcov1beta1.Connect{}, c)) + + _, err := r.Reconcile(ctx, req) + require.Error(t, err) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected one transition per reconcile") + dp = sum.DataPoints[0] + found = true + } + } + require.True(t, found, "expected status transition metric to be emitted on error") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, observability.PhaseError, attrs[observability.LabelToPhase], "to_phase should be error") + assert.Equal(t, "connect", attrs[observability.LabelController], "controller should be connect") + assert.Equal(t, ns, attrs[observability.LabelNamespace], "namespace label should match") } func TestConnectReconciler_SAML_WithIdPAttributeProfile(t *testing.T) { @@ -138,7 +235,7 @@ func TestConnectReconciler_SAML_WithIdPAttributeProfile(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -178,7 +275,7 @@ func TestConnectReconciler_SAML_WithIndividualAttributes(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -222,7 +319,7 @@ func TestConnectReconciler_SAML_PartialIndividualAttributes(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -263,7 +360,7 @@ func TestConnectReconciler_SAML_ValidationError_MutualExclusivity(t *testing.T) c = getConnect(t, cli, ns, name) - _, err = r.ReconcileConnect(ctx, req, c) + _, err = r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.Error(t, err) assert.Contains(t, err.Error(), "SAML IdPAttributeProfile cannot be specified together with individual SAML attribute mappings") } @@ -282,7 +379,7 @@ func TestConnectReconciler_DefaultDatabaseSchemas(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -319,7 +416,7 @@ func TestConnectReconciler_CustomDatabaseSchemas(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -359,7 +456,7 @@ func TestConnectReconciler_OIDC_EnableRegisterOnFirstLogin(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -396,7 +493,7 @@ func TestConnectReconciler_OIDC_DefaultRegisterOnFirstLogin(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -430,7 +527,7 @@ func TestConnectReconciler_RegisterOnFirstLogin_IgnoredWithNoAuth(t *testing.T) c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -466,7 +563,7 @@ func TestConnectReconciler_RegisterOnFirstLogin_IgnoredWithSAML(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -505,7 +602,7 @@ func TestConnectReconciler_OIDC_DisableGroupsClaim(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -546,7 +643,7 @@ func TestConnectReconciler_Suspended(t *testing.T) { c = getConnect(t, cli, ns, name) - res, err := r.ReconcileConnect(ctx, req, c) + res, err := r.ReconcileConnect(ctx, req, c, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) diff --git a/internal/controller/core/flightdeck_controller.go b/internal/controller/core/flightdeck_controller.go index 51021687..71f2e025 100644 --- a/internal/controller/core/flightdeck_controller.go +++ b/internal/controller/core/flightdeck_controller.go @@ -9,6 +9,7 @@ import ( "github.com/go-logr/logr" positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" "github.com/posit-dev/team-operator/internal" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" appsv1 "k8s.io/api/apps/v1" @@ -29,8 +30,9 @@ import ( // FlightdeckReconciler reconciles a Flightdeck object type FlightdeckReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme + Log logr.Logger + Scheme *runtime.Scheme + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=flightdecks,verbs=get;list;watch;create;update;patch;delete @@ -70,6 +72,9 @@ func (r *FlightdeckReconciler) Reconcile(ctx context.Context, req ctrl.Request) "domain", fd.Spec.Domain, ) + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(fd.Status.Conditions) + // Save a copy for status patching patchBase := client.MergeFrom(fd.DeepCopy()) @@ -79,6 +84,8 @@ func (r *FlightdeckReconciler) Reconcile(ctx context.Context, req ctrl.Request) if res, err := r.reconcileFlightdeckResources(ctx, req, fd, l); err != nil { l.Error(err, "failed to reconcile flightdeck resources") + r.Instruments.RecordStatusTransition(ctx, "flightdeck", req.Namespace, + priorPhase, observability.PhaseError) if patchErr := status.PatchErrorStatus(ctx, r.Status(), fd, patchBase, &fd.Status.Conditions, fd.Generation, err); patchErr != nil { l.Error(patchErr, "Error patching error status") } @@ -105,6 +112,9 @@ func (r *FlightdeckReconciler) Reconcile(ctx context.Context, req ctrl.Request) return ctrl.Result{}, err } + r.Instruments.RecordStatusTransition(ctx, "flightdeck", req.Namespace, + priorPhase, observability.PhaseReady) + l.Info("reconciliation completed successfully", "component", fd.ComponentName(), "domain", fd.Spec.Domain, diff --git a/internal/controller/core/flightdeck_test.go b/internal/controller/core/flightdeck_test.go index 9dda54dd..600b0ac7 100644 --- a/internal/controller/core/flightdeck_test.go +++ b/internal/controller/core/flightdeck_test.go @@ -6,8 +6,11 @@ import ( "github.com/posit-dev/team-operator/api/core/v1beta1" "github.com/posit-dev/team-operator/api/localtest" + "github.com/posit-dev/team-operator/internal/observability" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" @@ -437,3 +440,48 @@ func TestResolveFlightdeckImage(t *testing.T) { }) } } + +func TestFlightdeckReconciler_Metrics(t *testing.T) { + fdName := "metrics-flightdeck" + fdNamespace := "posit-team" + fd := defaultFlightdeck(fdName, fdNamespace) + + fakeClient := localtest.FakeTestEnv{} + cli, scheme, log := fakeClient.Start(loadSchemes) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + defer mp.Shutdown(context.Background()) + + rec := FlightdeckReconciler{ + Client: cli, + Scheme: scheme, + Log: log, + Instruments: observability.NewInstruments(mp.Meter("test")), + } + + err := cli.Create(context.TODO(), fd) + require.NoError(t, err) + + req := ctrl.Request{ + NamespacedName: types.NamespacedName{ + Namespace: fdNamespace, + Name: fdName, + }, + } + + _, err = rec.Reconcile(context.TODO(), req) + require.NoError(t, err) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name == observability.MetricStatusTransitionTotal { + found = true + } + } + } + assert.True(t, found, "expected status transition to be recorded") +} diff --git a/internal/controller/core/package_manager.go b/internal/controller/core/package_manager.go index 730b1234..2021142f 100644 --- a/internal/controller/core/package_manager.go +++ b/internal/controller/core/package_manager.go @@ -8,6 +8,7 @@ import ( "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" v1 "k8s.io/api/apps/v1" @@ -96,7 +97,7 @@ func (r *PackageManagerReconciler) cleanupDeployedService(ctx context.Context, r const packageManagerConfigShaKey = "package-manager.posit.team/configmap-sha" -func (r *PackageManagerReconciler) ReconcilePackageManager(ctx context.Context, req ctrl.Request, pm *positcov1beta1.PackageManager) (ctrl.Result, error) { +func (r *PackageManagerReconciler) ReconcilePackageManager(ctx context.Context, req ctrl.Request, pm *positcov1beta1.PackageManager, priorPhase string) (ctrl.Result, error) { l := r.GetLogger(ctx).WithValues( "event", "reconcile-package-manager-service", "product", "package-manager", @@ -222,6 +223,8 @@ func (r *PackageManagerReconciler) ReconcilePackageManager(ctx context.Context, return ctrl.Result{}, err } + r.Instruments.RecordStatusTransition(ctx, "packagemanager", req.Namespace, + priorPhase, observability.PhaseReady) return ctrl.Result{}, nil } diff --git a/internal/controller/core/package_manager_controller_test.go b/internal/controller/core/package_manager_controller_test.go index bb1d64a0..19196426 100644 --- a/internal/controller/core/package_manager_controller_test.go +++ b/internal/controller/core/package_manager_controller_test.go @@ -11,9 +11,13 @@ import ( positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" "github.com/posit-dev/team-operator/api/localtest" "github.com/posit-dev/team-operator/api/product" + "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" @@ -24,6 +28,84 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) +// TestPackageManagerReconciler_Metrics verifies that a status transition metric is recorded +// when Reconcile processes a PackageManager (error path through the real reconcile loop). +func TestPackageManagerReconciler_Metrics(t *testing.T) { + ctx := context.Background() + ns := "posit-team" + name := "pm-metrics" + + fakeEnv := localtest.FakeTestEnv{} + cli, scheme, log := fakeEnv.Start(loadSchemes) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + + r := &PackageManagerReconciler{ + Client: cli, + Scheme: scheme, + Log: log, + Instruments: observability.NewInstruments(mp.Meter("test")), + } + + ctx = logr.NewContext(ctx, log) + req := ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: ns, Name: name}, + } + + pm := &positcov1beta1.PackageManager{ + TypeMeta: metav1.TypeMeta{ + Kind: "PackageManager", + APIVersion: "core.posit.team/v1beta1", + }, + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: name}, + } + + err := cli.Create(ctx, pm) + require.NoError(t, err) + + // Reconcile will find the PM, call ReconcilePackageManager, which will fail + // at the DB step (fake client has no DB). The error path in Reconcile records + // the PhaseError status transition metric. + _, err = r.Reconcile(ctx, req) + require.ErrorIs(t, err, db.ErrDBHostnameMissing, + "expected DB-step failure to propagate") + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected exactly one data point for the single transition") + dp = sum.DataPoints[0] + found = true + break + } + if found { + break + } + } + require.True(t, found, "expected status transition metric to be emitted on error") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "packagemanager", + observability.LabelNamespace: ns, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseError, + }, attrs) + assert.Equal(t, int64(1), dp.Value, "expected exactly one transition recorded") +} + // TestPackageManagerReconciler_Suspended verifies that when PackageManager has Suspended=true, // ReconcilePackageManager does not create a Deployment and does not apply SetProgressing. func TestPackageManagerReconciler_Suspended(t *testing.T) { @@ -58,7 +140,7 @@ func TestPackageManagerReconciler_Suspended(t *testing.T) { err := cli.Create(ctx, pm) require.NoError(t, err) - res, err := r.ReconcilePackageManager(ctx, req, pm) + res, err := r.ReconcilePackageManager(ctx, req, pm, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) diff --git a/internal/controller/core/packagemanager_controller.go b/internal/controller/core/packagemanager_controller.go index e4ac416d..eb2ec4b7 100644 --- a/internal/controller/core/packagemanager_controller.go +++ b/internal/controller/core/packagemanager_controller.go @@ -18,13 +18,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" + "github.com/posit-dev/team-operator/internal/observability" ) // PackageManagerReconciler reconciles a PackageManager object type PackageManagerReconciler struct { client.Client - Scheme *runtime.Scheme - Log logr.Logger + Scheme *runtime.Scheme + Log logr.Logger + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=packagemanagers,verbs=get;list;watch;create;update;patch;delete @@ -71,12 +73,16 @@ func (r *PackageManagerReconciler) Reconcile(ctx context.Context, req ctrl.Reque l.Info("PackageManager found; updating resources") - if res, err := r.ReconcilePackageManager(ctx, req, &pm); err != nil { + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(pm.Status.Conditions) + + if res, err := r.ReconcilePackageManager(ctx, req, &pm, priorPhase); err != nil { l.Error(err, "error reconciling product state") + r.Instruments.RecordStatusTransition(ctx, "packagemanager", req.Namespace, + priorPhase, observability.PhaseError) return res, err } - - // reconcile successful + // reconcile successful — success metric recorded inside ReconcilePackageManager return ctrl.Result{}, nil } diff --git a/internal/controller/core/postgresdatabase_controller.go b/internal/controller/core/postgresdatabase_controller.go index 934b2cd7..aa6e87df 100644 --- a/internal/controller/core/postgresdatabase_controller.go +++ b/internal/controller/core/postgresdatabase_controller.go @@ -16,6 +16,7 @@ import ( "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -48,8 +49,9 @@ var ( // PostgresDatabaseReconciler reconciles a PostgresDatabase object type PostgresDatabaseReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme + Log logr.Logger + Scheme *runtime.Scheme + Instruments observability.Instruments } func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { @@ -82,6 +84,9 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req l.Info("PostgresDatabase found; reconciling database") + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(pgd.Status.Conditions) + // Save a copy for status patching patchBase := client.MergeFrom(pgd.DeepCopy()) @@ -96,9 +101,13 @@ func (r *PostgresDatabaseReconciler) Reconcile(ctx context.Context, req ctrl.Req msg := status.TruncateMessage(createErr.Error()) status.SetReady(&pgd.Status.Conditions, pgd.Generation, metav1.ConditionFalse, status.ReasonReconcileError, msg) status.SetProgressing(&pgd.Status.Conditions, pgd.Generation, metav1.ConditionFalse, status.ReasonReconcileError, msg) + r.Instruments.RecordStatusTransition(ctx, "postgres-database", req.Namespace, + priorPhase, observability.PhaseError) } else { status.SetReady(&pgd.Status.Conditions, pgd.Generation, metav1.ConditionTrue, status.ReasonDatabaseReady, "Database provisioned successfully") status.SetProgressing(&pgd.Status.Conditions, pgd.Generation, metav1.ConditionFalse, status.ReasonReconcileComplete, "Reconciliation complete") + r.Instruments.RecordStatusTransition(ctx, "postgres-database", req.Namespace, + priorPhase, observability.PhaseDatabaseReady) } // Patch status regardless of createDatabase result @@ -237,8 +246,12 @@ func (r *PostgresDatabaseReconciler) createDatabase(ctx context.Context, req ctr mainDbUrl, specDbUrl, err := r.loadValidatedDatabaseURLs(ctx, pgd, req, pgd.Spec.Secret, pgd.Spec.SecretPasswordKey) if err != nil { l.Error(err, "failed to load validated database urls") + r.Instruments.RecordDependencyCheck(ctx, "postgres-database", req.Namespace, + observability.DependencyPostgres, observability.ResultError) return ctrl.Result{}, err } + r.Instruments.RecordDependencyCheck(ctx, "postgres-database", req.Namespace, + observability.DependencyPostgres, observability.ResultSuccess) superuserDbUrl, _ := url.Parse(specDbUrl.String()) mainDbPassword, hasPassword := mainDbUrl.User.Password() diff --git a/internal/controller/core/site_controller.go b/internal/controller/core/site_controller.go index cc4f8df3..06a78e8d 100644 --- a/internal/controller/core/site_controller.go +++ b/internal/controller/core/site_controller.go @@ -13,6 +13,7 @@ import ( positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" corev1 "k8s.io/api/core/v1" @@ -38,8 +39,9 @@ func checkBool(b *bool, defaultVal bool) bool { // SiteReconciler reconciles a Site object type SiteReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme + Log logr.Logger + Scheme *runtime.Scheme + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=sites,verbs=get;list;watch;create;update;patch;delete @@ -86,6 +88,9 @@ func (r *SiteReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. l.Info("Site found; updating resources") + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(s.Status.Conditions) + // Save a copy for status patching patchBase := client.MergeFrom(s.DeepCopy()) @@ -98,18 +103,24 @@ func (r *SiteReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. // Aggregate child component status aggregateErr := r.aggregateChildStatus(ctx, req, s) - // Update status based on reconciliation result + // Update status based on reconciliation result. Capture the destination phase + // so the metric is emitted only after a successful status persist, and only + // when the phase actually changed. + var toPhase string if reconcileErr != nil { msg := status.TruncateMessage(reconcileErr.Error()) status.SetReady(&s.Status.Conditions, s.Generation, metav1.ConditionFalse, status.ReasonReconcileError, msg) + toPhase = observability.PhaseError status.SetProgressing(&s.Status.Conditions, s.Generation, metav1.ConditionFalse, status.ReasonReconcileError, msg) } else { // Overall Ready is true only if all children are ready allReady := s.Status.ConnectReady && s.Status.WorkbenchReady && s.Status.PackageManagerReady && s.Status.ChronicleReady && s.Status.FlightdeckReady if allReady { status.SetReady(&s.Status.Conditions, s.Generation, metav1.ConditionTrue, status.ReasonAllComponentsReady, "All child components are ready") + toPhase = observability.PhaseComponentsReady } else { status.SetReady(&s.Status.Conditions, s.Generation, metav1.ConditionFalse, status.ReasonComponentsNotReady, "One or more child components are not ready") + toPhase = observability.PhaseProgressing } status.SetProgressing(&s.Status.Conditions, s.Generation, metav1.ConditionFalse, status.ReasonReconcileComplete, "Reconciliation complete") } @@ -123,6 +134,10 @@ func (r *SiteReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, patchErr } + // Only record on actual phase transitions and after the status was persisted, + // so the counter reflects real state changes, not steady-state reconciles. + r.Instruments.RecordStatusTransition(ctx, "site", req.Namespace, priorPhase, toPhase) + if reconcileErr != nil { if aggregateErr != nil { l.Error(aggregateErr, "Error aggregating child status (returning reconcile error instead)") diff --git a/internal/controller/core/site_test.go b/internal/controller/core/site_test.go index 67b8515b..7aa3a893 100644 --- a/internal/controller/core/site_test.go +++ b/internal/controller/core/site_test.go @@ -9,11 +9,14 @@ import ( "github.com/posit-dev/team-operator/api/keycloak/v2alpha1" "github.com/posit-dev/team-operator/api/localtest" "github.com/posit-dev/team-operator/api/product" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/traefik/traefik/v3/pkg/provider/kubernetes/crd/traefikio/v1alpha1" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" policyv1 "k8s.io/api/policy/v1" @@ -730,12 +733,12 @@ func TestSiteReconcileWithSA(t *testing.T) { localTestEnv := localtest.LocalTestEnv{} cli, cliScheme, log, err := localTestEnv.Start(loadSchemes) - r.NoError(err) - t.Cleanup(func() { r.NoError(localTestEnv.Stop()) }) + r.NoError(err) + site := defaultSite("test-site") site.Spec.Workbench.ExperimentalFeatures = &v1beta1.InternalWorkbenchExperimentalFeatures{ SessionServiceAccountName: "test-sa", @@ -789,6 +792,8 @@ func TestSiteReconcileWithExperimental(t *testing.T) { localTestEnv := localtest.LocalTestEnv{} cli, cliScheme, log, err := localTestEnv.Start(loadSchemes) + t.Cleanup(func() { _ = localTestEnv.Stop() }) + assert.Nil(t, err) site := defaultSite("experimental-site") @@ -844,10 +849,6 @@ func TestSiteReconcileWithExperimental(t *testing.T) { assert.NotNil(t, tmpWorkbench) assert.NotNil(t, tmpWorkbench.Spec.Config.RServer) assert.Equal(t, 1, tmpWorkbench.Spec.Config.RServer.DatabricksEnabled) - - // stop testEnv - err = localTestEnv.Stop() - assert.Nil(t, err) } func TestSiteKeycloak(t *testing.T) { @@ -1734,7 +1735,13 @@ func TestSiteReadyWithDisabledProducts(t *testing.T) { // Use shared fake client to run multiple reconcile passes fakeClient := localtest.FakeTestEnv{} cli, scheme, log := fakeClient.Start(loadSchemes) - rec := SiteReconciler{Client: cli, Scheme: scheme, Log: log} + + // Set up in-memory meter for metric assertion + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + + rec := SiteReconciler{Client: cli, Scheme: scheme, Log: log, Instruments: observability.NewInstruments(mp.Meter("test"))} req := ctrl.Request{NamespacedName: types.NamespacedName{Namespace: siteNamespace, Name: siteName}} // Create the Site @@ -1745,6 +1752,38 @@ func TestSiteReadyWithDisabledProducts(t *testing.T) { _, err = rec.Reconcile(context.TODO(), req) assert.NoError(t, err) + // Assert that the status transition metric was emitted with the expected label + // contract. Reconcile transitions from no prior Ready condition (PhaseUnknown) + // to PhaseComponentsReady because all required products are disabled. + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected one transition per reconcile") + dp = sum.DataPoints[0] + found = true + } + } + require.True(t, found, "expected status transition metric to be emitted") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "site", + observability.LabelNamespace: siteNamespace, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseComponentsReady, + }, attrs) + assert.Equal(t, int64(1), dp.Value, "expected exactly one transition recorded") + // Fetch the Site to check its status fetchedSite := &v1beta1.Site{} err = cli.Get(context.TODO(), client.ObjectKey{Name: siteName, Namespace: siteNamespace}, fetchedSite) diff --git a/internal/controller/core/workbench.go b/internal/controller/core/workbench.go index 6b7003e2..55330c30 100644 --- a/internal/controller/core/workbench.go +++ b/internal/controller/core/workbench.go @@ -15,6 +15,7 @@ import ( "github.com/posit-dev/team-operator/api/templates" "github.com/posit-dev/team-operator/internal" "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/rstudio/goex/ptr" "github.com/traefik/traefik/v3/pkg/config/dynamic" @@ -89,7 +90,7 @@ func (r *WorkbenchReconciler) FetchAndSetClientSecretForAzureDatabricks(ctx cont return nil } -func (r *WorkbenchReconciler) ReconcileWorkbench(ctx context.Context, req ctrl.Request, w *positcov1beta1.Workbench) (ctrl.Result, error) { +func (r *WorkbenchReconciler) ReconcileWorkbench(ctx context.Context, req ctrl.Request, w *positcov1beta1.Workbench, priorPhase string) (ctrl.Result, error) { l := r.GetLogger(ctx).WithValues( "event", "reconcile-workbench", "product", "workbench", @@ -213,6 +214,8 @@ func (r *WorkbenchReconciler) ReconcileWorkbench(ctx context.Context, req ctrl.R return ctrl.Result{}, err } + r.Instruments.RecordStatusTransition(ctx, "workbench", req.Namespace, + priorPhase, observability.PhaseReady) return ctrl.Result{}, nil } diff --git a/internal/controller/core/workbench_controller.go b/internal/controller/core/workbench_controller.go index fad5bdd6..50e3b6dd 100644 --- a/internal/controller/core/workbench_controller.go +++ b/internal/controller/core/workbench_controller.go @@ -18,13 +18,15 @@ import ( "sigs.k8s.io/controller-runtime/pkg/predicate" positcov1beta1 "github.com/posit-dev/team-operator/api/core/v1beta1" + "github.com/posit-dev/team-operator/internal/observability" ) // WorkbenchReconciler reconciles a Workbench object type WorkbenchReconciler struct { client.Client - Scheme *runtime.Scheme - Log logr.Logger + Scheme *runtime.Scheme + Log logr.Logger + Instruments observability.Instruments } //+kubebuilder:rbac:namespace=posit-team,groups=core.posit.team,resources=workbenches,verbs=get;list;watch;create;update;patch;delete @@ -79,11 +81,16 @@ func (r *WorkbenchReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( l.Info("Workbench found; updating resources") - if res, err := r.ReconcileWorkbench(ctx, req, &w); err != nil { + // Capture prior phase before any mutation so the metric reflects the real transition. + priorPhase := observability.PhaseFromConditions(w.Status.Conditions) + + if res, err := r.ReconcileWorkbench(ctx, req, &w, priorPhase); err != nil { l.Error(err, "error reconciling product state") + r.Instruments.RecordStatusTransition(ctx, "workbench", req.Namespace, + priorPhase, observability.PhaseError) return res, err } - // reconcile successful + // reconcile successful — success metric recorded inside ReconcileWorkbench return ctrl.Result{}, nil } diff --git a/internal/controller/core/workbench_test.go b/internal/controller/core/workbench_test.go index c53549f4..19c9cd21 100644 --- a/internal/controller/core/workbench_test.go +++ b/internal/controller/core/workbench_test.go @@ -10,9 +10,12 @@ import ( "github.com/posit-dev/team-operator/api/product" "github.com/posit-dev/team-operator/internal" "github.com/posit-dev/team-operator/internal/db" + "github.com/posit-dev/team-operator/internal/observability" "github.com/posit-dev/team-operator/internal/status" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" networkingv1 "k8s.io/api/networking/v1" @@ -77,6 +80,7 @@ func initWorkbenchReconciler(t *testing.T, ctx context.Context, namespace, name localEnv := localtest.LocalTestEnv{} cli, cliScheme, log, err := localEnv.Start(loadSchemes) require.NoError(t, err) + t.Cleanup(func() { _ = localEnv.Stop() }) r := &WorkbenchReconciler{ Client: cli, Scheme: cliScheme, @@ -138,6 +142,12 @@ func TestWorkbenchReconciler_Basic(t *testing.T) { ctx, r, req, cli := initWorkbenchReconciler(t, ctx, ns, name) + // Wire up an in-memory meter so we can assert metric recording. + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + r.Instruments = observability.NewInstruments(mp.Meter("test")) + wb := defineDefaultWorkbench(t, ns, name) // have to make sure the CRD _actually exists_ @@ -146,7 +156,7 @@ func TestWorkbenchReconciler_Basic(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -159,6 +169,103 @@ func TestWorkbenchReconciler_Basic(t *testing.T) { headersMiddleware := getMiddleware(t, cli, ns, r.HeadersMiddleware(wb)) require.Equal(t, headersMiddleware.Name, r.HeadersMiddleware(wb)) + + // Assert that the status transition metric was emitted with the expected + // label contract. A regression that swapped from/to phases, omitted the + // namespace label, or recorded the wrong controller would change this map. + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected exactly one data point for the single transition") + dp = sum.DataPoints[0] + found = true + break + } + if found { + break + } + } + require.True(t, found, "expected status transition metric to be emitted") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "workbench", + observability.LabelNamespace: ns, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseReady, + }, attrs) + assert.Equal(t, int64(1), dp.Value, "expected exactly one transition recorded") +} + +// TestWorkbenchReconciler_ErrorRecordsTransition exercises the error emission +// site in Reconcile (not ReconcileWorkbench), so a regression that drops the +// error metric while keeping the success metric — or vice versa — is caught. +func TestWorkbenchReconciler_ErrorRecordsTransition(t *testing.T) { + ctx := context.Background() + ns := "posit-team" + name := "workbench-err" + + ctx, r, req, _ := initWorkbenchReconciler(t, ctx, ns, name) + + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { require.NoError(t, mp.Shutdown(context.Background())) }) + r.Instruments = observability.NewInstruments(mp.Meter("test")) + + // Force ReconcileWorkbench to error via the SAML missing-metadata-URL check. + wb := defineDefaultWorkbench(t, ns, name) + wb.Spec.Auth = positcov1beta1.AuthSpec{ + Type: positcov1beta1.AuthTypeSaml, + UsernameClaim: "email", + // SamlMetadataUrl intentionally not set + } + + require.NoError(t, internal.BasicCreateOrUpdate(ctx, r, r.GetLogger(ctx), req.NamespacedName, &positcov1beta1.Workbench{}, wb)) + + _, err := r.Reconcile(ctx, req) + require.ErrorContains(t, err, "SAML authentication requires a metadata URL") + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(ctx, &rm)) + var dp metricdata.DataPoint[int64] + found := false + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != observability.MetricStatusTransitionTotal { + continue + } + sum, ok := m.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 1, "expected exactly one data point for the single transition") + dp = sum.DataPoints[0] + found = true + break + } + if found { + break + } + } + require.True(t, found, "expected status transition metric to be emitted on error") + attrs := make(map[string]string, dp.Attributes.Len()) + for _, kv := range dp.Attributes.ToSlice() { + attrs[string(kv.Key)] = kv.Value.Emit() + } + assert.Equal(t, map[string]string{ + observability.LabelController: "workbench", + observability.LabelNamespace: ns, + observability.LabelFromPhase: observability.PhaseUnknown, + observability.LabelToPhase: observability.PhaseError, + }, attrs) } func TestWorkbenchReadinessProbePath(t *testing.T) { @@ -189,7 +296,7 @@ func TestWorkbenchReadinessProbePath(t *testing.T) { require.NoError(t, err) wb = getWorkbench(t, cli, ns, wbName) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -218,7 +325,7 @@ func TestWorkbenchConfigReload(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -237,7 +344,7 @@ func TestWorkbenchConfigReload(t *testing.T) { // reconcile again... (have to create/update too...?) err = internal.BasicCreateOrUpdate(ctx, r, r.GetLogger(ctx), req.NamespacedName, &positcov1beta1.Workbench{}, preWb) require.NoError(t, err) - res, err = r.ReconcileWorkbench(ctx, req, preWb) + res, err = r.ReconcileWorkbench(ctx, req, preWb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -268,7 +375,7 @@ func TestWorkbenchAuthSaml(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -305,7 +412,7 @@ func TestWorkbenchAuthSamlMissingMetadata(t *testing.T) { wb = getWorkbench(t, cli, ns, name) // Should return an error when SamlMetadataUrl is not provided - _, err = r.ReconcileWorkbench(ctx, req, wb) + _, err = r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) assert.Error(t, err) assert.Contains(t, err.Error(), "SAML authentication requires a metadata URL") } @@ -328,7 +435,7 @@ func TestWorkbenchLoadBalancingInitContainer(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -387,7 +494,7 @@ func TestWorkbenchLoadBalancingDisabled(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -417,7 +524,7 @@ func TestWorkbenchPodDisruptionBudgets(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -455,7 +562,7 @@ func TestWorkbenchReconciler_Suspended(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -503,7 +610,7 @@ func TestWorkbenchReconciler_SuspendRemovesDeployment(t *testing.T) { wb = getWorkbench(t, cli, ns, name) // Pass 1: normal reconcile — Deployment should be created - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -529,7 +636,7 @@ func TestWorkbenchReconciler_SuspendRemovesDeployment(t *testing.T) { require.NoError(t, err) wb = getWorkbench(t, cli, ns, name) - res, err = r.ReconcileWorkbench(ctx, req, wb) + res, err = r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -614,7 +721,7 @@ func TestWorkbenchSCIM_Disabled(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -656,7 +763,7 @@ func TestWorkbenchSCIM_EnabledManagedToken(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -734,7 +841,7 @@ func TestWorkbenchSCIM_BYOToken(t *testing.T) { wb = getWorkbench(t, cli, ns, name) - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -778,7 +885,7 @@ func TestWorkbenchSCIM_NoTokenRotation(t *testing.T) { wb = getWorkbench(t, cli, ns, name) // First reconcile — creates the managed secret. - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -791,7 +898,7 @@ func TestWorkbenchSCIM_NoTokenRotation(t *testing.T) { // Second reconcile — token must not change. wb = getWorkbench(t, cli, ns, name) - res, err = r.ReconcileWorkbench(ctx, req, wb) + res, err = r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -823,7 +930,7 @@ func TestWorkbenchSCIM_DisableAfterEnable(t *testing.T) { wb = getWorkbench(t, cli, ns, name) // First reconcile — SCIM enabled. - res, err := r.ReconcileWorkbench(ctx, req, wb) + res, err := r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -847,7 +954,7 @@ func TestWorkbenchSCIM_DisableAfterEnable(t *testing.T) { require.NoError(t, err) wb = getWorkbench(t, cli, ns, name) - res, err = r.ReconcileWorkbench(ctx, req, wb) + res, err = r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.NoError(t, err) require.True(t, res.IsZero()) @@ -896,6 +1003,6 @@ func TestWorkbenchSCIM_BYOTokenMissingKey(t *testing.T) { wb = getWorkbench(t, cli, ns, name) // Reconciliation should fail — missing "token" key is a blocking error. - _, err = r.ReconcileWorkbench(ctx, req, wb) + _, err = r.ReconcileWorkbench(ctx, req, wb, observability.PhaseUnknown) require.ErrorContains(t, err, `BYO SCIM token secret "my-incomplete-scim-secret" is missing required key "token"`) } diff --git a/internal/db/db.go b/internal/db/db.go index b75bc3d8..ca8d6d96 100644 --- a/internal/db/db.go +++ b/internal/db/db.go @@ -20,6 +20,10 @@ import ( ) var invalidCharacters = regexp.MustCompile("[^a-z0-9]") // do not glob, lest we lose uniqueness + +// ErrDBHostnameMissing is returned by EnsureDatabaseExists when the configured database URL has no host component. +var ErrDBHostnameMissing = errors.New("database connection hostname not provided") + func DbKey(req ctrl.Request, name string) client.ObjectKey { return client.ObjectKey{ Name: name, @@ -81,9 +85,8 @@ func EnsureDatabaseExists( fmt.Printf("Database URL: %s\n", u.String()) if u.Host == "" { - err := errors.New("database connection hostname not provided") - l.Error(err, "error creating database connection URL") - return err + l.Error(ErrDBHostnameMissing, "error creating database connection URL") + return ErrDBHostnameMissing } pgd := &v1beta1.PostgresDatabase{ diff --git a/internal/observability/metrics.go b/internal/observability/metrics.go new file mode 100644 index 00000000..cb3ea41f --- /dev/null +++ b/internal/observability/metrics.go @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// Instruments holds pre-created OTel counters for a single controller. +// Construct once at SetupWithManager time and reuse for the lifetime of the reconciler. +// A zero-value Instruments is a safe no-op (all Record* calls are silently dropped). +type Instruments struct { + StatusTransition metric.Int64Counter + DependencyCheck metric.Int64Counter + ReconcileRequeue metric.Int64Counter +} + +// NewInstruments creates a complete set of counters from the given Meter. +// Passing a nil meter returns a zero-value Instruments — all Record* methods become no-ops. +func NewInstruments(m metric.Meter) Instruments { + if m == nil { + return Instruments{} + } + status, _ := m.Int64Counter(MetricStatusTransitionTotal, + metric.WithDescription("Number of status phase transitions, partitioned by controller, namespace, from_phase, and to_phase.")) + dep, _ := m.Int64Counter(MetricDependencyCheckTotal, + metric.WithDescription("Number of dependency checks, partitioned by controller, namespace, dependency type, and result.")) + requeue, _ := m.Int64Counter(MetricReconcileRequeueTotal, + metric.WithDescription("Number of reconcile requeues, partitioned by controller, namespace, and reason.")) + return Instruments{StatusTransition: status, DependencyCheck: dep, ReconcileRequeue: requeue} +} + +// RecordStatusTransition increments team_operator_status_transition_total. +// controller is the controller name (e.g. "site", "connect"). +// fromPhase and toPhase should be Phase* constants from names.go. +// Calls where fromPhase == toPhase are no-ops: the metric tracks transitions, +// not steady-state reconciles. Use controller_runtime_reconcile_total for +// "how often did this controller reconcile in state X." +func (i Instruments) RecordStatusTransition(ctx context.Context, controller, namespace, fromPhase, toPhase string) { + if i.StatusTransition == nil || fromPhase == toPhase { + return + } + i.StatusTransition.Add(ctx, 1, + metric.WithAttributes( + attribute.String(LabelController, controller), + attribute.String(LabelNamespace, namespace), + attribute.String(LabelFromPhase, fromPhase), + attribute.String(LabelToPhase, toPhase), + ), + ) +} + +// RecordDependencyCheck increments team_operator_dependency_check_total. +// dependency should be a Dependency* constant. result should be a Result* constant. +func (i Instruments) RecordDependencyCheck(ctx context.Context, controller, namespace, dependency, result string) { + if i.DependencyCheck == nil { + return + } + i.DependencyCheck.Add(ctx, 1, + metric.WithAttributes( + attribute.String(LabelController, controller), + attribute.String(LabelNamespace, namespace), + attribute.String(LabelDependency, dependency), + attribute.String(LabelResult, result), + ), + ) +} + +// RecordReconcileRequeue increments team_operator_reconcile_requeue_total. +// reason should be a RequeueReason* constant from names.go. +func (i Instruments) RecordReconcileRequeue(ctx context.Context, controller, namespace, reason string) { + if i.ReconcileRequeue == nil { + return + } + i.ReconcileRequeue.Add(ctx, 1, + metric.WithAttributes( + attribute.String(LabelController, controller), + attribute.String(LabelNamespace, namespace), + attribute.String(LabelReason, reason), + ), + ) +} diff --git a/internal/observability/metrics_test.go b/internal/observability/metrics_test.go new file mode 100644 index 00000000..835d3a52 --- /dev/null +++ b/internal/observability/metrics_test.go @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + + "github.com/posit-dev/team-operator/internal/observability" +) + +func attrsToMap(s attribute.Set) map[string]string { + out := make(map[string]string, s.Len()) + for _, kv := range s.ToSlice() { + out[string(kv.Key)] = kv.Value.Emit() + } + return out +} + +func TestRecordStatusTransition(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { _ = mp.Shutdown(context.Background()) }) + inst := observability.NewInstruments(mp.Meter("test")) + + inst.RecordStatusTransition(context.Background(), + "site", "posit-team", observability.PhaseReconciling, observability.PhaseReady) + inst.RecordStatusTransition(context.Background(), + "site", "posit-team", observability.PhaseReconciling, observability.PhaseReady) + inst.RecordStatusTransition(context.Background(), + "connect", "posit-team", observability.PhaseReconciling, observability.PhaseError) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + var found bool + for _, sm := range rm.ScopeMetrics { + for _, mm := range sm.Metrics { + if mm.Name != observability.MetricStatusTransitionTotal { + continue + } + found = true + sum, ok := mm.Data.(metricdata.Sum[int64]) + require.True(t, ok, "expected Sum[int64] data type") + require.Len(t, sum.DataPoints, 2, "expected 2 distinct label sets") + for _, dp := range sum.DataPoints { + attrs := attrsToMap(dp.Attributes) + switch attrs[observability.LabelController] { + case "site": + assert.Equal(t, int64(2), dp.Value, "site->ready transition count") + assert.Equal(t, map[string]string{ + observability.LabelController: "site", + observability.LabelNamespace: "posit-team", + observability.LabelFromPhase: observability.PhaseReconciling, + observability.LabelToPhase: observability.PhaseReady, + }, attrs) + case "connect": + assert.Equal(t, int64(1), dp.Value, "connect->error transition count") + assert.Equal(t, map[string]string{ + observability.LabelController: "connect", + observability.LabelNamespace: "posit-team", + observability.LabelFromPhase: observability.PhaseReconciling, + observability.LabelToPhase: observability.PhaseError, + }, attrs) + default: + t.Fatalf("unexpected controller label %q in metric %q with attrs %v", attrs[observability.LabelController], mm.Name, attrs) + } + } + } + } + assert.True(t, found, "metric %s not found in output", observability.MetricStatusTransitionTotal) +} + +func TestRecordDependencyCheck(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { _ = mp.Shutdown(context.Background()) }) + inst := observability.NewInstruments(mp.Meter("test")) + + inst.RecordDependencyCheck(context.Background(), + "connect", "posit-team", observability.DependencyPostgres, observability.ResultSuccess) + inst.RecordDependencyCheck(context.Background(), + "connect", "posit-team", observability.DependencySecret, observability.ResultError) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + var found bool + for _, sm := range rm.ScopeMetrics { + for _, mm := range sm.Metrics { + if mm.Name != observability.MetricDependencyCheckTotal { + continue + } + found = true + sum, ok := mm.Data.(metricdata.Sum[int64]) + require.True(t, ok) + require.Len(t, sum.DataPoints, 2) + for _, dp := range sum.DataPoints { + attrs := attrsToMap(dp.Attributes) + switch attrs[observability.LabelDependency] { + case observability.DependencyPostgres: + assert.Equal(t, map[string]string{ + observability.LabelController: "connect", + observability.LabelNamespace: "posit-team", + observability.LabelDependency: observability.DependencyPostgres, + observability.LabelResult: observability.ResultSuccess, + }, attrs) + case observability.DependencySecret: + assert.Equal(t, map[string]string{ + observability.LabelController: "connect", + observability.LabelNamespace: "posit-team", + observability.LabelDependency: observability.DependencySecret, + observability.LabelResult: observability.ResultError, + }, attrs) + default: + t.Fatalf("unexpected dependency label %q in metric %q with attrs %v", attrs[observability.LabelDependency], mm.Name, attrs) + } + } + } + } + assert.True(t, found) +} + +func TestRecordReconcileRequeue(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { _ = mp.Shutdown(context.Background()) }) + inst := observability.NewInstruments(mp.Meter("test")) + + inst.RecordReconcileRequeue(context.Background(), + "workbench", "posit-team", observability.RequeueReasonDepsNotReady) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + var found bool + for _, sm := range rm.ScopeMetrics { + for _, mm := range sm.Metrics { + if mm.Name != observability.MetricReconcileRequeueTotal { + continue + } + found = true + sum, ok := mm.Data.(metricdata.Sum[int64]) + require.True(t, ok) + require.Len(t, sum.DataPoints, 1) + dp := sum.DataPoints[0] + assert.Equal(t, int64(1), dp.Value) + assert.Equal(t, map[string]string{ + observability.LabelController: "workbench", + observability.LabelNamespace: "posit-team", + observability.LabelReason: observability.RequeueReasonDepsNotReady, + }, attrsToMap(dp.Attributes)) + } + } + assert.True(t, found) +} + +// TestRecordStatusTransition_SamePhaseIsNoOp pins the contract that the +// transition counter only fires on actual phase changes, not on steady-state +// reconciles. Regression test for an issue caught during AKS validation where +// every Reconcile of a stable CR was emitting from=X to=X, drowning out +// genuine flapping signal. Use controller_runtime_reconcile_total for +// "how often did this controller reconcile in state X." +func TestRecordStatusTransition_SamePhaseIsNoOp(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + t.Cleanup(func() { _ = mp.Shutdown(context.Background()) }) + inst := observability.NewInstruments(mp.Meter("test")) + + // Same-phase calls — must not emit. + inst.RecordStatusTransition(context.Background(), + "site", "posit-team", observability.PhaseReady, observability.PhaseReady) + inst.RecordStatusTransition(context.Background(), + "chronicle", "posit-team", observability.PhaseError, observability.PhaseError) + inst.RecordStatusTransition(context.Background(), + "workbench", "posit-team", observability.PhaseUnknown, observability.PhaseUnknown) + + // One real transition — must emit, proving the meter still works. + inst.RecordStatusTransition(context.Background(), + "site", "posit-team", observability.PhaseError, observability.PhaseReady) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + for _, sm := range rm.ScopeMetrics { + for _, mm := range sm.Metrics { + if mm.Name == observability.MetricStatusTransitionTotal { + sum, ok := mm.Data.(metricdata.Sum[int64]) + require.True(t, ok) + require.Len(t, sum.DataPoints, 1, "only the genuine error->ready transition should be recorded") + assert.Equal(t, int64(1), sum.DataPoints[0].Value) + attrs := attrsToMap(sum.DataPoints[0].Attributes) + assert.Equal(t, observability.PhaseError, attrs[observability.LabelFromPhase]) + assert.Equal(t, observability.PhaseReady, attrs[observability.LabelToPhase]) + return + } + } + } + t.Fatal("no metric emitted at all — the genuine transition was suppressed too") +} + +// TestNewInstruments_NilMeterIsNoOp verifies that a zero-value Instruments +// (from passing nil to NewInstruments) does not panic on any Record* call. +func TestNewInstruments_NilMeterIsNoOp(t *testing.T) { + inst := observability.NewInstruments(nil) + // None of these should panic. + inst.RecordStatusTransition(context.Background(), "site", "ns", observability.PhaseReconciling, observability.PhaseReady) + inst.RecordDependencyCheck(context.Background(), "site", "ns", observability.DependencyPostgres, observability.ResultSuccess) + inst.RecordReconcileRequeue(context.Background(), "site", "ns", observability.RequeueReasonDepsNotReady) +} diff --git a/internal/observability/names.go b/internal/observability/names.go new file mode 100644 index 00000000..0c078419 --- /dev/null +++ b/internal/observability/names.go @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +// Package observability provides OTel-based metrics instrumentation for the team-operator. +package observability + +// Metric names — all under the team_operator_* namespace. +const ( + MetricResourceCount = "team_operator_resource_count" + MetricStatusTransitionTotal = "team_operator_status_transition_total" + MetricDependencyCheckTotal = "team_operator_dependency_check_total" + MetricReconcileRequeueTotal = "team_operator_reconcile_requeue_total" +) + +// Label keys. +const ( + LabelController = "controller" + LabelNamespace = "namespace" + LabelPhase = "phase" + LabelFromPhase = "from_phase" + LabelToPhase = "to_phase" + LabelDependency = "dependency" + LabelResult = "result" + LabelReason = "reason" +) + +// Dependency enum values for LabelDependency. +const ( + DependencyPostgres = "postgres" + DependencyKeycloak = "keycloak" + DependencySecret = "secret" + DependencyCRD = "crd" +) + +// Result enum values for LabelResult. +const ( + ResultSuccess = "success" + ResultError = "error" +) + +// Requeue reason enum values for LabelReason. +// Keep this small and operator-defined — never pass free-form strings. +const ( + RequeueReasonDepsNotReady = "deps_not_ready" + RequeueReasonConflict = "conflict" + RequeueReasonRetry = "retry" + RequeueReasonRateLimit = "rate_limit" +) + +// Phase values for LabelPhase / LabelFromPhase / LabelToPhase. +// Where applicable these are the lowercase_underscore form of the matching +// status.Reason* constants in internal/status/status.go. The mapping is +// asserted by TestPhaseMatchesStatusReason in names_test.go — adding or +// renaming a Reason in the status package will break that test. +const ( + PhaseReconciling = "reconciling" // status.ReasonReconciling + PhaseReady = "ready" // generic ready phase (not tied to a single Reason) + PhaseError = "error" // generic error phase (covers status.ReasonReconcileError) + PhaseSuspended = "suspended" // status.ReasonSuspended + PhaseDatabaseReady = "database_ready" // status.ReasonDatabaseReady + PhaseComponentsReady = "all_components_ready" // status.ReasonAllComponentsReady + PhaseProgressing = "progressing" // status.ReasonComponentsNotReady (waiting on children) + PhaseUnknown = "unknown" // sentinel for an untracked previous phase +) diff --git a/internal/observability/names_test.go b/internal/observability/names_test.go new file mode 100644 index 00000000..f3fec68e --- /dev/null +++ b/internal/observability/names_test.go @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability_test + +import ( + "strings" + "testing" + + "github.com/posit-dev/team-operator/internal/observability" + "github.com/posit-dev/team-operator/internal/status" +) + +func TestMetricNamesHaveTeamOperatorPrefix(t *testing.T) { + const prefix = "team_operator_" + for _, name := range []string{ + observability.MetricResourceCount, + observability.MetricStatusTransitionTotal, + observability.MetricDependencyCheckTotal, + observability.MetricReconcileRequeueTotal, + } { + if !strings.HasPrefix(name, prefix) { + t.Errorf("metric %q missing %q prefix", name, prefix) + } + } +} + +func TestLabelValueEnumsHaveNoDuplicates(t *testing.T) { + groups := map[string][]string{ + "dependency": { + observability.DependencyPostgres, + observability.DependencyKeycloak, + observability.DependencySecret, + observability.DependencyCRD, + }, + "result": { + observability.ResultSuccess, + observability.ResultError, + }, + "requeue_reason": { + observability.RequeueReasonDepsNotReady, + observability.RequeueReasonConflict, + observability.RequeueReasonRetry, + observability.RequeueReasonRateLimit, + }, + "phase": { + observability.PhaseReconciling, + observability.PhaseReady, + observability.PhaseError, + observability.PhaseSuspended, + observability.PhaseDatabaseReady, + observability.PhaseComponentsReady, + observability.PhaseProgressing, + observability.PhaseUnknown, + }, + } + for group, values := range groups { + seen := make(map[string]struct{}, len(values)) + for _, v := range values { + if _, dup := seen[v]; dup { + t.Errorf("%s group has duplicate value %q", group, v) + } + seen[v] = struct{}{} + } + } +} + +// TestPhaseMatchesStatusReason locks down phase strings that are expected to +// be the lowercase_underscore form of a status.Reason* constant. This catches +// the case where a Reason is renamed in the status package and dashboards +// silently break. +// +// Note: this test asserts two things at once — that phase strings track the +// matching Reason value, and that Reason values stay CamelCase. If a future +// change in internal/status switches Reason values to a different format +// (e.g., already-snake-cased or human-formatted strings) this test will fail +// even though the semantic mapping is unchanged; update camelToSnake or the +// expected phase strings accordingly. +func TestPhaseMatchesStatusReason(t *testing.T) { + // Force a build error if status.ReasonReconcileError is renamed/removed. + // PhaseError covers this Reason but the value transform is not 1:1, so it + // can't be asserted via camelToSnake below. + _ = status.ReasonReconcileError + + cases := []struct { + phase string + reason string + }{ + {observability.PhaseReconciling, status.ReasonReconciling}, + {observability.PhaseSuspended, status.ReasonSuspended}, + {observability.PhaseDatabaseReady, status.ReasonDatabaseReady}, + {observability.PhaseComponentsReady, status.ReasonAllComponentsReady}, + } + for _, c := range cases { + if got := camelToSnake(c.reason); got != c.phase { + t.Errorf("status.%s expected to map to phase %q, got %q", c.reason, c.phase, got) + } + } +} + +// camelToSnake converts CamelCase to lowercase_underscore. It only handles +// one capital per word boundary (e.g., "DatabaseReady" -> "database_ready"); +// consecutive capitals from acronyms like "HTTPReady" or "OIDCReady" are not +// supported and produce no boundary between the acronym and the following +// word (e.g., "HTTPReady" -> "httpready", not "http_ready"). None of the +// current status.Reason* values use acronyms; if one is added, this helper +// must be updated alongside the new test case. +func camelToSnake(s string) string { + var b strings.Builder + prevUpper := false + for i, r := range s { + isUpper := r >= 'A' && r <= 'Z' + if i > 0 && isUpper && !prevUpper { + b.WriteByte('_') + } + if isUpper { + r += 'a' - 'A' + } + b.WriteRune(r) + prevUpper = isUpper + } + return b.String() +} diff --git a/internal/observability/phase.go b/internal/observability/phase.go new file mode 100644 index 00000000..5c2bad69 --- /dev/null +++ b/internal/observability/phase.go @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/posit-dev/team-operator/internal/status" +) + +// PhaseFromConditions returns the current Phase value derived from the most +// recent Ready condition's Reason. It is intended to be called early in a +// Reconcile loop — before the controller sets Ready=Reconciling — so the +// returned value reflects the prior stable state. +// +// Returns PhaseUnknown if no Ready condition is present (first reconcile, or +// CR was just created) or if the Reason is not recognized. +func PhaseFromConditions(conds []metav1.Condition) string { + for i := range conds { + if conds[i].Type == status.TypeReady { + return phaseFromReason(conds[i].Reason) + } + } + return PhaseUnknown +} + +func phaseFromReason(reason string) string { + switch reason { + case status.ReasonReconciling: + return PhaseReconciling + case status.ReasonReconcileError: + return PhaseError + case status.ReasonReconcileComplete, status.ReasonDeploymentReady, status.ReasonStatefulSetReady: + return PhaseReady + case status.ReasonAllComponentsReady: + return PhaseComponentsReady + case status.ReasonComponentsNotReady: + return PhaseProgressing + case status.ReasonSuspended: + return PhaseSuspended + case status.ReasonDatabaseReady: + return PhaseDatabaseReady + case status.ReasonDeploymentNotReady, status.ReasonStatefulSetNotReady: + return PhaseProgressing + default: + return PhaseUnknown + } +} diff --git a/internal/observability/phase_test.go b/internal/observability/phase_test.go new file mode 100644 index 00000000..d2e4db1f --- /dev/null +++ b/internal/observability/phase_test.go @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability_test + +import ( + "testing" + + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/posit-dev/team-operator/internal/observability" + "github.com/posit-dev/team-operator/internal/status" +) + +func TestPhaseFromConditions(t *testing.T) { + cases := []struct { + name string + conds []metav1.Condition + want string + }{ + {"empty conditions returns Unknown", nil, observability.PhaseUnknown}, + {"reconciling reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonReconciling}}, observability.PhaseReconciling}, + {"reconcile error reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonReconcileError}}, observability.PhaseError}, + {"reconcile complete reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonReconcileComplete}}, observability.PhaseReady}, + {"deployment ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonDeploymentReady}}, observability.PhaseReady}, + {"statefulset ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonStatefulSetReady}}, observability.PhaseReady}, + {"all components ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonAllComponentsReady}}, observability.PhaseComponentsReady}, + {"suspended reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonSuspended}}, observability.PhaseSuspended}, + {"database ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonDatabaseReady}}, observability.PhaseDatabaseReady}, + {"deployment not ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonDeploymentNotReady}}, observability.PhaseProgressing}, + {"statefulset not ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonStatefulSetNotReady}}, observability.PhaseProgressing}, + {"components not ready reason", []metav1.Condition{{Type: status.TypeReady, Reason: status.ReasonComponentsNotReady}}, observability.PhaseProgressing}, + {"unrecognized reason returns Unknown", []metav1.Condition{{Type: status.TypeReady, Reason: "SomethingElse"}}, observability.PhaseUnknown}, + {"non-Ready condition is ignored", []metav1.Condition{{Type: status.TypeProgressing, Reason: status.ReasonReconcileComplete}}, observability.PhaseUnknown}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, observability.PhaseFromConditions(tc.conds)) + }) + } +} diff --git a/internal/observability/provider.go b/internal/observability/provider.go new file mode 100644 index 00000000..6d800c4a --- /dev/null +++ b/internal/observability/provider.go @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/prometheus/client_golang/prometheus" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc" + promexporter "go.opentelemetry.io/otel/exporters/prometheus" + "go.opentelemetry.io/otel/metric" + "go.opentelemetry.io/otel/metric/noop" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/resource" + semconv "go.opentelemetry.io/otel/semconv/v1.27.0" + ctrl "sigs.k8s.io/controller-runtime" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/posit-dev/team-operator/internal" +) + +// Config holds all flags/env that control OTel SDK initialization. +// Flags take precedence over environment variables; defaults are applied last. +// +// Note on service.name precedence: Config sets service.name to "team-operator" +// after resource.WithFromEnv(), so the explicit attribute wins over the +// OTEL_SERVICE_NAME and OTEL_RESOURCE_ATTRIBUTES env vars by design. +// +// Kill switch: set OTEL_SDK_DISABLED=true to disable all OTel instrumentation. +// The Prometheus exporter is always enabled when the SDK is active; use +// OTEL_SDK_DISABLED to turn off the entire metrics subsystem. +type Config struct { + // PrometheusRegisterer is the Prometheus registerer the exporter binds to. + // When nil, controller-runtime's metrics.Registry is used (which is what + // the controller-runtime metrics server reads from). + // Tests should pass a fresh prometheus.NewRegistry() to avoid polluting the + // process-global default registerer. + PrometheusRegisterer prometheus.Registerer + // OTLPEndpoint is the gRPC endpoint for OTLP metric push (e.g. "otel-collector:4317"). + // Empty string means OTLP push is disabled unless OTEL_EXPORTER_OTLP_ENDPOINT is set. + OTLPEndpoint string + // OTLPInsecure forces the gRPC exporter to plaintext. Default false (TLS is used). + // Set true for in-cluster collectors reachable over the pod network without TLS. + OTLPInsecure bool + // MetricsExportInterval is the cadence for OTLP metric export and async gauge collection. + MetricsExportInterval time.Duration + // ClusterName is written to the k8s.cluster.name resource attribute when non-empty. + ClusterName string + // InstanceID is service.instance.id, typically $POD_NAME. Filled from env in main.go. + InstanceID string +} + +// Provider wraps the OTel MeterProvider and exposes a Meter factory and Shutdown. +// All fields are unexported; callers interact only via Meter() and Shutdown(). +type Provider struct { + mp metric.MeterProvider +} + +var providerLog = ctrl.Log.WithName("observability") + +// NewProvider initialises the OTel metrics SDK based on cfg. +// If OTEL_SDK_DISABLED=true or SDK init fails, a noop provider is returned +// so the operator always boots. +func NewProvider(ctx context.Context, cfg Config) *Provider { + // Kill switch: OTEL_SDK_DISABLED env var (standard OTel convention). + if os.Getenv("OTEL_SDK_DISABLED") == "true" { + return &Provider{mp: noop.NewMeterProvider()} + } + + mp, err := buildMeterProvider(ctx, cfg) + if err != nil { + // Degraded mode: log warning and return noop so the operator still starts. + // Use Info (not Error) since the operator continues running normally. + providerLog.Info("SDK init failed; falling back to noop metrics", "err", err.Error()) + return &Provider{mp: noop.NewMeterProvider()} + } + + return &Provider{mp: mp} +} + +// Meter returns a named metric.Meter. name should be the controller/component name, +// e.g. "team-operator/site" or "team-operator/connect". +func (p *Provider) Meter(name string) metric.Meter { + return p.mp.Meter(name) +} + +// Shutdown flushes pending exports and releases SDK resources. +// Call this from the signal handler, after mgr.Start() returns. +// Returns the SDK shutdown error so callers can choose to log or ignore it; +// the operator should still exit cleanly even when shutdown errors occur. +func (p *Provider) Shutdown(ctx context.Context) error { + if sdk, ok := p.mp.(*sdkmetric.MeterProvider); ok { + return sdk.Shutdown(ctx) + } + // noop provider has no resources to release + return nil +} + +func buildMeterProvider(ctx context.Context, cfg Config) (*sdkmetric.MeterProvider, error) { + res, err := buildResource(ctx, cfg) + if err != nil { + return nil, fmt.Errorf("building OTel resource: %w", err) + } + + var opts []sdkmetric.Option + opts = append(opts, sdkmetric.WithResource(res)) + + // Prometheus exporter — registers onto a Prometheus Registerer so /metrics + // serves both controller-runtime built-ins and OTel metrics from one endpoint. + // promexporter.New() without a Registerer option creates an internal + // prometheus.NewRegistry() that no HTTP handler serves; we MUST pass + // WithRegisterer explicitly. When cfg.PrometheusRegisterer is nil we default + // to controller-runtime's metrics.Registry — which is what + // controller-runtime's metrics server reads from. (NOT + // prometheus.DefaultRegisterer; controller-runtime maintains its own + // internal *prometheus.Registry, separate from the global default.) + { + registerer := cfg.PrometheusRegisterer + if registerer == nil { + registerer = crmetrics.Registry + } + promExp, err := promexporter.New(promexporter.WithRegisterer(registerer)) + if err != nil { + return nil, fmt.Errorf("creating Prometheus exporter: %w", err) + } + opts = append(opts, sdkmetric.WithReader(promExp)) + } + + // Resolve OTLP endpoint: flag value > OTEL_EXPORTER_OTLP_METRICS_ENDPOINT > + // OTEL_EXPORTER_OTLP_ENDPOINT > unset (OTLP push disabled). + // We resolve manually because we want to gate on "is OTLP configured at all" — + // passing the resolved endpoint via WithEndpoint also lets us emit a startup + // log message identifying which endpoint was chosen. + otlpEndpoint := cfg.OTLPEndpoint + if otlpEndpoint == "" { + otlpEndpoint = os.Getenv("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT") + } + if otlpEndpoint == "" { + otlpEndpoint = os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + } + if otlpEndpoint != "" { + grpcOpts := []otlpmetricgrpc.Option{ + otlpmetricgrpc.WithEndpoint(otlpEndpoint), + } + if cfg.OTLPInsecure { + providerLog.Info("OTLP push using insecure (plaintext) transport; ensure the collector is in-cluster or behind a service mesh", "endpoint", otlpEndpoint) + grpcOpts = append(grpcOpts, otlpmetricgrpc.WithInsecure()) + } + otlpExp, err := otlpmetricgrpc.New(ctx, grpcOpts...) + if err != nil { + return nil, fmt.Errorf("creating OTLP metric exporter: %w", err) + } + interval := cfg.MetricsExportInterval + if interval <= 0 { + interval = 30 * time.Second + } + opts = append(opts, sdkmetric.WithReader( + sdkmetric.NewPeriodicReader(otlpExp, sdkmetric.WithInterval(interval)), + )) + } + + return sdkmetric.NewMeterProvider(opts...), nil +} + +func buildResource(ctx context.Context, cfg Config) (*resource.Resource, error) { + attrs := []attribute.KeyValue{ + semconv.ServiceName("team-operator"), + semconv.ServiceVersion(internal.VersionString), + } + if cfg.InstanceID != "" { + attrs = append(attrs, semconv.ServiceInstanceID(cfg.InstanceID)) + } + if cfg.ClusterName != "" { + attrs = append(attrs, attribute.String("k8s.cluster.name", cfg.ClusterName)) + } + + // Merge with OTEL_RESOURCE_ATTRIBUTES env var (OTel SDK handles this automatically + // when we use resource.New with WithProcess or Detect, but we build manually here + // so we apply env vars via resource.WithFromEnv()). + // Order matters: WithFromEnv runs first, then WithAttributes — so explicit + // attrs (including service.name) take precedence over OTEL_SERVICE_NAME. + return resource.New(ctx, + resource.WithFromEnv(), + resource.WithAttributes(attrs...), + ) +} diff --git a/internal/observability/provider_test.go b/internal/observability/provider_test.go new file mode 100644 index 00000000..9fee3434 --- /dev/null +++ b/internal/observability/provider_test.go @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability_test + +import ( + "context" + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/require" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/posit-dev/team-operator/internal/observability" +) + +func TestNewProvider_NoopWhenDisabled(t *testing.T) { + t.Setenv("OTEL_SDK_DISABLED", "true") + p := observability.NewProvider(context.Background(), observability.Config{}) + require.NotNil(t, p) + + // Meter should work without panicking (noop meter) + m := p.Meter("test") + counter, err := m.Int64Counter("test_counter") + require.NoError(t, err) + counter.Add(context.Background(), 1) // noop, should not panic + + require.NoError(t, p.Shutdown(context.Background())) +} + +func TestNewProvider_PrometheusOnly(t *testing.T) { + // Use a fresh registry so the test is idempotent across `go test -count=N` + // runs and does not pollute prometheus.DefaultRegisterer. + reg := prometheus.NewRegistry() + p := observability.NewProvider(context.Background(), observability.Config{ + PrometheusRegisterer: reg, + }) + require.NotNil(t, p) + + m := p.Meter("team-operator/site") + counter, err := m.Int64Counter("test_init_counter") + require.NoError(t, err) + counter.Add(context.Background(), 5) + + require.NoError(t, p.Shutdown(context.Background())) +} + +func TestNewProvider_PrometheusGather(t *testing.T) { + // Verify the contract that the OTel Prometheus exporter feeds the configured + // Registerer / Gatherer — i.e. recorded counters appear in /metrics output. + reg := prometheus.NewRegistry() + p := observability.NewProvider(context.Background(), observability.Config{ + PrometheusRegisterer: reg, + }) + require.NotNil(t, p) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + + m := p.Meter("team-operator/test") + counter, err := m.Int64Counter("provider_gather_test_total") + require.NoError(t, err) + counter.Add(context.Background(), 3) + + families, err := reg.Gather() + require.NoError(t, err) + + var found bool + for _, mf := range families { + if mf.GetName() == "provider_gather_test_total" { + found = true + break + } + } + require.True(t, found, "OTel counter must appear in Prometheus gather output") +} + +// TestNewProvider_NilRegistererDefaultsToCRMetrics pins the production wiring: +// when PrometheusRegisterer is nil (as main.go calls it), the exporter must +// register onto sigs.k8s.io/controller-runtime/pkg/metrics.Registry — the +// registry that controller-runtime's metrics server actually serves /metrics +// from. NOT prometheus.DefaultRegisterer (the global default), which is a +// SEPARATE registry that controller-runtime ignores. Regression test for a +// production bug found during AKS reference cluster validation where +// team_operator_* metrics emitted into a registry no HTTP handler served. +// +// Note: this test mutates global crmetrics.Registry state. +// `go test -count > 1` will fail with a duplicate-collector registration error. +func TestNewProvider_NilRegistererDefaultsToCRMetrics(t *testing.T) { + p := observability.NewProvider(context.Background(), observability.Config{ + // PrometheusRegisterer intentionally nil — this is how main.go calls it. + }) + require.NotNil(t, p) + t.Cleanup(func() { _ = p.Shutdown(context.Background()) }) + + counter, err := p.Meter("team-operator/regression").Int64Counter("crmetrics_registry_regression_total") + require.NoError(t, err) + counter.Add(context.Background(), 1) + + gatherer, ok := crmetrics.Registry.(prometheus.Gatherer) + require.True(t, ok, "controller-runtime metrics.Registry must implement prometheus.Gatherer") + families, err := gatherer.Gather() + require.NoError(t, err) + for _, mf := range families { + if mf.GetName() == "crmetrics_registry_regression_total" { + return + } + } + t.Fatalf("metric crmetrics_registry_regression_total not found in crmetrics.Registry; nil registerer did not default to controller-runtime's Registry") +} + +func TestNewProvider_OTLPEndpointSet(t *testing.T) { + // Smoke test: provider init with an OTLP endpoint set must succeed; gRPC + // connect is lazy so an unreachable collector does not fail at init time. + // Shutdown may return an error when the collector is unreachable (the SDK + // flushes pending exports), which is fine — callers tolerate the error. + reg := prometheus.NewRegistry() + p := observability.NewProvider(context.Background(), observability.Config{ + PrometheusRegisterer: reg, + OTLPEndpoint: "localhost:4317", + OTLPInsecure: true, + }) + require.NotNil(t, p) + _ = p.Shutdown(context.Background()) +} + +func TestNewProvider_EnvVarFallback(t *testing.T) { + t.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "localhost:4317") + reg := prometheus.NewRegistry() + p := observability.NewProvider(context.Background(), observability.Config{ + PrometheusRegisterer: reg, + OTLPEndpoint: "", // empty — should fall back to env var + OTLPInsecure: true, + }) + require.NotNil(t, p) + _ = p.Shutdown(context.Background()) +} diff --git a/internal/observability/resource_count.go b/internal/observability/resource_count.go new file mode 100644 index 00000000..176890a6 --- /dev/null +++ b/internal/observability/resource_count.go @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability + +import ( + "context" + + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// ResourceCount holds one gauge observation: how many CRs of a given controller +// are in a given namespace and phase. +type ResourceCount struct { + Controller string + Namespace string + Phase string + Count int64 +} + +// ResourceLister is implemented by types that can list CRs of all kinds and +// return per-(controller, namespace, phase) counts. +type ResourceLister interface { + List(ctx context.Context) ([]ResourceCount, error) +} + +// RegisterResourceCountGauge registers an async gauge on m that calls lister.List +// on each OTel collection cycle. +func RegisterResourceCountGauge(m metric.Meter, lister ResourceLister) error { + _, err := m.Int64ObservableGauge( + MetricResourceCount, + metric.WithDescription("Number of operator-managed CRs, partitioned by controller, namespace, and phase."), + metric.WithInt64Callback(func(ctx context.Context, o metric.Int64Observer) error { + counts, err := lister.List(ctx) + if err != nil { + return nil + } + for _, c := range counts { + o.Observe(c.Count, + metric.WithAttributes( + attribute.String(LabelController, c.Controller), + attribute.String(LabelNamespace, c.Namespace), + attribute.String(LabelPhase, c.Phase), + ), + ) + } + return nil + }), + ) + return err +} diff --git a/internal/observability/resource_count_test.go b/internal/observability/resource_count_test.go new file mode 100644 index 00000000..71ae6384 --- /dev/null +++ b/internal/observability/resource_count_test.go @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023-2026 Posit Software, PBC + +package observability_test + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/attribute" + sdkmetric "go.opentelemetry.io/otel/sdk/metric" + "go.opentelemetry.io/otel/sdk/metric/metricdata" + + "github.com/posit-dev/team-operator/internal/observability" +) + +type mockResourceLister struct { + results []observability.ResourceCount +} + +func (m *mockResourceLister) List(ctx context.Context) ([]observability.ResourceCount, error) { + return m.results, nil +} + +func TestRegisterResourceCountGauge(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + defer mp.Shutdown(context.Background()) + m := mp.Meter("test") + + lister := &mockResourceLister{ + results: []observability.ResourceCount{ + {Controller: "connect", Namespace: "posit-team", Phase: "ready", Count: 3}, + {Controller: "connect", Namespace: "posit-team", Phase: "error", Count: 1}, + }, + } + + err := observability.RegisterResourceCountGauge(m, lister) + require.NoError(t, err) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(context.Background(), &rm)) + + var found int + for _, sm := range rm.ScopeMetrics { + for _, mm := range sm.Metrics { + if mm.Name == observability.MetricResourceCount { + gauge, ok := mm.Data.(metricdata.Gauge[int64]) + require.True(t, ok) + for _, dp := range gauge.DataPoints { + found++ + controller, _ := dp.Attributes.Value(attribute.Key(observability.LabelController)) + phase, _ := dp.Attributes.Value(attribute.Key(observability.LabelPhase)) + if controller.AsString() == "connect" && phase.AsString() == "ready" { + assert.Equal(t, int64(3), dp.Value) + } + if controller.AsString() == "connect" && phase.AsString() == "error" { + assert.Equal(t, int64(1), dp.Value) + } + } + } + } + } + assert.Equal(t, 2, found, "expected 2 gauge data points") +}