Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
18fd44f
chore(deps): promote OTel metric SDK packages to direct dependencies
ian-flores May 5, 2026
3707a45
Address review findings (job 1245)
ian-flores May 5, 2026
8a1de80
feat(observability): add typed metric name and label constants
ian-flores May 5, 2026
ea93a0c
feat(observability): add OTel Provider with Prometheus and OTLP expor…
ian-flores May 5, 2026
84cbccf
Address review findings (job 1249)
ian-flores May 5, 2026
24b86e4
Address review findings (job 1252)
ian-flores May 5, 2026
0e0d20d
feat(observability): add RecordStatusTransition, RecordDependencyChec…
ian-flores May 5, 2026
2ef4560
feat(observability): add observability flags and Provider init to mai…
ian-flores May 5, 2026
f68a0e6
feat(site): instrument reconcile metrics for SiteReconciler
ian-flores May 5, 2026
db19e47
feat(connect): instrument reconcile metrics for ConnectReconciler
ian-flores May 5, 2026
1280cd0
feat(workbench): instrument reconcile metrics for WorkbenchReconciler
ian-flores May 5, 2026
54c7707
feat(package-manager): instrument reconcile metrics for PackageManage…
ian-flores May 5, 2026
22b1ea8
feat(chronicle): instrument reconcile metrics for ChronicleReconciler
ian-flores May 5, 2026
06dbd94
feat(flightdeck): instrument reconcile metrics for FlightdeckReconciler
ian-flores May 5, 2026
676a5e7
feat(postgres-database): instrument reconcile and dependency metrics …
ian-flores May 5, 2026
a5bea0a
feat(sessiongrouplabel): plumb Meter field for observability shape pa…
ian-flores May 5, 2026
7f10a06
feat(observability): add async resource count gauge with multi-kind l…
ian-flores May 5, 2026
24fd449
feat(observability): add Kustomize base flags and OTLP overlay
ian-flores May 5, 2026
8723a30
feat(observability): add Helm values and template wiring for observab…
ian-flores May 5, 2026
97ea1e5
docs(observability): add observability reference and remove metrics T…
ian-flores May 5, 2026
34dfcc7
refactor(observability): apply review feedback (cleanup + insecure-OT…
ian-flores May 5, 2026
d7b79b9
feat(observability): track prior stable phase in status transition me…
ian-flores May 5, 2026
c8aff56
style: apply gofmt to instrumentation imports
ian-flores May 5, 2026
db60e83
fix(build): copy and compile cmd/team-operator package, not just main.go
ian-flores May 5, 2026
4851a51
Address review findings (job 1251)
ian-flores May 5, 2026
260aad1
Address review findings (job 1253)
ian-flores May 5, 2026
b98b15d
Address review findings (job 1254)
ian-flores May 5, 2026
9115545
Address review findings (job 1277)
ian-flores May 5, 2026
9564996
Address review findings (job 1255)
ian-flores May 5, 2026
46b48fd
fix(observability): register Prometheus exporter onto DefaultRegisterer
ian-flores May 6, 2026
69966b4
fix(observability): register Prometheus exporter onto controller-runt…
ian-flores May 6, 2026
4d383a6
Address review findings (job 1256)
ian-flores May 6, 2026
26f2b7a
fix(observability): suppress same-phase status transition recordings
ian-flores May 6, 2026
6b99268
fix(core/test): reap envtest processes in shared init helpers
ian-flores May 8, 2026
ff75e53
Address review findings (job 1257)
ian-flores May 8, 2026
d1ce935
Address review findings (job 1258)
ian-flores May 8, 2026
5f2da7f
Address review findings (job 1286)
ian-flores May 8, 2026
71d0e73
Address review findings (job 1259)
ian-flores May 8, 2026
038211c
Address review findings (job 1288)
ian-flores May 8, 2026
9a40a27
Address review findings (job 1289)
ian-flores May 8, 2026
38e4e19
Address review findings (job 1290)
ian-flores May 8, 2026
baa1562
Address review findings (job 1260)
ian-flores May 8, 2026
8667f8f
Merge branch 'main' into observability-metrics
ian-flores May 8, 2026
d9deeaf
Merge remote-tracking branch 'origin/observability-metrics' into obse…
ian-flores May 8, 2026
9538a26
fix(core/api): hardcode GVK in OwnerReferencesForChildren
ian-flores May 8, 2026
24c7acd
Merge branch 'main' into observability-metrics
ian-flores May 26, 2026
a6d2eb2
refactor(observability): apply review fixes — drop dead code, simplif…
ian-flores May 26, 2026
d8a2e9f
revert: drop out-of-scope GVK-hardcode in OwnerReferencesForChildren
ian-flores May 26, 2026
66eb69f
Revert "revert: drop out-of-scope GVK-hardcode in OwnerReferencesForC…
ian-flores May 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ COPY go.mod go.sum ./
RUN go mod download

# Copy the go source
COPY cmd/team-operator/main.go cmd/team-operator/main.go
COPY cmd/team-operator/ cmd/team-operator/
COPY api/ api/
COPY internal/ ./internal/

Expand All @@ -29,7 +29,7 @@ RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
-ldflags="-X 'github.com/posit-dev/team-operator/internal.VersionString=${VERSION}'"\
-a \
-o team-operator \
cmd/team-operator/main.go
./cmd/team-operator/

# Use distroless as minimal base image to package the team-operator binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
Expand Down
4 changes: 2 additions & 2 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ deps-up:

# Run team-operator directly from source
run:
go run cmd/team-operator/main.go
go run ./cmd/team-operator/

# Run team-operator via the Makefile target
mrun:
Expand All @@ -40,7 +40,7 @@ build:
-ldflags="-X 'github.com/posit-dev/team-operator/internal.VersionString={{ VERSION }}'" \
-a \
-o ./bin/team-operator \
cmd/team-operator/main.go
./cmd/team-operator/

# Build ./bin/team-operator via the Makefile target
mbuild:
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ test-integration: go-test test-kind ## Run all tests (unit + integration).

.PHONY: build
build: copy-crds generate-all fmt vet ## Build manager binary.
go build -o bin/team-operator ./cmd/team-operator/main.go
go build -o bin/team-operator ./cmd/team-operator/

.PHONY: docker-build
docker-build: build ## Build the operator Docker image.
Expand All @@ -203,7 +203,7 @@ distclean:

.PHONY: run
run: manifests generate-all fmt vet ## Run a controller from your host.
go run ./cmd/team-operator/main.go
go run ./cmd/team-operator/

##@ Deployment

Expand Down
7 changes: 5 additions & 2 deletions api/core/v1beta1/chronicle_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,13 @@ func (c *Chronicle) KubernetesLabels() map[string]string {
}

func (c *Chronicle) OwnerReferencesForChildren() []metav1.OwnerReference {
// APIVersion/Kind are hardcoded because controller-runtime's client.Get
// strips TypeMeta from typed-object responses, leaving c.APIVersion and
// c.Kind empty in the reconcile path.
return []metav1.OwnerReference{
{
APIVersion: c.APIVersion,
Kind: c.Kind,
APIVersion: GroupVersion.String(),
Kind: "Chronicle",
Name: c.Name,
UID: c.UID,
},
Expand Down
7 changes: 5 additions & 2 deletions api/core/v1beta1/connect_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,10 +275,13 @@ func (c *Connect) GetAwsAccountId() string {
}

func (c *Connect) OwnerReferencesForChildren() []metav1.OwnerReference {
// APIVersion/Kind are hardcoded because controller-runtime's client.Get
// strips TypeMeta from typed-object responses, leaving c.APIVersion and
// c.Kind empty in the reconcile path.
return []metav1.OwnerReference{
{
APIVersion: c.APIVersion,
Kind: c.Kind,
APIVersion: GroupVersion.String(),
Kind: "Connect",
Name: c.Name,
UID: c.UID,
},
Expand Down
7 changes: 5 additions & 2 deletions api/core/v1beta1/packagemanager_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -414,10 +414,13 @@ func (pm *PackageManager) CreateSecretVolumeFactory() *product.SecretVolumeFacto
}

func (pm *PackageManager) OwnerReferencesForChildren() []metav1.OwnerReference {
// APIVersion/Kind are hardcoded because controller-runtime's client.Get
// strips TypeMeta from typed-object responses, leaving pm.APIVersion and
// pm.Kind empty in the reconcile path.
return []metav1.OwnerReference{
{
APIVersion: pm.APIVersion,
Kind: pm.Kind,
APIVersion: GroupVersion.String(),
Kind: "PackageManager",
Name: pm.Name,
UID: pm.UID,
},
Expand Down
7 changes: 5 additions & 2 deletions api/core/v1beta1/site_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -729,10 +729,13 @@ func (s *Site) GetSecretType() product.SiteSecretType {
}

func (s *Site) OwnerReferencesForChildren() []metav1.OwnerReference {
// APIVersion/Kind are hardcoded because controller-runtime's client.Get
// strips TypeMeta from typed-object responses, leaving s.APIVersion and
// s.Kind empty in the reconcile path.
return []metav1.OwnerReference{
{
APIVersion: s.APIVersion,
Kind: s.Kind,
APIVersion: GroupVersion.String(),
Kind: "Site",
Name: s.Name,
UID: s.UID,
},
Expand Down
7 changes: 5 additions & 2 deletions api/core/v1beta1/workbench_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,10 +236,13 @@ func init() {
}

func (w *Workbench) OwnerReferencesForChildren() []metav1.OwnerReference {
// APIVersion/Kind are hardcoded because controller-runtime's client.Get
// strips TypeMeta from typed-object responses, leaving w.APIVersion and
// w.Kind empty in the reconcile path.
return []metav1.OwnerReference{
{
APIVersion: w.APIVersion,
Kind: w.Kind,
APIVersion: GroupVersion.String(),
Kind: "Workbench",
Name: w.Name,
UID: w.UID,
},
Expand Down
90 changes: 70 additions & 20 deletions cmd/team-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/posit-dev/team-operator/api/keycloak/v2alpha1"
"github.com/posit-dev/team-operator/api/product"
"github.com/posit-dev/team-operator/internal/observability"
"github.com/traefik/traefik/v3/pkg/provider/kubernetes/crd/traefikio/v1alpha1"
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/metrics/server"
Expand Down Expand Up @@ -111,6 +112,20 @@ func main() {
"configurable Workbench session pod field and writes one numbered label per "+
"match onto the pod. Per-site config lives in the Workbench CR's sessionLabels field.")

var (
obsMetricsOTLPEndpoint string
obsMetricsExportInterval time.Duration
obsClusterName string
)

flag.StringVar(&obsMetricsOTLPEndpoint, "observability-metrics-otlp-endpoint", "",
"gRPC OTLP endpoint for metric push (e.g. otel-collector:4317). "+
"Falls back to OTEL_EXPORTER_OTLP_METRICS_ENDPOINT then OTEL_EXPORTER_OTLP_ENDPOINT.")
flag.DurationVar(&obsMetricsExportInterval, "observability-metrics-export-interval", 30*time.Second,
"Cadence for OTLP metric export and async gauge collection")
flag.StringVar(&obsClusterName, "observability-cluster-name", "",
"Value for the k8s.cluster.name resource attribute")

opts := zap.Options{Development: true}

opts.BindFlags(flag.CommandLine)
Expand All @@ -124,6 +139,26 @@ func main() {

zl.Info("team-operator version", "version", internal.VersionString)

instanceID := os.Getenv("POD_NAME")
if instanceID == "" {
setupLog.Info("POD_NAME env var not set; service.instance.id resource attribute will be empty. " +
"Wire POD_NAME from the downward API (metadata.name) for per-pod metric aggregation.")
}

obsProvider := observability.NewProvider(context.Background(), observability.Config{
OTLPEndpoint: obsMetricsOTLPEndpoint,
MetricsExportInterval: obsMetricsExportInterval,
ClusterName: obsClusterName,
InstanceID: instanceID,
})
defer func() {
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
if err := obsProvider.Shutdown(shutdownCtx); err != nil {
setupLog.Error(err, "error shutting down observability provider")
}
}()

mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{
Scheme: scheme,
Metrics: server.Options{
Expand Down Expand Up @@ -171,62 +206,69 @@ func main() {
}

if err = (&corecontroller.SiteReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/site")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Site")
os.Exit(1)
}

if err = (&corecontroller.PostgresDatabaseReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/postgres-database")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "PostgresDatabase")
os.Exit(1)
}

if err = (&corecontroller.ConnectReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/connect")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ImplConnect")
os.Exit(1)
}

if err = (&corecontroller.WorkbenchReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/workbench")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Workbench")
os.Exit(1)
}

if err = (&corecontroller.PackageManagerReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/package-manager")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "PackageManager")
os.Exit(1)
}

if err = (&corecontroller.ChronicleReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/chronicle")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Chronicle")
os.Exit(1)
}

if err = (&corecontroller.FlightdeckReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Log: setupLog,
Instruments: observability.NewInstruments(obsProvider.Meter("team-operator/flightdeck")),
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Flightdeck")
os.Exit(1)
Expand All @@ -248,6 +290,14 @@ func main() {

//+kubebuilder:scaffold:builder

lister := &multiKindLister{client: mgr.GetClient(), log: setupLog}
if err := observability.RegisterResourceCountGauge(
obsProvider.Meter("team-operator/resource-count"),
lister,
); err != nil {
setupLog.Error(err, "failed to register resource count gauge; continuing without it")
}

if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
setupLog.Error(err, "unable to set up health check")
os.Exit(1)
Expand Down
Loading
Loading