Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions api/v1alpha1/agentteam_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -848,6 +848,7 @@ type PullRequestStatus struct {
// +kubebuilder:subresource:status
// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase`
// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.ready`
// +kubebuilder:printcolumn:name="Stage",type=string,JSONPath=`.status.pipeline.currentStage`,priority=1
// +kubebuilder:printcolumn:name="Tasks Done",type=integer,JSONPath=`.status.tasks.completed`
// +kubebuilder:printcolumn:name="Cost",type=string,JSONPath=`.status.estimatedCost`
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
Expand Down
4 changes: 4 additions & 0 deletions charts/kagents/crds/kagents.dev_agentteams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ spec:
- jsonPath: .status.ready
name: Ready
type: string
- jsonPath: .status.pipeline.currentStage
name: Stage
priority: 1
type: string
- jsonPath: .status.tasks.completed
name: Tasks Done
type: integer
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/kagents.dev_agentteams.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ spec:
- jsonPath: .status.ready
name: Ready
type: string
- jsonPath: .status.pipeline.currentStage
name: Stage
priority: 1
type: string
- jsonPath: .status.tasks.completed
name: Tasks Done
type: integer
Expand Down
12 changes: 11 additions & 1 deletion docs/explanation/operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,12 @@ The operator exposes Prometheus metrics, ships a Grafana dashboard, and fires we

### Prometheus metrics

The operator binary exposes `/metrics` on port 8080 by default. Eight series, all labeled by team name and (where applicable) teammate name + model:
The operator binary exposes `/metrics` on port 8080 by default. Two
prefix families are emitted in parallel: the original `claude_*` series
that have been there since v0.3.0 (kept stable to avoid breaking
existing dashboards), and a `kagents_*` family added in v0.8.0 that
covers knowledge-work observability (pipeline stages, artifacts,
delivery). Both stream from the same `/metrics` endpoint.

| Metric | Type | Description |
|--------|------|-------------|
Expand All @@ -100,6 +105,11 @@ The operator binary exposes `/metrics` on port 8080 by default. Eight series, al
| `claude_teammate_restarts_total` | counter | Pod restarts per teammate |
| `claude_team_budget_remaining_usd` | gauge | `budgetLimit - estimatedCostUsd` |
| `claude_teammate_idle_seconds` | histogram | Time between task completions per teammate |
| `kagents_team_pipeline_stage_active` | gauge | 1 while a pipeline stage is in `Running`, 0 otherwise. Labels: `team`, `namespace`, `stage` |
| `kagents_team_stage_duration_seconds` | histogram | Stage wall-clock duration observed once at the `Running → Completed` transition |
| `kagents_team_artifacts_produced_total` | counter | Artifacts appended to `status.artifacts` per teammate |
| `kagents_team_delivery_success_total` | counter | Successful `onComplete: deliver` dispatches, by target type |
| `kagents_team_delivery_failure_total` | counter | Failed deliveries by target type |

Wire them to Prometheus by enabling the chart's ServiceMonitor:

Expand Down
24 changes: 24 additions & 0 deletions internal/controller/agentteam_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1739,9 +1739,11 @@ func (r *AgentTeamReconciler) executeDelivery(ctx context.Context, team *claudev
if err := dispatcher.Send(ctx, r.Client, target, team); err != nil {
status.Success = false
status.Error = err.Error()
metrics.RecordDeliveryFailure(team.Name, team.Namespace, target.Type)
r.recordEvent(team, corev1.EventTypeWarning, "DeliveryFailed",
"Delivery %s to %s failed: %v", target.Type, status.Target, err)
} else {
metrics.RecordDeliverySuccess(team.Name, team.Namespace, target.Type)
r.recordEvent(team, corev1.EventTypeNormal, "DeliveryComplete",
"Delivery %s to %s succeeded", target.Type, status.Target)
}
Expand Down Expand Up @@ -2190,6 +2192,7 @@ func (r *AgentTeamReconciler) updatePipelineStatus(team *claudev1alpha1.AgentTea
}
}

prevPhase := ss.Phase
switch {
case anyFailed:
ss.Phase = "Failed"
Expand All @@ -2214,6 +2217,22 @@ func (r *AgentTeamReconciler) updatePipelineStatus(team *claudev1alpha1.AgentTea
ss.Phase = "Waiting"
}

// Emit observability signals on phase transitions. Running and
// Completed are the two interesting edges:
//
// * Running: flip the stage_active gauge to 1 so dashboards
// show where each team currently is.
// * Completed: flip the gauge back to 0 and observe the stage's
// wall-clock duration (StartedAt → now). The
// histogram observation is gated on a fresh
// CompletedAt (== now) so re-reconciles of an
// already-completed stage don't double-count.
metrics.SetPipelineStageActive(team.Name, team.Namespace, ss.Name, ss.Phase == "Running")
if prevPhase != "Completed" && ss.Phase == "Completed" && ss.StartedAt != nil && ss.CompletedAt != nil {
metrics.ObservePipelineStageDuration(team.Name, team.Namespace, ss.Name,
ss.CompletedAt.Sub(ss.StartedAt.Time).Seconds())
}

if ss.Phase == "Completed" {
completed++
} else if currentStage == "" {
Expand Down Expand Up @@ -2254,6 +2273,10 @@ func findProducerOutputPath(team *claudev1alpha1.AgentTeam, from, artifact strin
// already present (same Name + ProducedBy) is not re-added, so calling this
// every reconcile after the producer reaches Succeeded is safe. Should be
// invoked once per teammate transition to Completed.
//
// Each newly-appended artifact also bumps the
// kagents_team_artifacts_produced_total counter — the existence check
// above is what makes the metric idempotent across reconciles.
func recordTeammateArtifacts(team *claudev1alpha1.AgentTeam, tm claudev1alpha1.TeammateSpec, at time.Time) {
if len(tm.Outputs) == 0 {
return
Expand All @@ -2274,6 +2297,7 @@ func recordTeammateArtifacts(team *claudev1alpha1.AgentTeam, tm claudev1alpha1.T
ProducedBy: tm.Name,
ProducedAt: metav1.NewTime(at),
})
metrics.RecordArtifactProduced(team.Name, team.Namespace, tm.Name)
}
}

Expand Down
15 changes: 15 additions & 0 deletions internal/dashboard/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ var templateFuncs = template.FuncMap{
}
return claudev1alpha1.TeammateStatus{Name: name}
},

// pipelinePercent computes a 0..100 progress value for the pipeline
// progress bar. Returns 0 when the pipeline reports no stages —
// callers that gate the section on .Status.Pipeline being non-nil
// won't reach this in that case, but the guard is cheap.
"pipelinePercent": func(p *claudev1alpha1.PipelineStatus) int {
if p == nil || p.StagesTotal == 0 {
return 0
}
pct := (p.StagesCompleted * 100) / p.StagesTotal
if pct > 100 {
return 100
}
return pct
},
}

// pageData wraps the typed payload in a render-time envelope so the layout
Expand Down
48 changes: 48 additions & 0 deletions internal/dashboard/templates/_detail_body.html
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,54 @@ <h1 class="text-2xl font-semibold">{{.Name}}</h1>
</div>
</div>

{{if .Status.Pipeline}}
<div class="bg-white rounded-lg shadow-sm overflow-hidden mb-4">
<h2 class="px-4 py-3 text-sm font-semibold border-b border-slate-200 flex items-center justify-between">
<span>Pipeline</span>
<span class="text-xs font-normal text-slate-500">{{.Status.Pipeline.StagesCompleted}} of {{.Status.Pipeline.StagesTotal}} stages complete</span>
</h2>
<div class="px-4 py-3">
{{$pct := pipelinePercent .Status.Pipeline}}
<div class="h-2 w-full bg-slate-200 rounded-full overflow-hidden mb-3">
<div class="h-2 bg-blue-500" style="width: {{$pct}}%;"></div>
</div>
<ol class="space-y-2">
{{range .Status.Pipeline.Stages}}
<li class="flex items-center gap-3 text-sm">
<span class="phase-{{.Phase}} px-2 py-0.5 rounded text-xs font-medium min-w-[5.5rem] text-center">{{or .Phase "Waiting"}}</span>
<span class="font-mono text-xs text-slate-700">{{.Name}}</span>
<span class="text-xs text-slate-500">{{.TeammatesReady}}</span>
</li>
{{end}}
</ol>
</div>
</div>
{{end}}

{{if .Status.Artifacts}}
<div class="bg-white rounded-lg shadow-sm overflow-hidden mb-4">
<h2 class="px-4 py-3 text-sm font-semibold border-b border-slate-200">Artifacts</h2>
<table class="w-full text-sm">
<thead class="bg-slate-50 text-left text-xs uppercase text-slate-600">
<tr>
<th class="px-4 py-2">Name</th>
<th class="px-4 py-2">Produced by</th>
<th class="px-4 py-2">Path</th>
</tr>
</thead>
<tbody>
{{range .Status.Artifacts}}
<tr class="border-t border-slate-100">
<td class="px-4 py-2 font-mono text-xs">{{.Name}}</td>
<td class="px-4 py-2 text-slate-600">{{.ProducedBy}}</td>
<td class="px-4 py-2 font-mono text-xs text-slate-500">{{.Path}}</td>
</tr>
{{end}}
</tbody>
</table>
</div>
{{end}}

<div class="bg-white rounded-lg shadow-sm overflow-hidden mb-4">
<h2 class="px-4 py-3 text-sm font-semibold border-b border-slate-200">Teammates</h2>
<table class="w-full text-sm">
Expand Down
56 changes: 56 additions & 0 deletions internal/dashboard/templates_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package dashboard

import (
"testing"

"github.com/stretchr/testify/assert"

claudev1alpha1 "github.com/amcheste/kagents/api/v1alpha1"
)

// pipelinePercent isn't reachable from outside the package via a named
// symbol — it's an entry in templateFuncs. Pulling it out by key
// preserves its blessed signature (function value) and lets us hit
// the boundary conditions without going through full template render.
func pipelinePercentFn() func(*claudev1alpha1.PipelineStatus) int {
return templateFuncs["pipelinePercent"].(func(*claudev1alpha1.PipelineStatus) int)
}

func TestPipelinePercent_NilStatus(t *testing.T) {
t.Parallel()
assert.Equal(t, 0, pipelinePercentFn()(nil))
}

func TestPipelinePercent_ZeroTotal(t *testing.T) {
t.Parallel()
// Division-by-zero guard: a freshly initialized PipelineStatus has
// StagesTotal=0 before the reconciler runs the spec. Must not panic.
assert.Equal(t, 0, pipelinePercentFn()(&claudev1alpha1.PipelineStatus{}))
}

func TestPipelinePercent_HalfComplete(t *testing.T) {
t.Parallel()
got := pipelinePercentFn()(&claudev1alpha1.PipelineStatus{
StagesCompleted: 2, StagesTotal: 4,
})
assert.Equal(t, 50, got)
}

func TestPipelinePercent_AllComplete(t *testing.T) {
t.Parallel()
got := pipelinePercentFn()(&claudev1alpha1.PipelineStatus{
StagesCompleted: 3, StagesTotal: 3,
})
assert.Equal(t, 100, got)
}

func TestPipelinePercent_OverflowClamps(t *testing.T) {
t.Parallel()
// Pathological case — reconciler bug counts more completed than
// total. The helper should clamp rather than render a >100% bar
// that breaks the layout.
got := pipelinePercentFn()(&claudev1alpha1.PipelineStatus{
StagesCompleted: 99, StagesTotal: 3,
})
assert.Equal(t, 100, got)
}
73 changes: 73 additions & 0 deletions internal/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,39 @@ var (
Buckets: prometheus.ExponentialBuckets(1, 2, 10),
}, []string{"team", "teammate"})

// --- Knowledge-work observability (v0.8.0+) ---
//
// These metrics use the `kagents_` prefix to match the rebranded
// project name. Existing `claude_*` metrics above stay put for
// backwards compatibility — they'll get a synchronized rename in a
// future major; for now operators dashboard against either set.

pipelineStageActive = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "kagents_team_pipeline_stage_active",
Help: "1 when this pipeline stage is in the Running phase, 0 otherwise. Useful for stacking by stage to visualize where a team is in flight.",
}, []string{"team", "namespace", "stage"})

pipelineStageDuration = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "kagents_team_stage_duration_seconds",
Help: "Wall-clock duration from a pipeline stage's first teammate spawning to its last completing, in seconds. Observed once per (team, stage) at the Completed transition.",
Buckets: prometheus.ExponentialBuckets(30, 2, 10),
}, []string{"team", "namespace", "stage"})

teamArtifactsProduced = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "kagents_team_artifacts_produced_total",
Help: "Artifacts recorded on a team's status by a teammate completing its declared outputs.",
}, []string{"team", "namespace", "teammate"})

teamDeliverySuccess = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "kagents_team_delivery_success_total",
Help: "Successful deliveries dispatched when OnComplete=deliver, by target type.",
}, []string{"team", "namespace", "type"})

teamDeliveryFailure = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "kagents_team_delivery_failure_total",
Help: "Failed deliveries dispatched when OnComplete=deliver, by target type.",
}, []string{"team", "namespace", "type"})

collectors = []prometheus.Collector{
teamActive,
teamDuration,
Expand All @@ -59,6 +92,11 @@ var (
teammateRestarts,
teamBudgetRemaining,
teammateIdle,
pipelineStageActive,
pipelineStageDuration,
teamArtifactsProduced,
teamDeliverySuccess,
teamDeliveryFailure,
}

registerOnce sync.Once
Expand Down Expand Up @@ -140,3 +178,38 @@ func SetBudgetRemaining(team, namespace string, remaining float64) {
func SetActiveTeams(count int) {
teamActive.Set(float64(count))
}

// SetPipelineStageActive marks a stage as Running (1) or not (0). The
// gauge is set on every reconcile so a stage that transitions
// Running → Completed flips to 0 without needing a separate "stage
// done" event.
func SetPipelineStageActive(team, namespace, stage string, active bool) {
v := 0.0
if active {
v = 1.0
}
pipelineStageActive.WithLabelValues(team, namespace, stage).Set(v)
}

// ObservePipelineStageDuration records the wall-clock seconds a stage
// spent in Running before transitioning to Completed. Call this once
// per (team, stage) — the reconciler guards against re-observing.
func ObservePipelineStageDuration(team, namespace, stage string, durationSec float64) {
pipelineStageDuration.WithLabelValues(team, namespace, stage).Observe(durationSec)
}

// RecordArtifactProduced increments the per-teammate artifact counter.
// One increment per artifact appended to status.artifacts.
func RecordArtifactProduced(team, namespace, teammate string) {
teamArtifactsProduced.WithLabelValues(team, namespace, teammate).Inc()
}

// RecordDeliverySuccess increments the per-type delivery success counter.
func RecordDeliverySuccess(team, namespace, deliveryType string) {
teamDeliverySuccess.WithLabelValues(team, namespace, deliveryType).Inc()
}

// RecordDeliveryFailure increments the per-type delivery failure counter.
func RecordDeliveryFailure(team, namespace, deliveryType string) {
teamDeliveryFailure.WithLabelValues(team, namespace, deliveryType).Inc()
}
Loading
Loading