From fd5453938c8e248e9261fd5d70a0a2a49c069a9c Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Mon, 2 Feb 2026 14:02:08 -0600 Subject: [PATCH 1/2] Report job success immediately after main graph completes, before post steps - Report success to users as soon as main graph completes successfully - Post steps (promotion/cleanup) now run as best-effort and don't affect job result - Add metrics tracking: main_graph_duration_seconds, post_steps_duration_seconds, time_saved_seconds - Metrics are extractable from ci-operator-metrics.json in test_platform_insights events - Prevent duplicate success reporting by tracking early-report state in options --- cmd/ci-operator/main.go | 45 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/cmd/ci-operator/main.go b/cmd/ci-operator/main.go index d3e5129730..46bf792813 100644 --- a/cmd/ci-operator/main.go +++ b/cmd/ci-operator/main.go @@ -454,6 +454,8 @@ type options struct { metricsAgent *metrics.MetricsAgent skippedImages sets.Set[string] + + successReported bool } func bindOptions(flag *flag.FlagSet) *options { @@ -919,6 +921,11 @@ func (o *options) Report(errs ...error) { } if len(errorToReport) == 0 { + // Skip reporting success if it was already reported early (before post steps) + if o.successReported { + logrus.Debug("Success was already reported early, skipping duplicate report.") + return + } reporter.Report(nil) } } @@ -1057,13 +1064,29 @@ func (o *options) Run() []error { return wrapped } - // Run each of the promotion steps concurrently + // Main graph completed successfully - report success immediately before post steps + mainGraphCompletedAt := time.Now() + mainGraphDuration := mainGraphCompletedAt.Sub(start) + eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace)) + + // Report success to users immediately (post steps are best-effort cleanup) + reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost) + if loadErr != nil { + logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.") + } else { + reporter.Report(nil) + o.successReported = true + } + + // Run each of the promotion steps concurrently (best-effort cleanup) + postStepsStart := time.Now() lenOfPromotionSteps := len(promotionSteps) detailsChan := make(chan api.CIOperatorStepDetails, lenOfPromotionSteps) errChan := make(chan error, lenOfPromotionSteps) for _, step := range promotionSteps { go runPromotionStep(ctx, step, detailsChan, errChan, o.metricsAgent) } + postStepsFailed := false for i := 0; i < lenOfPromotionSteps; i++ { select { case details := <-detailsChan: @@ -1071,12 +1094,26 @@ func (o *options) Run() []error { case err := <-errChan: errorDesc := fmt.Sprintf("post step failed while %s. with error: %v", eventJobDescription(o.jobSpec, o.namespace), err) eventRecorder.Event(runtimeObject, coreapi.EventTypeWarning, "PostStepFailed", errorDesc) - return []error{results.ForReason("executing_post").WithError(err).Unwrap()} // If any of the promotion steps fail, it is considered a failure + logrus.WithError(err).Warn("Post step failed, but job success was already reported. Continuing with cleanup.") + postStepsFailed = true + // Post step failures don't affect job success (already reported), but we still record them } } - eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace)) - o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metrics.Context{"duration_seconds": time.Since(start).Seconds(), "success": true})) + // Record final metrics including post steps duration + postStepsDuration := time.Since(postStepsStart) + totalDuration := time.Since(start) + metricsContext := metrics.Context{ + "duration_seconds": totalDuration.Seconds(), + "main_graph_duration_seconds": mainGraphDuration.Seconds(), + "post_steps_duration_seconds": postStepsDuration.Seconds(), + "time_saved_seconds": postStepsDuration.Seconds(), + "success": true, + } + if postStepsFailed { + metricsContext["post_steps_failed"] = true + } + o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metricsContext)) return nil }) From efdce43375bfca3023c20b7c29c32342ce5004c7 Mon Sep 17 00:00:00 2001 From: Deep Mistry Date: Mon, 2 Feb 2026 16:07:37 -0600 Subject: [PATCH 2/2] Limit early success reporting to presubmits and release controller periodics, exclude rehearsals --- cmd/ci-operator/main.go | 42 +++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/cmd/ci-operator/main.go b/cmd/ci-operator/main.go index 46bf792813..02402e8f78 100644 --- a/cmd/ci-operator/main.go +++ b/cmd/ci-operator/main.go @@ -1070,12 +1070,14 @@ func (o *options) Run() []error { eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace)) // Report success to users immediately (post steps are best-effort cleanup) - reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost) - if loadErr != nil { - logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.") - } else { - reporter.Report(nil) - o.successReported = true + if shouldReportEarly(o.jobSpec) { + reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost) + if loadErr != nil { + logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.") + } else { + reporter.Report(nil) + o.successReported = true + } } // Run each of the promotion steps concurrently (best-effort cleanup) @@ -2139,6 +2141,34 @@ func jobSpecFromGitRef(ref string) (*api.JobSpec, error) { return spec, nil } +// shouldReportEarly determines if success should be reported immediately after main graph completes. +// Returns true for presubmits (except rehearsals) and release controller periodics. +func shouldReportEarly(jobSpec *api.JobSpec) bool { + if jobSpec == nil { + return false + } + + if jobSpec.Type == prowapi.PresubmitJob { + if strings.HasPrefix(jobSpec.Job, "rehearse-") { + return false + } + return true + } + + if jobSpec.Type == prowapi.PeriodicJob { + jobName := strings.ToLower(jobSpec.Job) + if strings.Contains(jobName, "release") { + if strings.Contains(jobName, "ocp") || strings.Contains(jobName, "nightly") || + strings.Contains(jobName, "ci-") || strings.Contains(jobName, "release-") { + return true + } + } + return false + } + + return false +} + func nodeNames(nodes []*api.StepNode) []string { var names []string for _, node := range nodes {