Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 71 additions & 4 deletions cmd/ci-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,8 @@ type options struct {
metricsAgent *metrics.MetricsAgent

skippedImages sets.Set[string]

successReported bool
}

func bindOptions(flag *flag.FlagSet) *options {
Expand Down Expand Up @@ -919,6 +921,11 @@ func (o *options) Report(errs ...error) {
}

if len(errorToReport) == 0 {
// Skip reporting success if it was already reported early (before post steps)
if o.successReported {
logrus.Debug("Success was already reported early, skipping duplicate report.")
return
}
reporter.Report(nil)
}
}
Expand Down Expand Up @@ -1057,26 +1064,58 @@ func (o *options) Run() []error {
return wrapped
}

// Run each of the promotion steps concurrently
// Main graph completed successfully - report success immediately before post steps
mainGraphCompletedAt := time.Now()
mainGraphDuration := mainGraphCompletedAt.Sub(start)
eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nothing is consuming this. I had plans to deprecate them all.


// Report success to users immediately (post steps are best-effort cleanup)
if shouldReportEarly(o.jobSpec) {
reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost)
if loadErr != nil {
logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.")
} else {
reporter.Report(nil)
o.successReported = true
Comment on lines +1078 to +1079
Copy link
Member

@petr-muller petr-muller Feb 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is the part you expect to deliver the early report, but AFAIK this does not do any actual meaningful reporting. It reports results to result-aggregator which AFAIK only powers the ci operator executions fail too much because of a $reason alerts and apparently does something related to pod scaler, too (but its not documented so who knows lol).

The actual result reporting goes strictly through exit codes: ci-operator exits, prow picks up that exit code and propagates the result accordingly (sets Prowjob status, crier reports to GitHub...). So early reporting is likely a much greater architecture change - either we invent APIs on both ci-operator and Prow layers to directly support this (report the final result while the job is still executing), or we'd allow ci-operator and therefore the prowjob to terminate with the final result, and the "remaining" parts of the workload would need to be handed over to some background manager to process and finalize them. Both are IMO non trivial and a kind of effort that needs some design work

}
}

// Run each of the promotion steps concurrently (best-effort cleanup)
postStepsStart := time.Now()
lenOfPromotionSteps := len(promotionSteps)
detailsChan := make(chan api.CIOperatorStepDetails, lenOfPromotionSteps)
errChan := make(chan error, lenOfPromotionSteps)
for _, step := range promotionSteps {
go runPromotionStep(ctx, step, detailsChan, errChan, o.metricsAgent)
}
postStepsFailed := false
for i := 0; i < lenOfPromotionSteps; i++ {
select {
case details := <-detailsChan:
graph.MergeFrom(details)
case err := <-errChan:
errorDesc := fmt.Sprintf("post step failed while %s. with error: %v", eventJobDescription(o.jobSpec, o.namespace), err)
eventRecorder.Event(runtimeObject, coreapi.EventTypeWarning, "PostStepFailed", errorDesc)
return []error{results.ForReason("executing_post").WithError(err).Unwrap()} // If any of the promotion steps fail, it is considered a failure
logrus.WithError(err).Warn("Post step failed, but job success was already reported. Continuing with cleanup.")
postStepsFailed = true
// Post step failures don't affect job success (already reported), but we still record them
}
}

eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace))
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metrics.Context{"duration_seconds": time.Since(start).Seconds(), "success": true}))
// Record final metrics including post steps duration
postStepsDuration := time.Since(postStepsStart)
totalDuration := time.Since(start)
metricsContext := metrics.Context{
"duration_seconds": totalDuration.Seconds(),
"main_graph_duration_seconds": mainGraphDuration.Seconds(),
"post_steps_duration_seconds": postStepsDuration.Seconds(),
"time_saved_seconds": postStepsDuration.Seconds(),
"success": true,
}
if postStepsFailed {
metricsContext["post_steps_failed"] = true
}
o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metricsContext))

return nil
})
Expand Down Expand Up @@ -2102,6 +2141,34 @@ func jobSpecFromGitRef(ref string) (*api.JobSpec, error) {
return spec, nil
}

// shouldReportEarly determines if success should be reported immediately after main graph completes.
// Returns true for presubmits (except rehearsals) and release controller periodics.
func shouldReportEarly(jobSpec *api.JobSpec) bool {
if jobSpec == nil {
return false
}

if jobSpec.Type == prowapi.PresubmitJob {
if strings.HasPrefix(jobSpec.Job, "rehearse-") {
return false
}
return true
}

if jobSpec.Type == prowapi.PeriodicJob {
jobName := strings.ToLower(jobSpec.Job)
if strings.Contains(jobName, "release") {
if strings.Contains(jobName, "ocp") || strings.Contains(jobName, "nightly") ||
strings.Contains(jobName, "ci-") || strings.Contains(jobName, "release-") {
return true
}
}
return false
}

return false
}

func nodeNames(nodes []*api.StepNode) []string {
var names []string
for _, node := range nodes {
Expand Down