-
Notifications
You must be signed in to change notification settings - Fork 296
Report job success immediately after main graph completes, before post steps #4921
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -454,6 +454,8 @@ type options struct { | |
| metricsAgent *metrics.MetricsAgent | ||
|
|
||
| skippedImages sets.Set[string] | ||
|
|
||
| successReported bool | ||
| } | ||
|
|
||
| func bindOptions(flag *flag.FlagSet) *options { | ||
|
|
@@ -919,6 +921,11 @@ func (o *options) Report(errs ...error) { | |
| } | ||
|
|
||
| if len(errorToReport) == 0 { | ||
| // Skip reporting success if it was already reported early (before post steps) | ||
| if o.successReported { | ||
| logrus.Debug("Success was already reported early, skipping duplicate report.") | ||
| return | ||
| } | ||
| reporter.Report(nil) | ||
| } | ||
| } | ||
|
|
@@ -1057,26 +1064,58 @@ func (o *options) Run() []error { | |
| return wrapped | ||
| } | ||
|
|
||
| // Run each of the promotion steps concurrently | ||
| // Main graph completed successfully - report success immediately before post steps | ||
| mainGraphCompletedAt := time.Now() | ||
| mainGraphDuration := mainGraphCompletedAt.Sub(start) | ||
| eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace)) | ||
|
|
||
| // Report success to users immediately (post steps are best-effort cleanup) | ||
| if shouldReportEarly(o.jobSpec) { | ||
| reporter, loadErr := o.resultsOptions.Reporter(o.jobSpec, o.consoleHost) | ||
| if loadErr != nil { | ||
| logrus.WithError(loadErr).Warn("Could not load result reporting options, skipping early success report.") | ||
| } else { | ||
| reporter.Report(nil) | ||
| o.successReported = true | ||
|
Comment on lines
+1078
to
+1079
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is the part you expect to deliver the early report, but AFAIK this does not do any actual meaningful reporting. It reports results to result-aggregator which AFAIK only powers the The actual result reporting goes strictly through exit codes: ci-operator exits, prow picks up that exit code and propagates the result accordingly (sets Prowjob status, crier reports to GitHub...). So early reporting is likely a much greater architecture change - either we invent APIs on both ci-operator and Prow layers to directly support this (report the final result while the job is still executing), or we'd allow ci-operator and therefore the prowjob to terminate with the final result, and the "remaining" parts of the workload would need to be handed over to some background manager to process and finalize them. Both are IMO non trivial and a kind of effort that needs some design work |
||
| } | ||
| } | ||
coderabbitai[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Run each of the promotion steps concurrently (best-effort cleanup) | ||
| postStepsStart := time.Now() | ||
| lenOfPromotionSteps := len(promotionSteps) | ||
| detailsChan := make(chan api.CIOperatorStepDetails, lenOfPromotionSteps) | ||
| errChan := make(chan error, lenOfPromotionSteps) | ||
| for _, step := range promotionSteps { | ||
| go runPromotionStep(ctx, step, detailsChan, errChan, o.metricsAgent) | ||
| } | ||
| postStepsFailed := false | ||
| for i := 0; i < lenOfPromotionSteps; i++ { | ||
| select { | ||
| case details := <-detailsChan: | ||
| graph.MergeFrom(details) | ||
| case err := <-errChan: | ||
| errorDesc := fmt.Sprintf("post step failed while %s. with error: %v", eventJobDescription(o.jobSpec, o.namespace), err) | ||
| eventRecorder.Event(runtimeObject, coreapi.EventTypeWarning, "PostStepFailed", errorDesc) | ||
| return []error{results.ForReason("executing_post").WithError(err).Unwrap()} // If any of the promotion steps fail, it is considered a failure | ||
| logrus.WithError(err).Warn("Post step failed, but job success was already reported. Continuing with cleanup.") | ||
| postStepsFailed = true | ||
| // Post step failures don't affect job success (already reported), but we still record them | ||
| } | ||
| } | ||
|
|
||
| eventRecorder.Event(runtimeObject, coreapi.EventTypeNormal, "CiJobSucceeded", eventJobDescription(o.jobSpec, o.namespace)) | ||
| o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metrics.Context{"duration_seconds": time.Since(start).Seconds(), "success": true})) | ||
| // Record final metrics including post steps duration | ||
| postStepsDuration := time.Since(postStepsStart) | ||
| totalDuration := time.Since(start) | ||
| metricsContext := metrics.Context{ | ||
| "duration_seconds": totalDuration.Seconds(), | ||
| "main_graph_duration_seconds": mainGraphDuration.Seconds(), | ||
| "post_steps_duration_seconds": postStepsDuration.Seconds(), | ||
| "time_saved_seconds": postStepsDuration.Seconds(), | ||
| "success": true, | ||
| } | ||
| if postStepsFailed { | ||
| metricsContext["post_steps_failed"] = true | ||
| } | ||
| o.metricsAgent.Record(metrics.NewInsightsEvent(metrics.InsightExecutionCompleted, metricsContext)) | ||
|
|
||
| return nil | ||
| }) | ||
|
|
@@ -2102,6 +2141,34 @@ func jobSpecFromGitRef(ref string) (*api.JobSpec, error) { | |
| return spec, nil | ||
| } | ||
|
|
||
| // shouldReportEarly determines if success should be reported immediately after main graph completes. | ||
| // Returns true for presubmits (except rehearsals) and release controller periodics. | ||
| func shouldReportEarly(jobSpec *api.JobSpec) bool { | ||
| if jobSpec == nil { | ||
| return false | ||
| } | ||
|
|
||
| if jobSpec.Type == prowapi.PresubmitJob { | ||
| if strings.HasPrefix(jobSpec.Job, "rehearse-") { | ||
| return false | ||
| } | ||
| return true | ||
| } | ||
|
|
||
| if jobSpec.Type == prowapi.PeriodicJob { | ||
| jobName := strings.ToLower(jobSpec.Job) | ||
| if strings.Contains(jobName, "release") { | ||
| if strings.Contains(jobName, "ocp") || strings.Contains(jobName, "nightly") || | ||
| strings.Contains(jobName, "ci-") || strings.Contains(jobName, "release-") { | ||
| return true | ||
| } | ||
| } | ||
| return false | ||
| } | ||
|
|
||
| return false | ||
| } | ||
|
|
||
| func nodeNames(nodes []*api.StepNode) []string { | ||
| var names []string | ||
| for _, node := range nodes { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nothing is consuming this. I had plans to deprecate them all.