Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 136 additions & 4 deletions e2e/config/azure.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,8 +564,13 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi
}

if replicatedToCurrentRegion(version, location) {
toolkit.Logf(ctx, "Image version %s is already in region %s", *version.ID, location)
return nil
// Intent-to-replicate is registered in PublishingProfile.TargetRegions, but that
// does NOT mean the regional replica is actually serving traffic. The regional
// replication state can still be "Replicating" while the parent ProvisioningState
// is "Succeeded" — this is the source of GalleryImageNotFound 404s on VMSS create.
// Wait for the regional state itself before declaring success.
toolkit.Logf(ctx, "Image version %s is already a target of region %s; verifying regional replication state", *version.ID, location)
return a.waitForRegionalReplicationCompleted(ctx, image, version, location)
}
regions := make([]string, 0, len(version.Properties.PublishingProfile.TargetRegions))
for _, targetRegion := range version.Properties.PublishingProfile.TargetRegions {
Expand All @@ -575,12 +580,114 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi
toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Replicating to region %s", location)

start := time.Now() // Record the start time
err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location)
if err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location); err != nil {
return err
}

// The replicate LRO above completes when the parent resource is Succeeded, but the
// regional replica may still be in "Replicating" state for several more minutes.
// Block until that regional state hits Completed; otherwise downstream VMSS create
// will see GalleryImageNotFound and the test will appear to fail for a non-test reason.
if err := a.waitForRegionalReplicationCompleted(ctx, image, version, location); err != nil {
return err
}
elapsed := time.Since(start) // Calculate the elapsed time

toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", elapsed, *version.ID))

return err
return nil
}

// waitForRegionalReplicationCompleted polls the gallery image version with the
// ReplicationStatus expand option until the named region's RegionalReplicationStatus.State
// is Completed. Returns an error if the regional replication enters a terminal Failed state
// or the wait times out. Treats Unknown as transient (Azure occasionally emits Unknown
// briefly during state transitions).
//
// This closes a documented gap in the SIG API: the parent provisioning LRO can succeed
// before per-region replicas are actually serving traffic. Without this wait, callers see
// GalleryImageNotFound 404s on VMSS creation that look like bugs but are infra eventual
// consistency.
func (a *AzureClient) waitForRegionalReplicationCompleted(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion, location string) error {
imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions)
if err != nil {
return fmt.Errorf("create image version client for replication wait: %w", err)
}

getOpts := &armcompute.GalleryImageVersionsClientGetOptions{
Expand: to.Ptr(armcompute.ReplicationStatusTypesReplicationStatus),
}

var lastLoggedState armcompute.ReplicationState
const (
pollInterval = 15 * time.Second
pollTimeout = 20 * time.Minute
)

pollErr := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (bool, error) {
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, getOpts)
if err != nil {
// Transient API errors should not abort the wait; the SDK already retries throttling.
toolkit.Logf(ctx, "transient error getting image version replication status (will retry): %v", err)
return false, nil
}
Comment on lines +628 to +633
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

waitForRegionalReplicationCompleted treats any imgVersionClient.Get error as transient and continues polling (return false, nil). This can mask permanent failures (e.g., auth/permission issues, 404 due to wrong RG/name) and will only surface as a timeout, losing the underlying cause. Consider only swallowing clearly transient conditions (e.g., 429/5xx/timeouts) and returning the error for non-retryable 4xx so callers fail fast with the real Azure error.

Copilot uses AI. Check for mistakes.

regional := findRegionalReplicationStatus(resp.GalleryImageVersion.Properties.ReplicationStatus, location)
if regional == nil || regional.State == nil {
// Region not yet present in the status summary; keep waiting.
return false, nil
}

state := *regional.State
if state != lastLoggedState {
progress := int32(0)
if regional.Progress != nil {
progress = *regional.Progress
}
toolkit.Logf(ctx, "Image version %s regional replication state in %s: %s (progress: %d%%)", *version.ID, location, state, progress)
lastLoggedState = state
}

switch state {
case armcompute.ReplicationStateCompleted:
return true, nil
case armcompute.ReplicationStateFailed:
details := ""
if regional.Details != nil {
details = *regional.Details
}
return false, fmt.Errorf("regional replication of %s to %s failed: %s", *version.ID, location, details)
case armcompute.ReplicationStateReplicating, armcompute.ReplicationStateUnknown:
return false, nil
default:
// Forward-compat: unknown future states treated as in-progress.
return false, nil
}
})

if pollErr != nil {
return fmt.Errorf("waiting for regional replication of %s to %s to complete: %w", *version.ID, location, pollErr)
}
toolkit.Logf(ctx, "Image version %s regional replication to %s confirmed Completed", *version.ID, location)
return nil
}

// findRegionalReplicationStatus finds the RegionalReplicationStatus for the given location
// (case-insensitive, ignoring spaces) within the version-wide ReplicationStatus.
func findRegionalReplicationStatus(status *armcompute.ReplicationStatus, location string) *armcompute.RegionalReplicationStatus {
if status == nil {
return nil
}
normalized := strings.ToLower(strings.ReplaceAll(location, " ", ""))
for _, regional := range status.Summary {
if regional == nil || regional.Region == nil {
continue
}
if strings.ToLower(strings.ReplaceAll(*regional.Region, " ", "")) == normalized {
return regional
}
}
return nil
}

func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion) error {
Expand Down Expand Up @@ -688,6 +795,31 @@ func (a *AzureClient) EnsureSIGImageVersion(ctx context.Context, image *Image, l
return VHDResourceID(*resp.ID), nil
}

// WaitForImageVersionReplicatedToRegion is the public entrypoint for callers (e.g. the
// VMSS create retry loop) that need to confirm a previously-resolved image is actually
// serving traffic in a region before retrying a request that just failed with
// GalleryImageNotFound. It fetches the live image version with ReplicationStatus
// expanded and blocks until the regional state is Completed (or fails fast on terminal
// Failed). It does not initiate replication; callers should have already gone through
// EnsureSIGImageVersion / LatestSIGImageVersionByTag.
func (a *AzureClient) WaitForImageVersionReplicatedToRegion(ctx context.Context, image *Image, location string) error {
if image == nil {
return fmt.Errorf("nil image")
}
if image.Version == "" {
return fmt.Errorf("image %s has no resolved version; cannot check replication state", image.Name)
}
imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions)
if err != nil {
return fmt.Errorf("create image version client: %w", err)
}
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil)
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The doc comment for WaitForImageVersionReplicatedToRegion says it "fetches the live image version with ReplicationStatus expanded", but the initial Get call uses nil options (no Expand). Either update the comment to reflect that the expand happens inside waitForRegionalReplicationCompleted, or pass GalleryImageVersionsClientGetOptions{Expand: ReplicationStatusTypesReplicationStatus} here for consistency.

Suggested change
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil)
resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, &armcompute.GalleryImageVersionsClientGetOptions{
Expand: to.Ptr(armcompute.ReplicationStatusTypesReplicationStatus),
})

Copilot uses AI. Check for mistakes.
if err != nil {
return fmt.Errorf("get image version %s/%s for replication-wait: %w", image.Name, image.Version, err)
}
return a.waitForRegionalReplicationCompleted(ctx, image, &resp.GalleryImageVersion, location)
}

func DefaultRetryOpts() policy.RetryOptions {
return policy.RetryOptions{
// Use generous retry settings to survive Azure Compute Gallery throttling.
Expand Down
64 changes: 64 additions & 0 deletions e2e/config/azure_replication_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package config

import (
"testing"

"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7"
)

// TestFindRegionalReplicationStatus exercises the location-matching logic used to
// extract the per-region replication state from a SIG image version's ReplicationStatus
// summary. Region names from ARM may be in either the "WestUS 2" form or the "westus2"
// form, and may include arbitrary casing; the lookup must normalize both sides.
func TestFindRegionalReplicationStatus(t *testing.T) {
completed := armcompute.ReplicationStateCompleted
replicating := armcompute.ReplicationStateReplicating

status := &armcompute.ReplicationStatus{
Summary: []*armcompute.RegionalReplicationStatus{
{Region: to.Ptr("East US"), State: &completed, Progress: to.Ptr(int32(100))},
{Region: to.Ptr("West US 2"), State: &replicating, Progress: to.Ptr(int32(40))},
{Region: to.Ptr("uaenorth"), State: &completed, Progress: to.Ptr(int32(100))},
nil, // tolerate nil entries
{Region: nil, State: &completed},
},
}

tests := []struct {
name string
status *armcompute.ReplicationStatus
location string
wantFound bool
wantState armcompute.ReplicationState
wantRegion string
}{
{name: "nil status", status: nil, location: "westus2"},
{name: "exact lowercase normalized match (with space in summary)", status: status, location: "westus2", wantFound: true, wantState: replicating, wantRegion: "West US 2"},
{name: "uppercase input match (with space in summary)", status: status, location: "EASTUS", wantFound: true, wantState: completed, wantRegion: "East US"},
{name: "input with embedded spaces", status: status, location: "east us", wantFound: true, wantState: completed, wantRegion: "East US"},
{name: "summary already normalized", status: status, location: "uaenorth", wantFound: true, wantState: completed, wantRegion: "uaenorth"},
{name: "missing region returns nil", status: status, location: "centralus"},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := findRegionalReplicationStatus(tt.status, tt.location)
if !tt.wantFound {
if got != nil {
t.Fatalf("expected nil, got region %q state %v", *got.Region, *got.State)
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the !tt.wantFound branch, the failure message dereferences *got.Region and *got.State without checking for nil. If findRegionalReplicationStatus ever returns an entry with a nil Region or State (which this test data already includes elsewhere), the test will panic instead of producing a helpful assertion failure. Consider guarding those pointers (or formatting the struct with %#v) in the error message.

Suggested change
t.Fatalf("expected nil, got region %q state %v", *got.Region, *got.State)
t.Fatalf("expected nil, got %#v", got)

Copilot uses AI. Check for mistakes.
}
return
}
if got == nil {
t.Fatalf("expected to find region %q, got nil", tt.wantRegion)
}
if got.Region == nil || *got.Region != tt.wantRegion {
t.Errorf("region mismatch: want %q got %v", tt.wantRegion, got.Region)
}
if got.State == nil || *got.State != tt.wantState {
t.Errorf("state mismatch: want %v got %v", tt.wantState, got.State)
}
})
}
}
43 changes: 31 additions & 12 deletions e2e/vmss.go
Original file line number Diff line number Diff line change
Expand Up @@ -424,24 +424,26 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine

func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) {
delay := 5 * time.Second
retryOn := func(err error) bool {
classify := func(err error) (retry bool, isGalleryImageMissing bool) {
var respErr *azcore.ResponseError
// only retry on Azure API errors with specific error codes
if !errors.As(err, &respErr) {
return false
return false, false
}
// AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps
// It's not a quota issue
// AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps.
// It's not a quota issue.
if respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" {
return true
return true, false
}
// GalleryImageNotFound can happen transiently after image replication completes
// due to Azure eventual consistency - the gallery API reports success but the
// compute fabric in the target region hasn't fully propagated the image yet
// GalleryImageNotFound: gallery image version exists and the parent provisioningState is Succeeded,
// but the regional replica is not yet serving traffic. ensureReplication SHOULD have caught this before
// we got here; if we're seeing it on VMSS create, it's either fabric-level eventual consistency (very
// short window) or a regression in the wait. Either way, classify it specially so we can fail fast on
// terminal Failed states instead of burning 10 retries.
if respErr.StatusCode == 404 && respErr.ErrorCode == "GalleryImageNotFound" {
return true
return true, true
}
return false
return false, false
}

maxAttempts := 10
Expand All @@ -454,11 +456,28 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error)
return vm, nil
}

// not a retryable error
if !retryOn(err) {
retry, isGalleryImageMissing := classify(err)
if !retry {
return vm, err
}

// For GalleryImageNotFound, defer to the canonical replication-state poller in
// the config package. This either:
// - returns quickly when the regional state is already Completed (fabric-level race;
// fall through to a normal retry), or
// - blocks until the region transitions to Completed (then retry once on success), or
// - fails fast with a clear error if the region is in a terminal Failed state.
if isGalleryImageMissing && s.VHD != nil && s.Runtime != nil && s.Runtime.Cluster != nil && s.Runtime.Cluster.Model != nil && s.Runtime.Cluster.Model.Location != nil {
location := *s.Runtime.Cluster.Model.Location
toolkit.Logf(ctx, "GalleryImageNotFound on VMSS create in %s; consulting regional replication status before retrying", location)
if waitErr := config.Azure.WaitForImageVersionReplicatedToRegion(ctx, s.VHD, location); waitErr != nil {
// Surface the precise infra cause instead of a generic "failed after N retries".
return vm, fmt.Errorf("VMSS create returned GalleryImageNotFound and regional replication is not usable: %w (original error: %v)", waitErr, err)
}
toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately")
continue
}
Comment on lines +470 to +479
Copy link

Copilot AI Apr 22, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On GalleryImageNotFound, the code waits for replication to be Completed and then immediately continues, which skips the normal backoff delay. If the compute fabric is still eventually consistent even after replication is Completed, this can spin in a tight loop (up to maxAttempts) and hammer the VMSS create API without sleeping. Consider limiting this path to a single immediate retry (track a boolean), or apply a minimal delay/backoff before continuing.

Copilot uses AI. Check for mistakes.

if attempt >= maxAttempts {
return vm, fmt.Errorf("failed to create VMSS after %d retries: %w", maxAttempts, err)
}
Expand Down
Loading