diff --git a/e2e/config/azure.go b/e2e/config/azure.go index d0de6f04619..124f2d1b32b 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -564,8 +564,13 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi } if replicatedToCurrentRegion(version, location) { - toolkit.Logf(ctx, "Image version %s is already in region %s", *version.ID, location) - return nil + // Intent-to-replicate is registered in PublishingProfile.TargetRegions, but that + // does NOT mean the regional replica is actually serving traffic. The regional + // replication state can still be "Replicating" while the parent ProvisioningState + // is "Succeeded" — this is the source of GalleryImageNotFound 404s on VMSS create. + // Wait for the regional state itself before declaring success. + toolkit.Logf(ctx, "Image version %s is already a target of region %s; verifying regional replication state", *version.ID, location) + return a.waitForRegionalReplicationCompleted(ctx, image, version, location) } regions := make([]string, 0, len(version.Properties.PublishingProfile.TargetRegions)) for _, targetRegion := range version.Properties.PublishingProfile.TargetRegions { @@ -575,12 +580,114 @@ func (a *AzureClient) ensureReplication(ctx context.Context, image *Image, versi toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Replicating to region %s", location) start := time.Now() // Record the start time - err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location) + if err := a.replicateImageVersionToCurrentRegion(ctx, image, version, location); err != nil { + return err + } + + // The replicate LRO above completes when the parent resource is Succeeded, but the + // regional replica may still be in "Replicating" state for several more minutes. + // Block until that regional state hits Completed; otherwise downstream VMSS create + // will see GalleryImageNotFound and the test will appear to fail for a non-test reason. + if err := a.waitForRegionalReplicationCompleted(ctx, image, version, location); err != nil { + return err + } elapsed := time.Since(start) // Calculate the elapsed time toolkit.LogDuration(ctx, elapsed, 3*time.Minute, fmt.Sprintf("Replication took: %s (%s)", elapsed, *version.ID)) - return err + return nil +} + +// waitForRegionalReplicationCompleted polls the gallery image version with the +// ReplicationStatus expand option until the named region's RegionalReplicationStatus.State +// is Completed. Returns an error if the regional replication enters a terminal Failed state +// or the wait times out. Treats Unknown as transient (Azure occasionally emits Unknown +// briefly during state transitions). +// +// This closes a documented gap in the SIG API: the parent provisioning LRO can succeed +// before per-region replicas are actually serving traffic. Without this wait, callers see +// GalleryImageNotFound 404s on VMSS creation that look like bugs but are infra eventual +// consistency. +func (a *AzureClient) waitForRegionalReplicationCompleted(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion, location string) error { + imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) + if err != nil { + return fmt.Errorf("create image version client for replication wait: %w", err) + } + + getOpts := &armcompute.GalleryImageVersionsClientGetOptions{ + Expand: to.Ptr(armcompute.ReplicationStatusTypesReplicationStatus), + } + + var lastLoggedState armcompute.ReplicationState + const ( + pollInterval = 15 * time.Second + pollTimeout = 20 * time.Minute + ) + + pollErr := wait.PollUntilContextTimeout(ctx, pollInterval, pollTimeout, true, func(ctx context.Context) (bool, error) { + resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, *version.Name, getOpts) + if err != nil { + // Transient API errors should not abort the wait; the SDK already retries throttling. + toolkit.Logf(ctx, "transient error getting image version replication status (will retry): %v", err) + return false, nil + } + + regional := findRegionalReplicationStatus(resp.GalleryImageVersion.Properties.ReplicationStatus, location) + if regional == nil || regional.State == nil { + // Region not yet present in the status summary; keep waiting. + return false, nil + } + + state := *regional.State + if state != lastLoggedState { + progress := int32(0) + if regional.Progress != nil { + progress = *regional.Progress + } + toolkit.Logf(ctx, "Image version %s regional replication state in %s: %s (progress: %d%%)", *version.ID, location, state, progress) + lastLoggedState = state + } + + switch state { + case armcompute.ReplicationStateCompleted: + return true, nil + case armcompute.ReplicationStateFailed: + details := "" + if regional.Details != nil { + details = *regional.Details + } + return false, fmt.Errorf("regional replication of %s to %s failed: %s", *version.ID, location, details) + case armcompute.ReplicationStateReplicating, armcompute.ReplicationStateUnknown: + return false, nil + default: + // Forward-compat: unknown future states treated as in-progress. + return false, nil + } + }) + + if pollErr != nil { + return fmt.Errorf("waiting for regional replication of %s to %s to complete: %w", *version.ID, location, pollErr) + } + toolkit.Logf(ctx, "Image version %s regional replication to %s confirmed Completed", *version.ID, location) + return nil +} + +// findRegionalReplicationStatus finds the RegionalReplicationStatus for the given location +// (case-insensitive, ignoring spaces) within the version-wide ReplicationStatus. +func findRegionalReplicationStatus(status *armcompute.ReplicationStatus, location string) *armcompute.RegionalReplicationStatus { + if status == nil { + return nil + } + normalized := strings.ToLower(strings.ReplaceAll(location, " ", "")) + for _, regional := range status.Summary { + if regional == nil || regional.Region == nil { + continue + } + if strings.ToLower(strings.ReplaceAll(*regional.Region, " ", "")) == normalized { + return regional + } + } + return nil } func (a *AzureClient) waitForVersionOperationCompletion(ctx context.Context, image *Image, version *armcompute.GalleryImageVersion) error { @@ -688,6 +795,31 @@ func (a *AzureClient) EnsureSIGImageVersion(ctx context.Context, image *Image, l return VHDResourceID(*resp.ID), nil } +// WaitForImageVersionReplicatedToRegion is the public entrypoint for callers (e.g. the +// VMSS create retry loop) that need to confirm a previously-resolved image is actually +// serving traffic in a region before retrying a request that just failed with +// GalleryImageNotFound. It fetches the live image version with ReplicationStatus +// expanded and blocks until the regional state is Completed (or fails fast on terminal +// Failed). It does not initiate replication; callers should have already gone through +// EnsureSIGImageVersion / LatestSIGImageVersionByTag. +func (a *AzureClient) WaitForImageVersionReplicatedToRegion(ctx context.Context, image *Image, location string) error { + if image == nil { + return fmt.Errorf("nil image") + } + if image.Version == "" { + return fmt.Errorf("image %s has no resolved version; cannot check replication state", image.Name) + } + imgVersionClient, err := armcompute.NewGalleryImageVersionsClient(image.Gallery.SubscriptionID, a.Credential, a.ArmOptions) + if err != nil { + return fmt.Errorf("create image version client: %w", err) + } + resp, err := imgVersionClient.Get(ctx, image.Gallery.ResourceGroupName, image.Gallery.Name, image.Name, image.Version, nil) + if err != nil { + return fmt.Errorf("get image version %s/%s for replication-wait: %w", image.Name, image.Version, err) + } + return a.waitForRegionalReplicationCompleted(ctx, image, &resp.GalleryImageVersion, location) +} + func DefaultRetryOpts() policy.RetryOptions { return policy.RetryOptions{ // Use generous retry settings to survive Azure Compute Gallery throttling. diff --git a/e2e/config/azure_replication_test.go b/e2e/config/azure_replication_test.go new file mode 100644 index 00000000000..618d448c469 --- /dev/null +++ b/e2e/config/azure_replication_test.go @@ -0,0 +1,64 @@ +package config + +import ( + "testing" + + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// TestFindRegionalReplicationStatus exercises the location-matching logic used to +// extract the per-region replication state from a SIG image version's ReplicationStatus +// summary. Region names from ARM may be in either the "WestUS 2" form or the "westus2" +// form, and may include arbitrary casing; the lookup must normalize both sides. +func TestFindRegionalReplicationStatus(t *testing.T) { + completed := armcompute.ReplicationStateCompleted + replicating := armcompute.ReplicationStateReplicating + + status := &armcompute.ReplicationStatus{ + Summary: []*armcompute.RegionalReplicationStatus{ + {Region: to.Ptr("East US"), State: &completed, Progress: to.Ptr(int32(100))}, + {Region: to.Ptr("West US 2"), State: &replicating, Progress: to.Ptr(int32(40))}, + {Region: to.Ptr("uaenorth"), State: &completed, Progress: to.Ptr(int32(100))}, + nil, // tolerate nil entries + {Region: nil, State: &completed}, + }, + } + + tests := []struct { + name string + status *armcompute.ReplicationStatus + location string + wantFound bool + wantState armcompute.ReplicationState + wantRegion string + }{ + {name: "nil status", status: nil, location: "westus2"}, + {name: "exact lowercase normalized match (with space in summary)", status: status, location: "westus2", wantFound: true, wantState: replicating, wantRegion: "West US 2"}, + {name: "uppercase input match (with space in summary)", status: status, location: "EASTUS", wantFound: true, wantState: completed, wantRegion: "East US"}, + {name: "input with embedded spaces", status: status, location: "east us", wantFound: true, wantState: completed, wantRegion: "East US"}, + {name: "summary already normalized", status: status, location: "uaenorth", wantFound: true, wantState: completed, wantRegion: "uaenorth"}, + {name: "missing region returns nil", status: status, location: "centralus"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := findRegionalReplicationStatus(tt.status, tt.location) + if !tt.wantFound { + if got != nil { + t.Fatalf("expected nil, got region %q state %v", *got.Region, *got.State) + } + return + } + if got == nil { + t.Fatalf("expected to find region %q, got nil", tt.wantRegion) + } + if got.Region == nil || *got.Region != tt.wantRegion { + t.Errorf("region mismatch: want %q got %v", tt.wantRegion, got.Region) + } + if got.State == nil || *got.State != tt.wantState { + t.Errorf("state mismatch: want %v got %v", tt.wantState, got.State) + } + }) + } +} diff --git a/e2e/vmss.go b/e2e/vmss.go index 2b5714fa1a1..9cd2d6793fd 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -424,24 +424,26 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) { delay := 5 * time.Second - retryOn := func(err error) bool { + classify := func(err error) (retry bool, isGalleryImageMissing bool) { var respErr *azcore.ResponseError // only retry on Azure API errors with specific error codes if !errors.As(err, &respErr) { - return false + return false, false } - // AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps - // It's not a quota issue + // AllocationFailed sometimes happens for exotic SKUs (new GPUs) with limited availability, sometimes retrying helps. + // It's not a quota issue. if respErr.StatusCode == 200 && respErr.ErrorCode == "AllocationFailed" { - return true + return true, false } - // GalleryImageNotFound can happen transiently after image replication completes - // due to Azure eventual consistency - the gallery API reports success but the - // compute fabric in the target region hasn't fully propagated the image yet + // GalleryImageNotFound: gallery image version exists and the parent provisioningState is Succeeded, + // but the regional replica is not yet serving traffic. ensureReplication SHOULD have caught this before + // we got here; if we're seeing it on VMSS create, it's either fabric-level eventual consistency (very + // short window) or a regression in the wait. Either way, classify it specially so we can fail fast on + // terminal Failed states instead of burning 10 retries. if respErr.StatusCode == 404 && respErr.ErrorCode == "GalleryImageNotFound" { - return true + return true, true } - return false + return false, false } maxAttempts := 10 @@ -454,11 +456,28 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) return vm, nil } - // not a retryable error - if !retryOn(err) { + retry, isGalleryImageMissing := classify(err) + if !retry { return vm, err } + // For GalleryImageNotFound, defer to the canonical replication-state poller in + // the config package. This either: + // - returns quickly when the regional state is already Completed (fabric-level race; + // fall through to a normal retry), or + // - blocks until the region transitions to Completed (then retry once on success), or + // - fails fast with a clear error if the region is in a terminal Failed state. + if isGalleryImageMissing && s.VHD != nil && s.Runtime != nil && s.Runtime.Cluster != nil && s.Runtime.Cluster.Model != nil && s.Runtime.Cluster.Model.Location != nil { + location := *s.Runtime.Cluster.Model.Location + toolkit.Logf(ctx, "GalleryImageNotFound on VMSS create in %s; consulting regional replication status before retrying", location) + if waitErr := config.Azure.WaitForImageVersionReplicatedToRegion(ctx, s.VHD, location); waitErr != nil { + // Surface the precise infra cause instead of a generic "failed after N retries". + return vm, fmt.Errorf("VMSS create returned GalleryImageNotFound and regional replication is not usable: %w (original error: %v)", waitErr, err) + } + toolkit.Logf(ctx, "Regional replication confirmed Completed; retrying VMSS create immediately") + continue + } + if attempt >= maxAttempts { return vm, fmt.Errorf("failed to create VMSS after %d retries: %w", maxAttempts, err) }