From 9aa5d43d08dfae03f8c2d38a9bf7b365db5e4970 Mon Sep 17 00:00:00 2001 From: lufreita Date: Thu, 2 Jul 2026 13:46:59 -0300 Subject: [PATCH] ROSAENG-61038 | fix: improve ready-state cluster waiting Use describe-cluster state and provisioning details when waiting for clusters to become ready so manual HCP flows fail with actionable errors instead of low-signal timeouts. Add focused waiter coverage, including the ready-state polling path used by the failing test scenario. Signed-off-by: lufreita --- tests/utils/exec/rosacli/cluster_service.go | 147 ++++++++- .../exec/rosacli/cluster_service_test.go | 281 ++++++++++++++++++ .../utils/exec/rosacli/rosacli_suite_test.go | 13 + 3 files changed, 438 insertions(+), 3 deletions(-) create mode 100644 tests/utils/exec/rosacli/cluster_service_test.go create mode 100644 tests/utils/exec/rosacli/rosacli_suite_test.go diff --git a/tests/utils/exec/rosacli/cluster_service.go b/tests/utils/exec/rosacli/cluster_service.go index bf3ae31eef..c1f61ed825 100644 --- a/tests/utils/exec/rosacli/cluster_service.go +++ b/tests/utils/exec/rosacli/cluster_service.go @@ -3,6 +3,7 @@ package rosacli import ( "bytes" "context" + "errors" "fmt" "strconv" "strings" @@ -373,12 +374,12 @@ func (c *clusterService) GetClusterName(clusterID string) (clusterName string, e func (c *clusterService) GetJSONClusterDescription(clusterID string) (*jsonData, error) { c.client.Runner.JsonFormat() + defer c.client.Runner.UnsetFormat() output, err := c.DescribeCluster(clusterID) if err != nil { log.Logger.Errorf("it met error when describeCluster in IsUsingReusableOIDCConfig is %v", err) return nil, err } - c.client.Runner.UnsetFormat() return c.client.Parser.JsonData.Input(output).Parse(), nil } @@ -453,10 +454,34 @@ func (c *clusterService) ResumeCluster(clusterID string, flags ...string) (bytes // Wait cluster to some status, the inerval and duration are using minute func (c *clusterService) WaitClusterStatus(clusterID string, status string, interval int, duration int) error { + if strings.TrimSpace(clusterID) == "" { + return fmt.Errorf("cluster ID is required when waiting for cluster status %s", status) + } + + waitInterval := time.Duration(interval) * time.Minute + if waitInterval <= 0 { + waitInterval = time.Millisecond + } + waitTimeout := time.Duration(duration) * time.Minute + if waitTimeout <= 0 { + waitTimeout = 200 * time.Millisecond + } + + if status == constants.Ready { + return waitForClusterReadyStatus( + clusterID, + waitInterval, + waitTimeout, + func() (*ClusterDescription, error) { + return c.DescribeClusterAndReflect(clusterID) + }, + ) + } + err := wait.PollUntilContextTimeout( context.Background(), - time.Duration(interval)*time.Minute, - time.Duration(duration)*time.Minute, + waitInterval, + waitTimeout, false, func(context.Context) (bool, error) { clusterListB, err := c.List() @@ -479,6 +504,122 @@ func (c *clusterService) WaitClusterStatus(clusterID string, status string, inte return err } +func waitForClusterReadyStatus( + clusterID string, + interval time.Duration, + timeout time.Duration, + getDescription func() (*ClusterDescription, error), +) error { + var lastDescription *ClusterDescription + err := wait.PollUntilContextTimeout( + context.Background(), + interval, + timeout, + true, + func(context.Context) (bool, error) { + description, err := getDescription() + if err != nil { + if isClusterNotFoundErr(clusterID, err) { + return false, fmt.Errorf( + "cluster %s not found while waiting for it to become ready: %w", + clusterID, err, + ) + } + return false, err + } + lastDescription = description + + return evaluateReadyClusterState(clusterID, description) + }, + ) + if err == nil { + return nil + } + + if errors.Is(err, context.DeadlineExceeded) { + return formatReadyClusterTimeout(clusterID, timeout, lastDescription) + } + + return err +} + +func isClusterNotFoundErr(clusterID string, err error) bool { + if err == nil { + return false + } + + message := err.Error() + return strings.Contains(message, + fmt.Sprintf("There is no cluster with identifier or name '%s'", clusterID)) || + strings.Contains(message, fmt.Sprintf("Cluster '%s' not found", clusterID)) +} + +func provisioningDetails(description *ClusterDescription) []string { + if description == nil { + return nil + } + + details := []string{} + if description.ProvisioningErrorCode != "" { + details = append(details, fmt.Sprintf("provisioning error code: %s", description.ProvisioningErrorCode)) + } + if description.ProvisioningErrorMessage != "" { + details = append(details, fmt.Sprintf("provisioning error message: %s", description.ProvisioningErrorMessage)) + } + if description.FailedInflightChecks != "" { + details = append(details, fmt.Sprintf("failed inflight checks: %s", description.FailedInflightChecks)) + } + + return details +} + +func formatReadyClusterError(clusterID string, description *ClusterDescription) error { + details := provisioningDetails(description) + if len(details) == 0 { + return fmt.Errorf("cluster %s is in %s state", clusterID, description.State) + } + + return fmt.Errorf("cluster %s is in %s state (%s)", + clusterID, description.State, strings.Join(details, "; ")) +} + +func formatReadyClusterTimeout(clusterID string, timeout time.Duration, description *ClusterDescription) error { + timeoutMinutes := int(timeout / time.Minute) + if description == nil { + return fmt.Errorf("timeout for cluster ready waiting after %d mins", timeoutMinutes) + } + + details := append([]string{fmt.Sprintf("last state: %s", description.State)}, provisioningDetails(description)...) + + return fmt.Errorf("timeout for cluster ready waiting after %d mins (%s)", + timeoutMinutes, strings.Join(details, "; ")) +} + +func evaluateReadyClusterState(clusterID string, description *ClusterDescription) (bool, error) { + if description == nil { + return false, fmt.Errorf("no cluster description found for %s", clusterID) + } + + if description.State == constants.Ready { + return true, nil + } + + if strings.Contains(description.State, constants.Uninstalling) { + return false, fmt.Errorf("cluster %s is %s now. Cannot wait for it ready", + clusterID, constants.Uninstalling) + } + + if strings.Contains(description.State, constants.Error) { + return false, formatReadyClusterError(clusterID, description) + } + + if strings.TrimSpace(description.State) == "" { + return false, fmt.Errorf("cluster %s returned an empty state", clusterID) + } + + return false, nil +} + // Wait for cluster deleted func (c *clusterService) WaitClusterDeleted(clusterID string, interval int, duration int) error { err := wait.PollUntilContextTimeout( diff --git a/tests/utils/exec/rosacli/cluster_service_test.go b/tests/utils/exec/rosacli/cluster_service_test.go new file mode 100644 index 0000000000..38cc7491ed --- /dev/null +++ b/tests/utils/exec/rosacli/cluster_service_test.go @@ -0,0 +1,281 @@ +package rosacli + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/openshift/rosa/tests/utils/constants" +) + +var _ = Describe("Cluster wait helpers", func() { + const clusterID = "1234abcd" + + It("fails fast when waiting without a cluster ID", func() { + service := &clusterService{} + + err := service.WaitClusterStatus("", constants.Ready, 1, 1) + + Expect(err).To(MatchError(ContainSubstring("cluster ID is required"))) + }) + + It("waits for ready status using describe output", func() { + tempDir, err := os.MkdirTemp("", "fake-rosa-*") + Expect(err).ToNot(HaveOccurred()) + defer os.RemoveAll(tempDir) + + textCountFile := filepath.Join(tempDir, "describe-count") + jsonCountFile := filepath.Join(tempDir, "json-describe-count") + scriptPath := filepath.Join(tempDir, "rosa") + script := fmt.Sprintf(`#!/usr/bin/env bash +set -euo pipefail +text_count_file=%q +json_count_file=%q + +if [[ "$1" == "describe" && "$2" == "cluster" ]]; then + if [[ "$*" == *"--output json"* ]]; then + count=0 + if [[ -f "$json_count_file" ]]; then + count=$(<"$json_count_file") + fi + count=$((count + 1)) + printf '%%s' "$count" > "$json_count_file" + printf '{"state":"waiting"}\n' + exit 0 + fi + + count=0 + if [[ -f "$text_count_file" ]]; then + count=$(<"$text_count_file") + fi + count=$((count + 1)) + printf '%%s' "$count" > "$text_count_file" + + if [[ "$count" -eq 1 ]]; then + cat <<'EOF' +Name: fake +ID: 1234abcd +State: resuming +EOF + else + cat <<'EOF' +Name: fake +ID: 1234abcd +State: ready +EOF + fi + exit 0 +fi + +echo "unexpected args: $*" >&2 +exit 1 +`, textCountFile, jsonCountFile) + err = os.WriteFile(scriptPath, []byte(script), 0755) + Expect(err).ToNot(HaveOccurred()) + + originalPath := os.Getenv("PATH") + err = os.Setenv("PATH", tempDir+":"+originalPath) + Expect(err).ToNot(HaveOccurred()) + defer os.Setenv("PATH", originalPath) + + client := NewClient() + client.Runner.envs = prependPathEnv(tempDir) + service := NewClusterService(client) + + err = service.WaitClusterStatus(clusterID, constants.Ready, 0, 0) + + Expect(err).ToNot(HaveOccurred()) + + jsonCalls := "0" + if content, readErr := os.ReadFile(jsonCountFile); readErr == nil { + jsonCalls = string(content) + } + Expect(jsonCalls).To(Equal("0")) + }) + + It("keeps waiting while the cluster is still in waiting state", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: constants.Waiting, + }) + + Expect(err).ToNot(HaveOccurred()) + Expect(ready).To(BeFalse()) + }) + + It("returns an error when the cluster description is missing", func() { + ready, err := evaluateReadyClusterState(clusterID, nil) + + Expect(err).To(MatchError(ContainSubstring("no cluster description found"))) + Expect(ready).To(BeFalse()) + }) + + It("returns an error when the cluster state is empty", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: " ", + }) + + Expect(err).To(MatchError(ContainSubstring("returned an empty state"))) + Expect(ready).To(BeFalse()) + }) + + It("returns a detailed error when the cluster enters an error state", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: constants.Error, + ProvisioningErrorCode: "AccessDenied", + ProvisioningErrorMessage: "Missing operator role permissions", + FailedInflightChecks: "operator role validation failed", + }) + + Expect(err).To(MatchError(ContainSubstring("cluster 1234abcd is in error state"))) + Expect(err).To(MatchError(ContainSubstring("AccessDenied"))) + Expect(err).To(MatchError(ContainSubstring("Missing operator role permissions"))) + Expect(err).To(MatchError(ContainSubstring("operator role validation failed"))) + Expect(ready).To(BeFalse()) + }) + + It("returns a basic error when the cluster enters an error state without provisioning details", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: constants.Error, + }) + + Expect(err).To(MatchError(ContainSubstring("cluster 1234abcd is in error state"))) + Expect(ready).To(BeFalse()) + }) + + It("returns an uninstalling error while waiting for ready", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: constants.Uninstalling, + }) + + Expect(err).To(MatchError(ContainSubstring("Cannot wait for it ready"))) + Expect(ready).To(BeFalse()) + }) + + It("keeps waiting while the cluster is resuming", func() { + ready, err := evaluateReadyClusterState(clusterID, &ClusterDescription{ + State: "resuming", + }) + + Expect(err).ToNot(HaveOccurred()) + Expect(ready).To(BeFalse()) + }) + + It("waits through transient states until the cluster becomes ready", func() { + states := []string{constants.Waiting, constants.Ready} + stateIndex := 0 + + err := waitForClusterReadyStatus( + clusterID, + time.Nanosecond, + time.Millisecond, + func() (*ClusterDescription, error) { + current := states[stateIndex] + if stateIndex < len(states)-1 { + stateIndex++ + } + return &ClusterDescription{State: current}, nil + }, + ) + + Expect(err).ToNot(HaveOccurred()) + }) + + It("checks the first ready state immediately before waiting", func() { + err := waitForClusterReadyStatus( + clusterID, + time.Hour, + time.Millisecond, + func() (*ClusterDescription, error) { + return &ClusterDescription{State: constants.Ready}, nil + }, + ) + + Expect(err).ToNot(HaveOccurred()) + }) + + It("returns an explicit not-found error while waiting for ready", func() { + err := waitForClusterReadyStatus( + clusterID, + time.Nanosecond, + time.Millisecond, + func() (*ClusterDescription, error) { + return nil, fmt.Errorf("Cluster '%s' not found", clusterID) + }, + ) + + Expect(err).To(MatchError(ContainSubstring("cluster 1234abcd not found while waiting for it to become ready"))) + }) + + It("recognizes the alternate not-found error wording", func() { + err := waitForClusterReadyStatus( + clusterID, + time.Nanosecond, + time.Millisecond, + func() (*ClusterDescription, error) { + return nil, fmt.Errorf("There is no cluster with identifier or name '%s'", clusterID) + }, + ) + + Expect(err).To(MatchError(ContainSubstring("cluster 1234abcd not found while waiting for it to become ready"))) + }) + + It("includes the last known state in timeout errors", func() { + err := formatReadyClusterTimeout(clusterID, 60*time.Minute, &ClusterDescription{ + State: constants.Installing, + }) + + Expect(err).To(MatchError(ContainSubstring("timeout for cluster ready waiting after 60 mins"))) + Expect(err).To(MatchError(ContainSubstring("last state: installing"))) + }) + + It("handles timeout errors without a last description", func() { + err := formatReadyClusterTimeout(clusterID, time.Minute, nil) + + Expect(err).To(MatchError(ContainSubstring("timeout for cluster ready waiting after 1 mins"))) + }) + + It("resets runner format after JSON describe errors", func() { + tempDir, err := os.MkdirTemp("", "fake-rosa-error-*") + Expect(err).ToNot(HaveOccurred()) + defer os.RemoveAll(tempDir) + + scriptPath := filepath.Join(tempDir, "rosa") + script := `#!/usr/bin/env bash +echo "boom" >&2 +exit 1 +` + err = os.WriteFile(scriptPath, []byte(script), 0755) + Expect(err).ToNot(HaveOccurred()) + + originalPath := os.Getenv("PATH") + err = os.Setenv("PATH", tempDir+":"+originalPath) + Expect(err).ToNot(HaveOccurred()) + defer os.Setenv("PATH", originalPath) + + client := NewClient() + client.Runner.envs = prependPathEnv(tempDir) + service := NewClusterService(client) + + _, err = service.GetJSONClusterDescription(clusterID) + + Expect(err).To(HaveOccurred()) + Expect(client.Runner.runnerCfg.format).To(Equal(defaultRunnerFormat)) + }) +}) + +func prependPathEnv(prefix string) []string { + envs := os.Environ() + for index, env := range envs { + if strings.HasPrefix(env, "PATH=") { + envs[index] = "PATH=" + prefix + ":" + strings.TrimPrefix(env, "PATH=") + return envs + } + } + + return append(envs, "PATH="+prefix) +} diff --git a/tests/utils/exec/rosacli/rosacli_suite_test.go b/tests/utils/exec/rosacli/rosacli_suite_test.go new file mode 100644 index 0000000000..a47583d2a5 --- /dev/null +++ b/tests/utils/exec/rosacli/rosacli_suite_test.go @@ -0,0 +1,13 @@ +package rosacli + +import ( + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +func TestRosacli(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Rosacli Suite") +}