experimental/ssh: clarify GPU compute provisioning during ssh connect startup

TanishqDatabricks · TanishqDatabricks · commit e053cdd229fb · 2026-06-12T11:23:53.000Z
GPU_8xH100 serverless capacity takes ~10 minutes at P50 and ~30 minutes at
P90 to acquire, but `ssh connect` gave up after a hard 10-minute startup
timeout with an opaque error:

    Error: failed to ensure that ssh server is running: failed to submit and
    start ssh server job: timed out: waiting for task to start (current
    state: PENDING)

Users read this as a service outage rather than compute still being
provisioned (see the Zillow report in #remote-development-help).

- Raise the startup timeout to 40 minutes when --accelerator is set,
  keeping 10 minutes otherwise.
- Print an upfront notice that GPU provisioning can take 10-30 minutes,
  and reflect provisioning in the spinner text.
- On startup timeout, append guidance to the error: the run ID and run
  page URL, that compute is likely still provisioning, and that the run
  was left in place so re-running the command connects once it starts.

Co-authored-by: Isaac
diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md
@@ -6,6 +6,7 @@
 
 ### CLI
 * Show a once-per-day notice after a command when a newer CLI release is available, with a link to the release and the upgrade command for the detected install method. Suppressed for non-interactive/CI runs, JSON output, the Databricks Runtime, and development builds, and can be disabled with `DATABRICKS_CLI_DISABLE_UPDATE_CHECK` ([#5470](https://github.com/databricks/cli/pull/5470)).
+* `ssh connect`: Increase the SSH server startup timeout from 10 to 40 minutes for GPU accelerators, show "Waiting for compute to start" while compute spins up (with a notice for GPU accelerators that provisioning can take upwards of 10 minutes), and explain on timeout that the job run was left in place so re-running the command connects once compute is available.
 
 ### Bundles
 * Remove API enum values and types that are still in development from the `databricks-bundles` Python package; these were never accepted by the backend ([#5484](https://github.com/databricks/cli/pull/5484)).
diff --git a/experimental/ssh/cmd/connect.go b/experimental/ssh/cmd/connect.go
@@ -90,6 +90,10 @@ Connect to a dedicated cluster:
 		if connectionName == "" && clusterID == "" && !proxyMode {
 			connectionName = client.GenerateDefaultConnectionName(wsClient.Config.Host, accelerator)
 		}
+		startupTimeout := taskStartupTimeout
+		if accelerator != "" {
+			startupTimeout = gpuTaskStartupTimeout
+		}
 		opts := client.ClientOptions{
 			Profile:              wsClient.Config.Profile,
 			ClusterID:            clusterID,
@@ -103,7 +107,7 @@ Connect to a dedicated cluster:
 			HandoverTimeout:      handoverTimeout,
 			ReleasesDir:          releasesDir,
 			ServerTimeout:        max(serverTimeout, shutdownDelay),
-			TaskStartupTimeout:   taskStartupTimeout,
+			TaskStartupTimeout:   startupTimeout,
 			AutoStartCluster:     autoStartCluster,
 			ClientPublicKeyName:  clientPublicKeyName,
 			ClientPrivateKeyName: clientPrivateKeyName,
diff --git a/experimental/ssh/cmd/constants.go b/experimental/ssh/cmd/constants.go
@@ -9,12 +9,16 @@ const (
 	defaultHandoverTimeout    = 30 * time.Minute
 	defaultEnvironmentVersion = 4
 
-	serverTimeout        = 24 * time.Hour
-	taskStartupTimeout   = 10 * time.Minute
-	serverPortRange      = 100
-	serverConfigDir      = ".ssh-tunnel"
-	serverPrivateKeyName = "server-private-key"
-	serverPublicKeyName  = "server-public-key"
-	clientPrivateKeyName = "client-private-key"
-	clientPublicKeyName  = "client-public-key"
+	serverTimeout      = 24 * time.Hour
+	taskStartupTimeout = 10 * time.Minute
+	// Serverless GPU capacity is acquired on demand: launch latency for GPU_8xH100 is
+	// ~10 minutes at P50 and ~30 minutes at P90, so GPU sessions need a much longer
+	// startup timeout than the default to avoid giving up on runs that would succeed.
+	gpuTaskStartupTimeout = 40 * time.Minute
+	serverPortRange       = 100
+	serverConfigDir       = ".ssh-tunnel"
+	serverPrivateKeyName  = "server-private-key"
+	serverPublicKeyName   = "server-public-key"
+	clientPrivateKeyName  = "client-private-key"
+	clientPublicKeyName   = "client-public-key"
 )
diff --git a/experimental/ssh/internal/client/client.go b/experimental/ssh/internal/client/client.go
@@ -578,7 +578,7 @@ func submitSSHTunnelJob(ctx context.Context, client *databricks.WorkspaceClient,
 	cmdio.LogString(ctx, fmt.Sprintf("Job submitted successfully with run ID: %d", waiter.RunId))
 
 	// Return the run ID even on error so callers can fetch the run's failure details.
-	return waiter.RunId, waitForJobToStart(ctx, client, waiter.RunId, opts.TaskStartupTimeout)
+	return waiter.RunId, waitForJobToStart(ctx, client, waiter.RunId, opts)
 }
 
 func spawnSSHClient(ctx context.Context, userName, privateKeyPath string, serverPort int, clusterID string, opts ClientOptions) error {
@@ -642,7 +642,7 @@ func checkClusterState(ctx context.Context, client *databricks.WorkspaceClient,
 	sp := cmdio.NewSpinner(ctx, cmdio.WithElapsedTime())
 	defer sp.Close()
 	if autoStart {
-		sp.Update("Ensuring the cluster is running...")
+		sp.Update("Waiting for compute to start...")
 		err := client.Clusters.EnsureClusterIsRunning(ctx, clusterID)
 		if err != nil {
 			return fmt.Errorf("failed to ensure that the cluster is running: %w", err)
@@ -662,13 +662,21 @@ func checkClusterState(ctx context.Context, client *databricks.WorkspaceClient,
 
 // waitForJobToStart polls the task status until the SSH server task is in RUNNING state or terminates.
 // Returns an error if the task fails to start or if polling times out.
-func waitForJobToStart(ctx context.Context, client *databricks.WorkspaceClient, runID int64, taskStartupTimeout time.Duration) error {
+func waitForJobToStart(ctx context.Context, client *databricks.WorkspaceClient, runID int64, opts ClientOptions) error {
+	waitingMessage := "Waiting for compute to start..."
+	if opts.Accelerator != "" {
+		// GPU capacity is acquired on demand and routinely takes 10+ minutes; without
+		// this notice users assume a long PENDING wait means the service is down.
+		cmdio.LogString(ctx, fmt.Sprintf("Waiting for %s compute to be provisioned. This can take upwards of 10 minutes depending on capacity...", opts.Accelerator))
+		waitingMessage = fmt.Sprintf("Waiting for %s compute to be provisioned...", opts.Accelerator)
+	}
+
 	sp := cmdio.NewSpinner(ctx, cmdio.WithElapsedTime())
 	defer sp.Close()
-	sp.Update("Starting SSH server...")
+	sp.Update(waitingMessage)
 	var prevState jobs.RunLifecycleStateV2State
 
-	_, err := retries.Poll(ctx, taskStartupTimeout, func() (*jobs.RunTask, *retries.Err) {
+	_, err := retries.Poll(ctx, opts.TaskStartupTimeout, func() (*jobs.RunTask, *retries.Err) {
 		run, err := client.Jobs.GetRun(ctx, jobs.GetRunRequest{
 			RunId: runID,
 		})
@@ -697,7 +705,7 @@ func waitForJobToStart(ctx context.Context, client *databricks.WorkspaceClient,
 
 		// Update spinner if state changed
 		if currentState != prevState {
-			sp.Update(fmt.Sprintf("Starting SSH server... (task: %s)", currentState))
+			sp.Update(fmt.Sprintf("%s (task: %s)", waitingMessage, currentState))
 			prevState = currentState
 		}
 
@@ -716,9 +724,33 @@ func waitForJobToStart(ctx context.Context, client *databricks.WorkspaceClient,
 		return nil, retries.Continues(fmt.Sprintf("waiting for task to start (current state: %s)", currentState))
 	})
 
+	// A startup timeout almost always means compute is still being provisioned (the task
+	// never left PENDING), not an outage. The run is intentionally not cancelled: if
+	// capacity arrives later the server starts, and re-running the command connects to it.
+	if _, ok := errors.AsType[*retries.ErrTimedOut](err); ok {
+		return fmt.Errorf("%w\n%s", err, describeStartupTimeout(ctx, client, runID, opts))
+	}
 	return err
 }
 
+// describeStartupTimeout formats guidance for when the SSH server task did not reach RUNNING
+// within the startup timeout. It is best-effort: failures to fetch the run page URL are
+// silently ignored so the guidance can always be embedded in the returned error.
+func describeStartupTimeout(ctx context.Context, client *databricks.WorkspaceClient, runID int64, opts ClientOptions) string {
+	var b strings.Builder
+	fmt.Fprintf(&b, "  The SSH server job (run ID: %d) did not start within %s; its compute is most likely still being provisioned.\n", runID, opts.TaskStartupTimeout)
+	if opts.Accelerator != "" {
+		fmt.Fprintf(&b, "  %s capacity can take longer than this to acquire when demand is high.\n", opts.Accelerator)
+	}
+	runLocation := "in the workspace UI (Jobs & Pipelines > Job Runs)"
+	if run, err := client.Jobs.GetRun(ctx, jobs.GetRunRequest{RunId: runID}); err == nil && run.RunPageUrl != "" {
+		runLocation = "at " + run.RunPageUrl
+	}
+	fmt.Fprintf(&b, "  The run was left in place and may still start: track it %s,\n", runLocation)
+	fmt.Fprintf(&b, "  then re-run this command to connect once the run is running, or cancel the run to give up.")
+	return b.String()
+}
+
 // maxRunFailureTraceBytes bounds how much of a failed run's error trace we print to the
 // terminal; the full output is always available via the run page URL.
 const maxRunFailureTraceBytes = 2000
diff --git a/experimental/ssh/internal/client/client_internal_test.go b/experimental/ssh/internal/client/client_internal_test.go
@@ -110,8 +110,48 @@ func TestWaitForJobToStartSurfacesFailure(t *testing.T) {
 	api.EXPECT().GetRunOutput(mock.Anything, jobs.GetRunOutputRequest{RunId: 99}).Return(
 		&jobs.RunOutput{}, nil)
 
-	err := waitForJobToStart(ctx, m.WorkspaceClient, 1, 30*time.Second)
+	err := waitForJobToStart(ctx, m.WorkspaceClient, 1, ClientOptions{TaskStartupTimeout: 30 * time.Second})
 	require.Error(t, err)
 	assert.Contains(t, err.Error(), "ssh server bootstrap job failed")
 	assert.Contains(t, err.Error(), "Could not reach driver of cluster 0605-x.")
 }
+
+func TestWaitForJobToStartTimeoutExplainsPendingCompute(t *testing.T) {
+	ctx := cmdio.MockDiscard(t.Context())
+	m := mocks.NewMockWorkspaceClient(t)
+	api := m.GetMockJobsAPI()
+	// The run stays PENDING for the whole (tiny) startup timeout; the same response also
+	// serves the post-timeout lookup of the run page URL.
+	api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(&jobs.Run{
+		RunId:      1,
+		RunPageUrl: "https://example.test/run/1",
+		Tasks: []jobs.RunTask{{
+			TaskKey: sshServerTaskKey,
+			Status:  &jobs.RunStatus{State: jobs.RunLifecycleStateV2StatePending},
+		}},
+	}, nil)
+
+	err := waitForJobToStart(ctx, m.WorkspaceClient, 1, ClientOptions{
+		TaskStartupTimeout: 10 * time.Millisecond,
+		Accelerator:        "GPU_8xH100",
+	})
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "current state: PENDING")
+	assert.Contains(t, err.Error(), "did not start within 10ms")
+	assert.Contains(t, err.Error(), "still being provisioned")
+	assert.Contains(t, err.Error(), "GPU_8xH100 capacity can take longer")
+	assert.Contains(t, err.Error(), "https://example.test/run/1")
+	assert.Contains(t, err.Error(), "re-run this command")
+}
+
+func TestDescribeStartupTimeoutWithoutRunPageURL(t *testing.T) {
+	ctx := cmdio.MockDiscard(t.Context())
+	m := mocks.NewMockWorkspaceClient(t)
+	api := m.GetMockJobsAPI()
+	api.EXPECT().GetRun(mock.Anything, jobs.GetRunRequest{RunId: 1}).Return(nil, assert.AnError)
+
+	out := describeStartupTimeout(ctx, m.WorkspaceClient, 1, ClientOptions{TaskStartupTimeout: 10 * time.Minute})
+	assert.Contains(t, out, "run ID: 1")
+	assert.Contains(t, out, "did not start within 10m0s")
+	assert.Contains(t, out, "in the workspace UI")
+}