From 9cb3411b96841a508ce82dfdfa56f3aefb55018d Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Mon, 8 Jun 2026 17:27:22 +0200 Subject: [PATCH 1/6] acceptance: normalize vector search index schema_json in test server The Vector Search backend canonicalizes the SQL type aliases in a DIRECT_ACCESS index's schema_json on create (e.g. "int" -> "integer") and returns the normalized form on read. The fake server echoed the request verbatim, so the create -> get round-trip didn't match the real API and couldn't reproduce the schema drift this normalization causes. Fold the aliases to their canonical spelling, matching brickindex-common/src/utils/ColumnSpec.scala. HTML escaping is disabled when re-serializing so array<...> keeps its angle brackets. Co-authored-by: Isaac --- libs/testserver/vector_search_indexes.go | 57 +++++++++++++++++++ libs/testserver/vector_search_indexes_test.go | 51 +++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 libs/testserver/vector_search_indexes_test.go diff --git a/libs/testserver/vector_search_indexes.go b/libs/testserver/vector_search_indexes.go index 15105682cf7..626d6194579 100644 --- a/libs/testserver/vector_search_indexes.go +++ b/libs/testserver/vector_search_indexes.go @@ -1,6 +1,7 @@ package testserver import ( + "bytes" "encoding/json" "fmt" "net/http" @@ -70,6 +71,13 @@ func (s *FakeWorkspace) VectorSearchIndexCreate(req Request) Response { indexSubtype = vectorsearch.IndexSubtypeHybrid } + // The backend canonicalizes the column type aliases in schema_json on create + // (e.g. "int" -> "integer") and returns the normalized form on read. Mirror + // that here so the create -> get round-trip matches the real API. + if createReq.DirectAccessIndexSpec != nil { + createReq.DirectAccessIndexSpec.SchemaJson = normalizeSchemaJSON(createReq.DirectAccessIndexSpec.SchemaJson) + } + index := fakeVectorSearchIndex{ VectorIndex: vectorsearch.VectorIndex{ Creator: s.CurrentUser().UserName, @@ -110,6 +118,55 @@ func isValidIndexName(name string) bool { return true } +// normalizeSchemaJSON rewrites the column types in a schema_json document to +// the backend's canonical spelling. Returns the input unchanged when it isn't +// the expected {"column":"type"} JSON object. +func normalizeSchemaJSON(schemaJSON string) string { + if schemaJSON == "" { + return schemaJSON + } + var schema map[string]string + if err := json.Unmarshal([]byte(schemaJSON), &schema); err != nil { + return schemaJSON + } + for column, columnType := range schema { + schema[column] = normalizeColumnType(columnType) + } + // Disable HTML escaping so array<...> keeps its angle brackets verbatim + // rather than being rewritten to < / >. + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + enc.SetEscapeHTML(false) + if err := enc.Encode(schema); err != nil { + return schemaJSON + } + return strings.TrimRight(buf.String(), "\n") +} + +// normalizeColumnType folds the SQL type aliases the Vector Search backend +// accepts to the canonical form it stores and returns, recursing into array +// element types. Mirrors brickindex-common/src/utils/ColumnSpec.scala +// (the columnType field); types not listed there pass through unchanged. +func normalizeColumnType(columnType string) string { + if inner, ok := strings.CutPrefix(columnType, "array<"); ok { + if elem, ok := strings.CutSuffix(inner, ">"); ok { + return "array<" + normalizeColumnType(elem) + ">" + } + } + switch columnType { + case "int": + return "integer" + case "bigint": + return "long" + case "smallint": + return "short" + case "tinyint": + return "byte" + default: + return columnType + } +} + // remapDeltaSyncSpec converts a request spec to a response spec. func remapDeltaSyncSpec(req *vectorsearch.DeltaSyncVectorIndexSpecRequest) *vectorsearch.DeltaSyncVectorIndexSpecResponse { if req == nil { diff --git a/libs/testserver/vector_search_indexes_test.go b/libs/testserver/vector_search_indexes_test.go new file mode 100644 index 00000000000..b3196b2cb89 --- /dev/null +++ b/libs/testserver/vector_search_indexes_test.go @@ -0,0 +1,51 @@ +package testserver + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestNormalizeSchemaJSON(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + { + name: "alias folded to canonical type", + in: `{"id":"int","vector":"array"}`, + want: `{"id":"integer","vector":"array"}`, + }, + { + name: "all integer-family aliases", + in: `{"a":"bigint","b":"smallint","c":"tinyint"}`, + want: `{"a":"long","b":"short","c":"byte"}`, + }, + { + name: "array element type is normalized", + in: `{"tags":"array"}`, + want: `{"tags":"array"}`, + }, + { + name: "canonical types pass through and keys are sorted", + in: `{"y":"float","x":"string","z":"integer"}`, + want: `{"x":"string","y":"float","z":"integer"}`, + }, + { + name: "empty input", + in: "", + want: "", + }, + { + name: "non-object input is returned unchanged", + in: "not json", + want: "not json", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, normalizeSchemaJSON(tt.in)) + }) + } +} From b63b466e0953370254dbdb225c346e41a3d55c9b Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Mon, 8 Jun 2026 17:30:59 +0200 Subject: [PATCH 2/6] direct: treat vector search schema_json type aliases as equal A DIRECT_ACCESS index's direct_access_index_spec is immutable, so any diff against it plans a destructive recreate. The Vector Search backend canonicalizes the SQL type aliases in schema_json (e.g. "int" -> "integer") and returns the normalized spelling, so a config that uses an alias drifts against the remote schema on every redeploy and silently recreates the index, dropping all upserted vectors. Fold the aliases to their canonical form (matching brickindex-common/src/utils/ColumnSpec.scala) in OverrideChangeDesc and skip the recreate when the two schemas differ only by aliases. A genuine schema change still recreates. Co-authored-by: Isaac --- .../direct/dresources/vector_search_index.go | 85 +++++++++++++++++-- .../dresources/vector_search_index_test.go | 75 ++++++++++++++++ 2 files changed, 155 insertions(+), 5 deletions(-) diff --git a/bundle/direct/dresources/vector_search_index.go b/bundle/direct/dresources/vector_search_index.go index 48ee6f0f968..2af28d8175e 100644 --- a/bundle/direct/dresources/vector_search_index.go +++ b/bundle/direct/dresources/vector_search_index.go @@ -2,8 +2,11 @@ package dresources import ( "context" + "encoding/json" "errors" "fmt" + "maps" + "strings" "time" "github.com/databricks/cli/bundle/config/resources" @@ -192,11 +195,19 @@ func (r *ResourceVectorSearchIndex) WaitAfterDelete(ctx context.Context, id stri return err } -// OverrideChangeDesc classifies endpoint_uuid drift: Recreate when the saved -// UUID differs from what's currently attached to the endpoint name, Skip -// otherwise. endpoint_uuid is never present in config, so without Skip a -// synthetic diff between empty newState and populated saved state would -// otherwise leak into the plan. +// OverrideChangeDesc suppresses two synthetic diffs the built-in classifiers +// can't express; every other field is left untouched. +// +// schema_json: the backend canonicalizes SQL type aliases (e.g. "int" -> +// "integer") and returns the normalized spelling, so an otherwise unchanged +// config looks like a change to the immutable direct_access_index_spec and +// would trigger a destructive recreate. Skip when the two schemas differ only +// by those aliases. Mirrors brickindex-common/src/utils/ColumnSpec.scala. +// +// endpoint_uuid: Recreate when the saved UUID differs from what's currently +// attached to the endpoint name, Skip otherwise. endpoint_uuid is never present +// in config, so without Skip a synthetic diff between empty newState and +// populated saved state would otherwise leak into the plan. // // Unlike vector_search_endpoint, this intentionally does NOT require // remoteUuid != "". An empty remoteUuid here is the orphan signal: the index @@ -205,6 +216,19 @@ func (r *ResourceVectorSearchIndex) WaitAfterDelete(ctx context.Context, id stri // (propagated through DoRead/DoCreate), so reaching this branch with empty // remoteUuid unambiguously means the endpoint is gone. func (*ResourceVectorSearchIndex) OverrideChangeDesc(_ context.Context, path *structpath.PathNode, change *ChangeDesc, remote *VectorSearchIndexRemote) error { + if path.String() == "direct_access_index_spec.schema_json" { + if change.Action == deployplan.Skip { + return nil + } + newSchema, newOk := change.New.(string) + remoteSchema, remoteOk := change.Remote.(string) + if newOk && remoteOk && schemaTypesEqual(newSchema, remoteSchema) { + change.Action = deployplan.Skip + change.Reason = deployplan.ReasonAlias + } + return nil + } + if path.String() != "endpoint_uuid" { return nil } @@ -240,3 +264,54 @@ func (r *ResourceVectorSearchIndex) lookupEndpointUuid(ctx context.Context, endp } return info.Id, nil } + +// schemaTypesEqual reports whether two schema_json documents describe the same +// columns and types once SQL type aliases are folded to their canonical form +// (e.g. "int" == "integer"). Malformed input compares unequal so the caller +// falls back to the default recreate. +func schemaTypesEqual(a, b string) bool { + typesA, err := parseSchemaTypes(a) + if err != nil { + return false + } + typesB, err := parseSchemaTypes(b) + if err != nil { + return false + } + return maps.Equal(typesA, typesB) +} + +func parseSchemaTypes(schemaJSON string) (map[string]string, error) { + var schema map[string]string + if err := json.Unmarshal([]byte(schemaJSON), &schema); err != nil { + return nil, err + } + for column, columnType := range schema { + schema[column] = normalizeColumnType(columnType) + } + return schema, nil +} + +// normalizeColumnType folds the SQL type aliases the Vector Search backend +// accepts to the canonical form it stores and returns, recursing into array +// element types. Mirrors brickindex-common/src/utils/ColumnSpec.scala +// (the columnType field); types not listed there pass through unchanged. +func normalizeColumnType(columnType string) string { + if inner, ok := strings.CutPrefix(columnType, "array<"); ok { + if elem, ok := strings.CutSuffix(inner, ">"); ok { + return "array<" + normalizeColumnType(elem) + ">" + } + } + switch columnType { + case "int": + return "integer" + case "bigint": + return "long" + case "smallint": + return "short" + case "tinyint": + return "byte" + default: + return columnType + } +} diff --git a/bundle/direct/dresources/vector_search_index_test.go b/bundle/direct/dresources/vector_search_index_test.go index e3e2b4fcfdf..463a866b6b8 100644 --- a/bundle/direct/dresources/vector_search_index_test.go +++ b/bundle/direct/dresources/vector_search_index_test.go @@ -5,6 +5,8 @@ import ( "strings" "testing" + "github.com/databricks/cli/bundle/deployplan" + "github.com/databricks/cli/libs/structs/structpath" "github.com/databricks/databricks-sdk-go/service/vectorsearch" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -43,3 +45,76 @@ func TestVectorSearchIndexAllSDKFieldsAreClassified(t *testing.T) { ) } } + +func TestNormalizeColumnType(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"int", "integer"}, + {"integer", "integer"}, + {"bigint", "long"}, + {"long", "long"}, + {"smallint", "short"}, + {"tinyint", "byte"}, + {"float", "float"}, + {"string", "string"}, + {"array", "array"}, + {"array", "array"}, + {"array>", "array>"}, + } + for _, tt := range tests { + t.Run(tt.in, func(t *testing.T) { + assert.Equal(t, tt.want, normalizeColumnType(tt.in)) + }) + } +} + +func TestSchemaTypesEqual(t *testing.T) { + tests := []struct { + name string + a string + b string + want bool + }{ + {"alias equals canonical", `{"id":"int"}`, `{"id":"integer"}`, true}, + {"key order is irrelevant", `{"a":"int","b":"string"}`, `{"b":"string","a":"integer"}`, true}, + {"array alias equals canonical", `{"v":"array"}`, `{"v":"array"}`, true}, + {"different type", `{"id":"int"}`, `{"id":"string"}`, false}, + {"different columns", `{"id":"int"}`, `{"id":"int","x":"string"}`, false}, + {"malformed input", `not json`, `{"id":"int"}`, false}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + assert.Equal(t, tt.want, schemaTypesEqual(tt.a, tt.b)) + }) + } +} + +func TestVectorSearchIndexOverrideChangeDescSchemaJSON(t *testing.T) { + r := &ResourceVectorSearchIndex{} + path, err := structpath.ParsePath("direct_access_index_spec.schema_json") + require.NoError(t, err) + + // Alias-only difference between config and the normalized remote schema: + // the recreate is suppressed. + change := &ChangeDesc{ + Action: deployplan.Recreate, + Reason: "immutable", + New: `{"id":"integer","vector":"array"}`, + Remote: `{"id":"int","vector":"array"}`, + } + require.NoError(t, r.OverrideChangeDesc(t.Context(), path, change, nil)) + assert.Equal(t, deployplan.Skip, change.Action) + assert.Equal(t, deployplan.ReasonAlias, change.Reason) + + // A genuine schema change still recreates. + change = &ChangeDesc{ + Action: deployplan.Recreate, + Reason: "immutable", + New: `{"id":"string"}`, + Remote: `{"id":"int"}`, + } + require.NoError(t, r.OverrideChangeDesc(t.Context(), path, change, nil)) + assert.Equal(t, deployplan.Recreate, change.Action) +} From 49097b3454d3f7892bdf543ac6bc200e87a82e50 Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Mon, 8 Jun 2026 17:35:03 +0200 Subject: [PATCH 3/6] acceptance: add local test for vector search schema_json type aliases Create a DIRECT_ACCESS index whose schema_json uses the SQL type aliases the backend canonicalizes (int, bigint, smallint, tinyint, array), alongside types that pass through unchanged (float, string, array). The test server returns the normalized schema and the redeploy plan reports no changes, exercising the alias round-trip end-to-end without a live workspace. Pinned to local: a live workspace returns a different spelling than the test server's canonical form, so only the hermetic output is stable. Co-authored-by: Isaac --- .../schema_normalization/databricks.yml.tmpl | 26 ++++++++++++++++++ .../schema_normalization/out.test.toml | 5 ++++ .../schema_normalization/output.txt | 27 +++++++++++++++++++ .../schema_normalization/script | 18 +++++++++++++ .../schema_normalization/test.toml | 5 ++++ 5 files changed, 81 insertions(+) create mode 100644 acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl create mode 100644 acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml create mode 100644 acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt create mode 100644 acceptance/bundle/resources/vector_search_indexes/schema_normalization/script create mode 100644 acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl new file mode 100644 index 00000000000..114d23b08f4 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl @@ -0,0 +1,26 @@ +bundle: + name: vs-index-schema-$UNIQUE_NAME + +sync: + paths: [] + +resources: + vector_search_endpoints: + my_endpoint: + name: vs-endpoint-$UNIQUE_NAME + endpoint_type: STANDARD + vector_search_indexes: + my_index: + name: main.default.vs_index_$UNIQUE_NAME + endpoint_name: ${resources.vector_search_endpoints.my_endpoint.name} + primary_key: id + index_type: DIRECT_ACCESS + direct_access_index_spec: + # Column types use the SQL aliases the backend canonicalizes on create: + # int->integer, bigint->long, smallint->short, tinyint->byte, and the + # same recursively inside array<...>. A redeploy must not see the + # normalized remote schema as drift on this immutable field. + schema_json: '{"id":"int","count":"bigint","small":"smallint","tiny":"tinyint","tags":"array","score":"float","label":"string","vector":"array"}' + embedding_vector_columns: + - name: vector + embedding_dimension: 768 diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml new file mode 100644 index 00000000000..48203e833cd --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = true +CloudSlow = true +RequiresUnityCatalog = true +EnvMatrix.DATABRICKS_BUNDLE_ENGINE = ["direct"] diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt new file mode 100644 index 00000000000..10826d85ef6 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt @@ -0,0 +1,27 @@ + +>>> [CLI] bundle deploy +Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default/files... +Deploying resources... +Updating deployment state... +Deployment complete! + +>>> [CLI] vector-search-indexes get-index main.default.vs_index_[UNIQUE_NAME] +{"count":"long","id":"integer","label":"string","score":"float","small":"short","tags":"array","tiny":"byte","vector":"array"} + +>>> [CLI] bundle plan +Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged + +>>> [CLI] bundle destroy --auto-approve +The following resources will be deleted: + delete resources.vector_search_endpoints.my_endpoint + delete resources.vector_search_indexes.my_index + +This action will result in the deletion of the following Vector Search indexes. +For Delta Sync indexes, the source Delta Table is preserved but the embedding pipeline is removed. +For Direct Access indexes, all upserted vectors are permanently lost: + delete resources.vector_search_indexes.my_index + +All files and directories at the following location will be deleted: /Workspace/Users/[USERNAME]/.bundle/vs-index-schema-[UNIQUE_NAME]/default + +Deleting files... +Destroy complete! diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script new file mode 100644 index 00000000000..9e6ca7c0d05 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script @@ -0,0 +1,18 @@ +envsubst < databricks.yml.tmpl > databricks.yml + +cleanup() { + trace $CLI bundle destroy --auto-approve + rm -f out.requests.txt +} +trap cleanup EXIT + +trace $CLI bundle deploy + +# The backend (and the test server) canonicalize the column type aliases, so +# get-index returns the normalized schema even though the config used aliases. +index_name="main.default.vs_index_${UNIQUE_NAME}" +trace $CLI vector-search-indexes get-index "${index_name}" | jq -r '.direct_access_index_spec.schema_json' + +# Re-plan must be a no-op: the CLI treats the config aliases and the backend's +# canonical types as equal, so the immutable schema_json does not recreate. +trace $CLI bundle plan diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml new file mode 100644 index 00000000000..5314bca7df6 --- /dev/null +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml @@ -0,0 +1,5 @@ +# The test server returns schema_json column types in the backend's canonical +# form (int -> integer, ...), which differs from the spelling a live workspace +# returns, so the recorded output only matches locally. The point of this test +# is the alias round-trip, which the test server reproduces hermetically. +Cloud = false From f032963b442afb34709b6d2d080cf524dbc4fe16 Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Fri, 12 Jun 2026 12:37:35 +0200 Subject: [PATCH 4/6] direct: ignore remote rewrites of vector search schema_json The previous approach (b63b466e0) had the backend behavior backwards: the user provides user-facing type names ("integer", "long", "short", "byte") in a DIRECT_ACCESS index's schema_json and Unity Catalog stores them as Spark type names ("int", "bigint", "smallint", "tinyint"). GET returns the Spark names - and the columns in sorted key order, which alias folding alone could not account for. Replace the OverrideChangeDesc alias comparison with ignore_remote_changes on direct_access_index_spec.schema_json: remote rewrites of the field are ignored while a genuine local schema edit still recreates via the parent spec's recreate_on_changes rule. Flip the test server normalization to match the real API (user-facing -> Spark names, sorted keys) and update the schema_normalization acceptance test to cover the rewrite round-trip, including deliberately unsorted keys. The bind fixture switches to the Spark spelling: bind seeds state from the remote, so the user-facing spelling would read as a local edit after bind and plan a recreate of the adopted index. Co-authored-by: Isaac --- .../vector_search_index/databricks.yml.tmpl | 6 +- .../bind/vector_search_index/output.txt | 2 +- .../bind/vector_search_index/script | 2 +- .../recreate/with_endpoint/output.txt | 9 +- .../schema_normalization/databricks.yml.tmpl | 12 +-- .../schema_normalization/output.txt | 2 +- .../schema_normalization/script | 8 +- .../schema_normalization/test.toml | 8 +- bundle/direct/dresources/resources.yml | 10 +++ .../direct/dresources/vector_search_index.go | 85 ++----------------- .../dresources/vector_search_index_test.go | 75 ---------------- libs/testserver/vector_search_indexes.go | 40 +++++---- libs/testserver/vector_search_indexes_test.go | 24 +++--- 13 files changed, 80 insertions(+), 203 deletions(-) diff --git a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl index 29692b4450c..51e61dc31cb 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl +++ b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl @@ -12,7 +12,11 @@ resources: primary_key: id index_type: DIRECT_ACCESS direct_access_index_spec: - schema_json: '{"id":"integer","vector":"array"}' + # Spark type spelling ("int", not "integer"): the backend stores and + # returns Spark type names, and bind seeds state from the remote, so + # any other spelling would read as a local schema edit after bind and + # plan a recreate of the adopted index. + schema_json: '{"id":"int","vector":"array"}' embedding_vector_columns: - name: vector embedding_dimension: 768 diff --git a/acceptance/bundle/deployment/bind/vector_search_index/output.txt b/acceptance/bundle/deployment/bind/vector_search_index/output.txt index f1b79186906..a91168a5137 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/output.txt +++ b/acceptance/bundle/deployment/bind/vector_search_index/output.txt @@ -5,7 +5,7 @@ "endpoint_type": "STANDARD" } ->>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"integer\",\"vector\":\"array\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}} +>>> [CLI] vector-search-indexes create-index --json {"name":"main.default.test_vs_index_[UNIQUE_NAME]","endpoint_name":"test-vs-endpoint-[UNIQUE_NAME]","primary_key":"id","index_type":"DIRECT_ACCESS","direct_access_index_spec":{"schema_json":"{\"id\":\"int\",\"vector\":\"array\"}","embedding_vector_columns":[{"name":"vector","embedding_dimension":768}]}} { "name": "main.default.test_vs_index_[UNIQUE_NAME]", "endpoint_name": "test-vs-endpoint-[UNIQUE_NAME]", diff --git a/acceptance/bundle/deployment/bind/vector_search_index/script b/acceptance/bundle/deployment/bind/vector_search_index/script index 3d07efacf93..16f36a496fd 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/script +++ b/acceptance/bundle/deployment/bind/vector_search_index/script @@ -11,7 +11,7 @@ trap cleanup EXIT trace $CLI vector-search-endpoints create-endpoint "${ENDPOINT_NAME}" STANDARD | jq '{name, endpoint_type}' -trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"integer\\\",\\\"vector\\\":\\\"array\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}' +trace $CLI vector-search-indexes create-index --json "{\"name\":\"${INDEX_NAME}\",\"endpoint_name\":\"${ENDPOINT_NAME}\",\"primary_key\":\"id\",\"index_type\":\"DIRECT_ACCESS\",\"direct_access_index_spec\":{\"schema_json\":\"{\\\"id\\\":\\\"int\\\",\\\"vector\\\":\\\"array\\\"}\",\"embedding_vector_columns\":[{\"name\":\"vector\",\"embedding_dimension\":768}]}}" | jq '{name, endpoint_name, index_type, primary_key}' trace $CLI bundle deployment bind index1 "${INDEX_NAME}" --auto-approve diff --git a/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt b/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt index 20c139225ed..deb3c925c09 100644 --- a/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt +++ b/acceptance/bundle/resources/vector_search_indexes/recreate/with_endpoint/output.txt @@ -69,7 +69,7 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged "name": "vector" } ], - "schema_json": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}" + "schema_json": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}" }, "endpoint_name": "vs-endpoint-[UNIQUE_NAME]", "endpoint_uuid": "[UUID]", @@ -82,6 +82,13 @@ Plan: 1 to add, 0 to change, 1 to delete, 1 unchanged } }, "changes": { + "direct_access_index_spec.schema_json": { + "action": "skip", + "reason": "normalized_by_backend", + "old": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}", + "new": "{\"id\":\"integer\",\"vector\":\"array\u003cfloat\u003e\"}", + "remote": "{\"id\":\"int\",\"vector\":\"array\u003cfloat\u003e\"}" + }, "endpoint_uuid": { "action": "skip", "reason": "state-only field", diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl index 114d23b08f4..6c5df545b74 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl @@ -16,11 +16,13 @@ resources: primary_key: id index_type: DIRECT_ACCESS direct_access_index_spec: - # Column types use the SQL aliases the backend canonicalizes on create: - # int->integer, bigint->long, smallint->short, tinyint->byte, and the - # same recursively inside array<...>. A redeploy must not see the - # normalized remote schema as drift on this immutable field. - schema_json: '{"id":"int","count":"bigint","small":"smallint","tiny":"tinyint","tags":"array","score":"float","label":"string","vector":"array"}' + # The backend stores these user-facing type names as Spark type names + # (integer->int, long->bigint, short->smallint, byte->tinyint, and the + # same recursively inside array<...>) and returns the columns in + # sorted key order, so GET never echoes this literal string. The keys + # here are deliberately not sorted. A redeploy must not see the + # rewritten remote schema as drift on this immutable field. + schema_json: '{"id":"integer","count":"long","small":"short","tiny":"byte","tags":"array","score":"float","label":"string","vector":"array"}' embedding_vector_columns: - name: vector embedding_dimension: 768 diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt index 10826d85ef6..6e46bc6c8bf 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/output.txt @@ -6,7 +6,7 @@ Updating deployment state... Deployment complete! >>> [CLI] vector-search-indexes get-index main.default.vs_index_[UNIQUE_NAME] -{"count":"long","id":"integer","label":"string","score":"float","small":"short","tags":"array","tiny":"byte","vector":"array"} +{"count":"bigint","id":"int","label":"string","score":"float","small":"smallint","tags":"array","tiny":"tinyint","vector":"array"} >>> [CLI] bundle plan Plan: 0 to add, 0 to change, 0 to delete, 2 unchanged diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script index 9e6ca7c0d05..d7a56a8ff22 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/script @@ -8,11 +8,11 @@ trap cleanup EXIT trace $CLI bundle deploy -# The backend (and the test server) canonicalize the column type aliases, so -# get-index returns the normalized schema even though the config used aliases. +# The backend (and the test server) rewrite the schema on create, so +# get-index returns Spark type names and sorted keys, not the config literal. index_name="main.default.vs_index_${UNIQUE_NAME}" trace $CLI vector-search-indexes get-index "${index_name}" | jq -r '.direct_access_index_spec.schema_json' -# Re-plan must be a no-op: the CLI treats the config aliases and the backend's -# canonical types as equal, so the immutable schema_json does not recreate. +# Re-plan must be a no-op: remote changes to schema_json are ignored +# (ignore_remote_changes), so the immutable spec does not plan a recreate. trace $CLI bundle plan diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml index 5314bca7df6..1c7cadc4bca 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml @@ -1,5 +1,5 @@ -# The test server returns schema_json column types in the backend's canonical -# form (int -> integer, ...), which differs from the spelling a live workspace -# returns, so the recorded output only matches locally. The point of this test -# is the alias round-trip, which the test server reproduces hermetically. +# The test pins the exact schema_json string get-index returns. The test +# server's rewrite (user-facing -> Spark type names, sorted keys) matches the +# real backend's behavior, but the byte-for-byte JSON serialization of a live +# workspace isn't guaranteed to match, so the recorded output is local-only. Cloud = false diff --git a/bundle/direct/dresources/resources.yml b/bundle/direct/dresources/resources.yml index 75a60e0dc62..cb32df38800 100644 --- a/bundle/direct/dresources/resources.yml +++ b/bundle/direct/dresources/resources.yml @@ -608,6 +608,16 @@ resources: reason: immutable - field: direct_access_index_spec reason: immutable + ignore_remote_changes: + # The backend rewrites schema_json on create: user-facing type names + # ("integer", "long", "short", "byte") are stored in Unity Catalog as + # Spark type names ("int", "bigint", "smallint", "tinyint") and the + # columns come back in sorted key order, so GET never echoes the user's + # literal input. Without this rule the rewrite reads as a change to the + # immutable direct_access_index_spec and plans a destructive recreate + # that drops all upserted vectors. + - field: direct_access_index_spec.schema_json + reason: normalized_by_backend backend_defaults: # The Vector Search API assigns index_subtype when the config omits it - field: index_subtype diff --git a/bundle/direct/dresources/vector_search_index.go b/bundle/direct/dresources/vector_search_index.go index 2af28d8175e..48ee6f0f968 100644 --- a/bundle/direct/dresources/vector_search_index.go +++ b/bundle/direct/dresources/vector_search_index.go @@ -2,11 +2,8 @@ package dresources import ( "context" - "encoding/json" "errors" "fmt" - "maps" - "strings" "time" "github.com/databricks/cli/bundle/config/resources" @@ -195,19 +192,11 @@ func (r *ResourceVectorSearchIndex) WaitAfterDelete(ctx context.Context, id stri return err } -// OverrideChangeDesc suppresses two synthetic diffs the built-in classifiers -// can't express; every other field is left untouched. -// -// schema_json: the backend canonicalizes SQL type aliases (e.g. "int" -> -// "integer") and returns the normalized spelling, so an otherwise unchanged -// config looks like a change to the immutable direct_access_index_spec and -// would trigger a destructive recreate. Skip when the two schemas differ only -// by those aliases. Mirrors brickindex-common/src/utils/ColumnSpec.scala. -// -// endpoint_uuid: Recreate when the saved UUID differs from what's currently -// attached to the endpoint name, Skip otherwise. endpoint_uuid is never present -// in config, so without Skip a synthetic diff between empty newState and -// populated saved state would otherwise leak into the plan. +// OverrideChangeDesc classifies endpoint_uuid drift: Recreate when the saved +// UUID differs from what's currently attached to the endpoint name, Skip +// otherwise. endpoint_uuid is never present in config, so without Skip a +// synthetic diff between empty newState and populated saved state would +// otherwise leak into the plan. // // Unlike vector_search_endpoint, this intentionally does NOT require // remoteUuid != "". An empty remoteUuid here is the orphan signal: the index @@ -216,19 +205,6 @@ func (r *ResourceVectorSearchIndex) WaitAfterDelete(ctx context.Context, id stri // (propagated through DoRead/DoCreate), so reaching this branch with empty // remoteUuid unambiguously means the endpoint is gone. func (*ResourceVectorSearchIndex) OverrideChangeDesc(_ context.Context, path *structpath.PathNode, change *ChangeDesc, remote *VectorSearchIndexRemote) error { - if path.String() == "direct_access_index_spec.schema_json" { - if change.Action == deployplan.Skip { - return nil - } - newSchema, newOk := change.New.(string) - remoteSchema, remoteOk := change.Remote.(string) - if newOk && remoteOk && schemaTypesEqual(newSchema, remoteSchema) { - change.Action = deployplan.Skip - change.Reason = deployplan.ReasonAlias - } - return nil - } - if path.String() != "endpoint_uuid" { return nil } @@ -264,54 +240,3 @@ func (r *ResourceVectorSearchIndex) lookupEndpointUuid(ctx context.Context, endp } return info.Id, nil } - -// schemaTypesEqual reports whether two schema_json documents describe the same -// columns and types once SQL type aliases are folded to their canonical form -// (e.g. "int" == "integer"). Malformed input compares unequal so the caller -// falls back to the default recreate. -func schemaTypesEqual(a, b string) bool { - typesA, err := parseSchemaTypes(a) - if err != nil { - return false - } - typesB, err := parseSchemaTypes(b) - if err != nil { - return false - } - return maps.Equal(typesA, typesB) -} - -func parseSchemaTypes(schemaJSON string) (map[string]string, error) { - var schema map[string]string - if err := json.Unmarshal([]byte(schemaJSON), &schema); err != nil { - return nil, err - } - for column, columnType := range schema { - schema[column] = normalizeColumnType(columnType) - } - return schema, nil -} - -// normalizeColumnType folds the SQL type aliases the Vector Search backend -// accepts to the canonical form it stores and returns, recursing into array -// element types. Mirrors brickindex-common/src/utils/ColumnSpec.scala -// (the columnType field); types not listed there pass through unchanged. -func normalizeColumnType(columnType string) string { - if inner, ok := strings.CutPrefix(columnType, "array<"); ok { - if elem, ok := strings.CutSuffix(inner, ">"); ok { - return "array<" + normalizeColumnType(elem) + ">" - } - } - switch columnType { - case "int": - return "integer" - case "bigint": - return "long" - case "smallint": - return "short" - case "tinyint": - return "byte" - default: - return columnType - } -} diff --git a/bundle/direct/dresources/vector_search_index_test.go b/bundle/direct/dresources/vector_search_index_test.go index 463a866b6b8..e3e2b4fcfdf 100644 --- a/bundle/direct/dresources/vector_search_index_test.go +++ b/bundle/direct/dresources/vector_search_index_test.go @@ -5,8 +5,6 @@ import ( "strings" "testing" - "github.com/databricks/cli/bundle/deployplan" - "github.com/databricks/cli/libs/structs/structpath" "github.com/databricks/databricks-sdk-go/service/vectorsearch" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -45,76 +43,3 @@ func TestVectorSearchIndexAllSDKFieldsAreClassified(t *testing.T) { ) } } - -func TestNormalizeColumnType(t *testing.T) { - tests := []struct { - in string - want string - }{ - {"int", "integer"}, - {"integer", "integer"}, - {"bigint", "long"}, - {"long", "long"}, - {"smallint", "short"}, - {"tinyint", "byte"}, - {"float", "float"}, - {"string", "string"}, - {"array", "array"}, - {"array", "array"}, - {"array>", "array>"}, - } - for _, tt := range tests { - t.Run(tt.in, func(t *testing.T) { - assert.Equal(t, tt.want, normalizeColumnType(tt.in)) - }) - } -} - -func TestSchemaTypesEqual(t *testing.T) { - tests := []struct { - name string - a string - b string - want bool - }{ - {"alias equals canonical", `{"id":"int"}`, `{"id":"integer"}`, true}, - {"key order is irrelevant", `{"a":"int","b":"string"}`, `{"b":"string","a":"integer"}`, true}, - {"array alias equals canonical", `{"v":"array"}`, `{"v":"array"}`, true}, - {"different type", `{"id":"int"}`, `{"id":"string"}`, false}, - {"different columns", `{"id":"int"}`, `{"id":"int","x":"string"}`, false}, - {"malformed input", `not json`, `{"id":"int"}`, false}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - assert.Equal(t, tt.want, schemaTypesEqual(tt.a, tt.b)) - }) - } -} - -func TestVectorSearchIndexOverrideChangeDescSchemaJSON(t *testing.T) { - r := &ResourceVectorSearchIndex{} - path, err := structpath.ParsePath("direct_access_index_spec.schema_json") - require.NoError(t, err) - - // Alias-only difference between config and the normalized remote schema: - // the recreate is suppressed. - change := &ChangeDesc{ - Action: deployplan.Recreate, - Reason: "immutable", - New: `{"id":"integer","vector":"array"}`, - Remote: `{"id":"int","vector":"array"}`, - } - require.NoError(t, r.OverrideChangeDesc(t.Context(), path, change, nil)) - assert.Equal(t, deployplan.Skip, change.Action) - assert.Equal(t, deployplan.ReasonAlias, change.Reason) - - // A genuine schema change still recreates. - change = &ChangeDesc{ - Action: deployplan.Recreate, - Reason: "immutable", - New: `{"id":"string"}`, - Remote: `{"id":"int"}`, - } - require.NoError(t, r.OverrideChangeDesc(t.Context(), path, change, nil)) - assert.Equal(t, deployplan.Recreate, change.Action) -} diff --git a/libs/testserver/vector_search_indexes.go b/libs/testserver/vector_search_indexes.go index 626d6194579..abf90080200 100644 --- a/libs/testserver/vector_search_indexes.go +++ b/libs/testserver/vector_search_indexes.go @@ -71,9 +71,10 @@ func (s *FakeWorkspace) VectorSearchIndexCreate(req Request) Response { indexSubtype = vectorsearch.IndexSubtypeHybrid } - // The backend canonicalizes the column type aliases in schema_json on create - // (e.g. "int" -> "integer") and returns the normalized form on read. Mirror - // that here so the create -> get round-trip matches the real API. + // The backend rewrites schema_json on create: user-facing type names are + // stored as Spark type names (e.g. "integer" -> "int") and the columns are + // returned in sorted key order rather than the user's original order. + // Mirror that here so the create -> get round-trip matches the real API. if createReq.DirectAccessIndexSpec != nil { createReq.DirectAccessIndexSpec.SchemaJson = normalizeSchemaJSON(createReq.DirectAccessIndexSpec.SchemaJson) } @@ -118,9 +119,11 @@ func isValidIndexName(name string) bool { return true } -// normalizeSchemaJSON rewrites the column types in a schema_json document to -// the backend's canonical spelling. Returns the input unchanged when it isn't -// the expected {"column":"type"} JSON object. +// normalizeSchemaJSON rewrites a schema_json document the way the backend +// stores it: user-facing column type names are folded to Spark type names and +// the columns are re-serialized in sorted key order (encoding/json sorts map +// keys, matching the backend). Returns the input unchanged when it isn't the +// expected {"column":"type"} JSON object. func normalizeSchemaJSON(schemaJSON string) string { if schemaJSON == "" { return schemaJSON @@ -143,10 +146,11 @@ func normalizeSchemaJSON(schemaJSON string) string { return strings.TrimRight(buf.String(), "\n") } -// normalizeColumnType folds the SQL type aliases the Vector Search backend -// accepts to the canonical form it stores and returns, recursing into array -// element types. Mirrors brickindex-common/src/utils/ColumnSpec.scala -// (the columnType field); types not listed there pass through unchanged. +// normalizeColumnType maps the user-facing column type names the Vector +// Search API accepts ("integer", "long", "short", "byte") to the Spark type +// names Unity Catalog stores and GET returns, recursing into array element +// types. Types whose user-facing and Spark spellings coincide ("float", +// "string", ...) pass through unchanged. func normalizeColumnType(columnType string) string { if inner, ok := strings.CutPrefix(columnType, "array<"); ok { if elem, ok := strings.CutSuffix(inner, ">"); ok { @@ -154,14 +158,14 @@ func normalizeColumnType(columnType string) string { } } switch columnType { - case "int": - return "integer" - case "bigint": - return "long" - case "smallint": - return "short" - case "tinyint": - return "byte" + case "integer": + return "int" + case "long": + return "bigint" + case "short": + return "smallint" + case "byte": + return "tinyint" default: return columnType } diff --git a/libs/testserver/vector_search_indexes_test.go b/libs/testserver/vector_search_indexes_test.go index b3196b2cb89..6df73895fde 100644 --- a/libs/testserver/vector_search_indexes_test.go +++ b/libs/testserver/vector_search_indexes_test.go @@ -13,24 +13,24 @@ func TestNormalizeSchemaJSON(t *testing.T) { want string }{ { - name: "alias folded to canonical type", - in: `{"id":"int","vector":"array"}`, - want: `{"id":"integer","vector":"array"}`, + name: "user-facing type stored as Spark type name", + in: `{"id":"integer","vector":"array"}`, + want: `{"id":"int","vector":"array"}`, }, { - name: "all integer-family aliases", - in: `{"a":"bigint","b":"smallint","c":"tinyint"}`, - want: `{"a":"long","b":"short","c":"byte"}`, + name: "all integer-family names", + in: `{"a":"long","b":"short","c":"byte"}`, + want: `{"a":"bigint","b":"smallint","c":"tinyint"}`, }, { - name: "array element type is normalized", - in: `{"tags":"array"}`, - want: `{"tags":"array"}`, + name: "array element type is mapped", + in: `{"tags":"array"}`, + want: `{"tags":"array"}`, }, { - name: "canonical types pass through and keys are sorted", - in: `{"y":"float","x":"string","z":"integer"}`, - want: `{"x":"string","y":"float","z":"integer"}`, + name: "matching spellings pass through and keys are sorted", + in: `{"y":"float","x":"string","z":"int"}`, + want: `{"x":"string","y":"float","z":"int"}`, }, { name: "empty input", From 797889e7fa9219a5c6779a11b6fc86d02d499e6d Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Fri, 12 Jun 2026 13:33:42 +0200 Subject: [PATCH 5/6] Changelog --- NEXT_CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 90b65ec3e86..acdf23473ea 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -9,6 +9,7 @@ ### Bundles * Set the default `data_security_mode` to `DATA_SECURITY_MODE_AUTO` in bundle templates ([#5452](https://github.com/databricks/cli/pull/5452)). * Mark vector search index index_subtype as backend_default to prevent drift after deployment ([#5454](https://github.com/databricks/cli/pull/5454)). +* Ignore remote changes for vector search direct_access_index_spec.schema_json to prevent drift when the backend normalizes the schema ([#5481](https://github.com/databricks/cli/pull/5481)). ### Dependency updates From 08b792ba628da4eec5e5ca05d122bfe4cb7051b4 Mon Sep 17 00:00:00 2001 From: Jan Rose Date: Fri, 12 Jun 2026 13:52:05 +0200 Subject: [PATCH 6/6] Remove comments from acceptance tests --- .../deployment/bind/vector_search_index/databricks.yml.tmpl | 4 ---- .../schema_normalization/databricks.yml.tmpl | 6 ------ .../vector_search_indexes/schema_normalization/test.toml | 4 ---- 3 files changed, 14 deletions(-) diff --git a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl index 51e61dc31cb..e057c575ed8 100644 --- a/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl +++ b/acceptance/bundle/deployment/bind/vector_search_index/databricks.yml.tmpl @@ -12,10 +12,6 @@ resources: primary_key: id index_type: DIRECT_ACCESS direct_access_index_spec: - # Spark type spelling ("int", not "integer"): the backend stores and - # returns Spark type names, and bind seeds state from the remote, so - # any other spelling would read as a local schema edit after bind and - # plan a recreate of the adopted index. schema_json: '{"id":"int","vector":"array"}' embedding_vector_columns: - name: vector diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl index 6c5df545b74..d4725692991 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/databricks.yml.tmpl @@ -16,12 +16,6 @@ resources: primary_key: id index_type: DIRECT_ACCESS direct_access_index_spec: - # The backend stores these user-facing type names as Spark type names - # (integer->int, long->bigint, short->smallint, byte->tinyint, and the - # same recursively inside array<...>) and returns the columns in - # sorted key order, so GET never echoes this literal string. The keys - # here are deliberately not sorted. A redeploy must not see the - # rewritten remote schema as drift on this immutable field. schema_json: '{"id":"integer","count":"long","small":"short","tiny":"byte","tags":"array","score":"float","label":"string","vector":"array"}' embedding_vector_columns: - name: vector diff --git a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml index 1c7cadc4bca..18b1a88417e 100644 --- a/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml +++ b/acceptance/bundle/resources/vector_search_indexes/schema_normalization/test.toml @@ -1,5 +1 @@ -# The test pins the exact schema_json string get-index returns. The test -# server's rewrite (user-facing -> Spark type names, sorted keys) matches the -# real backend's behavior, but the byte-for-byte JSON serialization of a live -# workspace isn't guaranteed to match, so the recorded output is local-only. Cloud = false