-
Notifications
You must be signed in to change notification settings - Fork 2.1k
fix(profiler): N+1 / missing-index regression on /tables/.../columns?fields=profile (#3488) #27746
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,19 @@ | ||||||||||||||||||||||||||||||||||||||
| -- Restore a composite index on profiler_data_time_series(entityFQNHash, extension, timestamp). | ||||||||||||||||||||||||||||||||||||||
| -- | ||||||||||||||||||||||||||||||||||||||
| -- Root cause: the 1.9.9 postDataMigrationSQLScript.sql created several temporary indexes for | ||||||||||||||||||||||||||||||||||||||
| -- the data migration and then dropped ALL of them at the end, including the only index that | ||||||||||||||||||||||||||||||||||||||
| -- covered entityFQNHash. After that migration the table retains only the unique constraint | ||||||||||||||||||||||||||||||||||||||
| -- (entityFQNHash, extension, operation, timestamp) where `operation` sits between `extension` | ||||||||||||||||||||||||||||||||||||||
| -- and `timestamp`. Queries of the form | ||||||||||||||||||||||||||||||||||||||
| -- | ||||||||||||||||||||||||||||||||||||||
| -- SELECT entityFQNHash, MAX(timestamp) FROM profiler_data_time_series | ||||||||||||||||||||||||||||||||||||||
| -- WHERE entityFQNHash IN (...) AND extension = 'table.columnProfile' | ||||||||||||||||||||||||||||||||||||||
| -- GROUP BY entityFQNHash | ||||||||||||||||||||||||||||||||||||||
| -- | ||||||||||||||||||||||||||||||||||||||
| -- cannot use that index efficiently for MAX(timestamp) because `operation` (nullable) breaks | ||||||||||||||||||||||||||||||||||||||
| -- the prefix. On a large table (millions of profile rows) this causes a full table scan and | ||||||||||||||||||||||||||||||||||||||
| -- 100+ second response times on the columns API when `fields=profile` is requested. | ||||||||||||||||||||||||||||||||||||||
| -- | ||||||||||||||||||||||||||||||||||||||
| -- The new index covers the exact predicate pattern used by getLatestExtensionsBatch(). | ||||||||||||||||||||||||||||||||||||||
| CREATE INDEX IF NOT EXISTS idx_pdts_fqnhash_ext_ts | ||||||||||||||||||||||||||||||||||||||
| ON profiler_data_time_series (entityFQNHash, extension, timestamp); | ||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+18
to
+19
|
||||||||||||||||||||||||||||||||||||||
| CREATE INDEX IF NOT EXISTS idx_pdts_fqnhash_ext_ts | |
| ON profiler_data_time_series (entityFQNHash, extension, timestamp); | |
| SET @sql = ( | |
| SELECT IF( | |
| EXISTS( | |
| SELECT 1 | |
| FROM information_schema.statistics | |
| WHERE table_schema = DATABASE() | |
| AND table_name = 'profiler_data_time_series' | |
| AND index_name = 'idx_pdts_fqnhash_ext_ts' | |
| ), | |
| 'SELECT 1', | |
| 'CREATE INDEX idx_pdts_fqnhash_ext_ts ON profiler_data_time_series (entityFQNHash, extension, timestamp)' | |
| ) | |
| ); | |
| PREPARE statement FROM @sql; | |
| EXECUTE statement; | |
| DEALLOCATE PREPARE statement; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| -- Restore a composite index on profiler_data_time_series(entityFQNHash, extension, timestamp). | ||
| -- | ||
| -- Root cause: the 1.9.9 schemaChanges.sql explicitly dropped the unique constraint | ||
| -- profiler_data_time_series_unique_hash_extension_ts (entityFQNHash, extension, operation, timestamp) | ||
| -- to allow changing the `operation` generated-column expression, but never recreated it. | ||
| -- After the 1.9.9 migration the table retains only | ||
| -- profiler_data_time_series_combined_id_ts (extension, timestamp) | ||
| -- which is useless for queries that filter by entityFQNHash. | ||
| -- | ||
| -- The 1.9.9 postDataMigrationSQLScript.sql also created temporary indexes | ||
| -- (idx_pdts_entityFQNHash, idx_pdts_composite, etc.) during its bulk UPDATE pass and then | ||
| -- dropped them all, leaving no index on entityFQNHash. | ||
| -- | ||
| -- Queries of the form | ||
| -- | ||
| -- SELECT entityFQNHash, MAX(timestamp) FROM profiler_data_time_series | ||
| -- WHERE entityFQNHash IN (...) AND extension = 'table.columnProfile' | ||
| -- GROUP BY entityFQNHash | ||
| -- | ||
| -- issued by getLatestExtensionsBatch() perform a full table scan without this index, | ||
| -- causing 100+ second response times on the columns API when `fields=profile` is requested. | ||
| CREATE INDEX IF NOT EXISTS idx_pdts_fqnhash_ext_ts | ||
| ON profiler_data_time_series (entityFQNHash, extension, timestamp); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5900,4 +5900,138 @@ | |
| assertFalse(table.getTags().isEmpty(), "Table tags should not be empty"); | ||
| } | ||
| } | ||
|
|
||
| // =================================================================== | ||
| // REGRESSION TEST - columns API with fields=profile (collate#3488) | ||
| // =================================================================== | ||
|
|
||
| @Test | ||
| @Execution(ExecutionMode.SAME_THREAD) | ||
| void test_getColumnsWithProfileField_correctnessAndNoBatchRegression(TestNamespace ns) { | ||
| OpenMetadataClient client = SdkClients.adminClient(); | ||
|
|
||
| DatabaseService service = DatabaseServiceTestFactory.createPostgres(ns); | ||
| DatabaseSchema schema = DatabaseSchemaTestFactory.createSimple(ns, service); | ||
|
|
||
| CreateClassification createClassification = | ||
| new CreateClassification() | ||
| .withName(ns.prefix("profile_test_cls")) | ||
| .withDescription("Classification for profile regression test"); | ||
| Classification cls = client.classifications().create(createClassification); | ||
|
|
||
| CreateTag createTag = | ||
| new CreateTag() | ||
| .withName(ns.prefix("profile_test_tag")) | ||
| .withDescription("Tag for profile regression test") | ||
| .withClassification(cls.getName()); | ||
| Tag tag = client.tags().create(createTag); | ||
|
|
||
| TagLabel tagLabel = | ||
| new TagLabel() | ||
| .withTagFQN(tag.getFullyQualifiedName()) | ||
| .withSource(TagLabel.TagSource.CLASSIFICATION); | ||
|
|
||
| Column idCol = ColumnBuilder.of("id", "BIGINT").primaryKey().notNull().build(); | ||
| idCol.setTags(List.of(tagLabel)); | ||
| Column emailCol = ColumnBuilder.of("email", "VARCHAR").dataLength(255).build(); | ||
| emailCol.setTags(List.of(tagLabel)); | ||
| Column nameCol = ColumnBuilder.of("name", "VARCHAR").dataLength(255).build(); | ||
|
|
||
| CreateTable createRequest = createRequest(ns.prefix("profile_regression_table"), ns); | ||
| createRequest.setDatabaseSchema(schema.getFullyQualifiedName()); | ||
| createRequest.setColumns(List.of(idCol, emailCol, nameCol)); | ||
| Table table = client.tables().create(createRequest); | ||
|
Check failure on line 5943 in openmetadata-integration-tests/src/test/java/org/openmetadata/it/tests/TableResourceIT.java
|
||
|
|
||
| Long timestamp = System.currentTimeMillis(); | ||
| ColumnProfile idProfile = | ||
| new ColumnProfile() | ||
| .withName("id") | ||
| .withMin(1.0) | ||
| .withMax(999.0) | ||
| .withUniqueCount(100.0) | ||
| .withTimestamp(timestamp); | ||
| ColumnProfile emailProfile = | ||
| new ColumnProfile() | ||
| .withName("email") | ||
| .withNullCount(5.0) | ||
| .withNullProportion(0.05) | ||
| .withTimestamp(timestamp); | ||
|
|
||
| TableProfile tableProfile = | ||
| new TableProfile().withRowCount(100.0).withColumnCount(3.0).withTimestamp(timestamp); | ||
|
|
||
| CreateTableProfile createProfile = | ||
| new CreateTableProfile() | ||
| .withTableProfile(tableProfile) | ||
| .withColumnProfile(List.of(idProfile, emailProfile)); | ||
| client.tables().updateTableProfile(table.getId(), createProfile); | ||
|
|
||
| // Verify all four field combinations don't regress: | ||
| // (a) fields=profile — profile data returned, no full-table-scan on profiler_data_time_series | ||
| TableColumnList withProfile = | ||
| assertTimeout( | ||
| Duration.ofSeconds(30), | ||
| () -> client.tables().getColumns(table.getId(), "profile"), | ||
| "columns?fields=profile should complete within 30s"); | ||
|
Comment on lines
+5969
to
+5975
|
||
|
|
||
| assertEquals(3, withProfile.getData().size()); | ||
| Column returnedId = | ||
| withProfile.getData().stream() | ||
| .filter(c -> "id".equals(c.getName())) | ||
| .findFirst() | ||
| .orElse(null); | ||
| Column returnedName = | ||
| withProfile.getData().stream() | ||
| .filter(c -> "name".equals(c.getName())) | ||
| .findFirst() | ||
| .orElse(null); | ||
| assertNotNull(returnedId, "id column should be present"); | ||
| assertNotNull(returnedId.getProfile(), "id column should have profile data"); | ||
| assertEquals(1.0, returnedId.getProfile().getMin(), "id column min should match"); | ||
| assertEquals(999.0, returnedId.getProfile().getMax(), "id column max should match"); | ||
| assertNotNull(returnedName, "name column should be present"); | ||
| assertNull(returnedName.getProfile(), "name column has no profile, should be null"); | ||
|
|
||
| // (b) fields=tags,customMetrics,extension,profile — the exact production query | ||
| TableColumnList withAllFields = | ||
| assertTimeout( | ||
| Duration.ofSeconds(30), | ||
| () -> client.tables().getColumns(table.getId(), "tags,customMetrics,extension,profile"), | ||
| "columns?fields=tags,customMetrics,extension,profile should complete within 30s"); | ||
|
|
||
| assertEquals(3, withAllFields.getData().size()); | ||
|
|
||
| Column idResult = | ||
| withAllFields.getData().stream() | ||
| .filter(c -> "id".equals(c.getName())) | ||
| .findFirst() | ||
| .orElse(null); | ||
| assertNotNull(idResult, "id column must be present"); | ||
| assertNotNull(idResult.getProfile(), "id column must have profile"); | ||
| assertNotNull(idResult.getTags(), "id column must have tags"); | ||
| assertFalse(idResult.getTags().isEmpty(), "id column tags must not be empty"); | ||
| assertTrue( | ||
| idResult.getTags().stream() | ||
| .anyMatch(t -> tag.getFullyQualifiedName().equals(t.getTagFQN())), | ||
| "id column should carry the test tag"); | ||
|
|
||
| // (c) fields=tags,profile — duplicate populateEntityFieldTags must not run twice | ||
| TableColumnList withTagsAndProfile = | ||
| assertTimeout( | ||
| Duration.ofSeconds(30), | ||
| () -> client.tables().getColumns(table.getId(), "tags,profile"), | ||
| "columns?fields=tags,profile should complete within 30s"); | ||
|
Comment on lines
+6018
to
+6023
|
||
|
|
||
| assertEquals(3, withTagsAndProfile.getData().size()); | ||
| Column idTagsProfile = | ||
| withTagsAndProfile.getData().stream() | ||
| .filter(c -> "id".equals(c.getName())) | ||
| .findFirst() | ||
| .orElse(null); | ||
| assertNotNull(idTagsProfile); | ||
| assertNotNull(idTagsProfile.getTags()); | ||
| assertFalse( | ||
| idTagsProfile.getTags().isEmpty(), "Tags must be present even when profile requested"); | ||
| assertNotNull(idTagsProfile.getProfile(), "Profile must be present when profile requested"); | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -2891,8 +2891,21 @@ private ResultList<Column> getTableColumnsInternal( | |||||||
| } | ||||||||
|
|
||||||||
| if (fieldsParam != null && fieldsParam.contains("customMetrics")) { | ||||||||
| List<ExtensionRecord> allColumnMetricRecords = | ||||||||
| daoCollection | ||||||||
| .entityExtensionDAO() | ||||||||
| .getExtensions(table.getId(), CUSTOM_METRICS_EXTENSION + TABLE_COLUMN_EXTENSION); | ||||||||
| Map<String, List<CustomMetric>> metricsByColumn = new HashMap<>(); | ||||||||
| for (ExtensionRecord record : allColumnMetricRecords) { | ||||||||
| CustomMetric metric = JsonUtils.readValue(record.extensionJson(), CustomMetric.class); | ||||||||
| if (metric != null && metric.getColumnName() != null) { | ||||||||
| metricsByColumn | ||||||||
| .computeIfAbsent(metric.getColumnName(), k -> new ArrayList<>()) | ||||||||
| .add(metric); | ||||||||
| } | ||||||||
| } | ||||||||
| for (Column column : paginatedColumns) { | ||||||||
| column.setCustomMetrics(getCustomMetrics(table, column.getName())); | ||||||||
| column.setCustomMetrics(metricsByColumn.getOrDefault(column.getName(), new ArrayList<>())); | ||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Quality: Unnecessary ArrayList allocation per column without metrics
Suggested fix: Was this helpful? React with 👍 / 👎 | Reply
|
||||||||
| column.setCustomMetrics(metricsByColumn.getOrDefault(column.getName(), new ArrayList<>())); | |
| column.setCustomMetrics( | |
| metricsByColumn.getOrDefault(column.getName(), Collections.emptyList())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🚨 Bug: MySQL migration uses unsupported
IF NOT EXISTSfor CREATE INDEXThe MySQL migration uses
CREATE INDEX IF NOT EXISTS, but this syntax is not supported in MySQL prior to 8.0.29. No other MySQL migration in the project uses this pattern — all 148 otherCREATE INDEXstatements inmysql/*.sqlfiles use plainCREATE INDEXwithoutIF NOT EXISTS. The project's docker-compose pinsmysql:8.0, which today resolves to ≥8.0.29, but users running older 8.0.x versions (e.g., 8.0.28 or below) will get a syntax error and the migration will fail, blocking the upgrade.PostgreSQL correctly supports
CREATE INDEX IF NOT EXISTSin all supported versions, so the Postgres file is fine.Suggested fix:
Was this helpful? React with 👍 / 👎 | Reply
gitar fixto apply this suggestion