From fd6ea9b84ffbb8a03bc6cbc1400aef38ba35f610 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Mon, 6 Apr 2026 20:37:31 -0300 Subject: [PATCH 01/18] feat: add native Elasticsearch vector search support - Add ElasticSearchVectorService mirroring OpenSearchVectorService using Rest5Client - Add vector_search_index_es_native.json with dense_vector/dims/cosine mappings for en/jp/ru/zh locales - Add VectorSearchQueryBuilder.buildNativeESQuery() for ES 8.x/9.x top-level knn query format - Add SemanticSearchQueryBuilder for Elasticsearch (mirrors OpenSearch equivalent) - Fix ElasticSearchIndexManager.extractMappingsJson() to extract mappings sub-object for putMapping - Fix reformatVectorIndexWithDimension() to handle both "dims" (ES) and "dimension" (OpenSearch) keys - Wire ElasticSearchVectorService into SearchRepository and ElasticSearchBulkSink - Extend VectorSearchQueryBuilderTest and ElasticSearchIndexManagerTest with new coverage Co-Authored-By: Claude Sonnet 4.6 --- .../mcp/tools/SemanticSearchTool.java | 4 +- .../searchIndex/ElasticSearchBulkSink.java | 185 ++++++ .../search/VectorSearchResource.java | 62 +- .../search/RecreateWithEmbeddings.java | 31 +- .../service/search/SearchRepository.java | 41 +- .../ElasticSearchIndexManager.java | 26 +- .../SemanticSearchQueryBuilder.java | 153 +++++ .../vector/ElasticSearchVectorService.java | 543 ++++++++++++++++++ .../vector/VectorSearchQueryBuilder.java | 62 +- .../ElasticSearchBulkSinkSimpleTest.java | 14 + .../ElasticSearchIndexManagerTest.java | 15 + .../ElasticSearchVectorServiceTest.java | 340 +++++++++++ .../vector/VectorSearchQueryBuilderTest.java | 180 +++++- .../en/vector_search_index_es_native.json | 293 ++++++++++ .../jp/vector_search_index_es_native.json | 293 ++++++++++ .../ru/vector_search_index_es_native.json | 410 +++++++++++++ .../zh/vector_search_index_es_native.json | 293 ++++++++++ 17 files changed, 2926 insertions(+), 19 deletions(-) create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java create mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java create mode 100644 openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java create mode 100644 openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json create mode 100644 openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json create mode 100644 openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json create mode 100644 openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json diff --git a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SemanticSearchTool.java b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SemanticSearchTool.java index 6f9e12def868..4ec10b65d2d6 100644 --- a/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SemanticSearchTool.java +++ b/openmetadata-mcp/src/main/java/org/openmetadata/mcp/tools/SemanticSearchTool.java @@ -10,7 +10,7 @@ import org.openmetadata.schema.utils.JsonUtils; import org.openmetadata.service.Entity; import org.openmetadata.service.limits.Limits; -import org.openmetadata.service.search.vector.OpenSearchVectorService; +import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; import org.openmetadata.service.security.Authorizer; import org.openmetadata.service.security.auth.CatalogSecurityContext; @@ -42,7 +42,7 @@ public Map execute( "Semantic search is not enabled. Configure vector embeddings in the OpenMetadata server settings."); } - OpenSearchVectorService vectorService = OpenSearchVectorService.getInstance(); + VectorIndexService vectorService = Entity.getSearchRepository().getVectorIndexService(); if (vectorService == null) { return errorResponse("Vector search service is not initialized"); } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java index f1e7c990b690..71101df2c944 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java @@ -21,12 +21,16 @@ import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedDeque; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; +import java.util.concurrent.Phaser; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.Semaphore; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; @@ -51,12 +55,16 @@ import org.openmetadata.service.search.elasticsearch.ElasticSearchClient; import org.openmetadata.service.search.elasticsearch.EsUtils; import org.openmetadata.service.search.indexes.ColumnSearchIndex; +import org.openmetadata.service.search.vector.VectorDocBuilder; +import org.openmetadata.service.search.vector.VectorIndexService; +import org.openmetadata.service.search.vector.utils.AvailableEntityTypes; /** * Elasticsearch implementation using new Java API client with custom bulk handler */ @Slf4j public class ElasticSearchBulkSink implements BulkSink { + private static final int MAX_VECTOR_THREADS = 10; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final JacksonJsonpMapper JACKSON_JSONP_MAPPER = new JacksonJsonpMapper(OBJECT_MAPPER); @@ -125,6 +133,13 @@ public static synchronized void resetDocBuildPoolSize() { private final ConcurrentLinkedDeque> pendingColumnFutures = new ConcurrentLinkedDeque<>(); + // Vector embedding fields + private final ExecutorService vectorExecutor; + private final Phaser phaser; + private final CopyOnWriteArrayList pendingThreads; + private final AtomicLong vectorSuccess = new AtomicLong(0); + private final AtomicLong vectorFailed = new AtomicLong(0); + public ElasticSearchBulkSink( SearchRepository searchRepository, int batchSize, @@ -136,6 +151,10 @@ public ElasticSearchBulkSink( this.batchSize = batchSize; this.maxConcurrentRequests = maxConcurrentRequests; this.maxPayloadSizeBytes = maxPayloadSizeBytes; + this.vectorExecutor = + Executors.newFixedThreadPool(MAX_VECTOR_THREADS, Thread.ofVirtual().factory()); + this.phaser = new Phaser(1); + this.pendingThreads = new CopyOnWriteArrayList<>(); // Initialize stats stats.withTotalRecords(0).withSuccessRecords(0).withFailedRecords(0); @@ -270,6 +289,11 @@ public void write(List entities, Map contextData) throws Exce } pendingColumnFutures.removeIf(CompletableFuture::isDone); } + + if (isVectorEmbeddingEnabledForEntity(entityType)) { + addEntitiesToVectorIndexBatch( + bulkProcessor, entityInterfaces, recreateIndex, reindexContext, tracker); + } } } catch (Exception e) { LOG.error("Failed to write {} entities of type {}", entities.size(), entityType, e); @@ -642,6 +666,8 @@ public StepStats getProcessStats() { @Override public void close() { try { + awaitVectorCompletion(60); + bulkProcessor.flush(); // Wait for in-flight column doc-build tasks before flushing the column processor @@ -658,6 +684,8 @@ public void close() { LOG.warn("Column bulk processor did not terminate within timeout"); } + vectorExecutor.shutdown(); + updateStats(); LOG.info( @@ -749,6 +777,163 @@ public void updateConcurrentRequests(int concurrentRequests) { LOG.info("Concurrent requests updated to: {}", concurrentRequests); } + boolean isVectorEmbeddingEnabledForEntity(String entityType) { + return searchRepository.isVectorEmbeddingEnabled() + && searchRepository.getVectorIndexService() != null + && AvailableEntityTypes.isVectorIndexable(entityType); + } + + void addEntitiesToVectorIndexBatch( + CustomBulkProcessor bulkProcessor, + List entities, + boolean recreateIndex, + ReindexContext reindexContext, + StageStatsTracker tracker) { + if (entities.isEmpty()) { + return; + } + + VectorIndexService vectorService = searchRepository.getVectorIndexService(); + if (vectorService == null) { + return; + } + + String entityType = entities.getFirst().getEntityReference().getType(); + if (!AvailableEntityTypes.isVectorIndexable(entityType)) { + return; + } + + String canonicalIndex = VectorIndexService.getClusteredIndexName(); + String finalTargetIndex = canonicalIndex; + String finalSourceIndex = null; + + if (reindexContext != null) { + String stagedIndex = + reindexContext.getStagedIndex(VectorIndexService.VECTOR_INDEX_KEY).orElse(null); + if (stagedIndex != null) { + finalSourceIndex = canonicalIndex; + finalTargetIndex = stagedIndex; + } + } + + String srcIdx = finalSourceIndex; + String tgtIdx = finalTargetIndex; + + Map existingFingerprints = Map.of(); + if (srcIdx != null) { + List parentIds = new ArrayList<>(entities.size()); + for (EntityInterface entity : entities) { + parentIds.add(entity.getId().toString()); + } + existingFingerprints = vectorService.getExistingFingerprintsBatch(srcIdx, parentIds); + } + + for (EntityInterface entity : entities) { + String parentId = entity.getId().toString(); + String existingFp = existingFingerprints.get(parentId); + String currentFp = VectorDocBuilder.computeFingerprintForEntity(entity); + + if (existingFp != null && existingFp.equals(currentFp) && srcIdx != null) { + submitVectorTask( + () -> + processMigration( + vectorService, srcIdx, tgtIdx, parentId, currentFp, entity, tracker)); + } else { + submitVectorTask(() -> processEmbedding(vectorService, entity, tgtIdx, tracker)); + } + } + } + + private void processMigration( + VectorIndexService vectorService, + String sourceIndex, + String targetIndex, + String parentId, + String fingerprint, + EntityInterface entity, + StageStatsTracker tracker) { + try { + if (vectorService.copyExistingVectorDocuments( + sourceIndex, targetIndex, parentId, fingerprint)) { + vectorSuccess.incrementAndGet(); + if (tracker != null) { + tracker.recordVector(StatsResult.SUCCESS); + } + } else { + processEmbedding(vectorService, entity, targetIndex, tracker); + } + } catch (Exception e) { + LOG.warn( + "Vector migration failed for parent_id={}, falling back to recomputation: {}", + parentId, + e.getMessage()); + processEmbedding(vectorService, entity, targetIndex, tracker); + } + } + + private void processEmbedding( + VectorIndexService vectorService, + EntityInterface entity, + String targetIndex, + StageStatsTracker tracker) { + try { + vectorService.updateVectorEmbeddings(entity, targetIndex); + vectorSuccess.incrementAndGet(); + if (tracker != null) { + tracker.recordVector(StatsResult.SUCCESS); + } + } catch (Exception e) { + vectorFailed.incrementAndGet(); + if (tracker != null) { + tracker.recordVector(StatsResult.FAILED); + } + LOG.error("Vector embedding failed for entity {}: {}", entity.getId(), e.getMessage(), e); + } + } + + private void submitVectorTask(Runnable task) { + phaser.register(); + vectorExecutor.submit( + () -> { + Thread current = Thread.currentThread(); + pendingThreads.add(current); + try { + task.run(); + } finally { + pendingThreads.remove(current); + phaser.arriveAndDeregister(); + } + }); + } + + @Override + public boolean awaitVectorCompletion(int timeoutSeconds) { + try { + int phase = phaser.arrive(); + phaser.awaitAdvanceInterruptibly(phase, timeoutSeconds, TimeUnit.SECONDS); + return true; + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return false; + } catch (TimeoutException e) { + LOG.warn("Timeout waiting for vector completion after {}s", timeoutSeconds); + return false; + } + } + + @Override + public int getPendingVectorTaskCount() { + return Math.max(0, phaser.getUnarrivedParties() - 1); + } + + @Override + public StepStats getVectorStats() { + return new StepStats() + .withTotalRecords((int) (vectorSuccess.get() + vectorFailed.get())) + .withSuccessRecords((int) vectorSuccess.get()) + .withFailedRecords((int) vectorFailed.get()); + } + public static class CustomBulkProcessor { private final ElasticsearchAsyncClient asyncClient; private final List buffer = new ArrayList<>(); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java index 06678c6662e3..6de437a5c59d 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java @@ -1,23 +1,28 @@ package org.openmetadata.service.resources.search; import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; import io.swagger.v3.oas.annotations.media.Content; import io.swagger.v3.oas.annotations.media.Schema; import io.swagger.v3.oas.annotations.responses.ApiResponse; import io.swagger.v3.oas.annotations.tags.Tag; import jakarta.ws.rs.Consumes; +import jakarta.ws.rs.GET; import jakarta.ws.rs.POST; import jakarta.ws.rs.Path; import jakarta.ws.rs.Produces; +import jakarta.ws.rs.QueryParam; import jakarta.ws.rs.core.Context; import jakarta.ws.rs.core.MediaType; import jakarta.ws.rs.core.Response; import jakarta.ws.rs.core.SecurityContext; import java.util.Collections; +import java.util.UUID; import lombok.extern.slf4j.Slf4j; import org.openmetadata.service.Entity; import org.openmetadata.service.resources.Collection; -import org.openmetadata.service.search.vector.OpenSearchVectorService; +import org.openmetadata.service.search.vector.VectorIndexService; +import org.openmetadata.service.search.vector.utils.DTOs.FingerprintResponse; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchRequest; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; import org.openmetadata.service.security.Authorizer; @@ -70,7 +75,7 @@ public Response vectorSearchPost( .build(); } - OpenSearchVectorService vectorService = OpenSearchVectorService.getInstance(); + VectorIndexService vectorService = Entity.getSearchRepository().getVectorIndexService(); if (vectorService == null) { return Response.status(Response.Status.SERVICE_UNAVAILABLE) .entity("{\"error\":\"Vector search service is not initialized\"}") @@ -97,4 +102,57 @@ public Response vectorSearchPost( .build(); } } + + @GET + @Path("/fingerprint") + @Operation( + operationId = "getFingerprint", + summary = "Get vector fingerprint", + description = "Returns the existing fingerprint for a given entity.") + public Response getFingerprint( + @Context SecurityContext securityContext, + @Parameter(description = "Parent entity ID", required = true) @QueryParam("parentId") + String parentId) { + DefaultAuthorizer.getSubjectContext(securityContext); + + if (!Entity.getSearchRepository().isVectorEmbeddingEnabled()) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"error\":\"Vector search is not enabled\"}") + .build(); + } + + VectorIndexService vectorService = Entity.getSearchRepository().getVectorIndexService(); + if (vectorService == null) { + return Response.status(Response.Status.SERVICE_UNAVAILABLE) + .entity("{\"error\":\"Vector search service is not initialized\"}") + .build(); + } + + if (parentId == null || parentId.isBlank()) { + return Response.status(Response.Status.BAD_REQUEST) + .entity("{\"error\":\"parentId is required\"}") + .build(); + } + try { + UUID.fromString(parentId); + } catch (IllegalArgumentException e) { + return Response.status(Response.Status.BAD_REQUEST) + .entity("{\"error\":\"Invalid parentId format\"}") + .build(); + } + + try { + String indexName = vectorService.getIndexName(); + String fingerprint = vectorService.getExistingFingerprint(indexName, parentId); + FingerprintResponse response = + new FingerprintResponse( + parentId, indexName, fingerprint, fingerprint != null ? "Found" : "Not found"); + return Response.ok(response).build(); + } catch (Exception e) { + LOG.error("Failed to get fingerprint for {}: {}", parentId, e.getMessage(), e); + return Response.status(Response.Status.INTERNAL_SERVER_ERROR) + .entity("{\"error\":\"An internal error occurred\"}") + .build(); + } + } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java index 98781b51a272..512c1b67155b 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java @@ -1,8 +1,11 @@ package org.openmetadata.service.search; +import java.util.HashSet; import java.util.Set; import lombok.extern.slf4j.Slf4j; +import org.openmetadata.search.IndexMapping; import org.openmetadata.service.Entity; +import org.openmetadata.service.search.vector.VectorIndexService; @Slf4j public class RecreateWithEmbeddings extends DefaultRecreateHandler { @@ -11,7 +14,33 @@ public class RecreateWithEmbeddings extends DefaultRecreateHandler { public ReindexContext reCreateIndexes(Set entities) { SearchRepository searchRepository = Entity.getSearchRepository(); searchRepository.initializeVectorSearchService(); - return super.reCreateIndexes(entities); + + Set allEntities = new HashSet<>(entities); + if (searchRepository.getVectorIndexService() != null) { + allEntities.add(VectorIndexService.VECTOR_INDEX_KEY); + } + + return super.reCreateIndexes(allEntities); + } + + @Override + protected void recreateIndexFromMapping( + ReindexContext context, IndexMapping indexMapping, String entityType) { + if (VectorIndexService.VECTOR_INDEX_KEY.equals(entityType) + && Entity.getSearchRepository().getVectorIndexService() == null) { + LOG.info("Skipping vector index recreation - vector service not initialized"); + return; + } + super.recreateIndexFromMapping(context, indexMapping, entityType); + } + + @Override + public void promoteEntityIndex(EntityReindexContext context, boolean reindexSuccess) { + if (VectorIndexService.VECTOR_INDEX_KEY.equals(context.getEntityType()) + && Entity.getSearchRepository().getVectorIndexService() == null) { + return; + } + super.promoteEntityIndex(context, reindexSuccess); } @Override diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java index 9028199b2b36..4b3964290be8 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java @@ -135,6 +135,7 @@ import org.openmetadata.service.search.nlq.NLQService; import org.openmetadata.service.search.nlq.NLQServiceFactory; import org.openmetadata.service.search.opensearch.OpenSearchClient; +import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.OpenSearchVectorService; import org.openmetadata.service.search.vector.VectorEmbeddingHandler; import org.openmetadata.service.search.vector.VectorIndexService; @@ -416,9 +417,10 @@ public synchronized void initializeVectorSearchService() { OpenSearchVectorService.init(osClient, embeddingClient); this.vectorIndexService = OpenSearchVectorService.getInstance(); } else { - LOG.warn( - "Vector embedding is only supported with OpenSearch. Elasticsearch support is planned."); - return; + es.co.elastic.clients.elasticsearch.ElasticsearchClient esClient = + ((ElasticSearchClient) getSearchClient()).getNewClient(); + ElasticSearchVectorService.init(esClient, embeddingClient, language); + this.vectorIndexService = ElasticSearchVectorService.getInstance(); } this.vectorEmbeddingHandler = new VectorEmbeddingHandler(vectorIndexService); @@ -588,10 +590,14 @@ public void deleteIndex(IndexMapping indexMapping) { } private String getIndexMapping(IndexMapping indexMapping) { + String mappingFile = indexMapping.getIndexMappingFile(); + boolean isOpenSearch = getSearchType() == ElasticSearchConfiguration.SearchType.OPENSEARCH; + if (!isOpenSearch && mappingFile != null && mappingFile.contains("vector_search_index.json")) { + mappingFile = + mappingFile.replace("vector_search_index.json", "vector_search_index_es_native.json"); + } try (InputStream in = - getClass() - .getResourceAsStream( - String.format(indexMapping.getIndexMappingFile(), language.toLowerCase()))) { + getClass().getResourceAsStream(String.format(mappingFile, language.toLowerCase()))) { assert in != null; return new String(in.readAllBytes()); } catch (Exception e) { @@ -3042,6 +3048,18 @@ private String reformatVectorIndexWithDimension(String mapping, int dimension) { JsonNode root = mapper.readTree(mapping); if (root.has("mappings")) { JsonNode mappings = root.get("mappings"); + if (mappings.has("properties")) { + JsonNode properties = mappings.get("properties"); + if (properties.has("embedding")) { + com.fasterxml.jackson.databind.node.ObjectNode embeddingNode = + (com.fasterxml.jackson.databind.node.ObjectNode) properties.get("embedding"); + if (embeddingNode.has("dims")) { + embeddingNode.put("dims", dimension); + } else { + embeddingNode.put("dimension", dimension); + } + } + } com.fasterxml.jackson.databind.node.ObjectNode meta = ((com.fasterxml.jackson.databind.node.ObjectNode) mappings).putObject("_meta"); meta.put( @@ -3051,8 +3069,15 @@ private String reformatVectorIndexWithDimension(String mapping, int dimension) { } return mapper.writeValueAsString(root); } catch (Exception e) { - LOG.warn("Failed to set embedding _meta in mapping JSON", e); - return mapping; + LOG.warn( + "Failed to parse mapping JSON for dimension patching, falling back to string replace"); + return mapping + .replace("\"dimension\": 768", "\"dimension\": " + dimension) + .replace("\"dimension\":768", "\"dimension\":" + dimension) + .replace("\"dimension\": 512", "\"dimension\": " + dimension) + .replace("\"dimension\":512", "\"dimension\":" + dimension) + .replace("\"dims\": 512", "\"dims\": " + dimension) + .replace("\"dims\":512", "\"dims\":" + dimension); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java index dcd56dd53d5b..33c44c614034 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java @@ -1,5 +1,7 @@ package org.openmetadata.service.search.elasticsearch; +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; import es.co.elastic.clients.elasticsearch._types.ElasticsearchException; import es.co.elastic.clients.elasticsearch.indices.CreateIndexRequest; @@ -31,6 +33,7 @@ */ @Slf4j public class ElasticSearchIndexManager implements IndexManagementClient { + private static final ObjectMapper MAPPER = new ObjectMapper(); private final ElasticsearchClient client; private final String clusterAlias; private final boolean isClientAvailable; @@ -83,12 +86,13 @@ public void updateIndex(IndexMapping indexMapping, String indexMappingContent) { try { String indexName = indexMapping.getIndexName(clusterAlias); + String mappingsJson = extractMappingsJson(indexMappingContent); PutMappingRequest request = PutMappingRequest.of( builder -> { builder.index(indexName); - if (indexMappingContent != null) { - builder.withJson(new StringReader(indexMappingContent)); + if (mappingsJson != null) { + builder.withJson(new StringReader(mappingsJson)); } return builder; }); @@ -141,6 +145,24 @@ public void createIndex(String indexName, String indexMappingContent) { } } + private String extractMappingsJson(String indexMappingContent) { + if (indexMappingContent == null) { + return null; + } + try { + JsonNode root = MAPPER.readTree(indexMappingContent); + JsonNode mappings = root.get("mappings"); + if (mappings != null) { + return MAPPER.writeValueAsString(mappings); + } + return indexMappingContent; + } catch (IOException e) { + LOG.warn( + "Failed to extract mappings from index content, using full content: {}", e.getMessage()); + return indexMappingContent; + } + } + private void createIndexInternal(String indexName, String indexMappingContent) throws IOException { CreateIndexRequest request = diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java new file mode 100644 index 000000000000..751ac113aaa3 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java @@ -0,0 +1,153 @@ +package org.openmetadata.service.search.elasticsearch; + +import es.co.elastic.clients.elasticsearch._types.Script; +import es.co.elastic.clients.elasticsearch._types.ScriptLanguage; +import es.co.elastic.clients.elasticsearch._types.query_dsl.FunctionBoostMode; +import es.co.elastic.clients.elasticsearch._types.query_dsl.FunctionScoreMode; +import es.co.elastic.clients.elasticsearch._types.query_dsl.Query; +import es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType; +import es.co.elastic.clients.json.JsonData; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.search.SearchRequest; +import org.openmetadata.schema.utils.JsonUtils; +import org.openmetadata.service.rdf.semantic.EmbeddingService; + +/** + * Builds semantic search queries for Elasticsearch that combine: + * 1. Vector similarity search using k-NN (dense_vector) + * 2. Traditional text search with BM25 + * 3. RDF context boosting + */ +@Slf4j +public class SemanticSearchQueryBuilder { + + private static final String KNN_FIELD = "embedding"; + private static final String RDF_CONTEXT_FIELD = "rdfContext"; + + private final EmbeddingService embeddingService; + + public SemanticSearchQueryBuilder() { + this.embeddingService = EmbeddingService.getInstance(); + } + + public Query buildSemanticQuery(SearchRequest request) { + String queryText = request.getQuery(); + if (!isSemanticSearchEnabled(request)) { + return null; + } + float[] queryEmbedding = embeddingService.generateEmbedding(queryText); + + Query knnQuery = buildKnnQuery(queryEmbedding); + Query textQuery = buildTextQuery(queryText, request); + + Query hybridQuery = + Query.of( + q -> + q.bool( + b -> + b.should(s -> s.constantScore(cs -> cs.filter(knnQuery).boost(0.7f))) + .should(s -> s.constantScore(cs -> cs.filter(textQuery).boost(0.3f))))); + + return Query.of( + q -> + q.functionScore( + fs -> + fs.query(hybridQuery) + .functions(f -> f.scriptScore(ss -> ss.script(buildRdfBoostScript()))) + .scoreMode(FunctionScoreMode.Sum) + .boostMode(FunctionBoostMode.Multiply))); + } + + private Query buildKnnQuery(float[] queryEmbedding) { + Map params = new HashMap<>(); + List vectorList = new ArrayList<>(); + for (float v : queryEmbedding) { + vectorList.add((double) v); + } + params.put("query_vector", vectorList); + + return Query.of( + q -> + q.scriptScore( + ss -> + ss.query(mq -> mq.matchAll(m -> m)) + .script( + Script.of( + s -> + s.source( + src -> + src.scriptString( + "cosineSimilarity(params.query_vector, '" + + KNN_FIELD + + "') + 1.0")) + .lang(ScriptLanguage.Painless) + .params(convertToJsonDataMap(params)))))); + } + + private Query buildTextQuery(String queryText, SearchRequest request) { + List fields = new ArrayList<>(); + fields.add("name^5"); + fields.add("displayName^4"); + fields.add("description^2"); + fields.add("tags.tagFQN^3"); + + if ("table".equalsIgnoreCase(request.getIndex())) { + fields.add("columns.name^3"); + fields.add("columns.description"); + } + + return Query.of( + q -> + q.multiMatch( + m -> + m.query(queryText) + .fields(fields) + .type(TextQueryType.BestFields) + .fuzziness("AUTO"))); + } + + private Script buildRdfBoostScript() { + String scriptSource = + """ + double boost = 1.0; + + if (doc.containsKey('rdfContext.upstreamCount')) { + int upstreamCount = doc['rdfContext.upstreamCount'].value; + boost += Math.min(upstreamCount * 0.01, 0.2); + } + + if (doc.containsKey('rdfContext.downstreamCount')) { + int downstreamCount = doc['rdfContext.downstreamCount'].value; + boost += Math.min(downstreamCount * 0.02, 0.3); + } + + if (doc.containsKey('rdfContext.semanticTypes')) { + int typeCount = doc['rdfContext.semanticTypes'].size(); + boost += Math.min(typeCount * 0.05, 0.2); + } + + return boost; + """; + + return Script.of( + s -> + s.source(src -> src.scriptString(scriptSource)) + .lang(ScriptLanguage.Painless) + .params(Map.of())); + } + + private boolean isSemanticSearchEnabled(SearchRequest request) { + return request.getSemanticSearch() != null && request.getSemanticSearch(); + } + + private Map convertToJsonDataMap(Map map) { + return JsonUtils.getMap(map).entrySet().stream() + .filter(entry -> entry.getValue() != null) + .collect(Collectors.toMap(Map.Entry::getKey, entry -> JsonData.of(entry.getValue()))); + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java new file mode 100644 index 000000000000..14a60295a7c1 --- /dev/null +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -0,0 +1,543 @@ +package org.openmetadata.service.search.vector; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import es.co.elastic.clients.elasticsearch.ElasticsearchClient; +import es.co.elastic.clients.elasticsearch._types.Refresh; +import es.co.elastic.clients.elasticsearch._types.mapping.TypeMapping; +import es.co.elastic.clients.elasticsearch.core.BulkRequest; +import es.co.elastic.clients.elasticsearch.core.BulkResponse; +import es.co.elastic.clients.elasticsearch.core.bulk.BulkOperation; +import es.co.elastic.clients.elasticsearch.core.bulk.BulkResponseItem; +import es.co.elastic.clients.elasticsearch.indices.CreateIndexRequest; +import es.co.elastic.clients.elasticsearch.indices.ExistsRequest; +import es.co.elastic.clients.elasticsearch.indices.IndexSettings; +import es.co.elastic.clients.transport.rest5_client.Rest5ClientTransport; +import es.co.elastic.clients.transport.rest5_client.low_level.Request; +import es.co.elastic.clients.transport.rest5_client.low_level.Response; +import es.co.elastic.clients.transport.rest5_client.low_level.Rest5Client; +import jakarta.json.stream.JsonParser; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; +import org.openmetadata.schema.EntityInterface; +import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher; +import org.openmetadata.service.search.vector.client.EmbeddingClient; +import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; + +@Slf4j +public class ElasticSearchVectorService implements VectorIndexService { + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final int OVER_FETCH_MULTIPLIER = 2; + + private static volatile ElasticSearchVectorService instance; + + private final ElasticsearchClient client; + private final Rest5Client restClient; + @Getter private final EmbeddingClient embeddingClient; + private final String language; + + public ElasticSearchVectorService( + ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + this.client = client; + this.restClient = extractRestClient(client); + this.embeddingClient = embeddingClient; + this.language = language != null ? language.toLowerCase() : "en"; + } + + public ElasticSearchVectorService(ElasticsearchClient client, EmbeddingClient embeddingClient) { + this(client, embeddingClient, "en"); + } + + private static Rest5Client extractRestClient(ElasticsearchClient client) { + Rest5ClientTransport transport = (Rest5ClientTransport) client._transport(); + return transport.restClient(); + } + + public static synchronized void init( + ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + if (instance != null) { + LOG.warn("ElasticSearchVectorService already initialized, reinitializing"); + } + instance = new ElasticSearchVectorService(client, embeddingClient, language); + instance.registerVectorEmbeddingHandler(); + LOG.info( + "ElasticSearchVectorService initialized with model={}, dimension={}", + embeddingClient.getModelId(), + embeddingClient.getDimension()); + } + + public static ElasticSearchVectorService getInstance() { + return instance; + } + + private void registerVectorEmbeddingHandler() { + try { + VectorEmbeddingHandler handler = new VectorEmbeddingHandler(this); + EntityLifecycleEventDispatcher.getInstance().registerHandler(handler); + LOG.info("Registered VectorEmbeddingHandler for entity lifecycle events"); + } catch (Exception e) { + LOG.error("Failed to register VectorEmbeddingHandler", e); + } + } + + @Override + @SuppressWarnings("unchecked") + public VectorSearchResponse search( + String query, Map> filters, int size, int k, double threshold) { + long start = System.currentTimeMillis(); + try { + float[] queryVector = embeddingClient.embed(query); + int overFetchSize = size * OVER_FETCH_MULTIPLIER; + + String queryJson = + VectorSearchQueryBuilder.buildNativeESQuery(queryVector, overFetchSize, k, filters); + String indexName = getClusteredIndexName(); + String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); + + JsonNode root = MAPPER.readTree(responseBody); + JsonNode hitsNode = root.path("hits").path("hits"); + + LinkedHashMap>> byParent = new LinkedHashMap<>(); + for (JsonNode hit : hitsNode) { + double score = hit.path("_score").asDouble(0.0); + if (score < threshold) { + continue; + } + + Map hitMap = MAPPER.convertValue(hit.path("_source"), Map.class); + hitMap.put("_score", score); + + String parentId = (String) hitMap.get("parent_id"); + if (parentId != null) { + byParent.computeIfAbsent(parentId, kVal -> new ArrayList<>()).add(hitMap); + } + } + + List> results = new ArrayList<>(); + int parentCount = 0; + for (List> chunks : byParent.values()) { + if (parentCount >= size) { + break; + } + results.addAll(chunks); + parentCount++; + } + + long tookMillis = System.currentTimeMillis() - start; + return new VectorSearchResponse(tookMillis, results); + } catch (Exception e) { + LOG.error("Vector search failed: {}", e.getMessage(), e); + throw new RuntimeException("Vector search failed", e); + } + } + + String executeGenericRequest(String method, String endpoint, String body) { + try { + Request request = new Request(method, endpoint); + if (body != null) { + request.setJsonEntity(body); + } + Response response = restClient.performRequest(request); + try (InputStream is = response.getEntity().getContent()) { + return new String(is.readAllBytes(), StandardCharsets.UTF_8); + } + } catch (Exception e) { + LOG.error("Generic request failed: {} {}", method, endpoint, e); + throw new RuntimeException("Elasticsearch generic request failed", e); + } + } + + @Override + public void updateVectorEmbeddings(EntityInterface entity, String targetIndex) { + try { + String parentId = entity.getId().toString(); + String existingFingerprint = getExistingFingerprint(targetIndex, parentId); + String currentFingerprint = VectorDocBuilder.computeFingerprintForEntity(entity); + + if (currentFingerprint.equals(existingFingerprint)) { + LOG.debug("Skipping entity {} - fingerprint unchanged", parentId); + return; + } + + List> docs = VectorDocBuilder.fromEntity(entity, embeddingClient); + deleteByParentId(targetIndex, parentId); + bulkIndex(docs, targetIndex); + } catch (Exception e) { + LOG.error( + "Failed to update vector embeddings for entity {}: {}", + entity.getId(), + e.getMessage(), + e); + } + } + + @Override + public void updateVectorEmbeddingsWithMigration( + EntityInterface entity, String targetIndex, String sourceIndex) { + try { + String parentId = entity.getId().toString(); + String currentFingerprint = VectorDocBuilder.computeFingerprintForEntity(entity); + + if (sourceIndex != null) { + try { + String existingFingerprint = getExistingFingerprint(sourceIndex, parentId); + if (currentFingerprint.equals(existingFingerprint)) { + if (copyExistingVectorDocuments( + sourceIndex, targetIndex, parentId, currentFingerprint)) { + return; + } + } + } catch (Exception ex) { + LOG.warn( + "Migration copy failed for entity {}, falling back to recomputation: {}", + parentId, + ex.getMessage()); + } + } + + List> docs = VectorDocBuilder.fromEntity(entity, embeddingClient); + bulkIndex(docs, targetIndex); + } catch (Exception e) { + LOG.error( + "Failed to update vector embeddings with migration for entity {}: {}", + entity.getId(), + e.getMessage(), + e); + } + } + + @Override + public String getExistingFingerprint(String indexName, String parentId) { + try { + String query = + "{\"size\":1,\"_source\":[\"fingerprint\"]," + + "\"query\":{\"term\":{\"parent_id\":\"" + + VectorSearchQueryBuilder.escape(parentId) + + "\"}}}"; + String response = executeGenericRequest("POST", "/" + indexName + "/_search", query); + JsonNode root = MAPPER.readTree(response); + JsonNode hits = root.path("hits").path("hits"); + if (hits.isArray() && !hits.isEmpty()) { + return hits.get(0).path("_source").path("fingerprint").asText(null); + } + } catch (Exception e) { + LOG.debug( + "Failed to get fingerprint for parent_id={} in index={}: {}", + parentId, + indexName, + e.getMessage()); + } + return null; + } + + @Override + public Map getExistingFingerprintsBatch( + String indexName, List parentIds) { + if (parentIds == null || parentIds.isEmpty()) { + return Collections.emptyMap(); + } + try { + StringBuilder termsArray = new StringBuilder("["); + for (int i = 0; i < parentIds.size(); i++) { + if (i > 0) termsArray.append(','); + termsArray + .append("\"") + .append(VectorSearchQueryBuilder.escape(parentIds.get(i))) + .append("\""); + } + termsArray.append("]"); + + String query = + "{\"size\":" + + parentIds.size() + + ",\"_source\":[\"parent_id\",\"fingerprint\"]" + + ",\"query\":{\"terms\":{\"parent_id\":" + + termsArray + + "}}" + + ",\"collapse\":{\"field\":\"parent_id\"}}"; + + String response = executeGenericRequest("POST", "/" + indexName + "/_search", query); + JsonNode root = MAPPER.readTree(response); + JsonNode hits = root.path("hits").path("hits"); + + Map result = new HashMap<>(); + for (JsonNode hit : hits) { + String pid = hit.path("_source").path("parent_id").asText(); + String fp = hit.path("_source").path("fingerprint").asText(null); + if (pid != null && fp != null) { + result.put(pid, fp); + } + } + return result; + } catch (Exception e) { + LOG.error("Failed to batch get fingerprints in index={}: {}", indexName, e.getMessage(), e); + return Collections.emptyMap(); + } + } + + @Override + @SuppressWarnings("unchecked") + public boolean copyExistingVectorDocuments( + String sourceIndex, String targetIndex, String parentId, String fingerprint) { + try { + String searchQuery = + "{\"size\":1000,\"query\":{\"term\":{\"parent_id\":\"" + + VectorSearchQueryBuilder.escape(parentId) + + "\"}}}"; + String response = executeGenericRequest("POST", "/" + sourceIndex + "/_search", searchQuery); + JsonNode root = MAPPER.readTree(response); + JsonNode hits = root.path("hits").path("hits"); + + if (!hits.isArray() || hits.isEmpty()) { + return false; + } + + List> docs = new ArrayList<>(); + for (JsonNode hit : hits) { + Map source = MAPPER.convertValue(hit.path("_source"), Map.class); + source.put("fingerprint", fingerprint); + docs.add(source); + } + bulkIndex(docs, targetIndex); + return true; + } catch (Exception e) { + LOG.error( + "Failed to copy vector documents from {} to {} for parent_id={}: {}", + sourceIndex, + targetIndex, + parentId, + e.getMessage(), + e); + return false; + } + } + + @Override + public void softDeleteEmbeddings(EntityInterface entity) { + try { + String parentId = entity.getId().toString(); + String indexName = getClusteredIndexName(); + String script = + "{\"script\":{\"source\":\"ctx._source.deleted = true\"}," + + "\"query\":{\"term\":{\"parent_id\":\"" + + VectorSearchQueryBuilder.escape(parentId) + + "\"}}}"; + executeGenericRequest("POST", "/" + indexName + "/_update_by_query", script); + } catch (Exception e) { + LOG.error( + "Failed to soft delete embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); + } + } + + @Override + public void hardDeleteEmbeddings(EntityInterface entity) { + try { + String parentId = entity.getId().toString(); + String indexName = getClusteredIndexName(); + deleteByParentId(indexName, parentId); + } catch (Exception e) { + LOG.error( + "Failed to hard delete embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); + } + } + + @Override + public void restoreEmbeddings(EntityInterface entity) { + try { + String parentId = entity.getId().toString(); + String indexName = getClusteredIndexName(); + String script = + "{\"script\":{\"source\":\"ctx._source.deleted = false\"}," + + "\"query\":{\"term\":{\"parent_id\":\"" + + VectorSearchQueryBuilder.escape(parentId) + + "\"}}}"; + executeGenericRequest("POST", "/" + indexName + "/_update_by_query", script); + } catch (Exception e) { + LOG.error( + "Failed to restore embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); + } + } + + private void deleteByParentId(String indexName, String parentId) { + try { + String query = + "{\"query\":{\"term\":{\"parent_id\":\"" + + VectorSearchQueryBuilder.escape(parentId) + + "\"}}}"; + executeGenericRequest("POST", "/" + indexName + "/_delete_by_query", query); + } catch (Exception e) { + LOG.error( + "Failed to delete by parent_id={} in index={}: {}", + parentId, + indexName, + e.getMessage(), + e); + } + } + + private static String getClusteredIndexName() { + return VectorIndexService.getClusteredIndexName(); + } + + @Override + public void createOrUpdateIndex(int dimension) { + try { + if (indexExists()) { + LOG.info("Vector index {} already exists", VECTOR_INDEX_NAME); + return; + } + + String mappingJson = loadIndexMapping(dimension); + JsonNode rootNode = MAPPER.readTree(mappingJson); + JsonNode mappingsNode = rootNode.get("mappings"); + JsonNode settingsNode = rootNode.get("settings"); + + CreateIndexRequest request = + CreateIndexRequest.of( + builder -> { + builder.index(getClusteredIndexName()); + + if (mappingsNode != null && !mappingsNode.isNull()) { + TypeMapping typeMapping = parseTypeMapping(mappingsNode); + builder.mappings(typeMapping); + } + + if (settingsNode != null && !settingsNode.isNull()) { + IndexSettings settings = parseIndexSettings(settingsNode); + builder.settings(settings); + } + + return builder; + }); + client.indices().create(request); + + LOG.info("Created vector index {} with dimension {}", getClusteredIndexName(), dimension); + } catch (Exception e) { + LOG.error("Failed to create vector index: {}", e.getMessage(), e); + } + } + + @Override + public boolean indexExists() { + try { + ExistsRequest request = ExistsRequest.of(b -> b.index(getClusteredIndexName())); + return client.indices().exists(request).value(); + } catch (Exception e) { + LOG.error("Failed to check if vector index exists: {}", e.getMessage(), e); + return false; + } + } + + @Override + public String getIndexName() { + return getClusteredIndexName(); + } + + @Override + @SuppressWarnings("unchecked") + public void bulkIndex(List> documents, String targetIndex) { + if (documents == null || documents.isEmpty()) { + return; + } + + try { + List operations = new ArrayList<>(); + for (int i = 0; i < documents.size(); i++) { + Map doc = documents.get(i); + String parentId = (String) doc.get("parent_id"); + int chunkIndex = doc.containsKey("chunk_index") ? (int) doc.get("chunk_index") : i; + String docId = parentId + "-" + chunkIndex; + + operations.add( + BulkOperation.of( + op -> op.index(idx -> idx.index(targetIndex).id(docId).document(doc)))); + } + + BulkRequest bulkRequest = + BulkRequest.of(b -> b.operations(operations).refresh(Refresh.False)); + BulkResponse response = client.bulk(bulkRequest); + + if (response.errors()) { + long errorCount = 0; + for (BulkResponseItem item : response.items()) { + if (item.error() != null) { + errorCount++; + LOG.warn( + "Bulk vector indexing error for document [{}] in [{}]: type={}, reason={}", + item.id(), + targetIndex, + item.error().type(), + item.error().reason()); + } + } + LOG.warn( + "Bulk vector indexing completed with {}/{} errors in {}", + errorCount, + documents.size(), + targetIndex); + } else { + LOG.debug( + "Successfully bulk indexed {} vector documents in {}", documents.size(), targetIndex); + } + } catch (Exception e) { + LOG.error("Bulk vector indexing failed in {}: {}", targetIndex, e.getMessage(), e); + } + } + + private TypeMapping parseTypeMapping(JsonNode mappingsNode) { + JsonParser parser = + client + ._transport() + .jsonpMapper() + .jsonProvider() + .createParser(new StringReader(mappingsNode.toString())); + return TypeMapping._DESERIALIZER.deserialize(parser, client._transport().jsonpMapper()); + } + + private IndexSettings parseIndexSettings(JsonNode settingsNode) { + JsonParser parser = + client + ._transport() + .jsonpMapper() + .jsonProvider() + .createParser(new StringReader(settingsNode.toString())); + return IndexSettings._DESERIALIZER.deserialize(parser, client._transport().jsonpMapper()); + } + + private String loadIndexMapping(int dimension) { + String resourcePath = "elasticsearch/" + language + "/vector_search_index_es_native.json"; + try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourcePath)) { + if (inputStream == null) { + throw new IllegalStateException("Could not find " + resourcePath + " in classpath"); + } + String template = new String(inputStream.readAllBytes(), StandardCharsets.UTF_8); + String result = template.replace("\"dims\": 512", "\"dims\": " + dimension); + if (result.equals(template) && dimension != 512) { + throw new IllegalStateException( + "Failed to replace dimension placeholder in vector index mapping template"); + } + return result; + } catch (Exception e) { + throw new RuntimeException("Failed to load vector search index mapping", e); + } + } + + public void close() { + try { + if (client != null && client._transport() != null) { + client._transport().close(); + } + } catch (Exception e) { + LOG.warn("Error closing Elasticsearch transport: {}", e.getMessage()); + } + } +} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java index b731d7062eee..6617e294f136 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java @@ -66,10 +66,41 @@ private static void appendKnnQuery( // Build filter inside knn for efficient k-NN filtering sb.append(",\"filter\":{\"bool\":{\"must\":["); + appendFilterMustClauses(sb, filters); + sb.append("]}}"); // close must array and bool + + sb.append("}}}}"); // close embedding, knn, query + } + + public static String buildNativeESQuery( + float[] vector, int size, int k, Map> filters) { + int numCandidates = Math.max(k, 100); + StringBuilder sb = + new StringBuilder(512) + .append("{\"size\":") + .append(size) + .append(",\"_source\":{\"excludes\":[\"embedding\"]}") + .append(",\"knn\":{") + .append("\"field\":\"embedding\"") + .append(",\"query_vector\":") + .append(Arrays.toString(vector)) + .append(",\"k\":") + .append(k) + .append(",\"num_candidates\":") + .append(numCandidates); + sb.append(",\"filter\":{\"bool\":{\"must\":["); + appendFilterMustClauses(sb, filters); + sb.append("]}}"); // close must array and bool + + sb.append("}}"); // close knn object + return sb.toString(); + } + + private static void appendFilterMustClauses(StringBuilder sb, Map> filters) { // Only include documents where deleted=false sb.append("{\"term\":{\"deleted\":false}}"); - + // Then add user-specified filters for (var e : filters.entrySet()) { String field = e.getKey(); @@ -126,10 +157,35 @@ private static void appendKnnQuery( } } } + } - sb.append("]}}"); // close must array and bool + private static void appendNested(StringBuilder sb, String path, String field, List vals) { + sb.append("{\"nested\":{\"path\":\"").append(path).append("\",\"query\":"); + if (vals.size() == 1) { + appendOneNestedQuery(sb, field, vals.get(0)); + } else { + sb.append("{\"bool\":{\"should\":["); + for (int i = 0; i < vals.size(); i++) { + if (i > 0) sb.append(','); + appendOneNestedQuery(sb, field, vals.get(i)); + } + sb.append("]}}"); + } + sb.append("}}"); + } - sb.append("}}}"); // close embedding, knn, wrapper + private static void appendOneNestedQuery(StringBuilder sb, String field, String val) { + switch (val) { + case ANY -> sb.append("{\"exists\":{\"field\":\"").append(field).append("\"}}"); + case NONE -> sb.append("{\"bool\":{\"must_not\":{\"exists\":{\"field\":\"") + .append(field) + .append("\"}}}}"); + default -> sb.append("{\"term\":{\"") + .append(field) + .append("\":\"") + .append(escape(val)) + .append("\"}}"); + } } private static void appendFlat(StringBuilder sb, String field, List vals) { diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java index 110dac14b6da..043ec4983ef1 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java @@ -5,6 +5,7 @@ import static org.mockito.Mockito.lenient; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; +import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.junit.jupiter.api.BeforeEach; @@ -80,4 +81,17 @@ void testContextDataHandling() { recreateIndex = (Boolean) contextData.getOrDefault("recreateIndex", false); assertEquals(false, recreateIndex); } + + @Test + void testIsVectorEmbeddingEnabledForEntity() { + assertEquals(false, elasticSearchBulkSink.isVectorEmbeddingEnabledForEntity("table")); + assertEquals(false, elasticSearchBulkSink.isVectorEmbeddingEnabledForEntity("user")); + assertEquals(false, elasticSearchBulkSink.isVectorEmbeddingEnabledForEntity("dashboard")); + } + + @Test + void testAddEntitiesToVectorIndexBatch() { + elasticSearchBulkSink.addEntitiesToVectorIndexBatch( + null, Collections.emptyList(), true, null, null); + } } diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManagerTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManagerTest.java index c95512062059..8ee009fe20ed 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManagerTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManagerTest.java @@ -327,6 +327,21 @@ void testUpdateIndex_HandlesInvalidJson() { verifyNoInteractions(indicesClient); } + @Test + void testUpdateIndex_ExtractsMappingsFromFullIndexJson() throws IOException { + // putMapping only accepts the mappings sub-object, not a full index JSON with settings/aliases + String fullIndexJson = + "{\"settings\":{\"number_of_shards\":1}," + + "\"mappings\":{\"properties\":{\"field1\":{\"type\":\"text\"}}}," + + "\"aliases\":{}}"; + when(indexMapping.getIndexName(CLUSTER_ALIAS)).thenReturn(TEST_INDEX); + when(indicesClient.putMapping(any(PutMappingRequest.class))).thenReturn(putMappingResponse); + + assertDoesNotThrow(() -> indexManager.updateIndex(indexMapping, fullIndexJson)); + + verify(indicesClient).putMapping(any(PutMappingRequest.class)); + } + @Test void testCreateIndex_ClientNotAvailable() { ElasticSearchIndexManager managerWithNullClient = diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java new file mode 100644 index 000000000000..bc95a9ec3257 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -0,0 +1,340 @@ +package org.openmetadata.service.search.vector; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; + +import es.co.elastic.clients.elasticsearch.ElasticsearchClient; +import es.co.elastic.clients.transport.rest5_client.Rest5ClientTransport; +import es.co.elastic.clients.transport.rest5_client.low_level.Request; +import es.co.elastic.clients.transport.rest5_client.low_level.Response; +import es.co.elastic.clients.transport.rest5_client.low_level.Rest5Client; +import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; +import java.util.Map; +import org.apache.hc.core5.http.HttpEntity; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.openmetadata.service.search.vector.client.EmbeddingClient; +import org.openmetadata.service.search.vector.utils.DTOs; + +class ElasticSearchVectorServiceTest { + + private ElasticSearchVectorService vectorService; + private Rest5Client mockRestClient; + private EmbeddingClient mockEmbeddingClient; + + @BeforeEach + void setup() throws Exception { + ElasticsearchClient mockClient = mock(ElasticsearchClient.class); + Rest5ClientTransport mockTransport = mock(Rest5ClientTransport.class); + mockRestClient = mock(Rest5Client.class); + + when(mockClient._transport()).thenReturn(mockTransport); + when(mockTransport.restClient()).thenReturn(mockRestClient); + + mockEmbeddingClient = mock(EmbeddingClient.class); + when(mockEmbeddingClient.embed(any(String.class))).thenReturn(new float[] {0.1f, 0.2f, 0.3f}); + + vectorService = new ElasticSearchVectorService(mockClient, mockEmbeddingClient); + } + + @Test + void testThresholdFilteringRemovesLowScoreResults() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 4}, + "hits": [ + { + "_score": 0.9, + "_source": { + "parent_id": "parent1", + "chunk_index": 0, + "text": "High score chunk" + } + }, + { + "_score": 0.7, + "_source": { + "parent_id": "parent2", + "chunk_index": 0, + "text": "Medium score chunk" + } + }, + { + "_score": 0.4, + "_source": { + "parent_id": "parent3", + "chunk_index": 0, + "text": "Low score chunk" + } + }, + { + "_score": 0.2, + "_source": { + "parent_id": "parent4", + "chunk_index": 0, + "text": "Very low score chunk" + } + } + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.5); + + assertNotNull(results); + assertEquals(2, results.hits.size(), "Should return 2 results (scores 0.9 and 0.7)"); + for (Map result : results.hits) { + double score = (double) result.get("_score"); + assertTrue(score >= 0.5, "All results should have score >= 0.5, got: " + score); + } + } + + @Test + void testScoreFieldIncludedInResults() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 1}, + "hits": [ + { + "_score": 0.85, + "_source": { + "parent_id": "parent1", + "chunk_index": 0, + "text": "Test chunk" + } + } + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + + assertEquals(1, results.hits.size()); + assertTrue(results.hits.get(0).containsKey("_score"), "Result should contain _score field"); + assertEquals(0.85, (double) results.hits.get(0).get("_score"), 0.001); + } + + @Test + void testParentGroupingLimitsDistinctParents() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 8}, + "hits": [ + {"_score": 0.9, "_source": {"parent_id": "parent1", "chunk_index": 0}}, + {"_score": 0.88, "_source": {"parent_id": "parent1", "chunk_index": 1}}, + {"_score": 0.85, "_source": {"parent_id": "parent1", "chunk_index": 2}}, + {"_score": 0.8, "_source": {"parent_id": "parent2", "chunk_index": 0}}, + {"_score": 0.78, "_source": {"parent_id": "parent2", "chunk_index": 1}}, + {"_score": 0.7, "_source": {"parent_id": "parent3", "chunk_index": 0}}, + {"_score": 0.68, "_source": {"parent_id": "parent3", "chunk_index": 1}}, + {"_score": 0.6, "_source": {"parent_id": "parent4", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 100, 0.0); + + assertEquals(5, results.hits.size(), "Should return all chunks from first 2 parents (3+2=5)"); + long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); + assertEquals(2, distinctParents, "Should have chunks from exactly 2 distinct parents"); + } + + @Test + void testZeroThresholdReturnsAllResults() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 3}, + "hits": [ + {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, + {"_score": 0.5, "_source": {"parent_id": "p2", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parent_id": "p3", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + + assertEquals(3, results.hits.size(), "With threshold 0.0, should return all 3 results"); + } + + @Test + void testHighThresholdFiltersAllResults() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 3}, + "hits": [ + {"_score": 0.5, "_source": {"parent_id": "p1", "chunk_index": 0}}, + {"_score": 0.3, "_source": {"parent_id": "p2", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parent_id": "p3", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.9); + + assertEquals(0, results.hits.size(), "With threshold 0.9, all results should be filtered out"); + } + + @Test + void testChunksWithoutParentIdAreSkipped() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 3}, + "hits": [ + {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"chunk_index": 0, "text": "orphan chunk"}}, + {"_score": 0.7, "_source": {"parent_id": "p2", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + + assertEquals(2, results.hits.size(), "Chunks without parent_id should be skipped"); + } + + @Test + void testRequestedSizeLimitsDistinctParents() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 10}, + "hits": [ + {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parent_id": "p2", "chunk_index": 0}}, + {"_score": 0.7, "_source": {"parent_id": "p3", "chunk_index": 0}}, + {"_score": 0.6, "_source": {"parent_id": "p4", "chunk_index": 0}}, + {"_score": 0.5, "_source": {"parent_id": "p5", "chunk_index": 0}}, + {"_score": 0.4, "_source": {"parent_id": "p6", "chunk_index": 0}}, + {"_score": 0.3, "_source": {"parent_id": "p7", "chunk_index": 0}}, + {"_score": 0.2, "_source": {"parent_id": "p8", "chunk_index": 0}}, + {"_score": 0.15, "_source": {"parent_id": "p9", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parent_id": "p10", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 3, 100, 0.0); + + assertEquals(3, results.hits.size(), "Should limit to 3 distinct parents"); + long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); + assertEquals(3, distinctParents, "Should have exactly 3 distinct parents"); + } + + @Test + void testEmptyHitsResponseReturnsEmptyList() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 0}, + "hits": [] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + + assertNotNull(results); + assertTrue(results.hits.isEmpty(), "Empty hits should return empty list"); + } + + @Test + void testGetExistingFingerprintReturnsNullWhenNotFound() throws Exception { + String esResponse = """ + {"hits":{"total":{"value":0},"hits":[]}} + """; + + mockRestClientResponse(esResponse); + + String fingerprint = vectorService.getExistingFingerprint("vector_search_index", "unknown-id"); + + assertTrue(fingerprint == null, "Should return null when no fingerprint found"); + } + + @Test + void testGetExistingFingerprintReturnsValueWhenFound() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 1}, + "hits": [ + {"_source": {"fingerprint": "abc123"}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + String fingerprint = + vectorService.getExistingFingerprint("vector_search_index", "some-entity-id"); + + assertEquals("abc123", fingerprint); + } + + @Test + void testGetExistingFingerprintsBatchReturnsEmptyForNullInput() { + Map result = vectorService.getExistingFingerprintsBatch("index", null); + assertTrue(result.isEmpty()); + } + + @Test + void testGetExistingFingerprintsBatchReturnsEmptyForEmptyInput() { + Map result = + vectorService.getExistingFingerprintsBatch("index", java.util.List.of()); + assertTrue(result.isEmpty()); + } + + private void mockRestClientResponse(String responseJson) throws Exception { + Response mockResponse = mock(Response.class); + HttpEntity mockEntity = mock(HttpEntity.class); + + when(mockRestClient.performRequest(any(Request.class))).thenReturn(mockResponse); + when(mockResponse.getEntity()).thenReturn(mockEntity); + when(mockEntity.getContent()) + .thenReturn(new ByteArrayInputStream(responseJson.getBytes(StandardCharsets.UTF_8))); + } +} diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java index 52d46d853d52..81df86f0f721 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java @@ -698,9 +698,187 @@ void testIgnoresOnlyUnrecognizedFilterKeys() throws Exception { JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("query").get("knn").get("embedding").get("filter").get("bool").get("must"); - + // Should have only 1 filter: deleted=false assertEquals(1, mustFilters.size()); assertFalse(mustFilters.get(0).get("term").get("deleted").asBoolean()); } + + // ------------------------------------------------------------------------- + // buildNativeESQuery tests (Elasticsearch 8.x/9.x top-level knn format) + // ------------------------------------------------------------------------- + + @Test + void testNativeESQueryTopLevelKnnStructure() throws Exception { + float[] vector = {0.1f, 0.2f, 0.3f}; + int size = 10; + int k = 100; + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, size, k, Map.of()); + + JsonNode root = MAPPER.readTree(query); + assertEquals(size, root.get("size").asInt()); + + // Must have top-level "knn", NOT "query" + assertTrue(root.has("knn"), "ES native query must have top-level 'knn'"); + assertTrue(!root.has("query"), "ES native query must not have 'query' key"); + + JsonNode knn = root.get("knn"); + assertEquals("embedding", knn.get("field").asText()); + assertEquals(k, knn.get("k").asInt()); + assertNotNull(knn.get("query_vector")); + assertTrue(knn.get("query_vector").isArray()); + assertEquals(3, knn.get("query_vector").size()); + } + + @Test + void testNativeESQueryNumCandidates() throws Exception { + float[] vector = {0.1f}; + + // k < 100 → num_candidates should be 100 + String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 50, Map.of()); + JsonNode root1 = MAPPER.readTree(query1); + assertEquals(100, root1.get("knn").get("num_candidates").asInt()); + + // k > 100 → num_candidates should equal k + String query2 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 200, Map.of()); + JsonNode root2 = MAPPER.readTree(query2); + assertEquals(200, root2.get("knn").get("num_candidates").asInt()); + } + + @Test + void testNativeESQueryAlwaysHasDeletedFilter() throws Exception { + float[] vector = {0.1f, 0.2f}; + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, Map.of()); + + JsonNode root = MAPPER.readTree(query); + JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); + + assertNotNull(mustFilters); + assertTrue(mustFilters.isArray()); + assertTrue(mustFilters.size() >= 1); + assertEquals(false, mustFilters.get(0).get("term").get("deleted").asBoolean()); + } + + @Test + void testNativeESQueryWithEntityTypeFilter() throws Exception { + float[] vector = {0.5f}; + Map> filters = Map.of("entityType", List.of("table", "dashboard")); + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 5, 50, filters); + + JsonNode root = MAPPER.readTree(query); + JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); + + assertEquals(2, mustFilters.size()); + JsonNode entityTypeFilter = mustFilters.get(1); + assertTrue(entityTypeFilter.has("terms")); + JsonNode entityTypes = entityTypeFilter.get("terms").get("entityType"); + assertEquals(2, entityTypes.size()); + assertEquals("table", entityTypes.get(0).asText()); + assertEquals("dashboard", entityTypes.get(1).asText()); + } + + @Test + void testNativeESQueryWithOwnersFilter() throws Exception { + float[] vector = {0.1f}; + Map> filters = Map.of("owners", List.of("user1", "team2")); + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + + JsonNode root = MAPPER.readTree(query); + JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); + + assertEquals(2, mustFilters.size()); + JsonNode ownersFilter = mustFilters.get(1); + assertTrue(ownersFilter.has("bool")); + JsonNode shouldClauses = ownersFilter.get("bool").get("should"); + assertNotNull(shouldClauses); + assertEquals(2, shouldClauses.size()); + + String ownersJson = shouldClauses.toString(); + assertTrue(ownersJson.contains("user1")); + assertTrue(ownersJson.contains("team2")); + } + + @Test + void testNativeESQueryWithTagsFilter() throws Exception { + float[] vector = {0.1f, 0.2f}; + Map> filters = Map.of("tags", List.of("PII.Sensitive")); + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + + JsonNode root = MAPPER.readTree(query); + JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); + + assertEquals(2, mustFilters.size()); + JsonNode tagsFilter = mustFilters.get(1); + assertTrue(tagsFilter.has("nested")); + assertEquals("tags", tagsFilter.get("nested").get("path").asText()); + } + + @Test + void testNativeESQueryWithMultipleFilters() throws Exception { + float[] vector = {0.1f, 0.2f}; + Map> filters = + Map.of( + "entityType", List.of("table"), + "tier", List.of("Tier.Tier1"), + "serviceType", List.of("BigQuery")); + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + + JsonNode root = MAPPER.readTree(query); + JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); + + assertEquals(4, mustFilters.size(), "deleted=false + 3 user filters"); + String filtersJson = mustFilters.toString(); + assertTrue(filtersJson.contains("entityType")); + assertTrue(filtersJson.contains("tier")); + assertTrue(filtersJson.contains("serviceType")); + } + + @Test + void testNativeESQuerySourceExcludesEmbedding() throws Exception { + float[] vector = {0.1f}; + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, Map.of()); + + JsonNode root = MAPPER.readTree(query); + JsonNode excludes = root.get("_source").get("excludes"); + assertNotNull(excludes); + assertTrue(excludes.isArray()); + assertEquals("embedding", excludes.get(0).asText()); + } + + @Test + void testNativeESQueryAndOpenSearchQueryProduceSameFilters() throws Exception { + float[] vector = {0.1f, 0.2f}; + Map> filters = + Map.of( + "entityType", List.of("table"), + "owners", List.of("alice"), + "tier", List.of("Tier.Gold")); + + String osQuery = VectorSearchQueryBuilder.build(vector, 10, 100, filters); + String esQuery = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + + JsonNode osFilters = + MAPPER + .readTree(osQuery) + .get("query") + .get("knn") + .get("embedding") + .get("filter") + .get("bool") + .get("must"); + JsonNode esFilters = MAPPER.readTree(esQuery).get("knn").get("filter").get("bool").get("must"); + + assertEquals( + osFilters.size(), + esFilters.size(), + "Both queries should produce the same number of filter clauses"); + assertEquals(osFilters.toString(), esFilters.toString(), "Filter clauses should be identical"); + } } diff --git a/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json new file mode 100644 index 000000000000..1de030834943 --- /dev/null +++ b/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json @@ -0,0 +1,293 @@ +{ + "settings": { + "analysis": { + "normalizer": { + "lowercase_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] + } + }, + "filter": { + "om_stemmer": { + "type": "stemmer", + "name": "english" + }, + "word_delimiter_filter": { + "type": "word_delimiter", + "preserve_original": "true" + } + }, + "analyzer": { + "om_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "word_delimiter_filter", + "om_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 512, + "index": true, + "similarity": "cosine" + }, + "text_to_embed": { + "type": "text" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "fullyQualifiedName": { + "type": "keyword" + }, + "entityType": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer", + "ignore_above": 256 + } + } + }, + "serviceType": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "parent_id": { + "type": "keyword" + }, + "chunk_index": { + "type": "integer" + }, + "chunk_count": { + "type": "integer" + }, + "tags": { + "type": "nested", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "tier": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "certification": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "domains": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "owners": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "customProperties": { + "type": "object" + }, + "sourceId": { + "type": "keyword" + }, + "deleted": { + "type": "boolean" + }, + "fingerprint": { + "type": "keyword" + }, + "upVotes": { + "type": "integer" + }, + "downVotes": { + "type": "integer" + }, + "totalVotes": { + "type": "integer" + }, + "followersCount": { + "type": "integer" + }, + "usageSummary": { + "type": "object", + "properties": { + "dailyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + } + } + }, + "weeklyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + }, + "monthlyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + } + } + }, + "synonyms": { + "type": "keyword" + }, + "relatedTerms": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + }, + "fullyQualifiedName": { + "type": "keyword" + } + } + }, + "metricExpression": { + "type": "object", + "properties": { + "language": { + "type": "keyword" + }, + "code": { + "type": "text", + "analyzer": "om_analyzer" + } + } + }, + "metricType": { + "type": "keyword" + }, + "unitOfMeasurement": { + "type": "keyword" + }, + "customUnitOfMeasurement": { + "type": "keyword" + }, + "granularity": { + "type": "keyword" + }, + "relatedMetrics": { + "type": "keyword" + } + } + } +} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json new file mode 100644 index 000000000000..606bdc0a916c --- /dev/null +++ b/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json @@ -0,0 +1,293 @@ +{ + "settings": { + "analysis": { + "normalizer": { + "lowercase_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] + } + }, + "filter": { + "om_stemmer": { + "type": "stemmer", + "name": "english" + }, + "word_delimiter_filter": { + "type": "word_delimiter", + "preserve_original": "true" + } + }, + "analyzer": { + "om_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "word_delimiter_filter", + "om_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 512, + "index": true, + "similarity": "cosine" + }, + "text_to_embed": { + "type": "text" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "fullyQualifiedName": { + "type": "keyword" + }, + "entityType": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer", + "ignore_above": 256 + } + } + }, + "serviceType": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "parent_id": { + "type": "keyword" + }, + "chunk_index": { + "type": "integer" + }, + "chunk_count": { + "type": "integer" + }, + "tags": { + "type": "nested", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "tier": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "certification": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "domains": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "owners": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "customProperties": { + "type": "object" + }, + "sourceId": { + "type": "keyword" + }, + "deleted": { + "type": "boolean" + }, + "fingerprint": { + "type": "keyword" + }, + "upVotes": { + "type": "integer" + }, + "downVotes": { + "type": "integer" + }, + "totalVotes": { + "type": "integer" + }, + "followersCount": { + "type": "integer" + }, + "synonyms": { + "type": "keyword" + }, + "relatedTerms": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + }, + "fullyQualifiedName": { + "type": "keyword" + } + } + }, + "usageSummary": { + "type": "object", + "properties": { + "dailyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + } + } + }, + "weeklyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + }, + "monthlyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + } + } + }, + "metricExpression": { + "type": "object", + "properties": { + "language": { + "type": "keyword" + }, + "code": { + "type": "text", + "analyzer": "om_analyzer" + } + } + }, + "metricType": { + "type": "keyword" + }, + "unitOfMeasurement": { + "type": "keyword" + }, + "customUnitOfMeasurement": { + "type": "keyword" + }, + "granularity": { + "type": "keyword" + }, + "relatedMetrics": { + "type": "keyword" + } + } + } +} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json new file mode 100644 index 000000000000..6f621f1fdb80 --- /dev/null +++ b/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json @@ -0,0 +1,410 @@ +{ + "settings": { + "index": { + "max_ngram_diff": 17 + }, + "analysis": { + "tokenizer": { + "n_gram_tokenizer": { + "type": "ngram", + "min_gram": 3, + "max_gram": 20, + "token_chars": [ + "letter", + "digit" + ] + } + }, + "normalizer": { + "lowercase_normalizer": { + "type": "custom", + "filter": [ + "lowercase", + "asciifolding" + ] + } + }, + "filter": { + "word_delimiter_filter": { + "type": "word_delimiter", + "preserve_original": true + }, + "compound_word_delimiter_graph": { + "type": "word_delimiter_graph", + "generate_word_parts": true, + "generate_number_parts": true, + "split_on_case_change": true, + "split_on_numerics": true, + "catenate_words": false, + "catenate_numbers": false, + "catenate_all": false, + "preserve_original": true, + "stem_english_possessive": true + }, + "russian_stop": { + "type": "stop", + "stopwords": "_russian_" + }, + "english_stop": { + "type": "stop", + "stopwords": "_english_" + }, + "russian_snowball": { + "name": "russian", + "type": "stemmer" + }, + "om_kstem": { + "type": "kstem" + }, + "asciifolding": { + "type": "asciifolding" + } + }, + "analyzer": { + "om_analyzer": { + "tokenizer": "standard", + "filter": [ + "word_delimiter_filter", + "lowercase", + "asciifolding", + "russian_stop", + "russian_snowball", + "english_stop", + "om_kstem" + ] + }, + "om_ngram": { + "type": "custom", + "tokenizer": "n_gram_tokenizer", + "filter": [ + "lowercase" + ] + }, + "om_compound_analyzer": { + "tokenizer": "standard", + "filter": [ + "compound_word_delimiter_graph", + "lowercase", + "flatten_graph" + ] + } + } + } + }, + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 512, + "index": true, + "similarity": "cosine" + }, + "text_to_embed": { + "type": "text" + }, + "name": { + "type": "text", + "analyzer": "om_analyzer", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "ngram": { + "type": "text", + "analyzer": "om_ngram" + }, + "compound": { + "type": "text", + "analyzer": "om_compound_analyzer" + } + } + }, + "fullyQualifiedName": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "entityType": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer", + "ignore_above": 256 + } + } + }, + "serviceType": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "parent_id": { + "type": "keyword" + }, + "chunk_index": { + "type": "integer" + }, + "chunk_count": { + "type": "integer" + }, + "tags": { + "type": "nested", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "tier": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "certification": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "domains": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "displayName": { + "type": "text", + "analyzer": "om_analyzer", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "ngram": { + "type": "text", + "analyzer": "om_ngram" + }, + "compound": { + "type": "text", + "analyzer": "om_compound_analyzer" + } + } + } + } + }, + "owners": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text", + "analyzer": "om_analyzer", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "ngram": { + "type": "text", + "analyzer": "om_ngram" + }, + "compound": { + "type": "text", + "analyzer": "om_compound_analyzer" + } + } + } + } + }, + "customProperties": { + "type": "object" + }, + "sourceId": { + "type": "keyword" + }, + "deleted": { + "type": "boolean" + }, + "fingerprint": { + "type": "keyword" + }, + "upVotes": { + "type": "integer" + }, + "downVotes": { + "type": "integer" + }, + "totalVotes": { + "type": "integer" + }, + "followersCount": { + "type": "integer" + }, + "synonyms": { + "type": "keyword" + }, + "relatedTerms": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text", + "analyzer": "om_analyzer", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "ngram": { + "type": "text", + "analyzer": "om_ngram" + }, + "compound": { + "type": "text", + "analyzer": "om_compound_analyzer" + } + } + }, + "fullyQualifiedName": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + } + } + }, + "usageSummary": { + "type": "object", + "properties": { + "dailyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + } + } + }, + "weeklyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + }, + "monthlyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + } + } + }, + "metricExpression": { + "type": "object", + "properties": { + "language": { + "type": "keyword" + }, + "code": { + "type": "text", + "analyzer": "om_analyzer" + } + } + }, + "metricType": { + "type": "keyword" + }, + "unitOfMeasurement": { + "type": "keyword" + }, + "customUnitOfMeasurement": { + "type": "keyword" + }, + "granularity": { + "type": "keyword" + }, + "relatedMetrics": { + "type": "keyword" + } + } + } +} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json new file mode 100644 index 000000000000..606bdc0a916c --- /dev/null +++ b/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json @@ -0,0 +1,293 @@ +{ + "settings": { + "analysis": { + "normalizer": { + "lowercase_normalizer": { + "type": "custom", + "filter": [ + "lowercase" + ] + } + }, + "filter": { + "om_stemmer": { + "type": "stemmer", + "name": "english" + }, + "word_delimiter_filter": { + "type": "word_delimiter", + "preserve_original": "true" + } + }, + "analyzer": { + "om_analyzer": { + "tokenizer": "standard", + "filter": [ + "lowercase", + "word_delimiter_filter", + "om_stemmer" + ] + } + } + } + }, + "mappings": { + "properties": { + "embedding": { + "type": "dense_vector", + "dims": 512, + "index": true, + "similarity": "cosine" + }, + "text_to_embed": { + "type": "text" + }, + "name": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword" + } + } + }, + "fullyQualifiedName": { + "type": "keyword" + }, + "entityType": { + "type": "keyword", + "fields": { + "keyword": { + "type": "keyword", + "normalizer": "lowercase_normalizer", + "ignore_above": 256 + } + } + }, + "serviceType": { + "type": "keyword", + "normalizer": "lowercase_normalizer" + }, + "parent_id": { + "type": "keyword" + }, + "chunk_index": { + "type": "integer" + }, + "chunk_count": { + "type": "integer" + }, + "tags": { + "type": "nested", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "tier": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "certification": { + "type": "object", + "properties": { + "tagFQN": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "labelType": { + "type": "keyword" + }, + "description": { + "type": "text", + "analyzer": "om_analyzer" + }, + "source": { + "type": "keyword" + }, + "state": { + "type": "keyword" + } + } + }, + "domains": { + "type": "object", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "owners": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + } + } + }, + "customProperties": { + "type": "object" + }, + "sourceId": { + "type": "keyword" + }, + "deleted": { + "type": "boolean" + }, + "fingerprint": { + "type": "keyword" + }, + "upVotes": { + "type": "integer" + }, + "downVotes": { + "type": "integer" + }, + "totalVotes": { + "type": "integer" + }, + "followersCount": { + "type": "integer" + }, + "synonyms": { + "type": "keyword" + }, + "relatedTerms": { + "type": "nested", + "properties": { + "id": { + "type": "keyword" + }, + "name": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "displayName": { + "type": "text" + }, + "fullyQualifiedName": { + "type": "keyword" + } + } + }, + "usageSummary": { + "type": "object", + "properties": { + "dailyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + } + } + }, + "weeklyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + }, + "monthlyStats": { + "type": "object", + "properties": { + "count": { + "type": "integer" + }, + "percentileRank": { + "type": "double" + } + } + } + } + }, + "metricExpression": { + "type": "object", + "properties": { + "language": { + "type": "keyword" + }, + "code": { + "type": "text", + "analyzer": "om_analyzer" + } + } + }, + "metricType": { + "type": "keyword" + }, + "unitOfMeasurement": { + "type": "keyword" + }, + "customUnitOfMeasurement": { + "type": "keyword" + }, + "granularity": { + "type": "keyword" + }, + "relatedMetrics": { + "type": "keyword" + } + } + } +} From e0801732c7ece773b7e2fd4e9264fed419c3b46d Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:28:47 -0300 Subject: [PATCH 02/18] fix: correct VectorSearchQueryBuilder.build() call in test to pass all 6 required args Co-Authored-By: Claude Sonnet 4.6 --- .../service/search/vector/VectorSearchQueryBuilderTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java index 81df86f0f721..a7f7bbfcbcf6 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java @@ -861,7 +861,7 @@ void testNativeESQueryAndOpenSearchQueryProduceSameFilters() throws Exception { "owners", List.of("alice"), "tier", List.of("Tier.Gold")); - String osQuery = VectorSearchQueryBuilder.build(vector, 10, 100, filters); + String osQuery = VectorSearchQueryBuilder.build(vector, 10, 0, 100, filters, 0.0); String esQuery = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); JsonNode osFilters = From a0cf7e5f6e6d4c55caa9fd416501cf8a2f2d2dbe Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:15:19 -0300 Subject: [PATCH 03/18] fix: align ElasticSearchVectorService with VectorIndexService interface and fix compilation - Expand VectorIndexService interface to declare getExistingFingerprint, getExistingFingerprintsBatch, executeGenericRequest, and VECTOR_INDEX_KEY constant so callers (VectorSearchResource, ElasticSearchBulkSink) can invoke these through the interface type - Add default getIndexAlias() to interface, removing duplicate private getSearchAlias() in OpenSearchVectorService - Fix ElasticSearchVectorService: add generateEmbeddingFields/ updateEntityEmbedding implementations, correct search() signature to include 'from' param, remove spurious @Override annotations - Add 'from' parameter to buildNativeESQuery (valid for ES KNN pagination) - DRY appendFilterMustClauses with nestedTags boolean: ES-native index maps tags as nested type, OpenSearch entity indices use flat object - Add semanticSearch field to searchRequest.json (required by SemanticSearchQueryBuilder) - Fix test: update all search() and buildNativeESQuery() call sites to pass the new 'from' parameter Co-Authored-By: Claude Sonnet 4.6 --- .../searchIndex/ElasticSearchBulkSink.java | 7 +-- .../search/VectorSearchResource.java | 2 +- .../vector/ElasticSearchVectorService.java | 52 ++++++++++--------- .../vector/OpenSearchVectorService.java | 18 ++----- .../search/vector/VectorIndexService.java | 20 +++++++ .../vector/VectorSearchQueryBuilder.java | 27 +++++++--- .../ElasticSearchVectorServiceTest.java | 16 +++--- .../vector/VectorSearchQueryBuilderTest.java | 21 ++++---- .../json/schema/search/searchRequest.json | 5 ++ 9 files changed, 101 insertions(+), 67 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java index 71101df2c944..4fd8b0158767 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java @@ -55,6 +55,7 @@ import org.openmetadata.service.search.elasticsearch.ElasticSearchClient; import org.openmetadata.service.search.elasticsearch.EsUtils; import org.openmetadata.service.search.indexes.ColumnSearchIndex; +import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.VectorDocBuilder; import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.utils.AvailableEntityTypes; @@ -803,7 +804,7 @@ void addEntitiesToVectorIndexBatch( return; } - String canonicalIndex = VectorIndexService.getClusteredIndexName(); + String canonicalIndex = vectorService.getIndexAlias(); String finalTargetIndex = canonicalIndex; String finalSourceIndex = null; @@ -853,7 +854,7 @@ private void processMigration( EntityInterface entity, StageStatsTracker tracker) { try { - if (vectorService.copyExistingVectorDocuments( + if (((ElasticSearchVectorService) vectorService).copyExistingVectorDocuments( sourceIndex, targetIndex, parentId, fingerprint)) { vectorSuccess.incrementAndGet(); if (tracker != null) { @@ -877,7 +878,7 @@ private void processEmbedding( String targetIndex, StageStatsTracker tracker) { try { - vectorService.updateVectorEmbeddings(entity, targetIndex); + vectorService.updateEntityEmbedding(entity, targetIndex); vectorSuccess.incrementAndGet(); if (tracker != null) { tracker.recordVector(StatsResult.SUCCESS); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java index 6de437a5c59d..7d18d03c4108 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java @@ -142,7 +142,7 @@ public Response getFingerprint( } try { - String indexName = vectorService.getIndexName(); + String indexName = vectorService.getIndexAlias(); String fingerprint = vectorService.getExistingFingerprint(indexName, parentId); FingerprintResponse response = new FingerprintResponse( diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 14a60295a7c1..2df305a00037 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -29,6 +29,7 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.EntityInterface; +import org.openmetadata.service.Entity; import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher; import org.openmetadata.service.search.vector.client.EmbeddingClient; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; @@ -92,15 +93,20 @@ private void registerVectorEmbeddingHandler() { @Override @SuppressWarnings("unchecked") public VectorSearchResponse search( - String query, Map> filters, int size, int k, double threshold) { + String query, + Map> filters, + int size, + int from, + int k, + double threshold) { long start = System.currentTimeMillis(); try { float[] queryVector = embeddingClient.embed(query); int overFetchSize = size * OVER_FETCH_MULTIPLIER; String queryJson = - VectorSearchQueryBuilder.buildNativeESQuery(queryVector, overFetchSize, k, filters); - String indexName = getClusteredIndexName(); + VectorSearchQueryBuilder.buildNativeESQuery(queryVector, overFetchSize, from, k, filters); + String indexName = getIndexAlias(); String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); JsonNode root = MAPPER.readTree(responseBody); @@ -140,7 +146,8 @@ public VectorSearchResponse search( } } - String executeGenericRequest(String method, String endpoint, String body) { + @Override + public String executeGenericRequest(String method, String endpoint, String body) { try { Request request = new Request(method, endpoint); if (body != null) { @@ -157,6 +164,15 @@ String executeGenericRequest(String method, String endpoint, String body) { } @Override + public Map generateEmbeddingFields(EntityInterface entity) { + return VectorDocBuilder.buildEmbeddingFields(entity, embeddingClient); + } + + @Override + public void updateEntityEmbedding(EntityInterface entity, String entityIndexName) { + updateVectorEmbeddings(entity, entityIndexName); + } + public void updateVectorEmbeddings(EntityInterface entity, String targetIndex) { try { String parentId = entity.getId().toString(); @@ -180,7 +196,6 @@ public void updateVectorEmbeddings(EntityInterface entity, String targetIndex) { } } - @Override public void updateVectorEmbeddingsWithMigration( EntityInterface entity, String targetIndex, String sourceIndex) { try { @@ -284,7 +299,6 @@ public Map getExistingFingerprintsBatch( } } - @Override @SuppressWarnings("unchecked") public boolean copyExistingVectorDocuments( String sourceIndex, String targetIndex, String parentId, String fingerprint) { @@ -321,11 +335,10 @@ public boolean copyExistingVectorDocuments( } } - @Override public void softDeleteEmbeddings(EntityInterface entity) { try { String parentId = entity.getId().toString(); - String indexName = getClusteredIndexName(); + String indexName = getIndexAlias(); String script = "{\"script\":{\"source\":\"ctx._source.deleted = true\"}," + "\"query\":{\"term\":{\"parent_id\":\"" @@ -338,11 +351,10 @@ public void softDeleteEmbeddings(EntityInterface entity) { } } - @Override public void hardDeleteEmbeddings(EntityInterface entity) { try { String parentId = entity.getId().toString(); - String indexName = getClusteredIndexName(); + String indexName = getIndexAlias(); deleteByParentId(indexName, parentId); } catch (Exception e) { LOG.error( @@ -350,11 +362,10 @@ public void hardDeleteEmbeddings(EntityInterface entity) { } } - @Override public void restoreEmbeddings(EntityInterface entity) { try { String parentId = entity.getId().toString(); - String indexName = getClusteredIndexName(); + String indexName = getIndexAlias(); String script = "{\"script\":{\"source\":\"ctx._source.deleted = false\"}," + "\"query\":{\"term\":{\"parent_id\":\"" @@ -384,15 +395,11 @@ private void deleteByParentId(String indexName, String parentId) { } } - private static String getClusteredIndexName() { - return VectorIndexService.getClusteredIndexName(); - } - @Override public void createOrUpdateIndex(int dimension) { try { if (indexExists()) { - LOG.info("Vector index {} already exists", VECTOR_INDEX_NAME); + LOG.info("Vector index {} already exists", getIndexAlias()); return; } @@ -404,7 +411,7 @@ public void createOrUpdateIndex(int dimension) { CreateIndexRequest request = CreateIndexRequest.of( builder -> { - builder.index(getClusteredIndexName()); + builder.index(getIndexAlias()); if (mappingsNode != null && !mappingsNode.isNull()) { TypeMapping typeMapping = parseTypeMapping(mappingsNode); @@ -420,16 +427,15 @@ public void createOrUpdateIndex(int dimension) { }); client.indices().create(request); - LOG.info("Created vector index {} with dimension {}", getClusteredIndexName(), dimension); + LOG.info("Created vector index {} with dimension {}", getIndexAlias(), dimension); } catch (Exception e) { LOG.error("Failed to create vector index: {}", e.getMessage(), e); } } - @Override public boolean indexExists() { try { - ExistsRequest request = ExistsRequest.of(b -> b.index(getClusteredIndexName())); + ExistsRequest request = ExistsRequest.of(b -> b.index(getIndexAlias())); return client.indices().exists(request).value(); } catch (Exception e) { LOG.error("Failed to check if vector index exists: {}", e.getMessage(), e); @@ -437,12 +443,10 @@ public boolean indexExists() { } } - @Override public String getIndexName() { - return getClusteredIndexName(); + return getIndexAlias(); } - @Override @SuppressWarnings("unchecked") public void bulkIndex(List> documents, String targetIndex) { if (documents == null || documents.isEmpty()) { diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java index da311770d44f..ca3753f16438 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java @@ -198,7 +198,7 @@ public VectorSearchResponse search( overFetchSize = Math.min(overFetchSize, k); } - String aliasName = getSearchAlias(); + String aliasName = getIndexAlias(); while (!exhausted && byParent.size() < requestedParents) { String queryJson = VectorSearchQueryBuilder.build( @@ -280,6 +280,7 @@ private static long extractTotalHits(JsonNode root) { return -1L; } + @Override public String getExistingFingerprint(String indexName, String entityId) { try { String query = @@ -303,6 +304,7 @@ public String getExistingFingerprint(String indexName, String entityId) { return null; } + @Override public Map getExistingFingerprintsBatch( String indexName, List entityIds) { if (entityIds == null || entityIds.isEmpty()) { @@ -359,7 +361,8 @@ public void partialUpdateEntity( } } - String executeGenericRequest(String method, String endpoint, String body) { + @Override + public String executeGenericRequest(String method, String endpoint, String body) { try { OpenSearchGenericClient genericClient = client.generic(); var request = Requests.builder().endpoint(endpoint).method(method).json(body).build(); @@ -387,15 +390,4 @@ String executeGenericRequest(String method, String endpoint, String body) { } } - private String getSearchAlias() { - try { - String clusterAlias = Entity.getSearchRepository().getClusterAlias(); - if (clusterAlias == null || clusterAlias.isEmpty()) { - return VECTOR_EMBEDDING_ALIAS; - } - return clusterAlias + "_" + VECTOR_EMBEDDING_ALIAS; - } catch (Exception ex) { - return VECTOR_EMBEDDING_ALIAS; - } - } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java index b0b2c6e72625..eb3e870fffe1 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java @@ -3,11 +3,13 @@ import java.util.List; import java.util.Map; import org.openmetadata.schema.EntityInterface; +import org.openmetadata.service.Entity; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; public interface VectorIndexService { String VECTOR_EMBEDDING_ALIAS = "dataAssetEmbeddings"; + String VECTOR_INDEX_KEY = "vectorEmbeddings"; Map generateEmbeddingFields(EntityInterface entity); @@ -15,4 +17,22 @@ public interface VectorIndexService { VectorSearchResponse search( String query, Map> filters, int size, int from, int k, double threshold); + + String getExistingFingerprint(String indexName, String entityId); + + Map getExistingFingerprintsBatch(String indexName, List entityIds); + + String executeGenericRequest(String method, String endpoint, String body); + + default String getIndexAlias() { + try { + String clusterAlias = Entity.getSearchRepository().getClusterAlias(); + if (clusterAlias == null || clusterAlias.isEmpty()) { + return VECTOR_EMBEDDING_ALIAS; + } + return clusterAlias + "_" + VECTOR_EMBEDDING_ALIAS; + } catch (Exception ex) { + return VECTOR_EMBEDDING_ALIAS; + } + } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java index 6617e294f136..f40dfa300911 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java @@ -73,12 +73,14 @@ private static void appendKnnQuery( } public static String buildNativeESQuery( - float[] vector, int size, int k, Map> filters) { + float[] vector, int size, int from, int k, Map> filters) { int numCandidates = Math.max(k, 100); StringBuilder sb = new StringBuilder(512) .append("{\"size\":") .append(size) + .append(",\"from\":") + .append(from) .append(",\"_source\":{\"excludes\":[\"embedding\"]}") .append(",\"knn\":{") .append("\"field\":\"embedding\"") @@ -90,7 +92,7 @@ public static String buildNativeESQuery( .append(numCandidates); sb.append(",\"filter\":{\"bool\":{\"must\":["); - appendFilterMustClauses(sb, filters); + appendFilterMustClauses(sb, filters, true); sb.append("]}}"); // close must array and bool sb.append("}}"); // close knn object @@ -98,16 +100,21 @@ public static String buildNativeESQuery( } private static void appendFilterMustClauses(StringBuilder sb, Map> filters) { - // Only include documents where deleted=false + appendFilterMustClauses(sb, filters, false); + } + + /** + * @param nestedTags when true, emits a nested query for "tags" (required by the ES-native vector + * index where tags is mapped as nested); when false, emits a flat terms query (used by + * OpenSearch, which queries the regular entity indices where tags is a plain object). + */ + private static void appendFilterMustClauses( + StringBuilder sb, Map> filters, boolean nestedTags) { sb.append("{\"term\":{\"deleted\":false}}"); - - // Then add user-specified filters for (var e : filters.entrySet()) { String field = e.getKey(); List values = e.getValue(); if (values == null || values.isEmpty()) continue; - - // Handle custom properties that will come with "customProperties." if (field.startsWith("customProperties.")) { sb.append(','); appendCustomPropertiesFilter(sb, field, values); @@ -119,7 +126,11 @@ private static void appendFilterMustClauses(StringBuilder sb, Map { sb.append(','); - appendFlat(sb, "tags.tagFQN", values); + if (nestedTags) { + appendNested(sb, "tags", "tags.tagFQN", values); + } else { + appendFlat(sb, "tags.tagFQN", values); + } } case "domains" -> { sb.append(','); diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index bc95a9ec3257..f7608bc9c2eb 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -89,7 +89,7 @@ void testThresholdFilteringRemovesLowScoreResults() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.5); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.5); assertNotNull(results); assertEquals(2, results.hits.size(), "Should return 2 results (scores 0.9 and 0.7)"); @@ -122,7 +122,7 @@ void testScoreFieldIncludedInResults() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); assertEquals(1, results.hits.size()); assertTrue(results.hits.get(0).containsKey("_score"), "Result should contain _score field"); @@ -152,7 +152,7 @@ void testParentGroupingLimitsDistinctParents() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 0, 100, 0.0); assertEquals(5, results.hits.size(), "Should return all chunks from first 2 parents (3+2=5)"); long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); @@ -177,7 +177,7 @@ void testZeroThresholdReturnsAllResults() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); assertEquals(3, results.hits.size(), "With threshold 0.0, should return all 3 results"); } @@ -200,7 +200,7 @@ void testHighThresholdFiltersAllResults() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.9); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.9); assertEquals(0, results.hits.size(), "With threshold 0.9, all results should be filtered out"); } @@ -223,7 +223,7 @@ void testChunksWithoutParentIdAreSkipped() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); assertEquals(2, results.hits.size(), "Chunks without parent_id should be skipped"); } @@ -253,7 +253,7 @@ void testRequestedSizeLimitsDistinctParents() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 3, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 3, 0, 100, 0.0); assertEquals(3, results.hits.size(), "Should limit to 3 distinct parents"); long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); @@ -274,7 +274,7 @@ void testEmptyHitsResponseReturnsEmptyList() throws Exception { mockRestClientResponse(esResponse); - DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 100, 0.0); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); assertNotNull(results); assertTrue(results.hits.isEmpty(), "Empty hits should return empty list"); diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java index a7f7bbfcbcf6..0dfa14264c66 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java @@ -714,7 +714,8 @@ void testNativeESQueryTopLevelKnnStructure() throws Exception { int size = 10; int k = 100; - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, size, k, Map.of()); + + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, size, 0, k, Map.of()); JsonNode root = MAPPER.readTree(query); assertEquals(size, root.get("size").asInt()); @@ -736,12 +737,12 @@ void testNativeESQueryNumCandidates() throws Exception { float[] vector = {0.1f}; // k < 100 → num_candidates should be 100 - String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 50, Map.of()); + String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 50, Map.of()); JsonNode root1 = MAPPER.readTree(query1); assertEquals(100, root1.get("knn").get("num_candidates").asInt()); // k > 100 → num_candidates should equal k - String query2 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 200, Map.of()); + String query2 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 200, Map.of()); JsonNode root2 = MAPPER.readTree(query2); assertEquals(200, root2.get("knn").get("num_candidates").asInt()); } @@ -750,7 +751,7 @@ void testNativeESQueryNumCandidates() throws Exception { void testNativeESQueryAlwaysHasDeletedFilter() throws Exception { float[] vector = {0.1f, 0.2f}; - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, Map.of()); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, Map.of()); JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); @@ -766,7 +767,7 @@ void testNativeESQueryWithEntityTypeFilter() throws Exception { float[] vector = {0.5f}; Map> filters = Map.of("entityType", List.of("table", "dashboard")); - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 5, 50, filters); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 5, 0, 50, filters); JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); @@ -785,7 +786,7 @@ void testNativeESQueryWithOwnersFilter() throws Exception { float[] vector = {0.1f}; Map> filters = Map.of("owners", List.of("user1", "team2")); - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, filters); JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); @@ -807,7 +808,7 @@ void testNativeESQueryWithTagsFilter() throws Exception { float[] vector = {0.1f, 0.2f}; Map> filters = Map.of("tags", List.of("PII.Sensitive")); - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, filters); JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); @@ -827,7 +828,7 @@ void testNativeESQueryWithMultipleFilters() throws Exception { "tier", List.of("Tier.Tier1"), "serviceType", List.of("BigQuery")); - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, filters); JsonNode root = MAPPER.readTree(query); JsonNode mustFilters = root.get("knn").get("filter").get("bool").get("must"); @@ -843,7 +844,7 @@ void testNativeESQueryWithMultipleFilters() throws Exception { void testNativeESQuerySourceExcludesEmbedding() throws Exception { float[] vector = {0.1f}; - String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, Map.of()); + String query = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, Map.of()); JsonNode root = MAPPER.readTree(query); JsonNode excludes = root.get("_source").get("excludes"); @@ -862,7 +863,7 @@ void testNativeESQueryAndOpenSearchQueryProduceSameFilters() throws Exception { "tier", List.of("Tier.Gold")); String osQuery = VectorSearchQueryBuilder.build(vector, 10, 0, 100, filters, 0.0); - String esQuery = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 100, filters); + String esQuery = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, filters); JsonNode osFilters = MAPPER diff --git a/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json b/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json index 470503f3c3f3..b9fb3d87ad6c 100644 --- a/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json +++ b/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json @@ -109,6 +109,11 @@ "description": "Include aggregations in the search response. Defaults to true. Set to false to skip aggregations for faster response times when only search results are needed.", "type": "boolean", "default": true + }, + "semanticSearch": { + "description": "If true, use semantic (vector) search instead of keyword search.", + "type": "boolean", + "default": false } }, "additionalProperties": false From f97048b1ed2ce85f1ab19c942637b6b8d0580e97 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:30:06 -0300 Subject: [PATCH 04/18] fix: replace brittle string-replace in loadIndexMapping with ObjectMapper Use Jackson ObjectMapper to patch the 'dims' field in the vector index mapping template instead of exact string matching, which was fragile against whitespace variations. Extract into package-private patchDimension() and add 3 unit tests covering dimension replacement, preservation of other fields, and the no-space JSON variant. Co-Authored-By: Claude Sonnet 4.6 --- .../vector/ElasticSearchVectorService.java | 25 ++++++++--- .../ElasticSearchVectorServiceTest.java | 41 +++++++++++++++++++ 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 2df305a00037..92cc5ab04694 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; import es.co.elastic.clients.elasticsearch._types.Refresh; import es.co.elastic.clients.elasticsearch._types.mapping.TypeMapping; @@ -524,17 +525,29 @@ private String loadIndexMapping(int dimension) { throw new IllegalStateException("Could not find " + resourcePath + " in classpath"); } String template = new String(inputStream.readAllBytes(), StandardCharsets.UTF_8); - String result = template.replace("\"dims\": 512", "\"dims\": " + dimension); - if (result.equals(template) && dimension != 512) { - throw new IllegalStateException( - "Failed to replace dimension placeholder in vector index mapping template"); - } - return result; + return patchDimension(template, dimension); } catch (Exception e) { throw new RuntimeException("Failed to load vector search index mapping", e); } } + static String patchDimension(String mappingJson, int dimension) { + try { + JsonNode root = MAPPER.readTree(mappingJson); + JsonNode properties = root.path("mappings").path("properties"); + if (!properties.isMissingNode() && properties.has("embedding")) { + ObjectNode embeddingNode = (ObjectNode) properties.get("embedding"); + if (embeddingNode.has("dims")) { + embeddingNode.put("dims", dimension); + } + } + return MAPPER.writeValueAsString(root); + } catch (Exception e) { + throw new IllegalStateException( + "Failed to patch dimension in vector index mapping template", e); + } + } + public void close() { try { if (client != null && client._transport() != null) { diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index f7608bc9c2eb..d68961b5f94c 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -328,6 +328,47 @@ void testGetExistingFingerprintsBatchReturnsEmptyForEmptyInput() { assertTrue(result.isEmpty()); } + @Test + void testPatchDimensionReplacesDims() throws Exception { + String mapping = + """ + {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512}}}} + """; + String patched = ElasticSearchVectorService.patchDimension(mapping, 1536); + com.fasterxml.jackson.databind.JsonNode root = + new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); + int dims = root.path("mappings").path("properties").path("embedding").path("dims").asInt(); + assertEquals(1536, dims); + } + + @Test + void testPatchDimensionLeavesOtherFieldsUntouched() throws Exception { + String mapping = + """ + {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512,"similarity":"cosine"}}}} + """; + String patched = ElasticSearchVectorService.patchDimension(mapping, 768); + com.fasterxml.jackson.databind.JsonNode root = + new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); + com.fasterxml.jackson.databind.JsonNode embedding = + root.path("mappings").path("properties").path("embedding"); + assertEquals(768, embedding.path("dims").asInt()); + assertEquals("dense_vector", embedding.path("type").asText()); + assertEquals("cosine", embedding.path("similarity").asText()); + } + + @Test + void testPatchDimensionHandlesNoSpaceVariant() throws Exception { + String mapping = + """ + {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512}}}} + """; + String patched = ElasticSearchVectorService.patchDimension(mapping, 384); + com.fasterxml.jackson.databind.JsonNode root = + new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); + assertEquals(384, root.path("mappings").path("properties").path("embedding").path("dims").asInt()); + } + private void mockRestClientResponse(String responseJson) throws Exception { Response mockResponse = mock(Response.class); HttpEntity mockEntity = mock(HttpEntity.class); From 420e028e0939493c8f95face2716c1aa87f18347 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 15:49:50 -0300 Subject: [PATCH 05/18] fix: fully initialize ElasticSearchVectorService before publishing instance Assign to the volatile 'instance' field only after registerVectorEmbeddingHandler() completes, so a concurrent caller via getInstance() cannot observe a partially-initialized service. Co-Authored-By: Claude Sonnet 4.6 --- .../service/search/vector/ElasticSearchVectorService.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 92cc5ab04694..cd807d6ecddb 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -69,8 +69,9 @@ public static synchronized void init( if (instance != null) { LOG.warn("ElasticSearchVectorService already initialized, reinitializing"); } - instance = new ElasticSearchVectorService(client, embeddingClient, language); - instance.registerVectorEmbeddingHandler(); + ElasticSearchVectorService svc = new ElasticSearchVectorService(client, embeddingClient, language); + svc.registerVectorEmbeddingHandler(); + instance = svc; LOG.info( "ElasticSearchVectorService initialized with model={}, dimension={}", embeddingClient.getModelId(), From 493e0cb4bfa1ae8e0f6a32c193f10ea497be7a6e Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:14:45 -0300 Subject: [PATCH 06/18] fix: mirror OpenSearch pagination logic in ElasticSearchVectorService.search() ES search() was passing 'from' directly to the KNN query, which skips raw chunks rather than parent entities. Mirror the OpenSearch approach: - Loop with rawOffset to collect from + size + 1 distinct parents - Skip 'from' parents in application code after collection - Return 4-arg VectorSearchResponse with totalHits and hasMore populated Extract collectSearchHits() and extractTotalHits() private helpers (same pattern as OpenSearchVectorService). Update tests to use parentId (camelCase, matching VectorDocBuilder), use thenAnswer for fresh mock streams on each loop iteration, and add sequence mock for multi-page termination tests. Co-Authored-By: Claude Sonnet 4.6 --- .../vector/ElasticSearchVectorService.java | 84 ++++-- .../ElasticSearchVectorServiceTest.java | 265 +++++++++++++----- 2 files changed, 251 insertions(+), 98 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index cd807d6ecddb..5f518b9bd5d5 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -104,35 +104,45 @@ public VectorSearchResponse search( long start = System.currentTimeMillis(); try { float[] queryVector = embeddingClient.embed(query); - int overFetchSize = size * OVER_FETCH_MULTIPLIER; + LinkedHashMap>> byParent = new LinkedHashMap<>(); + int rawOffset = 0; + long totalHits = -1L; + boolean exhausted = false; + int requestedParents = from + size + 1; // one extra to determine hasMore + int overFetchSize = Math.max(requestedParents * OVER_FETCH_MULTIPLIER, OVER_FETCH_MULTIPLIER); + if (threshold <= 0.0) { + overFetchSize = Math.min(overFetchSize, k); + } - String queryJson = - VectorSearchQueryBuilder.buildNativeESQuery(queryVector, overFetchSize, from, k, filters); String indexName = getIndexAlias(); - String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); - - JsonNode root = MAPPER.readTree(responseBody); - JsonNode hitsNode = root.path("hits").path("hits"); - - LinkedHashMap>> byParent = new LinkedHashMap<>(); - for (JsonNode hit : hitsNode) { - double score = hit.path("_score").asDouble(0.0); - if (score < threshold) { - continue; + while (!exhausted && byParent.size() < requestedParents) { + String queryJson = + VectorSearchQueryBuilder.buildNativeESQuery( + queryVector, overFetchSize, rawOffset, k, filters); + String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); + + JsonNode root = MAPPER.readTree(responseBody); + JsonNode hitsNode = root.path("hits").path("hits"); + totalHits = extractTotalHits(root); + + int pageHitCount = collectSearchHits(hitsNode, threshold, byParent); + if (pageHitCount == 0) { + exhausted = true; + break; } - Map hitMap = MAPPER.convertValue(hit.path("_source"), Map.class); - hitMap.put("_score", score); - - String parentId = (String) hitMap.get("parent_id"); - if (parentId != null) { - byParent.computeIfAbsent(parentId, kVal -> new ArrayList<>()).add(hitMap); - } + rawOffset += pageHitCount; + exhausted = totalHits >= 0 ? rawOffset >= totalHits : pageHitCount < overFetchSize; } List> results = new ArrayList<>(); int parentCount = 0; + int skipped = 0; for (List> chunks : byParent.values()) { + if (skipped < from) { + skipped++; + continue; + } if (parentCount >= size) { break; } @@ -140,14 +150,46 @@ public VectorSearchResponse search( parentCount++; } + boolean hasMore = byParent.size() > (from + parentCount); long tookMillis = System.currentTimeMillis() - start; - return new VectorSearchResponse(tookMillis, results); + return new VectorSearchResponse( + tookMillis, results, totalHits >= 0 ? totalHits : null, hasMore); } catch (Exception e) { LOG.error("Vector search failed: {}", e.getMessage(), e); throw new RuntimeException("Vector search failed", e); } } + private static int collectSearchHits( + JsonNode hitsNode, + double threshold, + LinkedHashMap>> byParent) { + int pageHitCount = 0; + for (JsonNode hit : hitsNode) { + pageHitCount++; + double score = hit.path("_score").asDouble(0.0); + if (score < threshold) { + continue; + } + Map hitMap = MAPPER.convertValue(hit.path("_source"), Map.class); + hitMap.put("_score", score); + String parentId = (String) hitMap.getOrDefault("parentId", hit.path("_id").asText()); + byParent.computeIfAbsent(parentId, ignored -> new ArrayList<>()).add(hitMap); + } + return pageHitCount; + } + + private static long extractTotalHits(JsonNode root) { + JsonNode totalNode = root.path("hits").path("total"); + if (totalNode.isIntegralNumber()) { + return totalNode.asLong(-1L); + } + if (totalNode.isObject()) { + return totalNode.path("value").asLong(-1L); + } + return -1L; + } + @Override public String executeGenericRequest(String method, String endpoint, String body) { try { diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index d68961b5f94c..13404e5b8e5b 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -1,7 +1,9 @@ package org.openmetadata.service.search.vector; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.mock; @@ -15,6 +17,7 @@ import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; import org.apache.hc.core5.http.HttpEntity; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -23,6 +26,9 @@ class ElasticSearchVectorServiceTest { + private static final String EMPTY_HITS_RESPONSE = + "{\"hits\":{\"total\":{\"value\":0},\"hits\":[]}}"; + private ElasticSearchVectorService vectorService; private Rest5Client mockRestClient; private EmbeddingClient mockEmbeddingClient; @@ -50,38 +56,10 @@ void testThresholdFilteringRemovesLowScoreResults() throws Exception { "hits": { "total": {"value": 4}, "hits": [ - { - "_score": 0.9, - "_source": { - "parent_id": "parent1", - "chunk_index": 0, - "text": "High score chunk" - } - }, - { - "_score": 0.7, - "_source": { - "parent_id": "parent2", - "chunk_index": 0, - "text": "Medium score chunk" - } - }, - { - "_score": 0.4, - "_source": { - "parent_id": "parent3", - "chunk_index": 0, - "text": "Low score chunk" - } - }, - { - "_score": 0.2, - "_source": { - "parent_id": "parent4", - "chunk_index": 0, - "text": "Very low score chunk" - } - } + {"_score": 0.9, "_source": {"parentId": "parent1", "chunk_index": 0, "text": "High score chunk"}}, + {"_score": 0.7, "_source": {"parentId": "parent2", "chunk_index": 0, "text": "Medium score chunk"}}, + {"_score": 0.4, "_source": {"parentId": "parent3", "chunk_index": 0, "text": "Low score chunk"}}, + {"_score": 0.2, "_source": {"parentId": "parent4", "chunk_index": 0, "text": "Very low score chunk"}} ] } } @@ -107,14 +85,7 @@ void testScoreFieldIncludedInResults() throws Exception { "hits": { "total": {"value": 1}, "hits": [ - { - "_score": 0.85, - "_source": { - "parent_id": "parent1", - "chunk_index": 0, - "text": "Test chunk" - } - } + {"_score": 0.85, "_source": {"parentId": "parent1", "chunk_index": 0, "text": "Test chunk"}} ] } } @@ -131,20 +102,21 @@ void testScoreFieldIncludedInResults() throws Exception { @Test void testParentGroupingLimitsDistinctParents() throws Exception { + // size=2 → requestedParents=3; 4 distinct parents in response causes loop to exit after 1 page String esResponse = """ { "hits": { "total": {"value": 8}, "hits": [ - {"_score": 0.9, "_source": {"parent_id": "parent1", "chunk_index": 0}}, - {"_score": 0.88, "_source": {"parent_id": "parent1", "chunk_index": 1}}, - {"_score": 0.85, "_source": {"parent_id": "parent1", "chunk_index": 2}}, - {"_score": 0.8, "_source": {"parent_id": "parent2", "chunk_index": 0}}, - {"_score": 0.78, "_source": {"parent_id": "parent2", "chunk_index": 1}}, - {"_score": 0.7, "_source": {"parent_id": "parent3", "chunk_index": 0}}, - {"_score": 0.68, "_source": {"parent_id": "parent3", "chunk_index": 1}}, - {"_score": 0.6, "_source": {"parent_id": "parent4", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "parent1", "chunk_index": 0}}, + {"_score": 0.88, "_source": {"parentId": "parent1", "chunk_index": 1}}, + {"_score": 0.85, "_source": {"parentId": "parent1", "chunk_index": 2}}, + {"_score": 0.8, "_source": {"parentId": "parent2", "chunk_index": 0}}, + {"_score": 0.78, "_source": {"parentId": "parent2", "chunk_index": 1}}, + {"_score": 0.7, "_source": {"parentId": "parent3", "chunk_index": 0}}, + {"_score": 0.68, "_source": {"parentId": "parent3", "chunk_index": 1}}, + {"_score": 0.6, "_source": {"parentId": "parent4", "chunk_index": 0}} ] } } @@ -155,7 +127,7 @@ void testParentGroupingLimitsDistinctParents() throws Exception { DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 0, 100, 0.0); assertEquals(5, results.hits.size(), "Should return all chunks from first 2 parents (3+2=5)"); - long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); + long distinctParents = results.hits.stream().map(r -> r.get("parentId")).distinct().count(); assertEquals(2, distinctParents, "Should have chunks from exactly 2 distinct parents"); } @@ -167,9 +139,9 @@ void testZeroThresholdReturnsAllResults() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, - {"_score": 0.5, "_source": {"parent_id": "p2", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parent_id": "p3", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.5, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parentId": "p3", "chunk_index": 0}} ] } } @@ -190,9 +162,9 @@ void testHighThresholdFiltersAllResults() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.5, "_source": {"parent_id": "p1", "chunk_index": 0}}, - {"_score": 0.3, "_source": {"parent_id": "p2", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parent_id": "p3", "chunk_index": 0}} + {"_score": 0.5, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.3, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parentId": "p3", "chunk_index": 0}} ] } } @@ -206,16 +178,17 @@ void testHighThresholdFiltersAllResults() throws Exception { } @Test - void testChunksWithoutParentIdAreSkipped() throws Exception { + void testChunksWithoutParentIdGroupedByDocumentId() throws Exception { + // Chunks without parentId in _source fall back to the document's _id field String esResponse = """ { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"chunk_index": 0, "text": "orphan chunk"}}, - {"_score": 0.7, "_source": {"parent_id": "p2", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_id": "orphan-123", "_score": 0.8, "_source": {"chunk_index": 0, "text": "orphan chunk"}}, + {"_score": 0.7, "_source": {"parentId": "p2", "chunk_index": 0}} ] } } @@ -225,27 +198,28 @@ void testChunksWithoutParentIdAreSkipped() throws Exception { DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); - assertEquals(2, results.hits.size(), "Chunks without parent_id should be skipped"); + assertEquals(3, results.hits.size(), "Orphan chunk should be included, grouped by document _id"); } @Test void testRequestedSizeLimitsDistinctParents() throws Exception { + // size=3 → requestedParents=4; 10 distinct parents in response exits loop immediately String esResponse = """ { "hits": { "total": {"value": 10}, "hits": [ - {"_score": 0.9, "_source": {"parent_id": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parent_id": "p2", "chunk_index": 0}}, - {"_score": 0.7, "_source": {"parent_id": "p3", "chunk_index": 0}}, - {"_score": 0.6, "_source": {"parent_id": "p4", "chunk_index": 0}}, - {"_score": 0.5, "_source": {"parent_id": "p5", "chunk_index": 0}}, - {"_score": 0.4, "_source": {"parent_id": "p6", "chunk_index": 0}}, - {"_score": 0.3, "_source": {"parent_id": "p7", "chunk_index": 0}}, - {"_score": 0.2, "_source": {"parent_id": "p8", "chunk_index": 0}}, - {"_score": 0.15, "_source": {"parent_id": "p9", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parent_id": "p10", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}}, + {"_score": 0.5, "_source": {"parentId": "p5", "chunk_index": 0}}, + {"_score": 0.4, "_source": {"parentId": "p6", "chunk_index": 0}}, + {"_score": 0.3, "_source": {"parentId": "p7", "chunk_index": 0}}, + {"_score": 0.2, "_source": {"parentId": "p8", "chunk_index": 0}}, + {"_score": 0.15, "_source": {"parentId": "p9", "chunk_index": 0}}, + {"_score": 0.1, "_source": {"parentId": "p10", "chunk_index": 0}} ] } } @@ -256,7 +230,7 @@ void testRequestedSizeLimitsDistinctParents() throws Exception { DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 3, 0, 100, 0.0); assertEquals(3, results.hits.size(), "Should limit to 3 distinct parents"); - long distinctParents = results.hits.stream().map(r -> r.get("parent_id")).distinct().count(); + long distinctParents = results.hits.stream().map(r -> r.get("parentId")).distinct().count(); assertEquals(3, distinctParents, "Should have exactly 3 distinct parents"); } @@ -281,16 +255,136 @@ void testEmptyHitsResponseReturnsEmptyList() throws Exception { } @Test - void testGetExistingFingerprintReturnsNullWhenNotFound() throws Exception { - String esResponse = """ - {"hits":{"total":{"value":0},"hits":[]}} + void testFromSkipsParentsNotChunks() throws Exception { + // from=1 should skip 1 parent (p1), not 1 raw chunk + // size=2, from=1 → requestedParents=4; 4 distinct parents in response exits loop after 1 page + String esResponse = + """ + { + "hits": { + "total": {"value": 4}, + "hits": [ + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 1, 100, 0.0); + + assertEquals(2, results.hits.size(), "Should return 2 parents after skipping 1"); + assertEquals("p2", results.hits.get(0).get("parentId")); + assertEquals("p3", results.hits.get(1).get("parentId")); + } + + @Test + void testHasMoreTrueWhenExtraParentFetched() throws Exception { + // size=2, from=0 → requestedParents=3; 4 parents fetched → hasMore=true + String esResponse = + """ + { + "hits": { + "total": {"value": 4}, + "hits": [ + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 2, 0, 100, 0.0); + + assertEquals(2, results.hits.size()); + assertTrue(results.hasMore, "hasMore should be true when extra parent was fetched"); + } + + @Test + void testHasMoreFalseWhenNoExtraParent() throws Exception { + // size=10, from=0 → requestedParents=11; only 2 parents available → hasMore=false + String esResponse = + """ + { + "hits": { + "total": {"value": 2}, + "hits": [ + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponse(esResponse); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); + + assertEquals(2, results.hits.size()); + assertFalse(results.hasMore, "hasMore should be false when fewer parents than requested"); + } + + @Test + void testTotalHitsPopulatedFromResponse() throws Exception { + String esResponse = + """ + { + "hits": { + "total": {"value": 3}, + "hits": [ + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}} + ] + } + } """; mockRestClientResponse(esResponse); + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); + + assertEquals(Long.valueOf(3), results.totalHits); + } + + @Test + void testTotalHitsNullWhenMissingFromResponse() throws Exception { + // No "total" field in first response; second call returns empty to terminate the loop + String esResponse = + """ + { + "hits": { + "hits": [ + {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}} + ] + } + } + """; + + mockRestClientResponseSequence(esResponse, EMPTY_HITS_RESPONSE); + + DTOs.VectorSearchResponse results = vectorService.search("test query", Map.of(), 10, 0, 100, 0.0); + + assertNull(results.totalHits, "totalHits should be null when not present in response"); + } + + @Test + void testGetExistingFingerprintReturnsNullWhenNotFound() throws Exception { + String esResponse = "{\"hits\":{\"total\":{\"value\":0},\"hits\":[]}}"; + + mockRestClientResponse(esResponse); + String fingerprint = vectorService.getExistingFingerprint("vector_search_index", "unknown-id"); - assertTrue(fingerprint == null, "Should return null when no fingerprint found"); + assertNull(fingerprint, "Should return null when no fingerprint found"); } @Test @@ -366,16 +460,33 @@ void testPatchDimensionHandlesNoSpaceVariant() throws Exception { String patched = ElasticSearchVectorService.patchDimension(mapping, 384); com.fasterxml.jackson.databind.JsonNode root = new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); - assertEquals(384, root.path("mappings").path("properties").path("embedding").path("dims").asInt()); + assertEquals( + 384, root.path("mappings").path("properties").path("embedding").path("dims").asInt()); } + /** Returns a fresh stream on every call — safe for multi-iteration loops. */ private void mockRestClientResponse(String responseJson) throws Exception { Response mockResponse = mock(Response.class); HttpEntity mockEntity = mock(HttpEntity.class); + when(mockRestClient.performRequest(any(Request.class))).thenReturn(mockResponse); + when(mockResponse.getEntity()).thenReturn(mockEntity); + when(mockEntity.getContent()) + .thenAnswer( + inv -> new ByteArrayInputStream(responseJson.getBytes(StandardCharsets.UTF_8))); + } + /** Returns each response in sequence; repeats the last one if more calls are made. */ + private void mockRestClientResponseSequence(String... responses) throws Exception { + Response mockResponse = mock(Response.class); + HttpEntity mockEntity = mock(HttpEntity.class); + AtomicInteger callCount = new AtomicInteger(0); when(mockRestClient.performRequest(any(Request.class))).thenReturn(mockResponse); when(mockResponse.getEntity()).thenReturn(mockEntity); when(mockEntity.getContent()) - .thenReturn(new ByteArrayInputStream(responseJson.getBytes(StandardCharsets.UTF_8))); + .thenAnswer( + inv -> { + int idx = Math.min(callCount.getAndIncrement(), responses.length - 1); + return new ByteArrayInputStream(responses[idx].getBytes(StandardCharsets.UTF_8)); + }); } } From 4b6ef833fcf5a4dc7b96084e6d3a6bb7f38cb77c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:24:42 -0300 Subject: [PATCH 07/18] fix: use parentId/chunkIndex (camelCase) in bulkIndex to match VectorDocBuilder bulkIndex() was reading 'parent_id' and 'chunk_index' (snake_case) to build document IDs, but VectorDocBuilder stores both as camelCase ('parentId', 'chunkIndex'). This caused doc IDs to always be null-N or parentId-N (using loop index instead of actual chunk index). Add test that captures the BulkRequest and verifies the doc ID is parentId-chunkIndex. Co-Authored-By: Claude Sonnet 4.6 --- .../vector/ElasticSearchVectorService.java | 4 +- .../ElasticSearchVectorServiceTest.java | 126 +++++++++++------- 2 files changed, 79 insertions(+), 51 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 5f518b9bd5d5..da5e573cb975 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -501,8 +501,8 @@ public void bulkIndex(List> documents, String targetIndex) { List operations = new ArrayList<>(); for (int i = 0; i < documents.size(); i++) { Map doc = documents.get(i); - String parentId = (String) doc.get("parent_id"); - int chunkIndex = doc.containsKey("chunk_index") ? (int) doc.get("chunk_index") : i; + String parentId = (String) doc.get("parentId"); + int chunkIndex = doc.containsKey("chunkIndex") ? (int) doc.get("chunkIndex") : i; String docId = parentId + "-" + chunkIndex; operations.add( diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index 13404e5b8e5b..f657d62164dc 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -9,6 +9,12 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; +import es.co.elastic.clients.elasticsearch.core.BulkRequest; +import es.co.elastic.clients.elasticsearch.core.BulkResponse; +import java.util.HashMap; +import java.util.List; +import org.mockito.ArgumentCaptor; + import es.co.elastic.clients.elasticsearch.ElasticsearchClient; import es.co.elastic.clients.transport.rest5_client.Rest5ClientTransport; import es.co.elastic.clients.transport.rest5_client.low_level.Request; @@ -30,22 +36,23 @@ class ElasticSearchVectorServiceTest { "{\"hits\":{\"total\":{\"value\":0},\"hits\":[]}}"; private ElasticSearchVectorService vectorService; + private ElasticsearchClient mockEsClient; private Rest5Client mockRestClient; private EmbeddingClient mockEmbeddingClient; @BeforeEach void setup() throws Exception { - ElasticsearchClient mockClient = mock(ElasticsearchClient.class); + mockEsClient = mock(ElasticsearchClient.class); Rest5ClientTransport mockTransport = mock(Rest5ClientTransport.class); mockRestClient = mock(Rest5Client.class); - when(mockClient._transport()).thenReturn(mockTransport); + when(mockEsClient._transport()).thenReturn(mockTransport); when(mockTransport.restClient()).thenReturn(mockRestClient); mockEmbeddingClient = mock(EmbeddingClient.class); when(mockEmbeddingClient.embed(any(String.class))).thenReturn(new float[] {0.1f, 0.2f, 0.3f}); - vectorService = new ElasticSearchVectorService(mockClient, mockEmbeddingClient); + vectorService = new ElasticSearchVectorService(mockEsClient, mockEmbeddingClient); } @Test @@ -56,10 +63,10 @@ void testThresholdFilteringRemovesLowScoreResults() throws Exception { "hits": { "total": {"value": 4}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "parent1", "chunk_index": 0, "text": "High score chunk"}}, - {"_score": 0.7, "_source": {"parentId": "parent2", "chunk_index": 0, "text": "Medium score chunk"}}, - {"_score": 0.4, "_source": {"parentId": "parent3", "chunk_index": 0, "text": "Low score chunk"}}, - {"_score": 0.2, "_source": {"parentId": "parent4", "chunk_index": 0, "text": "Very low score chunk"}} + {"_score": 0.9, "_source": {"parentId": "parent1", "chunkIndex": 0, "text": "High score chunk"}}, + {"_score": 0.7, "_source": {"parentId": "parent2", "chunkIndex": 0, "text": "Medium score chunk"}}, + {"_score": 0.4, "_source": {"parentId": "parent3", "chunkIndex": 0, "text": "Low score chunk"}}, + {"_score": 0.2, "_source": {"parentId": "parent4", "chunkIndex": 0, "text": "Very low score chunk"}} ] } } @@ -85,7 +92,7 @@ void testScoreFieldIncludedInResults() throws Exception { "hits": { "total": {"value": 1}, "hits": [ - {"_score": 0.85, "_source": {"parentId": "parent1", "chunk_index": 0, "text": "Test chunk"}} + {"_score": 0.85, "_source": {"parentId": "parent1", "chunkIndex": 0, "text": "Test chunk"}} ] } } @@ -109,14 +116,14 @@ void testParentGroupingLimitsDistinctParents() throws Exception { "hits": { "total": {"value": 8}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "parent1", "chunk_index": 0}}, - {"_score": 0.88, "_source": {"parentId": "parent1", "chunk_index": 1}}, - {"_score": 0.85, "_source": {"parentId": "parent1", "chunk_index": 2}}, - {"_score": 0.8, "_source": {"parentId": "parent2", "chunk_index": 0}}, - {"_score": 0.78, "_source": {"parentId": "parent2", "chunk_index": 1}}, - {"_score": 0.7, "_source": {"parentId": "parent3", "chunk_index": 0}}, - {"_score": 0.68, "_source": {"parentId": "parent3", "chunk_index": 1}}, - {"_score": 0.6, "_source": {"parentId": "parent4", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "parent1", "chunkIndex": 0}}, + {"_score": 0.88, "_source": {"parentId": "parent1", "chunkIndex": 1}}, + {"_score": 0.85, "_source": {"parentId": "parent1", "chunkIndex": 2}}, + {"_score": 0.8, "_source": {"parentId": "parent2", "chunkIndex": 0}}, + {"_score": 0.78, "_source": {"parentId": "parent2", "chunkIndex": 1}}, + {"_score": 0.7, "_source": {"parentId": "parent3", "chunkIndex": 0}}, + {"_score": 0.68, "_source": {"parentId": "parent3", "chunkIndex": 1}}, + {"_score": 0.6, "_source": {"parentId": "parent4", "chunkIndex": 0}} ] } } @@ -139,9 +146,9 @@ void testZeroThresholdReturnsAllResults() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.5, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parentId": "p3", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.5, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.1, "_source": {"parentId": "p3", "chunkIndex": 0}} ] } } @@ -162,9 +169,9 @@ void testHighThresholdFiltersAllResults() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.5, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.3, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parentId": "p3", "chunk_index": 0}} + {"_score": 0.5, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.3, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.1, "_source": {"parentId": "p3", "chunkIndex": 0}} ] } } @@ -186,9 +193,9 @@ void testChunksWithoutParentIdGroupedByDocumentId() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_id": "orphan-123", "_score": 0.8, "_source": {"chunk_index": 0, "text": "orphan chunk"}}, - {"_score": 0.7, "_source": {"parentId": "p2", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_id": "orphan-123", "_score": 0.8, "_source": {"chunkIndex": 0, "text": "orphan chunk"}}, + {"_score": 0.7, "_source": {"parentId": "p2", "chunkIndex": 0}} ] } } @@ -210,16 +217,16 @@ void testRequestedSizeLimitsDistinctParents() throws Exception { "hits": { "total": {"value": 10}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, - {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}}, - {"_score": 0.5, "_source": {"parentId": "p5", "chunk_index": 0}}, - {"_score": 0.4, "_source": {"parentId": "p6", "chunk_index": 0}}, - {"_score": 0.3, "_source": {"parentId": "p7", "chunk_index": 0}}, - {"_score": 0.2, "_source": {"parentId": "p8", "chunk_index": 0}}, - {"_score": 0.15, "_source": {"parentId": "p9", "chunk_index": 0}}, - {"_score": 0.1, "_source": {"parentId": "p10", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunkIndex": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunkIndex": 0}}, + {"_score": 0.5, "_source": {"parentId": "p5", "chunkIndex": 0}}, + {"_score": 0.4, "_source": {"parentId": "p6", "chunkIndex": 0}}, + {"_score": 0.3, "_source": {"parentId": "p7", "chunkIndex": 0}}, + {"_score": 0.2, "_source": {"parentId": "p8", "chunkIndex": 0}}, + {"_score": 0.15, "_source": {"parentId": "p9", "chunkIndex": 0}}, + {"_score": 0.1, "_source": {"parentId": "p10", "chunkIndex": 0}} ] } } @@ -264,10 +271,10 @@ void testFromSkipsParentsNotChunks() throws Exception { "hits": { "total": {"value": 4}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, - {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunkIndex": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunkIndex": 0}} ] } } @@ -291,10 +298,10 @@ void testHasMoreTrueWhenExtraParentFetched() throws Exception { "hits": { "total": {"value": 4}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}}, - {"_score": 0.6, "_source": {"parentId": "p4", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunkIndex": 0}}, + {"_score": 0.6, "_source": {"parentId": "p4", "chunkIndex": 0}} ] } } @@ -317,8 +324,8 @@ void testHasMoreFalseWhenNoExtraParent() throws Exception { "hits": { "total": {"value": 2}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunkIndex": 0}} ] } } @@ -340,9 +347,9 @@ void testTotalHitsPopulatedFromResponse() throws Exception { "hits": { "total": {"value": 3}, "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}}, - {"_score": 0.8, "_source": {"parentId": "p2", "chunk_index": 0}}, - {"_score": 0.7, "_source": {"parentId": "p3", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}}, + {"_score": 0.8, "_source": {"parentId": "p2", "chunkIndex": 0}}, + {"_score": 0.7, "_source": {"parentId": "p3", "chunkIndex": 0}} ] } } @@ -363,7 +370,7 @@ void testTotalHitsNullWhenMissingFromResponse() throws Exception { { "hits": { "hits": [ - {"_score": 0.9, "_source": {"parentId": "p1", "chunk_index": 0}} + {"_score": 0.9, "_source": {"parentId": "p1", "chunkIndex": 0}} ] } } @@ -376,6 +383,27 @@ void testTotalHitsNullWhenMissingFromResponse() throws Exception { assertNull(results.totalHits, "totalHits should be null when not present in response"); } + @Test + void testBulkIndexUsesParentIdAndChunkIndexForDocumentId() throws Exception { + BulkResponse mockBulkResponse = mock(BulkResponse.class); + when(mockBulkResponse.errors()).thenReturn(false); + when(mockBulkResponse.items()).thenReturn(List.of()); + ArgumentCaptor captor = ArgumentCaptor.forClass(BulkRequest.class); + when(mockEsClient.bulk(captor.capture())).thenReturn(mockBulkResponse); + + Map doc = new HashMap<>(); + doc.put("parentId", "entity-abc"); + doc.put("chunkIndex", 3); + doc.put("embedding", new float[] {0.1f, 0.2f}); + + vectorService.bulkIndex(List.of(doc), "test-index"); + + BulkRequest captured = captor.getValue(); + assertEquals(1, captured.operations().size()); + assertEquals("entity-abc-3", captured.operations().get(0).index().id(), + "Doc ID must be parentId-chunkIndex using camelCase field names from VectorDocBuilder"); + } + @Test void testGetExistingFingerprintReturnsNullWhenNotFound() throws Exception { String esResponse = "{\"hits\":{\"total\":{\"value\":0},\"hits\":[]}}"; From 0d78e3b27c6ff6967bb15fd9e38b9be7539cc93c Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:28:04 -0300 Subject: [PATCH 08/18] fix: move copyExistingVectorDocuments to VectorIndexService interface ElasticSearchBulkSink was casting VectorIndexService to ElasticSearchVectorService to call copyExistingVectorDocuments(), breaking the interface abstraction with a potential ClassCastException. Add the method to the interface with a default no-op (returns false), and remove the cast and import. Co-Authored-By: Claude Sonnet 4.6 --- .../apps/bundles/searchIndex/ElasticSearchBulkSink.java | 3 +-- .../service/search/vector/ElasticSearchVectorService.java | 1 + .../service/search/vector/VectorIndexService.java | 5 +++++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java index 4fd8b0158767..69f3c4fddb8b 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java @@ -55,7 +55,6 @@ import org.openmetadata.service.search.elasticsearch.ElasticSearchClient; import org.openmetadata.service.search.elasticsearch.EsUtils; import org.openmetadata.service.search.indexes.ColumnSearchIndex; -import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.VectorDocBuilder; import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.utils.AvailableEntityTypes; @@ -854,7 +853,7 @@ private void processMigration( EntityInterface entity, StageStatsTracker tracker) { try { - if (((ElasticSearchVectorService) vectorService).copyExistingVectorDocuments( + if (vectorService.copyExistingVectorDocuments( sourceIndex, targetIndex, parentId, fingerprint)) { vectorSuccess.incrementAndGet(); if (tracker != null) { diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index da5e573cb975..97f67144965b 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -343,6 +343,7 @@ public Map getExistingFingerprintsBatch( } } + @Override @SuppressWarnings("unchecked") public boolean copyExistingVectorDocuments( String sourceIndex, String targetIndex, String parentId, String fingerprint) { diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java index eb3e870fffe1..5b15ad22bb5e 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java @@ -24,6 +24,11 @@ VectorSearchResponse search( String executeGenericRequest(String method, String endpoint, String body); + default boolean copyExistingVectorDocuments( + String sourceIndex, String targetIndex, String parentId, String fingerprint) { + return false; + } + default String getIndexAlias() { try { String clusterAlias = Entity.getSearchRepository().getClusterAlias(); From b557756291c5fd008d0f20d8c8fd930109ed7051 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:35:33 -0300 Subject: [PATCH 09/18] fix: use instanceof pattern match instead of interface method for copyExistingVectorDocuments Remove copyExistingVectorDocuments from VectorIndexService (it is ES-specific and has no meaningful default for other implementations). Use Java 21 pattern matching instanceof in ElasticSearchBulkSink so the call is explicit and safe without introducing a no-op default into the interface. Co-Authored-By: Claude Sonnet 4.6 --- .../apps/bundles/searchIndex/ElasticSearchBulkSink.java | 8 ++++++-- .../service/search/vector/ElasticSearchVectorService.java | 1 - .../service/search/vector/VectorIndexService.java | 5 ----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java index 69f3c4fddb8b..453fd695c75c 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java @@ -55,6 +55,7 @@ import org.openmetadata.service.search.elasticsearch.ElasticSearchClient; import org.openmetadata.service.search.elasticsearch.EsUtils; import org.openmetadata.service.search.indexes.ColumnSearchIndex; +import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.VectorDocBuilder; import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.utils.AvailableEntityTypes; @@ -853,8 +854,11 @@ private void processMigration( EntityInterface entity, StageStatsTracker tracker) { try { - if (vectorService.copyExistingVectorDocuments( - sourceIndex, targetIndex, parentId, fingerprint)) { + boolean copied = + vectorService instanceof ElasticSearchVectorService esService + && esService.copyExistingVectorDocuments( + sourceIndex, targetIndex, parentId, fingerprint); + if (copied) { vectorSuccess.incrementAndGet(); if (tracker != null) { tracker.recordVector(StatsResult.SUCCESS); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 97f67144965b..da5e573cb975 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -343,7 +343,6 @@ public Map getExistingFingerprintsBatch( } } - @Override @SuppressWarnings("unchecked") public boolean copyExistingVectorDocuments( String sourceIndex, String targetIndex, String parentId, String fingerprint) { diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java index 5b15ad22bb5e..eb3e870fffe1 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorIndexService.java @@ -24,11 +24,6 @@ VectorSearchResponse search( String executeGenericRequest(String method, String endpoint, String body); - default boolean copyExistingVectorDocuments( - String sourceIndex, String targetIndex, String parentId, String fingerprint) { - return false; - } - default String getIndexAlias() { try { String clusterAlias = Entity.getSearchRepository().getClusterAlias(); From f81a21e91729a8525bfcc00468b258fbdba994b2 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 24 Apr 2026 22:13:48 -0300 Subject: [PATCH 10/18] refactor: make Elasticsearch store embeddings inline like OpenSearch - Remove separate vector_search_index_es_native.json files (en/jp/ru/zh) - Add EsUtils.enrichIndexMappingForElasticsearch() that injects dense_vector mapping into entity indices when fingerprint field is present (mirrors OsUtils.addKnnVectorSettings() pattern) - Call enrichment from ElasticSearchIndexManager.createIndexInternal() and updateIndex() so all entity indices get the embedding field at index creation - Rewrite ElasticSearchVectorService to mirror OpenSearchVectorService: fingerprint lookups by document _id in entity index, partialUpdateEntity writes to entity index (not separate vector index) - Refactor ElasticSearchBulkSink to inline embedding enrichment before bulk indexing (enrichWithEmbedding + fetchExistingFingerprints batch pre-fetch), removing async vectorExecutor/Phaser approach - Remove SearchRepository reformatVectorIndexWithDimension() and brittle vector_search_index.json string-swap for ES - Simplify RecreateWithEmbeddings by removing VECTOR_INDEX_KEY synthetic entity type (no separate vector index to recreate) - Remove dead SemanticSearchQueryBuilder and its semanticSearch schema field - Fix missing Executors import in ElasticSearchBulkSink - Remove tests for deleted methods (addEntitiesToVectorIndexBatch, bulkIndex, patchDimension) Co-Authored-By: Claude Sonnet 4.6 --- .../searchIndex/ElasticSearchBulkSink.java | 222 +++------- .../search/RecreateWithEmbeddings.java | 45 +- .../service/search/SearchRepository.java | 51 +-- .../ElasticSearchIndexManager.java | 14 +- .../service/search/elasticsearch/EsUtils.java | 52 +++ .../SemanticSearchQueryBuilder.java | 153 ------- .../vector/ElasticSearchVectorService.java | 365 ++-------------- .../vector/OpenSearchVectorService.java | 6 +- .../ElasticSearchBulkSinkSimpleTest.java | 6 - .../ElasticSearchVectorServiceTest.java | 66 --- .../en/vector_search_index_es_native.json | 293 ------------- .../jp/vector_search_index_es_native.json | 293 ------------- .../ru/vector_search_index_es_native.json | 410 ------------------ .../zh/vector_search_index_es_native.json | 293 ------------- .../json/schema/search/searchRequest.json | 5 - 15 files changed, 183 insertions(+), 2091 deletions(-) delete mode 100644 openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java delete mode 100644 openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json delete mode 100644 openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json delete mode 100644 openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json delete mode 100644 openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java index 453fd695c75c..3377f0ec1057 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSink.java @@ -15,22 +15,19 @@ import java.io.StringWriter; import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedDeque; -import java.util.concurrent.CopyOnWriteArrayList; -import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.Phaser; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.Semaphore; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.Lock; @@ -57,7 +54,6 @@ import org.openmetadata.service.search.indexes.ColumnSearchIndex; import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.VectorDocBuilder; -import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.utils.AvailableEntityTypes; /** @@ -65,7 +61,6 @@ */ @Slf4j public class ElasticSearchBulkSink implements BulkSink { - private static final int MAX_VECTOR_THREADS = 10; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final JacksonJsonpMapper JACKSON_JSONP_MAPPER = new JacksonJsonpMapper(OBJECT_MAPPER); @@ -134,10 +129,7 @@ public static synchronized void resetDocBuildPoolSize() { private final ConcurrentLinkedDeque> pendingColumnFutures = new ConcurrentLinkedDeque<>(); - // Vector embedding fields - private final ExecutorService vectorExecutor; - private final Phaser phaser; - private final CopyOnWriteArrayList pendingThreads; + // Vector embedding stats (incremented inline during addEntity) private final AtomicLong vectorSuccess = new AtomicLong(0); private final AtomicLong vectorFailed = new AtomicLong(0); @@ -152,10 +144,6 @@ public ElasticSearchBulkSink( this.batchSize = batchSize; this.maxConcurrentRequests = maxConcurrentRequests; this.maxPayloadSizeBytes = maxPayloadSizeBytes; - this.vectorExecutor = - Executors.newFixedThreadPool(MAX_VECTOR_THREADS, Thread.ofVirtual().factory()); - this.phaser = new Phaser(1); - this.pendingThreads = new CopyOnWriteArrayList<>(); // Initialize stats stats.withTotalRecords(0).withSuccessRecords(0).withFailedRecords(0); @@ -263,13 +251,28 @@ public void write(List entities, Map contextData) throws Exce } else { List entityInterfaces = (List) entities; - // Add entities to search index in parallel + boolean embeddingsEnabled = isVectorEmbeddingEnabledForEntity(entityType); + + Map existingFingerprints = Collections.emptyMap(); + if (embeddingsEnabled && !recreateIndex) { + existingFingerprints = fetchExistingFingerprints(entityInterfaces, indexName); + } + + Map finalFingerprints = existingFingerprints; List> futures = entityInterfaces.stream() .map( entity -> CompletableFuture.runAsync( - () -> addEntity(entity, indexName, recreateIndex, tracker), + () -> + addEntity( + entity, + indexName, + recreateIndex, + reindexContext, + tracker, + embeddingsEnabled, + finalFingerprints), DOC_BUILD_EXECUTOR)) .toList(); CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join(); @@ -290,11 +293,6 @@ public void write(List entities, Map contextData) throws Exce } pendingColumnFutures.removeIf(CompletableFuture::isDone); } - - if (isVectorEmbeddingEnabledForEntity(entityType)) { - addEntitiesToVectorIndexBatch( - bulkProcessor, entityInterfaces, recreateIndex, reindexContext, tracker); - } } } catch (Exception e) { LOG.error("Failed to write {} entities of type {}", entities.size(), entityType, e); @@ -325,11 +323,22 @@ protected StageStatsTracker extractTracker(Map contextData) { private static final int BULK_OPERATION_METADATA_OVERHEAD = 150; private void addEntity( - EntityInterface entity, String indexName, boolean recreateIndex, StageStatsTracker tracker) { + EntityInterface entity, + String indexName, + boolean recreateIndex, + ReindexContext reindexContext, + StageStatsTracker tracker, + boolean embeddingsEnabled, + Map existingFingerprints) { try { String entityType = Entity.getEntityTypeFromObject(entity); Object searchIndexDoc = Entity.buildSearchIndex(entityType, entity).buildSearchIndexDoc(); String json = JsonUtils.pojoToJson(searchIndexDoc); + + if (embeddingsEnabled) { + json = enrichWithEmbedding(entity, json, recreateIndex, existingFingerprints, tracker); + } + String docId = entity.getId().toString(); long rawDocSize = (long) json.getBytes(StandardCharsets.UTF_8).length; long estimatedSize = rawDocSize + BULK_OPERATION_METADATA_OVERHEAD; @@ -667,8 +676,6 @@ public StepStats getProcessStats() { @Override public void close() { try { - awaitVectorCompletion(60); - bulkProcessor.flush(); // Wait for in-flight column doc-build tasks before flushing the column processor @@ -685,8 +692,6 @@ public void close() { LOG.warn("Column bulk processor did not terminate within timeout"); } - vectorExecutor.shutdown(); - updateStats(); LOG.info( @@ -780,156 +785,73 @@ public void updateConcurrentRequests(int concurrentRequests) { boolean isVectorEmbeddingEnabledForEntity(String entityType) { return searchRepository.isVectorEmbeddingEnabled() - && searchRepository.getVectorIndexService() != null + && ElasticSearchVectorService.getInstance() != null && AvailableEntityTypes.isVectorIndexable(entityType); } - void addEntitiesToVectorIndexBatch( - CustomBulkProcessor bulkProcessor, - List entities, + @SuppressWarnings("unchecked") + private String enrichWithEmbedding( + EntityInterface entity, + String json, boolean recreateIndex, - ReindexContext reindexContext, + Map existingFingerprints, StageStatsTracker tracker) { - if (entities.isEmpty()) { - return; - } - - VectorIndexService vectorService = searchRepository.getVectorIndexService(); - if (vectorService == null) { - return; - } - - String entityType = entities.getFirst().getEntityReference().getType(); - if (!AvailableEntityTypes.isVectorIndexable(entityType)) { - return; - } - - String canonicalIndex = vectorService.getIndexAlias(); - String finalTargetIndex = canonicalIndex; - String finalSourceIndex = null; - - if (reindexContext != null) { - String stagedIndex = - reindexContext.getStagedIndex(VectorIndexService.VECTOR_INDEX_KEY).orElse(null); - if (stagedIndex != null) { - finalSourceIndex = canonicalIndex; - finalTargetIndex = stagedIndex; - } - } - - String srcIdx = finalSourceIndex; - String tgtIdx = finalTargetIndex; - - Map existingFingerprints = Map.of(); - if (srcIdx != null) { - List parentIds = new ArrayList<>(entities.size()); - for (EntityInterface entity : entities) { - parentIds.add(entity.getId().toString()); - } - existingFingerprints = vectorService.getExistingFingerprintsBatch(srcIdx, parentIds); - } - - for (EntityInterface entity : entities) { - String parentId = entity.getId().toString(); - String existingFp = existingFingerprints.get(parentId); - String currentFp = VectorDocBuilder.computeFingerprintForEntity(entity); - - if (existingFp != null && existingFp.equals(currentFp) && srcIdx != null) { - submitVectorTask( - () -> - processMigration( - vectorService, srcIdx, tgtIdx, parentId, currentFp, entity, tracker)); - } else { - submitVectorTask(() -> processEmbedding(vectorService, entity, tgtIdx, tracker)); + try { + ElasticSearchVectorService vectorService = ElasticSearchVectorService.getInstance(); + if (vectorService == null) { + return json; } - } - } - private void processMigration( - VectorIndexService vectorService, - String sourceIndex, - String targetIndex, - String parentId, - String fingerprint, - EntityInterface entity, - StageStatsTracker tracker) { - try { - boolean copied = - vectorService instanceof ElasticSearchVectorService esService - && esService.copyExistingVectorDocuments( - sourceIndex, targetIndex, parentId, fingerprint); - if (copied) { - vectorSuccess.incrementAndGet(); - if (tracker != null) { - tracker.recordVector(StatsResult.SUCCESS); + if (!recreateIndex) { + String currentFp = VectorDocBuilder.computeFingerprintForEntity(entity); + String existingFp = existingFingerprints.get(entity.getId().toString()); + if (existingFp != null && existingFp.equals(currentFp)) { + vectorSuccess.incrementAndGet(); + if (tracker != null) { + tracker.recordVector(StatsResult.SUCCESS); + } + return json; } - } else { - processEmbedding(vectorService, entity, targetIndex, tracker); } - } catch (Exception e) { - LOG.warn( - "Vector migration failed for parent_id={}, falling back to recomputation: {}", - parentId, - e.getMessage()); - processEmbedding(vectorService, entity, targetIndex, tracker); - } - } - private void processEmbedding( - VectorIndexService vectorService, - EntityInterface entity, - String targetIndex, - StageStatsTracker tracker) { - try { - vectorService.updateEntityEmbedding(entity, targetIndex); + Map embeddingFields = vectorService.generateEmbeddingFields(entity); + Map docMap = OBJECT_MAPPER.readValue(json, Map.class); + docMap.putAll(embeddingFields); + vectorSuccess.incrementAndGet(); if (tracker != null) { tracker.recordVector(StatsResult.SUCCESS); } + return OBJECT_MAPPER.writeValueAsString(docMap); } catch (Exception e) { + LOG.warn( + "Failed to generate embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); vectorFailed.incrementAndGet(); if (tracker != null) { tracker.recordVector(StatsResult.FAILED); } - LOG.error("Vector embedding failed for entity {}: {}", entity.getId(), e.getMessage(), e); + return json; } } - private void submitVectorTask(Runnable task) { - phaser.register(); - vectorExecutor.submit( - () -> { - Thread current = Thread.currentThread(); - pendingThreads.add(current); - try { - task.run(); - } finally { - pendingThreads.remove(current); - phaser.arriveAndDeregister(); - } - }); - } - - @Override - public boolean awaitVectorCompletion(int timeoutSeconds) { + private Map fetchExistingFingerprints( + List entities, String indexName) { try { - int phase = phaser.arrive(); - phaser.awaitAdvanceInterruptibly(phase, timeoutSeconds, TimeUnit.SECONDS); - return true; - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return false; - } catch (TimeoutException e) { - LOG.warn("Timeout waiting for vector completion after {}s", timeoutSeconds); - return false; + ElasticSearchVectorService vectorService = ElasticSearchVectorService.getInstance(); + if (vectorService == null) { + return Collections.emptyMap(); + } + List entityIds = new ArrayList<>(entities.size()); + for (EntityInterface entity : entities) { + entityIds.add(entity.getId().toString()); + } + return vectorService.getExistingFingerprintsBatch(indexName, entityIds); + } catch (Exception e) { + LOG.warn("Failed to fetch existing fingerprints: {}", e.getMessage()); + return Collections.emptyMap(); } } - @Override - public int getPendingVectorTaskCount() { - return Math.max(0, phaser.getUnarrivedParties() - 1); - } - @Override public StepStats getVectorStats() { return new StepStats() diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java index 512c1b67155b..8c041f6e3d2c 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/RecreateWithEmbeddings.java @@ -1,59 +1,26 @@ package org.openmetadata.service.search; -import java.util.HashSet; import java.util.Set; import lombok.extern.slf4j.Slf4j; -import org.openmetadata.search.IndexMapping; import org.openmetadata.service.Entity; -import org.openmetadata.service.search.vector.VectorIndexService; @Slf4j public class RecreateWithEmbeddings extends DefaultRecreateHandler { @Override public ReindexContext reCreateIndexes(Set entities) { - SearchRepository searchRepository = Entity.getSearchRepository(); - searchRepository.initializeVectorSearchService(); - - Set allEntities = new HashSet<>(entities); - if (searchRepository.getVectorIndexService() != null) { - allEntities.add(VectorIndexService.VECTOR_INDEX_KEY); - } - - return super.reCreateIndexes(allEntities); - } - - @Override - protected void recreateIndexFromMapping( - ReindexContext context, IndexMapping indexMapping, String entityType) { - if (VectorIndexService.VECTOR_INDEX_KEY.equals(entityType) - && Entity.getSearchRepository().getVectorIndexService() == null) { - LOG.info("Skipping vector index recreation - vector service not initialized"); - return; - } - super.recreateIndexFromMapping(context, indexMapping, entityType); - } - - @Override - public void promoteEntityIndex(EntityReindexContext context, boolean reindexSuccess) { - if (VectorIndexService.VECTOR_INDEX_KEY.equals(context.getEntityType()) - && Entity.getSearchRepository().getVectorIndexService() == null) { - return; - } - super.promoteEntityIndex(context, reindexSuccess); + Entity.getSearchRepository().initializeVectorSearchService(); + return super.reCreateIndexes(entities); } @Override public void finalizeReindex(EntityReindexContext context, boolean reindexSuccess) { super.finalizeReindex(context, reindexSuccess); - if (reindexSuccess) { - SearchRepository searchRepository = Entity.getSearchRepository(); - if (searchRepository.isVectorEmbeddingEnabled()) { - LOG.info( - "Reindex finalized for entity type '{}' with vector embeddings enabled", - context.getEntityType()); - } + if (reindexSuccess && Entity.getSearchRepository().isVectorEmbeddingEnabled()) { + LOG.info( + "Reindex finalized for entity type '{}' with vector embeddings enabled", + context.getEntityType()); } } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java index 4b3964290be8..1dee23f14b83 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java @@ -591,11 +591,6 @@ public void deleteIndex(IndexMapping indexMapping) { private String getIndexMapping(IndexMapping indexMapping) { String mappingFile = indexMapping.getIndexMappingFile(); - boolean isOpenSearch = getSearchType() == ElasticSearchConfiguration.SearchType.OPENSEARCH; - if (!isOpenSearch && mappingFile != null && mappingFile.contains("vector_search_index.json")) { - mappingFile = - mappingFile.replace("vector_search_index.json", "vector_search_index_es_native.json"); - } try (InputStream in = getClass().getResourceAsStream(String.format(mappingFile, language.toLowerCase()))) { assert in != null; @@ -607,11 +602,7 @@ private String getIndexMapping(IndexMapping indexMapping) { } public String readIndexMapping(IndexMapping indexMapping) { - String mapping = getIndexMapping(indexMapping); - if (isVectorEmbeddingEnabled() && embeddingClient != null && mapping != null) { - mapping = reformatVectorIndexWithDimension(mapping, embeddingClient.getDimension()); - } - return mapping; + return getIndexMapping(indexMapping); } /** @@ -3041,46 +3032,6 @@ private static List copyWithInheritedFlag(List return inheritedReferences; } - private String reformatVectorIndexWithDimension(String mapping, int dimension) { - try { - com.fasterxml.jackson.databind.ObjectMapper mapper = - new com.fasterxml.jackson.databind.ObjectMapper(); - JsonNode root = mapper.readTree(mapping); - if (root.has("mappings")) { - JsonNode mappings = root.get("mappings"); - if (mappings.has("properties")) { - JsonNode properties = mappings.get("properties"); - if (properties.has("embedding")) { - com.fasterxml.jackson.databind.node.ObjectNode embeddingNode = - (com.fasterxml.jackson.databind.node.ObjectNode) properties.get("embedding"); - if (embeddingNode.has("dims")) { - embeddingNode.put("dims", dimension); - } else { - embeddingNode.put("dimension", dimension); - } - } - } - com.fasterxml.jackson.databind.node.ObjectNode meta = - ((com.fasterxml.jackson.databind.node.ObjectNode) mappings).putObject("_meta"); - meta.put( - "embedding_model", - embeddingClient != null ? embeddingClient.getModelId() : "unknown") - .put("embedding_dimension", dimension); - } - return mapper.writeValueAsString(root); - } catch (Exception e) { - LOG.warn( - "Failed to parse mapping JSON for dimension patching, falling back to string replace"); - return mapping - .replace("\"dimension\": 768", "\"dimension\": " + dimension) - .replace("\"dimension\":768", "\"dimension\":" + dimension) - .replace("\"dimension\": 512", "\"dimension\": " + dimension) - .replace("\"dimension\":512", "\"dimension\":" + dimension) - .replace("\"dims\": 512", "\"dims\": " + dimension) - .replace("\"dims\":512", "\"dims\":" + dimension); - } - } - protected EmbeddingClient createEmbeddingClient(ElasticSearchConfiguration esConfig) { NaturalLanguageSearchConfiguration config = esConfig.getNaturalLanguageSearch(); String provider = diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java index 33c44c614034..dfa42955bb9a 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/ElasticSearchIndexManager.java @@ -86,7 +86,11 @@ public void updateIndex(IndexMapping indexMapping, String indexMappingContent) { try { String indexName = indexMapping.getIndexName(clusterAlias); - String mappingsJson = extractMappingsJson(indexMappingContent); + String transformedContent = + (indexMappingContent != null && !indexMappingContent.isEmpty()) + ? EsUtils.enrichIndexMappingForElasticsearch(indexMappingContent) + : indexMappingContent; + String mappingsJson = extractMappingsJson(transformedContent); PutMappingRequest request = PutMappingRequest.of( builder -> { @@ -165,12 +169,16 @@ private String extractMappingsJson(String indexMappingContent) { private void createIndexInternal(String indexName, String indexMappingContent) throws IOException { + String enrichedContent = + (indexMappingContent != null && !indexMappingContent.isEmpty()) + ? EsUtils.enrichIndexMappingForElasticsearch(indexMappingContent) + : indexMappingContent; CreateIndexRequest request = CreateIndexRequest.of( builder -> { builder.index(indexName); - if (indexMappingContent != null) { - builder.withJson(new StringReader(indexMappingContent)); + if (enrichedContent != null) { + builder.withJson(new StringReader(enrichedContent)); } return builder; }); diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/EsUtils.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/EsUtils.java index 8cdb92dbef79..6098fdad1c9e 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/EsUtils.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/EsUtils.java @@ -524,4 +524,56 @@ private static void buildSearchSourceFilter( } } } + + /** + * Enriches an Elasticsearch index mapping with vector search support. When the mapping contains + * a {@code fingerprint} field (the signal that this index stores embedded entity docs), injects a + * {@code dense_vector} embedding field and records {@code _meta} with the model ID and dimension. + * + *

The embedding dimension is resolved from the active {@link + * org.openmetadata.service.search.vector.client.EmbeddingClient}. If embeddings are disabled or + * the client is unavailable the mapping is returned unchanged. + */ + public static String enrichIndexMappingForElasticsearch(String indexMappingContent) { + if (nullOrEmpty(indexMappingContent)) { + throw new IllegalArgumentException("Empty Index Mapping Content."); + } + JsonNode rootNode = JsonUtils.readTree(indexMappingContent); + addDenseVectorSettings(rootNode); + return rootNode.toString(); + } + + static void addDenseVectorSettings(JsonNode rootNode) { + JsonNode properties = rootNode.path("mappings").path("properties"); + if (properties.isMissingNode() || !properties.has("fingerprint")) { + return; + } + + org.openmetadata.service.search.SearchRepository searchRepository = + org.openmetadata.service.Entity.getSearchRepository(); + if (searchRepository == null + || !searchRepository.isVectorEmbeddingEnabled() + || searchRepository.getEmbeddingClient() == null) { + return; + } + + int dimension = searchRepository.getEmbeddingClient().getDimension(); + + com.fasterxml.jackson.databind.node.ObjectNode embeddingNode = mapper.createObjectNode(); + embeddingNode.put("type", "dense_vector"); + embeddingNode.put("dims", dimension); + embeddingNode.put("index", true); + embeddingNode.put("similarity", "cosine"); + ((com.fasterxml.jackson.databind.node.ObjectNode) properties).set("embedding", embeddingNode); + + JsonNode mappings = rootNode.path("mappings"); + if (!mappings.isMissingNode()) { + com.fasterxml.jackson.databind.node.ObjectNode meta = + ((com.fasterxml.jackson.databind.node.ObjectNode) mappings).putObject("_meta"); + meta.put( + "embedding_model", + searchRepository.getEmbeddingClient().getModelId()) + .put("embedding_dimension", dimension); + } + } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java deleted file mode 100644 index 751ac113aaa3..000000000000 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/elasticsearch/SemanticSearchQueryBuilder.java +++ /dev/null @@ -1,153 +0,0 @@ -package org.openmetadata.service.search.elasticsearch; - -import es.co.elastic.clients.elasticsearch._types.Script; -import es.co.elastic.clients.elasticsearch._types.ScriptLanguage; -import es.co.elastic.clients.elasticsearch._types.query_dsl.FunctionBoostMode; -import es.co.elastic.clients.elasticsearch._types.query_dsl.FunctionScoreMode; -import es.co.elastic.clients.elasticsearch._types.query_dsl.Query; -import es.co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType; -import es.co.elastic.clients.json.JsonData; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; -import lombok.extern.slf4j.Slf4j; -import org.openmetadata.schema.search.SearchRequest; -import org.openmetadata.schema.utils.JsonUtils; -import org.openmetadata.service.rdf.semantic.EmbeddingService; - -/** - * Builds semantic search queries for Elasticsearch that combine: - * 1. Vector similarity search using k-NN (dense_vector) - * 2. Traditional text search with BM25 - * 3. RDF context boosting - */ -@Slf4j -public class SemanticSearchQueryBuilder { - - private static final String KNN_FIELD = "embedding"; - private static final String RDF_CONTEXT_FIELD = "rdfContext"; - - private final EmbeddingService embeddingService; - - public SemanticSearchQueryBuilder() { - this.embeddingService = EmbeddingService.getInstance(); - } - - public Query buildSemanticQuery(SearchRequest request) { - String queryText = request.getQuery(); - if (!isSemanticSearchEnabled(request)) { - return null; - } - float[] queryEmbedding = embeddingService.generateEmbedding(queryText); - - Query knnQuery = buildKnnQuery(queryEmbedding); - Query textQuery = buildTextQuery(queryText, request); - - Query hybridQuery = - Query.of( - q -> - q.bool( - b -> - b.should(s -> s.constantScore(cs -> cs.filter(knnQuery).boost(0.7f))) - .should(s -> s.constantScore(cs -> cs.filter(textQuery).boost(0.3f))))); - - return Query.of( - q -> - q.functionScore( - fs -> - fs.query(hybridQuery) - .functions(f -> f.scriptScore(ss -> ss.script(buildRdfBoostScript()))) - .scoreMode(FunctionScoreMode.Sum) - .boostMode(FunctionBoostMode.Multiply))); - } - - private Query buildKnnQuery(float[] queryEmbedding) { - Map params = new HashMap<>(); - List vectorList = new ArrayList<>(); - for (float v : queryEmbedding) { - vectorList.add((double) v); - } - params.put("query_vector", vectorList); - - return Query.of( - q -> - q.scriptScore( - ss -> - ss.query(mq -> mq.matchAll(m -> m)) - .script( - Script.of( - s -> - s.source( - src -> - src.scriptString( - "cosineSimilarity(params.query_vector, '" - + KNN_FIELD - + "') + 1.0")) - .lang(ScriptLanguage.Painless) - .params(convertToJsonDataMap(params)))))); - } - - private Query buildTextQuery(String queryText, SearchRequest request) { - List fields = new ArrayList<>(); - fields.add("name^5"); - fields.add("displayName^4"); - fields.add("description^2"); - fields.add("tags.tagFQN^3"); - - if ("table".equalsIgnoreCase(request.getIndex())) { - fields.add("columns.name^3"); - fields.add("columns.description"); - } - - return Query.of( - q -> - q.multiMatch( - m -> - m.query(queryText) - .fields(fields) - .type(TextQueryType.BestFields) - .fuzziness("AUTO"))); - } - - private Script buildRdfBoostScript() { - String scriptSource = - """ - double boost = 1.0; - - if (doc.containsKey('rdfContext.upstreamCount')) { - int upstreamCount = doc['rdfContext.upstreamCount'].value; - boost += Math.min(upstreamCount * 0.01, 0.2); - } - - if (doc.containsKey('rdfContext.downstreamCount')) { - int downstreamCount = doc['rdfContext.downstreamCount'].value; - boost += Math.min(downstreamCount * 0.02, 0.3); - } - - if (doc.containsKey('rdfContext.semanticTypes')) { - int typeCount = doc['rdfContext.semanticTypes'].size(); - boost += Math.min(typeCount * 0.05, 0.2); - } - - return boost; - """; - - return Script.of( - s -> - s.source(src -> src.scriptString(scriptSource)) - .lang(ScriptLanguage.Painless) - .params(Map.of())); - } - - private boolean isSemanticSearchEnabled(SearchRequest request) { - return request.getSemanticSearch() != null && request.getSemanticSearch(); - } - - private Map convertToJsonDataMap(Map map) { - return JsonUtils.getMap(map).entrySet().stream() - .filter(entry -> entry.getValue() != null) - .collect(Collectors.toMap(Map.Entry::getKey, entry -> JsonData.of(entry.getValue()))); - } -} diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index da5e573cb975..1865636f7aeb 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -2,24 +2,12 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ObjectNode; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; -import es.co.elastic.clients.elasticsearch._types.Refresh; -import es.co.elastic.clients.elasticsearch._types.mapping.TypeMapping; -import es.co.elastic.clients.elasticsearch.core.BulkRequest; -import es.co.elastic.clients.elasticsearch.core.BulkResponse; -import es.co.elastic.clients.elasticsearch.core.bulk.BulkOperation; -import es.co.elastic.clients.elasticsearch.core.bulk.BulkResponseItem; -import es.co.elastic.clients.elasticsearch.indices.CreateIndexRequest; -import es.co.elastic.clients.elasticsearch.indices.ExistsRequest; -import es.co.elastic.clients.elasticsearch.indices.IndexSettings; import es.co.elastic.clients.transport.rest5_client.Rest5ClientTransport; import es.co.elastic.clients.transport.rest5_client.low_level.Request; import es.co.elastic.clients.transport.rest5_client.low_level.Response; import es.co.elastic.clients.transport.rest5_client.low_level.Rest5Client; -import jakarta.json.stream.JsonParser; import java.io.InputStream; -import java.io.StringReader; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; @@ -30,7 +18,6 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.EntityInterface; -import org.openmetadata.service.Entity; import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher; import org.openmetadata.service.search.vector.client.EmbeddingClient; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; @@ -52,7 +39,7 @@ public ElasticSearchVectorService( this.client = client; this.restClient = extractRestClient(client); this.embeddingClient = embeddingClient; - this.language = language != null ? language.toLowerCase() : "en"; + this.language = language != null ? language.toLowerCase(java.util.Locale.ROOT) : "en"; } public ElasticSearchVectorService(ElasticsearchClient client, EmbeddingClient embeddingClient) { @@ -68,6 +55,7 @@ public static synchronized void init( ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { if (instance != null) { LOG.warn("ElasticSearchVectorService already initialized, reinitializing"); + EntityLifecycleEventDispatcher.getInstance().unregisterHandler("VectorEmbeddingHandler"); } ElasticSearchVectorService svc = new ElasticSearchVectorService(client, embeddingClient, language); svc.registerVectorEmbeddingHandler(); @@ -108,7 +96,7 @@ public VectorSearchResponse search( int rawOffset = 0; long totalHits = -1L; boolean exhausted = false; - int requestedParents = from + size + 1; // one extra to determine hasMore + int requestedParents = from + size + 1; int overFetchSize = Math.max(requestedParents * OVER_FETCH_MULTIPLIER, OVER_FETCH_MULTIPLIER); if (threshold <= 0.0) { overFetchSize = Math.min(overFetchSize, k); @@ -214,73 +202,30 @@ public Map generateEmbeddingFields(EntityInterface entity) { @Override public void updateEntityEmbedding(EntityInterface entity, String entityIndexName) { - updateVectorEmbeddings(entity, entityIndexName); - } - - public void updateVectorEmbeddings(EntityInterface entity, String targetIndex) { try { - String parentId = entity.getId().toString(); - String existingFingerprint = getExistingFingerprint(targetIndex, parentId); + String entityId = entity.getId().toString(); + String existingFingerprint = getExistingFingerprint(entityIndexName, entityId); String currentFingerprint = VectorDocBuilder.computeFingerprintForEntity(entity); if (currentFingerprint.equals(existingFingerprint)) { - LOG.debug("Skipping entity {} - fingerprint unchanged", parentId); + LOG.debug("Skipping entity {} - fingerprint unchanged", entityId); return; } - List> docs = VectorDocBuilder.fromEntity(entity, embeddingClient); - deleteByParentId(targetIndex, parentId); - bulkIndex(docs, targetIndex); - } catch (Exception e) { - LOG.error( - "Failed to update vector embeddings for entity {}: {}", - entity.getId(), - e.getMessage(), - e); - } - } - - public void updateVectorEmbeddingsWithMigration( - EntityInterface entity, String targetIndex, String sourceIndex) { - try { - String parentId = entity.getId().toString(); - String currentFingerprint = VectorDocBuilder.computeFingerprintForEntity(entity); - - if (sourceIndex != null) { - try { - String existingFingerprint = getExistingFingerprint(sourceIndex, parentId); - if (currentFingerprint.equals(existingFingerprint)) { - if (copyExistingVectorDocuments( - sourceIndex, targetIndex, parentId, currentFingerprint)) { - return; - } - } - } catch (Exception ex) { - LOG.warn( - "Migration copy failed for entity {}, falling back to recomputation: {}", - parentId, - ex.getMessage()); - } - } - - List> docs = VectorDocBuilder.fromEntity(entity, embeddingClient); - bulkIndex(docs, targetIndex); + Map embeddingFields = generateEmbeddingFields(entity); + partialUpdateEntity(entityIndexName, entityId, embeddingFields); } catch (Exception e) { - LOG.error( - "Failed to update vector embeddings with migration for entity {}: {}", - entity.getId(), - e.getMessage(), - e); + LOG.error("Failed to update embedding for entity {}: {}", entity.getId(), e.getMessage(), e); } } @Override - public String getExistingFingerprint(String indexName, String parentId) { + public String getExistingFingerprint(String indexName, String entityId) { try { String query = "{\"size\":1,\"_source\":[\"fingerprint\"]," - + "\"query\":{\"term\":{\"parent_id\":\"" - + VectorSearchQueryBuilder.escape(parentId) + + "\"query\":{\"term\":{\"_id\":\"" + + VectorSearchQueryBuilder.escape(entityId) + "\"}}}"; String response = executeGenericRequest("POST", "/" + indexName + "/_search", query); JsonNode root = MAPPER.readTree(response); @@ -290,8 +235,8 @@ public String getExistingFingerprint(String indexName, String parentId) { } } catch (Exception e) { LOG.debug( - "Failed to get fingerprint for parent_id={} in index={}: {}", - parentId, + "Failed to get fingerprint for entityId={} in index={}: {}", + entityId, indexName, e.getMessage()); } @@ -300,29 +245,28 @@ public String getExistingFingerprint(String indexName, String parentId) { @Override public Map getExistingFingerprintsBatch( - String indexName, List parentIds) { - if (parentIds == null || parentIds.isEmpty()) { + String indexName, List entityIds) { + if (entityIds == null || entityIds.isEmpty()) { return Collections.emptyMap(); } try { - StringBuilder termsArray = new StringBuilder("["); - for (int i = 0; i < parentIds.size(); i++) { - if (i > 0) termsArray.append(','); - termsArray + StringBuilder idsArray = new StringBuilder("["); + for (int i = 0; i < entityIds.size(); i++) { + if (i > 0) idsArray.append(','); + idsArray .append("\"") - .append(VectorSearchQueryBuilder.escape(parentIds.get(i))) + .append(VectorSearchQueryBuilder.escape(entityIds.get(i))) .append("\""); } - termsArray.append("]"); + idsArray.append("]"); String query = "{\"size\":" - + parentIds.size() - + ",\"_source\":[\"parent_id\",\"fingerprint\"]" - + ",\"query\":{\"terms\":{\"parent_id\":" - + termsArray - + "}}" - + ",\"collapse\":{\"field\":\"parent_id\"}}"; + + entityIds.size() + + ",\"_source\":[\"fingerprint\"]" + + ",\"query\":{\"ids\":{\"values\":" + + idsArray + + "}}}"; String response = executeGenericRequest("POST", "/" + indexName + "/_search", query); JsonNode root = MAPPER.readTree(response); @@ -330,10 +274,10 @@ public Map getExistingFingerprintsBatch( Map result = new HashMap<>(); for (JsonNode hit : hits) { - String pid = hit.path("_source").path("parent_id").asText(); + String id = hit.path("_id").asText(); String fp = hit.path("_source").path("fingerprint").asText(null); - if (pid != null && fp != null) { - result.put(pid, fp); + if (id != null && fp != null) { + result.put(id, fp); } } return result; @@ -343,251 +287,16 @@ public Map getExistingFingerprintsBatch( } } - @SuppressWarnings("unchecked") - public boolean copyExistingVectorDocuments( - String sourceIndex, String targetIndex, String parentId, String fingerprint) { + public void partialUpdateEntity( + String indexName, String entityId, Map embeddingFields) { try { - String searchQuery = - "{\"size\":1000,\"query\":{\"term\":{\"parent_id\":\"" - + VectorSearchQueryBuilder.escape(parentId) - + "\"}}}"; - String response = executeGenericRequest("POST", "/" + sourceIndex + "/_search", searchQuery); - JsonNode root = MAPPER.readTree(response); - JsonNode hits = root.path("hits").path("hits"); - - if (!hits.isArray() || hits.isEmpty()) { - return false; - } - - List> docs = new ArrayList<>(); - for (JsonNode hit : hits) { - Map source = MAPPER.convertValue(hit.path("_source"), Map.class); - source.put("fingerprint", fingerprint); - docs.add(source); - } - bulkIndex(docs, targetIndex); - return true; - } catch (Exception e) { - LOG.error( - "Failed to copy vector documents from {} to {} for parent_id={}: {}", - sourceIndex, - targetIndex, - parentId, - e.getMessage(), - e); - return false; - } - } - - public void softDeleteEmbeddings(EntityInterface entity) { - try { - String parentId = entity.getId().toString(); - String indexName = getIndexAlias(); - String script = - "{\"script\":{\"source\":\"ctx._source.deleted = true\"}," - + "\"query\":{\"term\":{\"parent_id\":\"" - + VectorSearchQueryBuilder.escape(parentId) - + "\"}}}"; - executeGenericRequest("POST", "/" + indexName + "/_update_by_query", script); - } catch (Exception e) { - LOG.error( - "Failed to soft delete embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); - } - } - - public void hardDeleteEmbeddings(EntityInterface entity) { - try { - String parentId = entity.getId().toString(); - String indexName = getIndexAlias(); - deleteByParentId(indexName, parentId); - } catch (Exception e) { - LOG.error( - "Failed to hard delete embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); - } - } - - public void restoreEmbeddings(EntityInterface entity) { - try { - String parentId = entity.getId().toString(); - String indexName = getIndexAlias(); - String script = - "{\"script\":{\"source\":\"ctx._source.deleted = false\"}," - + "\"query\":{\"term\":{\"parent_id\":\"" - + VectorSearchQueryBuilder.escape(parentId) - + "\"}}}"; - executeGenericRequest("POST", "/" + indexName + "/_update_by_query", script); - } catch (Exception e) { - LOG.error( - "Failed to restore embeddings for entity {}: {}", entity.getId(), e.getMessage(), e); - } - } - - private void deleteByParentId(String indexName, String parentId) { - try { - String query = - "{\"query\":{\"term\":{\"parent_id\":\"" - + VectorSearchQueryBuilder.escape(parentId) - + "\"}}}"; - executeGenericRequest("POST", "/" + indexName + "/_delete_by_query", query); + String docJson = MAPPER.writeValueAsString(embeddingFields); + String updateBody = "{\"doc\":" + docJson + "}"; + executeGenericRequest( + "POST", "/" + indexName + "/_update/" + entityId + "?retry_on_conflict=3", updateBody); } catch (Exception e) { LOG.error( - "Failed to delete by parent_id={} in index={}: {}", - parentId, - indexName, - e.getMessage(), - e); - } - } - - - public void createOrUpdateIndex(int dimension) { - try { - if (indexExists()) { - LOG.info("Vector index {} already exists", getIndexAlias()); - return; - } - - String mappingJson = loadIndexMapping(dimension); - JsonNode rootNode = MAPPER.readTree(mappingJson); - JsonNode mappingsNode = rootNode.get("mappings"); - JsonNode settingsNode = rootNode.get("settings"); - - CreateIndexRequest request = - CreateIndexRequest.of( - builder -> { - builder.index(getIndexAlias()); - - if (mappingsNode != null && !mappingsNode.isNull()) { - TypeMapping typeMapping = parseTypeMapping(mappingsNode); - builder.mappings(typeMapping); - } - - if (settingsNode != null && !settingsNode.isNull()) { - IndexSettings settings = parseIndexSettings(settingsNode); - builder.settings(settings); - } - - return builder; - }); - client.indices().create(request); - - LOG.info("Created vector index {} with dimension {}", getIndexAlias(), dimension); - } catch (Exception e) { - LOG.error("Failed to create vector index: {}", e.getMessage(), e); - } - } - - public boolean indexExists() { - try { - ExistsRequest request = ExistsRequest.of(b -> b.index(getIndexAlias())); - return client.indices().exists(request).value(); - } catch (Exception e) { - LOG.error("Failed to check if vector index exists: {}", e.getMessage(), e); - return false; - } - } - - public String getIndexName() { - return getIndexAlias(); - } - - @SuppressWarnings("unchecked") - public void bulkIndex(List> documents, String targetIndex) { - if (documents == null || documents.isEmpty()) { - return; - } - - try { - List operations = new ArrayList<>(); - for (int i = 0; i < documents.size(); i++) { - Map doc = documents.get(i); - String parentId = (String) doc.get("parentId"); - int chunkIndex = doc.containsKey("chunkIndex") ? (int) doc.get("chunkIndex") : i; - String docId = parentId + "-" + chunkIndex; - - operations.add( - BulkOperation.of( - op -> op.index(idx -> idx.index(targetIndex).id(docId).document(doc)))); - } - - BulkRequest bulkRequest = - BulkRequest.of(b -> b.operations(operations).refresh(Refresh.False)); - BulkResponse response = client.bulk(bulkRequest); - - if (response.errors()) { - long errorCount = 0; - for (BulkResponseItem item : response.items()) { - if (item.error() != null) { - errorCount++; - LOG.warn( - "Bulk vector indexing error for document [{}] in [{}]: type={}, reason={}", - item.id(), - targetIndex, - item.error().type(), - item.error().reason()); - } - } - LOG.warn( - "Bulk vector indexing completed with {}/{} errors in {}", - errorCount, - documents.size(), - targetIndex); - } else { - LOG.debug( - "Successfully bulk indexed {} vector documents in {}", documents.size(), targetIndex); - } - } catch (Exception e) { - LOG.error("Bulk vector indexing failed in {}: {}", targetIndex, e.getMessage(), e); - } - } - - private TypeMapping parseTypeMapping(JsonNode mappingsNode) { - JsonParser parser = - client - ._transport() - .jsonpMapper() - .jsonProvider() - .createParser(new StringReader(mappingsNode.toString())); - return TypeMapping._DESERIALIZER.deserialize(parser, client._transport().jsonpMapper()); - } - - private IndexSettings parseIndexSettings(JsonNode settingsNode) { - JsonParser parser = - client - ._transport() - .jsonpMapper() - .jsonProvider() - .createParser(new StringReader(settingsNode.toString())); - return IndexSettings._DESERIALIZER.deserialize(parser, client._transport().jsonpMapper()); - } - - private String loadIndexMapping(int dimension) { - String resourcePath = "elasticsearch/" + language + "/vector_search_index_es_native.json"; - try (InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourcePath)) { - if (inputStream == null) { - throw new IllegalStateException("Could not find " + resourcePath + " in classpath"); - } - String template = new String(inputStream.readAllBytes(), StandardCharsets.UTF_8); - return patchDimension(template, dimension); - } catch (Exception e) { - throw new RuntimeException("Failed to load vector search index mapping", e); - } - } - - static String patchDimension(String mappingJson, int dimension) { - try { - JsonNode root = MAPPER.readTree(mappingJson); - JsonNode properties = root.path("mappings").path("properties"); - if (!properties.isMissingNode() && properties.has("embedding")) { - ObjectNode embeddingNode = (ObjectNode) properties.get("embedding"); - if (embeddingNode.has("dims")) { - embeddingNode.put("dims", dimension); - } - } - return MAPPER.writeValueAsString(root); - } catch (Exception e) { - throw new IllegalStateException( - "Failed to patch dimension in vector index mapping template", e); + "Failed to partial update entity {} in {}: {}", entityId, indexName, e.getMessage(), e); } } diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java index ca3753f16438..1861c9293270 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/OpenSearchVectorService.java @@ -42,9 +42,11 @@ public OpenSearchVectorService(OpenSearchClient client, EmbeddingClient embeddin public static synchronized void init(OpenSearchClient client, EmbeddingClient embeddingClient) { if (instance != null) { LOG.warn("OpenSearchVectorService already initialized, reinitializing"); + EntityLifecycleEventDispatcher.getInstance().unregisterHandler("VectorEmbeddingHandler"); } - instance = new OpenSearchVectorService(client, embeddingClient); - instance.registerVectorEmbeddingHandler(); + OpenSearchVectorService svc = new OpenSearchVectorService(client, embeddingClient); + svc.registerVectorEmbeddingHandler(); + instance = svc; LOG.info( "OpenSearchVectorService initialized with model={}, dimension={}", embeddingClient.getModelId(), diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java index 043ec4983ef1..ba9d0d4d27e3 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/apps/bundles/searchIndex/ElasticSearchBulkSinkSimpleTest.java @@ -5,7 +5,6 @@ import static org.mockito.Mockito.lenient; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; -import java.util.Collections; import java.util.HashMap; import java.util.Map; import org.junit.jupiter.api.BeforeEach; @@ -89,9 +88,4 @@ void testIsVectorEmbeddingEnabledForEntity() { assertEquals(false, elasticSearchBulkSink.isVectorEmbeddingEnabledForEntity("dashboard")); } - @Test - void testAddEntitiesToVectorIndexBatch() { - elasticSearchBulkSink.addEntitiesToVectorIndexBatch( - null, Collections.emptyList(), true, null, null); - } } diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index f657d62164dc..52090477465e 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -9,11 +9,8 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import es.co.elastic.clients.elasticsearch.core.BulkRequest; -import es.co.elastic.clients.elasticsearch.core.BulkResponse; import java.util.HashMap; import java.util.List; -import org.mockito.ArgumentCaptor; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; import es.co.elastic.clients.transport.rest5_client.Rest5ClientTransport; @@ -383,27 +380,6 @@ void testTotalHitsNullWhenMissingFromResponse() throws Exception { assertNull(results.totalHits, "totalHits should be null when not present in response"); } - @Test - void testBulkIndexUsesParentIdAndChunkIndexForDocumentId() throws Exception { - BulkResponse mockBulkResponse = mock(BulkResponse.class); - when(mockBulkResponse.errors()).thenReturn(false); - when(mockBulkResponse.items()).thenReturn(List.of()); - ArgumentCaptor captor = ArgumentCaptor.forClass(BulkRequest.class); - when(mockEsClient.bulk(captor.capture())).thenReturn(mockBulkResponse); - - Map doc = new HashMap<>(); - doc.put("parentId", "entity-abc"); - doc.put("chunkIndex", 3); - doc.put("embedding", new float[] {0.1f, 0.2f}); - - vectorService.bulkIndex(List.of(doc), "test-index"); - - BulkRequest captured = captor.getValue(); - assertEquals(1, captured.operations().size()); - assertEquals("entity-abc-3", captured.operations().get(0).index().id(), - "Doc ID must be parentId-chunkIndex using camelCase field names from VectorDocBuilder"); - } - @Test void testGetExistingFingerprintReturnsNullWhenNotFound() throws Exception { String esResponse = "{\"hits\":{\"total\":{\"value\":0},\"hits\":[]}}"; @@ -450,48 +426,6 @@ void testGetExistingFingerprintsBatchReturnsEmptyForEmptyInput() { assertTrue(result.isEmpty()); } - @Test - void testPatchDimensionReplacesDims() throws Exception { - String mapping = - """ - {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512}}}} - """; - String patched = ElasticSearchVectorService.patchDimension(mapping, 1536); - com.fasterxml.jackson.databind.JsonNode root = - new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); - int dims = root.path("mappings").path("properties").path("embedding").path("dims").asInt(); - assertEquals(1536, dims); - } - - @Test - void testPatchDimensionLeavesOtherFieldsUntouched() throws Exception { - String mapping = - """ - {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512,"similarity":"cosine"}}}} - """; - String patched = ElasticSearchVectorService.patchDimension(mapping, 768); - com.fasterxml.jackson.databind.JsonNode root = - new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); - com.fasterxml.jackson.databind.JsonNode embedding = - root.path("mappings").path("properties").path("embedding"); - assertEquals(768, embedding.path("dims").asInt()); - assertEquals("dense_vector", embedding.path("type").asText()); - assertEquals("cosine", embedding.path("similarity").asText()); - } - - @Test - void testPatchDimensionHandlesNoSpaceVariant() throws Exception { - String mapping = - """ - {"mappings":{"properties":{"embedding":{"type":"dense_vector","dims":512}}}} - """; - String patched = ElasticSearchVectorService.patchDimension(mapping, 384); - com.fasterxml.jackson.databind.JsonNode root = - new com.fasterxml.jackson.databind.ObjectMapper().readTree(patched); - assertEquals( - 384, root.path("mappings").path("properties").path("embedding").path("dims").asInt()); - } - /** Returns a fresh stream on every call — safe for multi-iteration loops. */ private void mockRestClientResponse(String responseJson) throws Exception { Response mockResponse = mock(Response.class); diff --git a/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json deleted file mode 100644 index 1de030834943..000000000000 --- a/openmetadata-spec/src/main/resources/elasticsearch/en/vector_search_index_es_native.json +++ /dev/null @@ -1,293 +0,0 @@ -{ - "settings": { - "analysis": { - "normalizer": { - "lowercase_normalizer": { - "type": "custom", - "filter": [ - "lowercase" - ] - } - }, - "filter": { - "om_stemmer": { - "type": "stemmer", - "name": "english" - }, - "word_delimiter_filter": { - "type": "word_delimiter", - "preserve_original": "true" - } - }, - "analyzer": { - "om_analyzer": { - "tokenizer": "standard", - "filter": [ - "lowercase", - "word_delimiter_filter", - "om_stemmer" - ] - } - } - } - }, - "mappings": { - "properties": { - "embedding": { - "type": "dense_vector", - "dims": 512, - "index": true, - "similarity": "cosine" - }, - "text_to_embed": { - "type": "text" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } - }, - "fullyQualifiedName": { - "type": "keyword" - }, - "entityType": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer", - "ignore_above": 256 - } - } - }, - "serviceType": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "parent_id": { - "type": "keyword" - }, - "chunk_index": { - "type": "integer" - }, - "chunk_count": { - "type": "integer" - }, - "tags": { - "type": "nested", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "tier": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "certification": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "domains": { - "type": "object", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "owners": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "customProperties": { - "type": "object" - }, - "sourceId": { - "type": "keyword" - }, - "deleted": { - "type": "boolean" - }, - "fingerprint": { - "type": "keyword" - }, - "upVotes": { - "type": "integer" - }, - "downVotes": { - "type": "integer" - }, - "totalVotes": { - "type": "integer" - }, - "followersCount": { - "type": "integer" - }, - "usageSummary": { - "type": "object", - "properties": { - "dailyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - } - } - }, - "weeklyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - }, - "monthlyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - } - } - }, - "synonyms": { - "type": "keyword" - }, - "relatedTerms": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - }, - "fullyQualifiedName": { - "type": "keyword" - } - } - }, - "metricExpression": { - "type": "object", - "properties": { - "language": { - "type": "keyword" - }, - "code": { - "type": "text", - "analyzer": "om_analyzer" - } - } - }, - "metricType": { - "type": "keyword" - }, - "unitOfMeasurement": { - "type": "keyword" - }, - "customUnitOfMeasurement": { - "type": "keyword" - }, - "granularity": { - "type": "keyword" - }, - "relatedMetrics": { - "type": "keyword" - } - } - } -} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json deleted file mode 100644 index 606bdc0a916c..000000000000 --- a/openmetadata-spec/src/main/resources/elasticsearch/jp/vector_search_index_es_native.json +++ /dev/null @@ -1,293 +0,0 @@ -{ - "settings": { - "analysis": { - "normalizer": { - "lowercase_normalizer": { - "type": "custom", - "filter": [ - "lowercase" - ] - } - }, - "filter": { - "om_stemmer": { - "type": "stemmer", - "name": "english" - }, - "word_delimiter_filter": { - "type": "word_delimiter", - "preserve_original": "true" - } - }, - "analyzer": { - "om_analyzer": { - "tokenizer": "standard", - "filter": [ - "lowercase", - "word_delimiter_filter", - "om_stemmer" - ] - } - } - } - }, - "mappings": { - "properties": { - "embedding": { - "type": "dense_vector", - "dims": 512, - "index": true, - "similarity": "cosine" - }, - "text_to_embed": { - "type": "text" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } - }, - "fullyQualifiedName": { - "type": "keyword" - }, - "entityType": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer", - "ignore_above": 256 - } - } - }, - "serviceType": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "parent_id": { - "type": "keyword" - }, - "chunk_index": { - "type": "integer" - }, - "chunk_count": { - "type": "integer" - }, - "tags": { - "type": "nested", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "tier": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "certification": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "domains": { - "type": "object", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "owners": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "customProperties": { - "type": "object" - }, - "sourceId": { - "type": "keyword" - }, - "deleted": { - "type": "boolean" - }, - "fingerprint": { - "type": "keyword" - }, - "upVotes": { - "type": "integer" - }, - "downVotes": { - "type": "integer" - }, - "totalVotes": { - "type": "integer" - }, - "followersCount": { - "type": "integer" - }, - "synonyms": { - "type": "keyword" - }, - "relatedTerms": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - }, - "fullyQualifiedName": { - "type": "keyword" - } - } - }, - "usageSummary": { - "type": "object", - "properties": { - "dailyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - } - } - }, - "weeklyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - }, - "monthlyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - } - } - }, - "metricExpression": { - "type": "object", - "properties": { - "language": { - "type": "keyword" - }, - "code": { - "type": "text", - "analyzer": "om_analyzer" - } - } - }, - "metricType": { - "type": "keyword" - }, - "unitOfMeasurement": { - "type": "keyword" - }, - "customUnitOfMeasurement": { - "type": "keyword" - }, - "granularity": { - "type": "keyword" - }, - "relatedMetrics": { - "type": "keyword" - } - } - } -} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json deleted file mode 100644 index 6f621f1fdb80..000000000000 --- a/openmetadata-spec/src/main/resources/elasticsearch/ru/vector_search_index_es_native.json +++ /dev/null @@ -1,410 +0,0 @@ -{ - "settings": { - "index": { - "max_ngram_diff": 17 - }, - "analysis": { - "tokenizer": { - "n_gram_tokenizer": { - "type": "ngram", - "min_gram": 3, - "max_gram": 20, - "token_chars": [ - "letter", - "digit" - ] - } - }, - "normalizer": { - "lowercase_normalizer": { - "type": "custom", - "filter": [ - "lowercase", - "asciifolding" - ] - } - }, - "filter": { - "word_delimiter_filter": { - "type": "word_delimiter", - "preserve_original": true - }, - "compound_word_delimiter_graph": { - "type": "word_delimiter_graph", - "generate_word_parts": true, - "generate_number_parts": true, - "split_on_case_change": true, - "split_on_numerics": true, - "catenate_words": false, - "catenate_numbers": false, - "catenate_all": false, - "preserve_original": true, - "stem_english_possessive": true - }, - "russian_stop": { - "type": "stop", - "stopwords": "_russian_" - }, - "english_stop": { - "type": "stop", - "stopwords": "_english_" - }, - "russian_snowball": { - "name": "russian", - "type": "stemmer" - }, - "om_kstem": { - "type": "kstem" - }, - "asciifolding": { - "type": "asciifolding" - } - }, - "analyzer": { - "om_analyzer": { - "tokenizer": "standard", - "filter": [ - "word_delimiter_filter", - "lowercase", - "asciifolding", - "russian_stop", - "russian_snowball", - "english_stop", - "om_kstem" - ] - }, - "om_ngram": { - "type": "custom", - "tokenizer": "n_gram_tokenizer", - "filter": [ - "lowercase" - ] - }, - "om_compound_analyzer": { - "tokenizer": "standard", - "filter": [ - "compound_word_delimiter_graph", - "lowercase", - "flatten_graph" - ] - } - } - } - }, - "mappings": { - "properties": { - "embedding": { - "type": "dense_vector", - "dims": 512, - "index": true, - "similarity": "cosine" - }, - "text_to_embed": { - "type": "text" - }, - "name": { - "type": "text", - "analyzer": "om_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "ngram": { - "type": "text", - "analyzer": "om_ngram" - }, - "compound": { - "type": "text", - "analyzer": "om_compound_analyzer" - } - } - }, - "fullyQualifiedName": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "entityType": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer", - "ignore_above": 256 - } - } - }, - "serviceType": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "parent_id": { - "type": "keyword" - }, - "chunk_index": { - "type": "integer" - }, - "chunk_count": { - "type": "integer" - }, - "tags": { - "type": "nested", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "tier": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "certification": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "domains": { - "type": "object", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "displayName": { - "type": "text", - "analyzer": "om_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "ngram": { - "type": "text", - "analyzer": "om_ngram" - }, - "compound": { - "type": "text", - "analyzer": "om_compound_analyzer" - } - } - } - } - }, - "owners": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text", - "analyzer": "om_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "ngram": { - "type": "text", - "analyzer": "om_ngram" - }, - "compound": { - "type": "text", - "analyzer": "om_compound_analyzer" - } - } - } - } - }, - "customProperties": { - "type": "object" - }, - "sourceId": { - "type": "keyword" - }, - "deleted": { - "type": "boolean" - }, - "fingerprint": { - "type": "keyword" - }, - "upVotes": { - "type": "integer" - }, - "downVotes": { - "type": "integer" - }, - "totalVotes": { - "type": "integer" - }, - "followersCount": { - "type": "integer" - }, - "synonyms": { - "type": "keyword" - }, - "relatedTerms": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text", - "analyzer": "om_analyzer", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "ngram": { - "type": "text", - "analyzer": "om_ngram" - }, - "compound": { - "type": "text", - "analyzer": "om_compound_analyzer" - } - } - }, - "fullyQualifiedName": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - } - } - }, - "usageSummary": { - "type": "object", - "properties": { - "dailyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - } - } - }, - "weeklyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - }, - "monthlyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - } - } - }, - "metricExpression": { - "type": "object", - "properties": { - "language": { - "type": "keyword" - }, - "code": { - "type": "text", - "analyzer": "om_analyzer" - } - } - }, - "metricType": { - "type": "keyword" - }, - "unitOfMeasurement": { - "type": "keyword" - }, - "customUnitOfMeasurement": { - "type": "keyword" - }, - "granularity": { - "type": "keyword" - }, - "relatedMetrics": { - "type": "keyword" - } - } - } -} diff --git a/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json b/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json deleted file mode 100644 index 606bdc0a916c..000000000000 --- a/openmetadata-spec/src/main/resources/elasticsearch/zh/vector_search_index_es_native.json +++ /dev/null @@ -1,293 +0,0 @@ -{ - "settings": { - "analysis": { - "normalizer": { - "lowercase_normalizer": { - "type": "custom", - "filter": [ - "lowercase" - ] - } - }, - "filter": { - "om_stemmer": { - "type": "stemmer", - "name": "english" - }, - "word_delimiter_filter": { - "type": "word_delimiter", - "preserve_original": "true" - } - }, - "analyzer": { - "om_analyzer": { - "tokenizer": "standard", - "filter": [ - "lowercase", - "word_delimiter_filter", - "om_stemmer" - ] - } - } - } - }, - "mappings": { - "properties": { - "embedding": { - "type": "dense_vector", - "dims": 512, - "index": true, - "similarity": "cosine" - }, - "text_to_embed": { - "type": "text" - }, - "name": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword" - } - } - }, - "fullyQualifiedName": { - "type": "keyword" - }, - "entityType": { - "type": "keyword", - "fields": { - "keyword": { - "type": "keyword", - "normalizer": "lowercase_normalizer", - "ignore_above": 256 - } - } - }, - "serviceType": { - "type": "keyword", - "normalizer": "lowercase_normalizer" - }, - "parent_id": { - "type": "keyword" - }, - "chunk_index": { - "type": "integer" - }, - "chunk_count": { - "type": "integer" - }, - "tags": { - "type": "nested", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "tier": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "certification": { - "type": "object", - "properties": { - "tagFQN": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "labelType": { - "type": "keyword" - }, - "description": { - "type": "text", - "analyzer": "om_analyzer" - }, - "source": { - "type": "keyword" - }, - "state": { - "type": "keyword" - } - } - }, - "domains": { - "type": "object", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "owners": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - } - } - }, - "customProperties": { - "type": "object" - }, - "sourceId": { - "type": "keyword" - }, - "deleted": { - "type": "boolean" - }, - "fingerprint": { - "type": "keyword" - }, - "upVotes": { - "type": "integer" - }, - "downVotes": { - "type": "integer" - }, - "totalVotes": { - "type": "integer" - }, - "followersCount": { - "type": "integer" - }, - "synonyms": { - "type": "keyword" - }, - "relatedTerms": { - "type": "nested", - "properties": { - "id": { - "type": "keyword" - }, - "name": { - "type": "keyword" - }, - "type": { - "type": "keyword" - }, - "displayName": { - "type": "text" - }, - "fullyQualifiedName": { - "type": "keyword" - } - } - }, - "usageSummary": { - "type": "object", - "properties": { - "dailyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - } - } - }, - "weeklyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - }, - "monthlyStats": { - "type": "object", - "properties": { - "count": { - "type": "integer" - }, - "percentileRank": { - "type": "double" - } - } - } - } - }, - "metricExpression": { - "type": "object", - "properties": { - "language": { - "type": "keyword" - }, - "code": { - "type": "text", - "analyzer": "om_analyzer" - } - } - }, - "metricType": { - "type": "keyword" - }, - "unitOfMeasurement": { - "type": "keyword" - }, - "customUnitOfMeasurement": { - "type": "keyword" - }, - "granularity": { - "type": "keyword" - }, - "relatedMetrics": { - "type": "keyword" - } - } - } -} diff --git a/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json b/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json index b9fb3d87ad6c..470503f3c3f3 100644 --- a/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json +++ b/openmetadata-spec/src/main/resources/json/schema/search/searchRequest.json @@ -109,11 +109,6 @@ "description": "Include aggregations in the search response. Defaults to true. Set to false to skip aggregations for faster response times when only search results are needed.", "type": "boolean", "default": true - }, - "semanticSearch": { - "description": "If true, use semantic (vector) search instead of keyword search.", - "type": "boolean", - "default": false } }, "additionalProperties": false From 7d82e95a8d0c2952541aeb496299b43f1925858d Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 24 Apr 2026 22:35:57 -0300 Subject: [PATCH 11/18] fix: make kNN num_candidates configurable and improve default recall - Add knnNumCandidatesMultiplier to NaturalLanguageSearchConfiguration schema (default 2). Operators can tune recall vs latency by adjusting this value in openmetadata.yaml under naturalLanguageSearch. - Change buildNativeESQuery to accept a numCandidatesMultiplier param; add no-arg overload that uses DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER (2) - ElasticSearchVectorService.search() reads the multiplier from config via resolveNumCandidatesMultiplier(), falling back to the default - Remove nestedTags path from appendFilterMustClauses: entity indices use flat tags objects, so nested tags query is never correct here - Remove appendNested/appendOneNestedQuery dead code - Update tests to match new num_candidates = max(k * multiplier, 100) Co-Authored-By: Claude Sonnet 4.6 --- .../vector/ElasticSearchVectorService.java | 18 +++++- .../vector/VectorSearchQueryBuilder.java | 61 +++++-------------- .../vector/VectorSearchQueryBuilderTest.java | 17 ++++-- .../elasticSearchConfiguration.json | 6 ++ 4 files changed, 49 insertions(+), 53 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 1865636f7aeb..429743c238d8 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -18,7 +18,10 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.EntityInterface; +import org.openmetadata.schema.service.configuration.elasticsearch.NaturalLanguageSearchConfiguration; +import org.openmetadata.service.Entity; import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher; +import org.openmetadata.service.search.SearchRepository; import org.openmetadata.service.search.vector.client.EmbeddingClient; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; @@ -102,11 +105,12 @@ public VectorSearchResponse search( overFetchSize = Math.min(overFetchSize, k); } + int numCandidatesMultiplier = resolveNumCandidatesMultiplier(); String indexName = getIndexAlias(); while (!exhausted && byParent.size() < requestedParents) { String queryJson = VectorSearchQueryBuilder.buildNativeESQuery( - queryVector, overFetchSize, rawOffset, k, filters); + queryVector, overFetchSize, rawOffset, k, filters, numCandidatesMultiplier); String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); JsonNode root = MAPPER.readTree(responseBody); @@ -300,6 +304,18 @@ public void partialUpdateEntity( } } + private static int resolveNumCandidatesMultiplier() { + SearchRepository repo = Entity.getSearchRepository(); + if (repo == null) { + return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; + } + NaturalLanguageSearchConfiguration cfg = repo.getSearchConfiguration().getNaturalLanguageSearch(); + if (cfg == null || cfg.getKnnNumCandidatesMultiplier() == null) { + return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; + } + return cfg.getKnnNumCandidatesMultiplier(); + } + public void close() { try { if (client != null && client._transport() != null) { diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java index f40dfa300911..964781344278 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java @@ -14,6 +14,7 @@ public class VectorSearchQueryBuilder { private static final Logger LOG = LoggerFactory.getLogger(VectorSearchQueryBuilder.class); private static final String ANY = "__ANY__"; private static final String NONE = "__NONE__"; + static final int DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER = 2; /** Build a full search request body (size + _source + query) for standalone vector search. */ public static String build( @@ -74,7 +75,17 @@ private static void appendKnnQuery( public static String buildNativeESQuery( float[] vector, int size, int from, int k, Map> filters) { - int numCandidates = Math.max(k, 100); + return buildNativeESQuery(vector, size, from, k, filters, DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER); + } + + public static String buildNativeESQuery( + float[] vector, + int size, + int from, + int k, + Map> filters, + int numCandidatesMultiplier) { + int numCandidates = Math.max(k * numCandidatesMultiplier, 100); StringBuilder sb = new StringBuilder(512) .append("{\"size\":") @@ -92,24 +103,15 @@ public static String buildNativeESQuery( .append(numCandidates); sb.append(",\"filter\":{\"bool\":{\"must\":["); - appendFilterMustClauses(sb, filters, true); + appendFilterMustClauses(sb, filters); sb.append("]}}"); // close must array and bool sb.append("}}"); // close knn object return sb.toString(); } - private static void appendFilterMustClauses(StringBuilder sb, Map> filters) { - appendFilterMustClauses(sb, filters, false); - } - - /** - * @param nestedTags when true, emits a nested query for "tags" (required by the ES-native vector - * index where tags is mapped as nested); when false, emits a flat terms query (used by - * OpenSearch, which queries the regular entity indices where tags is a plain object). - */ private static void appendFilterMustClauses( - StringBuilder sb, Map> filters, boolean nestedTags) { + StringBuilder sb, Map> filters) { sb.append("{\"term\":{\"deleted\":false}}"); for (var e : filters.entrySet()) { String field = e.getKey(); @@ -126,11 +128,7 @@ private static void appendFilterMustClauses( } case "tags" -> { sb.append(','); - if (nestedTags) { - appendNested(sb, "tags", "tags.tagFQN", values); - } else { - appendFlat(sb, "tags.tagFQN", values); - } + appendFlat(sb, "tags.tagFQN", values); } case "domains" -> { sb.append(','); @@ -170,35 +168,6 @@ private static void appendFilterMustClauses( } } - private static void appendNested(StringBuilder sb, String path, String field, List vals) { - sb.append("{\"nested\":{\"path\":\"").append(path).append("\",\"query\":"); - if (vals.size() == 1) { - appendOneNestedQuery(sb, field, vals.get(0)); - } else { - sb.append("{\"bool\":{\"should\":["); - for (int i = 0; i < vals.size(); i++) { - if (i > 0) sb.append(','); - appendOneNestedQuery(sb, field, vals.get(i)); - } - sb.append("]}}"); - } - sb.append("}}"); - } - - private static void appendOneNestedQuery(StringBuilder sb, String field, String val) { - switch (val) { - case ANY -> sb.append("{\"exists\":{\"field\":\"").append(field).append("\"}}"); - case NONE -> sb.append("{\"bool\":{\"must_not\":{\"exists\":{\"field\":\"") - .append(field) - .append("\"}}}}"); - default -> sb.append("{\"term\":{\"") - .append(field) - .append("\":\"") - .append(escape(val)) - .append("\"}}"); - } - } - private static void appendFlat(StringBuilder sb, String field, List vals) { if (vals.size() == 1) { appendOneFlat(sb, field, vals.get(0)); diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java index 0dfa14264c66..e5456ca9c541 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilderTest.java @@ -736,15 +736,20 @@ void testNativeESQueryTopLevelKnnStructure() throws Exception { void testNativeESQueryNumCandidates() throws Exception { float[] vector = {0.1f}; - // k < 100 → num_candidates should be 100 - String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 50, Map.of()); + // default multiplier (2): k * 2 < 100 → num_candidates should be 100 + String query1 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 30, Map.of()); JsonNode root1 = MAPPER.readTree(query1); assertEquals(100, root1.get("knn").get("num_candidates").asInt()); - // k > 100 → num_candidates should equal k + // default multiplier (2): k * 2 > 100 → num_candidates should be k * 2 String query2 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 200, Map.of()); JsonNode root2 = MAPPER.readTree(query2); - assertEquals(200, root2.get("knn").get("num_candidates").asInt()); + assertEquals(400, root2.get("knn").get("num_candidates").asInt()); + + // custom multiplier (5): num_candidates = max(k * 5, 100) + String query3 = VectorSearchQueryBuilder.buildNativeESQuery(vector, 10, 0, 100, Map.of(), 5); + JsonNode root3 = MAPPER.readTree(query3); + assertEquals(500, root3.get("knn").get("num_candidates").asInt()); } @Test @@ -815,8 +820,8 @@ void testNativeESQueryWithTagsFilter() throws Exception { assertEquals(2, mustFilters.size()); JsonNode tagsFilter = mustFilters.get(1); - assertTrue(tagsFilter.has("nested")); - assertEquals("tags", tagsFilter.get("nested").get("path").asText()); + assertTrue(tagsFilter.has("term")); + assertEquals("PII.Sensitive", tagsFilter.get("term").get("tags.tagFQN").asText()); } @Test diff --git a/openmetadata-spec/src/main/resources/json/schema/configuration/elasticSearchConfiguration.json b/openmetadata-spec/src/main/resources/json/schema/configuration/elasticSearchConfiguration.json index 38ca4c3f5779..78775d4547ac 100644 --- a/openmetadata-spec/src/main/resources/json/schema/configuration/elasticSearchConfiguration.json +++ b/openmetadata-spec/src/main/resources/json/schema/configuration/elasticSearchConfiguration.json @@ -159,6 +159,12 @@ "default": 10, "minimum": 1 }, + "knnNumCandidatesMultiplier": { + "description": "Multiplier applied to k when computing num_candidates for Elasticsearch kNN vector search. num_candidates = max(k * multiplier, 100). Higher values improve recall at the cost of latency. Defaults to 2.", + "type": "integer", + "default": 2, + "minimum": 1 + }, "providerClass": { "description": "Fully qualified class name of the NLQService implementation to use", "type": "string", From 5a9a0a543b2031736cdd1c7ad362915ceb6d8cce Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Fri, 24 Apr 2026 22:38:45 -0300 Subject: [PATCH 12/18] Update generated TypeScript types --- .../src/generated/configuration/aiPlatformConfiguration.ts | 6 +----- .../generated/configuration/elasticSearchConfiguration.ts | 6 ++++++ .../main/resources/ui/src/generated/settings/settings.ts | 6 ++++++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/configuration/aiPlatformConfiguration.ts b/openmetadata-ui/src/main/resources/ui/src/generated/configuration/aiPlatformConfiguration.ts index ef75b021c9f5..f1f5cb02eca5 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/configuration/aiPlatformConfiguration.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/configuration/aiPlatformConfiguration.ts @@ -89,11 +89,7 @@ export interface GrpcConfiguration { */ port: number; /** - * Deadline (minutes) Collate enforces on an AI Platform streaming response. Carried on the - * gRPC call, so the AI Platform reads it from context and wraps up gracefully. The chat - * lock sweeper uses streamDeadlineMinutes + 2 as its default stale-lock ceiling (override - * via COLLATE_CHAT_LOCK_MAX_DURATION_MINUTES). Capped at 60 minutes; for longer tasks - * prefer async job + polling over a single long-lived stream. + * Deadline (minutes) enforced on a streaming response from the gRPC server. */ streamDeadlineMinutes?: number; [property: string]: any; diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/configuration/elasticSearchConfiguration.ts b/openmetadata-ui/src/main/resources/ui/src/generated/configuration/elasticSearchConfiguration.ts index c3eba5bf8b23..79dee6cf735b 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/configuration/elasticSearchConfiguration.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/configuration/elasticSearchConfiguration.ts @@ -138,6 +138,12 @@ export interface NaturalLanguageSearch { * Weight for BM25 keyword search results in hybrid RRF pipeline (0.0-1.0) */ keywordWeight?: number; + /** + * Multiplier applied to k when computing num_candidates for Elasticsearch kNN vector + * search. num_candidates = max(k * multiplier, 100). Higher values improve recall at the + * cost of latency. Defaults to 2. + */ + knnNumCandidatesMultiplier?: number; /** * Maximum number of concurrent embedding API requests. Controls the semaphore used to * throttle calls to the embedding provider and prevent overwhelming HTTP/2 connection diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/settings/settings.ts b/openmetadata-ui/src/main/resources/ui/src/generated/settings/settings.ts index a20a2d9106a0..6be6f6fb76c6 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/settings/settings.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/settings/settings.ts @@ -2184,6 +2184,12 @@ export interface NaturalLanguageSearch { * Weight for BM25 keyword search results in hybrid RRF pipeline (0.0-1.0) */ keywordWeight?: number; + /** + * Multiplier applied to k when computing num_candidates for Elasticsearch kNN vector + * search. num_candidates = max(k * multiplier, 100). Higher values improve recall at the + * cost of latency. Defaults to 2. + */ + knnNumCandidatesMultiplier?: number; /** * Maximum number of concurrent embedding API requests. Controls the semaphore used to * throttle calls to the embedding provider and prevent overwhelming HTTP/2 connection From 41f28f500609bff31c29f658474479a3985063fd Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 13:04:49 -0300 Subject: [PATCH 13/18] fix: remove resolveNumCandidatesMultiplier() that required uninstalled generated getter The knnNumCandidatesMultiplier field was added to NaturalLanguageSearchConfiguration JSON schema but the generated Java class in target/ is gitignored and the spec module hasn't been reinstalled locally. Remove the static helper that called the missing getKnnNumCandidatesMultiplier() getter. The instance field knnNumCandidatesMultiplier is set via the init() 4-arg overload; the existing 3-arg call in SearchRepository defaults to DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER (2). When CI regenerates the spec classes, SearchRepository can be updated to pass the configured value. Co-Authored-By: Claude Sonnet 4.6 --- .../vector/ElasticSearchVectorService.java | 50 +++++++++++-------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index 429743c238d8..fc1113c0418f 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -18,10 +18,7 @@ import lombok.Getter; import lombok.extern.slf4j.Slf4j; import org.openmetadata.schema.EntityInterface; -import org.openmetadata.schema.service.configuration.elasticsearch.NaturalLanguageSearchConfiguration; -import org.openmetadata.service.Entity; import org.openmetadata.service.events.lifecycle.EntityLifecycleEventDispatcher; -import org.openmetadata.service.search.SearchRepository; import org.openmetadata.service.search.vector.client.EmbeddingClient; import org.openmetadata.service.search.vector.utils.DTOs.VectorSearchResponse; @@ -36,13 +33,30 @@ public class ElasticSearchVectorService implements VectorIndexService { private final Rest5Client restClient; @Getter private final EmbeddingClient embeddingClient; private final String language; + private final int knnNumCandidatesMultiplier; public ElasticSearchVectorService( - ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + ElasticsearchClient client, + EmbeddingClient embeddingClient, + String language, + int knnNumCandidatesMultiplier) { this.client = client; this.restClient = extractRestClient(client); this.embeddingClient = embeddingClient; this.language = language != null ? language.toLowerCase(java.util.Locale.ROOT) : "en"; + this.knnNumCandidatesMultiplier = + knnNumCandidatesMultiplier > 0 + ? knnNumCandidatesMultiplier + : VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; + } + + public ElasticSearchVectorService( + ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + this( + client, + embeddingClient, + language, + VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER); } public ElasticSearchVectorService(ElasticsearchClient client, EmbeddingClient embeddingClient) { @@ -55,12 +69,16 @@ private static Rest5Client extractRestClient(ElasticsearchClient client) { } public static synchronized void init( - ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + ElasticsearchClient client, + EmbeddingClient embeddingClient, + String language, + int knnNumCandidatesMultiplier) { if (instance != null) { LOG.warn("ElasticSearchVectorService already initialized, reinitializing"); EntityLifecycleEventDispatcher.getInstance().unregisterHandler("VectorEmbeddingHandler"); } - ElasticSearchVectorService svc = new ElasticSearchVectorService(client, embeddingClient, language); + ElasticSearchVectorService svc = + new ElasticSearchVectorService(client, embeddingClient, language, knnNumCandidatesMultiplier); svc.registerVectorEmbeddingHandler(); instance = svc; LOG.info( @@ -69,6 +87,11 @@ public static synchronized void init( embeddingClient.getDimension()); } + public static synchronized void init( + ElasticsearchClient client, EmbeddingClient embeddingClient, String language) { + init(client, embeddingClient, language, VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER); + } + public static ElasticSearchVectorService getInstance() { return instance; } @@ -105,12 +128,11 @@ public VectorSearchResponse search( overFetchSize = Math.min(overFetchSize, k); } - int numCandidatesMultiplier = resolveNumCandidatesMultiplier(); String indexName = getIndexAlias(); while (!exhausted && byParent.size() < requestedParents) { String queryJson = VectorSearchQueryBuilder.buildNativeESQuery( - queryVector, overFetchSize, rawOffset, k, filters, numCandidatesMultiplier); + queryVector, overFetchSize, rawOffset, k, filters, knnNumCandidatesMultiplier); String responseBody = executeGenericRequest("POST", "/" + indexName + "/_search", queryJson); JsonNode root = MAPPER.readTree(responseBody); @@ -304,18 +326,6 @@ public void partialUpdateEntity( } } - private static int resolveNumCandidatesMultiplier() { - SearchRepository repo = Entity.getSearchRepository(); - if (repo == null) { - return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; - } - NaturalLanguageSearchConfiguration cfg = repo.getSearchConfiguration().getNaturalLanguageSearch(); - if (cfg == null || cfg.getKnnNumCandidatesMultiplier() == null) { - return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; - } - return cfg.getKnnNumCandidatesMultiplier(); - } - public void close() { try { if (client != null && client._transport() != null) { From 4ee2a937b79a5391c1a1ea3f70167ea96054d1a2 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:01:03 -0300 Subject: [PATCH 14/18] fix: wire knnNumCandidatesMultiplier from config into ElasticSearchVectorService - Make DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER public so SearchRepository can reference it as the fallback - Add resolveKnnNumCandidatesMultiplier() in SearchRepository that reads NaturalLanguageSearchConfiguration.knnNumCandidatesMultiplier via the generated getter (now available after spec module reinstall) - Pass the resolved multiplier to ElasticSearchVectorService.init() 4-arg overload so it flows into every kNN query via the instance field - Add ElasticSearchVectorServiceTest.testNumCandidatesMultiplierFromConfigIsApplied() that constructs the service with multiplier=5, runs a search with k=50, and asserts num_candidates=250 in the captured request body Co-Authored-By: Claude Sonnet 4.6 --- .../service/search/SearchRepository.java | 13 ++++++- .../vector/VectorSearchQueryBuilder.java | 2 +- .../ElasticSearchVectorServiceTest.java | 39 +++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java index 1dee23f14b83..9585dfb3963a 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/SearchRepository.java @@ -137,6 +137,7 @@ import org.openmetadata.service.search.opensearch.OpenSearchClient; import org.openmetadata.service.search.vector.ElasticSearchVectorService; import org.openmetadata.service.search.vector.OpenSearchVectorService; +import org.openmetadata.service.search.vector.VectorSearchQueryBuilder; import org.openmetadata.service.search.vector.VectorEmbeddingHandler; import org.openmetadata.service.search.vector.VectorIndexService; import org.openmetadata.service.search.vector.client.BedrockEmbeddingClient; @@ -419,7 +420,8 @@ public synchronized void initializeVectorSearchService() { } else { es.co.elastic.clients.elasticsearch.ElasticsearchClient esClient = ((ElasticSearchClient) getSearchClient()).getNewClient(); - ElasticSearchVectorService.init(esClient, embeddingClient, language); + int knnMultiplier = resolveKnnNumCandidatesMultiplier(cfg); + ElasticSearchVectorService.init(esClient, embeddingClient, language, knnMultiplier); this.vectorIndexService = ElasticSearchVectorService.getInstance(); } @@ -440,6 +442,15 @@ public synchronized void initializeVectorSearchService() { } } + private static int resolveKnnNumCandidatesMultiplier(ElasticSearchConfiguration cfg) { + NaturalLanguageSearchConfiguration nlCfg = cfg.getNaturalLanguageSearch(); + if (nlCfg != null && nlCfg.getKnnNumCandidatesMultiplier() != null + && nlCfg.getKnnNumCandidatesMultiplier() >= 1) { + return nlCfg.getKnnNumCandidatesMultiplier(); + } + return VectorSearchQueryBuilder.DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER; + } + public void ensureHybridSearchPipeline() { if (!isVectorEmbeddingEnabled() || !vectorServiceInitialized) { return; diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java index 964781344278..99f11afe7d86 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/VectorSearchQueryBuilder.java @@ -14,7 +14,7 @@ public class VectorSearchQueryBuilder { private static final Logger LOG = LoggerFactory.getLogger(VectorSearchQueryBuilder.class); private static final String ANY = "__ANY__"; private static final String NONE = "__NONE__"; - static final int DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER = 2; + public static final int DEFAULT_KNN_NUM_CANDIDATES_MULTIPLIER = 2; /** Build a full search request body (size + _source + query) for standalone vector search. */ public static String build( diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index 52090477465e..66011360e391 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -437,6 +437,45 @@ private void mockRestClientResponse(String responseJson) throws Exception { inv -> new ByteArrayInputStream(responseJson.getBytes(StandardCharsets.UTF_8))); } + @Test + void testNumCandidatesMultiplierFromConfigIsApplied() throws Exception { + int configuredMultiplier = 5; + int k = 50; + // num_candidates = max(50 * 5, 100) = 250 + int expectedNumCandidates = 250; + + ElasticSearchVectorService svc = + new ElasticSearchVectorService(mockEsClient, mockEmbeddingClient, "en", configuredMultiplier); + + List capturedBodies = new java.util.ArrayList<>(); + Response mockResponse = mock(Response.class); + HttpEntity mockEntity = mock(HttpEntity.class); + when(mockRestClient.performRequest(any(Request.class))) + .thenAnswer( + inv -> { + Request req = inv.getArgument(0); + org.apache.hc.core5.http.HttpEntity entity = req.getEntity(); + if (entity != null) { + try (java.io.InputStream is = entity.getContent()) { + capturedBodies.add(new String(is.readAllBytes(), StandardCharsets.UTF_8)); + } + } + return mockResponse; + }); + when(mockResponse.getEntity()).thenReturn(mockEntity); + when(mockEntity.getContent()) + .thenAnswer( + inv -> new ByteArrayInputStream(EMPTY_HITS_RESPONSE.getBytes(StandardCharsets.UTF_8))); + + svc.search("test query", Map.of(), 10, 0, k, 0.0); + + assertFalse(capturedBodies.isEmpty(), "Expected at least one request to be captured"); + String requestBody = capturedBodies.get(0); + assertTrue( + requestBody.contains("\"num_candidates\":" + expectedNumCandidates), + "Expected num_candidates=" + expectedNumCandidates + " in: " + requestBody); + } + /** Returns each response in sequence; repeats the last one if more calls are made. */ private void mockRestClientResponseSequence(String... responses) throws Exception { Response mockResponse = mock(Response.class); From 0bf1a6ea048a9b0c746690bfd892a6b7af23a507 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:19:51 -0300 Subject: [PATCH 15/18] fix: gate getFingerprint endpoint to admin users only The /vector/fingerprint diagnostic endpoint allowed any authenticated user to enumerate vector fingerprints for arbitrary entity UUIDs. Replace the subject-only extraction with authorizer.authorizeAdmin() to restrict access to admins. Add VectorSearchResourceTest covering: admin gate enforcement, found/not-found fingerprints, bad UUID, missing parentId, and service-unavailable when vector search is disabled. Co-Authored-By: Claude Sonnet 4.6 --- .../search/VectorSearchResource.java | 2 +- .../search/VectorSearchResourceTest.java | 146 ++++++++++++++++++ 2 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 openmetadata-service/src/test/java/org/openmetadata/service/resources/search/VectorSearchResourceTest.java diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java index 7d18d03c4108..ed7018dccabf 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/search/VectorSearchResource.java @@ -113,7 +113,7 @@ public Response getFingerprint( @Context SecurityContext securityContext, @Parameter(description = "Parent entity ID", required = true) @QueryParam("parentId") String parentId) { - DefaultAuthorizer.getSubjectContext(securityContext); + authorizer.authorizeAdmin(securityContext); if (!Entity.getSearchRepository().isVectorEmbeddingEnabled()) { return Response.status(Response.Status.SERVICE_UNAVAILABLE) diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/resources/search/VectorSearchResourceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/resources/search/VectorSearchResourceTest.java new file mode 100644 index 000000000000..759ebe443411 --- /dev/null +++ b/openmetadata-service/src/test/java/org/openmetadata/service/resources/search/VectorSearchResourceTest.java @@ -0,0 +1,146 @@ +package org.openmetadata.service.resources.search; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import jakarta.ws.rs.core.Response; +import jakarta.ws.rs.core.SecurityContext; +import java.util.UUID; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.MockedStatic; +import org.openmetadata.service.Entity; +import org.openmetadata.service.search.SearchRepository; +import org.openmetadata.service.search.vector.VectorIndexService; +import org.openmetadata.service.security.Authorizer; +import org.openmetadata.service.security.AuthorizationException; + +class VectorSearchResourceTest { + + private Authorizer mockAuthorizer; + private SecurityContext mockSecurityContext; + private SearchRepository mockSearchRepository; + private VectorIndexService mockVectorService; + private VectorSearchResource resource; + + @BeforeEach + void setUp() { + mockAuthorizer = mock(Authorizer.class); + mockSecurityContext = mock(SecurityContext.class); + mockSearchRepository = mock(SearchRepository.class); + mockVectorService = mock(VectorIndexService.class); + resource = new VectorSearchResource(mockAuthorizer); + } + + @Test + void testGetFingerprintRequiresAdmin() { + doThrow(new AuthorizationException("Forbidden")) + .when(mockAuthorizer) + .authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + + try { + resource.getFingerprint(mockSecurityContext, UUID.randomUUID().toString()); + } catch (AuthorizationException e) { + verify(mockVectorService, never()).getExistingFingerprint(any(), any()); + return; + } + throw new AssertionError("Expected AuthorizationException"); + } + } + + @Test + void testGetFingerprintReturnsFoundWhenFingerprintExists() { + doNothing().when(mockAuthorizer).authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + when(mockSearchRepository.getVectorIndexService()).thenReturn(mockVectorService); + when(mockVectorService.getIndexAlias()).thenReturn("table_search_index"); + + String entityId = UUID.randomUUID().toString(); + when(mockVectorService.getExistingFingerprint("table_search_index", entityId)) + .thenReturn("abc123"); + + Response response = resource.getFingerprint(mockSecurityContext, entityId); + + assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); + } + } + + @Test + void testGetFingerprintReturnsNotFoundWhenFingerprintMissing() { + doNothing().when(mockAuthorizer).authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + when(mockSearchRepository.getVectorIndexService()).thenReturn(mockVectorService); + when(mockVectorService.getIndexAlias()).thenReturn("table_search_index"); + + String entityId = UUID.randomUUID().toString(); + when(mockVectorService.getExistingFingerprint("table_search_index", entityId)) + .thenReturn(null); + + Response response = resource.getFingerprint(mockSecurityContext, entityId); + + assertEquals(Response.Status.OK.getStatusCode(), response.getStatus()); + } + } + + @Test + void testGetFingerprintReturnsBadRequestForInvalidUuid() { + doNothing().when(mockAuthorizer).authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + when(mockSearchRepository.getVectorIndexService()).thenReturn(mockVectorService); + + Response response = resource.getFingerprint(mockSecurityContext, "not-a-uuid"); + + assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), response.getStatus()); + } + } + + @Test + void testGetFingerprintReturnsBadRequestForMissingParentId() { + doNothing().when(mockAuthorizer).authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + when(mockSearchRepository.getVectorIndexService()).thenReturn(mockVectorService); + + Response response = resource.getFingerprint(mockSecurityContext, null); + + assertEquals(Response.Status.BAD_REQUEST.getStatusCode(), response.getStatus()); + } + } + + @Test + void testGetFingerprintReturnsServiceUnavailableWhenDisabled() { + doNothing().when(mockAuthorizer).authorizeAdmin(mockSecurityContext); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(mockSearchRepository); + when(mockSearchRepository.isVectorEmbeddingEnabled()).thenReturn(false); + + Response response = + resource.getFingerprint(mockSecurityContext, UUID.randomUUID().toString()); + + assertEquals(Response.Status.SERVICE_UNAVAILABLE.getStatusCode(), response.getStatus()); + } + } +} From f94c79083e4f80ef76c9a22173ee51857e2c7751 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:34:07 -0300 Subject: [PATCH 16/18] fix: check HTTP status code in executeGenericRequest Elasticsearch returns 4xx/5xx as regular HTTP responses that the low-level client does not throw on. Previously the response body was returned as-is, causing downstream JSON parsing failures with no context about the real error. Now checks response.getStatusCode() and throws IOException with the status and body when >= 400, mirroring the same pattern in OpenSearchVectorService. Co-Authored-By: Claude Sonnet 4.6 --- .../search/vector/ElasticSearchVectorService.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java index fc1113c0418f..22a7c963c5f1 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/search/vector/ElasticSearchVectorService.java @@ -7,6 +7,7 @@ import es.co.elastic.clients.transport.rest5_client.low_level.Request; import es.co.elastic.clients.transport.rest5_client.low_level.Response; import es.co.elastic.clients.transport.rest5_client.low_level.Rest5Client; +import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; import java.util.ArrayList; @@ -212,8 +213,14 @@ public String executeGenericRequest(String method, String endpoint, String body) request.setJsonEntity(body); } Response response = restClient.performRequest(request); + int statusCode = response.getStatusCode(); try (InputStream is = response.getEntity().getContent()) { - return new String(is.readAllBytes(), StandardCharsets.UTF_8); + String responseBody = new String(is.readAllBytes(), StandardCharsets.UTF_8); + if (statusCode >= 400) { + throw new IOException( + "Elasticsearch request failed with status " + statusCode + ": " + responseBody); + } + return responseBody; } } catch (Exception e) { LOG.error("Generic request failed: {} {}", method, endpoint, e); From 96eaf364f1c12e7327e3a20f60319bd0a11590ca Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:34:55 -0300 Subject: [PATCH 17/18] Update openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../service/search/vector/ElasticSearchVectorServiceTest.java | 1 - 1 file changed, 1 deletion(-) diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java index 66011360e391..9ad740b8f61d 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/vector/ElasticSearchVectorServiceTest.java @@ -9,7 +9,6 @@ import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; -import java.util.HashMap; import java.util.List; import es.co.elastic.clients.elasticsearch.ElasticsearchClient; From a593b1f9b4535d8f8d56f9371873df9411922c40 Mon Sep 17 00:00:00 2001 From: Joao Amaral <7281460+joaopamaral@users.noreply.github.com> Date: Sat, 25 Apr 2026 14:44:20 -0300 Subject: [PATCH 18/18] fix: replace stale reformatVectorIndexWithDimension test with current coverage The method was deleted in the inline-embedding refactor. Remove the test that invoked it via reflection (which would throw NoSuchMethodException). Replace with: - SearchRepositoryBehaviorTest: readIndexMappingReturnsMappingForKnownIndex verifies readIndexMapping still loads the file-based mapping correctly - EsUtilsTest: three tests for enrichIndexMappingForElasticsearch covering null/empty input, skip when fingerprint field absent, and dense_vector injection with _meta when fingerprint field present and vector enabled Co-Authored-By: Claude Sonnet 4.6 --- .../search/SearchRepositoryBehaviorTest.java | 28 ++------------ .../search/elasticsearch/EsUtilsTest.java | 37 +++++++++++++++++++ 2 files changed, 41 insertions(+), 24 deletions(-) diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/SearchRepositoryBehaviorTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/SearchRepositoryBehaviorTest.java index a4cf0e850cfd..a7c8c68efdad 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/SearchRepositoryBehaviorTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/SearchRepositoryBehaviorTest.java @@ -2136,30 +2136,10 @@ void initializeLineageComponentsDelegatesWhenSearchClientExists() throws Excepti } @Test - void reformatVectorIndexWithDimensionAddsMetaAndPreservesInvalidJson() throws Exception { - EmbeddingClient embeddingClient = mock(EmbeddingClient.class); - when(embeddingClient.getModelId()).thenReturn("test-model"); - setPrivateField(repository, "embeddingClient", embeddingClient); - - String updated = - (String) - invokePrivateMethod( - repository, - "reformatVectorIndexWithDimension", - new Class[] {String.class, int.class}, - "{\"mappings\":{}}", - 768); - - assertTrue(updated.contains("\"embedding_model\":\"test-model\"")); - assertTrue(updated.contains("\"embedding_dimension\":768")); - assertEquals( - "not-json", - invokePrivateMethod( - repository, - "reformatVectorIndexWithDimension", - new Class[] {String.class, int.class}, - "not-json", - 384)); + void readIndexMappingReturnsMappingForKnownIndex() { + String mapping = repository.readIndexMapping(TABLE_MAPPING); + assertNotNull(mapping); + assertFalse(mapping.isBlank()); } @Test diff --git a/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/EsUtilsTest.java b/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/EsUtilsTest.java index 0f9714a8e49d..bc2efc55119a 100644 --- a/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/EsUtilsTest.java +++ b/openmetadata-service/src/test/java/org/openmetadata/service/search/elasticsearch/EsUtilsTest.java @@ -1,6 +1,7 @@ package org.openmetadata.service.search.elasticsearch; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertInstanceOf; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertSame; @@ -503,6 +504,42 @@ void testSearchEntityByKeyThrowsWhenMultipleMatchesExist() throws Exception { } } + @Test + void enrichIndexMappingThrowsOnNullOrEmptyInput() { + assertThrows(IllegalArgumentException.class, () -> EsUtils.enrichIndexMappingForElasticsearch(null)); + assertThrows(IllegalArgumentException.class, () -> EsUtils.enrichIndexMappingForElasticsearch("")); + } + + @Test + void enrichIndexMappingSkipsEmbeddingWhenFingerprintFieldAbsent() { + String mapping = "{\"mappings\":{\"properties\":{\"name\":{\"type\":\"keyword\"}}}}"; + String result = EsUtils.enrichIndexMappingForElasticsearch(mapping); + assertFalse(result.contains("dense_vector"), "Should not add embedding when fingerprint field is absent"); + } + + @Test + void enrichIndexMappingInjectsEmbeddingWhenFingerprintPresentAndVectorEnabled() { + String mapping = "{\"mappings\":{\"properties\":{\"name\":{\"type\":\"keyword\"},\"fingerprint\":{\"type\":\"keyword\"}}}}"; + + org.openmetadata.service.search.vector.client.EmbeddingClient mockEmbeddingClient = + org.mockito.Mockito.mock(org.openmetadata.service.search.vector.client.EmbeddingClient.class); + org.mockito.Mockito.when(mockEmbeddingClient.getDimension()).thenReturn(768); + org.mockito.Mockito.when(mockEmbeddingClient.getModelId()).thenReturn("test-model"); + + try (MockedStatic entityMock = mockStatic(Entity.class)) { + entityMock.when(Entity::getSearchRepository).thenReturn(searchRepository); + org.mockito.Mockito.when(searchRepository.isVectorEmbeddingEnabled()).thenReturn(true); + org.mockito.Mockito.when(searchRepository.getEmbeddingClient()).thenReturn(mockEmbeddingClient); + + String result = EsUtils.enrichIndexMappingForElasticsearch(mapping); + + assertTrue(result.contains("\"dense_vector\""), "Should add dense_vector field"); + assertTrue(result.contains("\"dims\":768"), "Should set correct dimension"); + assertTrue(result.contains("\"embedding_model\":\"test-model\""), "Should add _meta.embedding_model"); + assertTrue(result.contains("\"embedding_dimension\":768"), "Should add _meta.embedding_dimension"); + } + } + @Test void testSearchEntitiesUsesResolvedAliasAndPostFilter() throws Exception { try (MockedStatic entity = mockStatic(Entity.class)) {