From 55f244e577b2c3c871af94ebf0af508e90c383c4 Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 22 Jan 2026 13:55:25 +0000 Subject: [PATCH 01/10] Getting ES and faceting working --- .gitlab-ci.yml | 1 - docker-compose.yaml | 38 ++++--- k8s/helm/values.yaml | 12 +- readme.md | 2 +- .../search/samples/SampleServiceGrpcIT.java | 8 +- server/src/it/resources/application.yml | 3 + .../biosamples/search/facet/FacetService.java | 6 +- .../facet/SamplingFacetingStrategy.java | 103 ++++++++++++++++-- 8 files changed, 135 insertions(+), 38 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b89a5de..419e161 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -29,7 +29,6 @@ build_docker_image: after_script: - docker logout ${CI_REGISTRY} - deploy_primary_dev: variables: ENVIRONMENT_NAME: primary_dev diff --git a/docker-compose.yaml b/docker-compose.yaml index 900b7ff..1943ea7 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,19 +1,26 @@ services: -# biosamples-search: -# build: . -# image: biosamples-search:latest -# ports: -# - "8080:8080" -# - "9090:9090" -# environment: -# - spring.elasticsearch.username=elastic -# - spring.elasticsearch.password=elastic -# - spring.elasticsearch.uris=http://elastic:9200 -# links: -# - elastic + biosamples-search: + build: . + image: biosamples-search:latest + ports: + - "8080:8080" + - "9090:9090" + depends_on: + elastic: + condition: service_healthy + rabbitmq: + condition: service_started + environment: + - spring.elasticsearch.username=elastic + - spring.elasticsearch.password=elastic + - spring.elasticsearch.uris=http://elastic:9200 + - spring.rabbitmq.host=rabbitmq setup: image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} user: "0" + depends_on: + elastic: + condition: service_healthy command: > bash -c ' if [ x${ELASTIC_PASSWORD} == x ]; then @@ -29,11 +36,6 @@ services: until curl -s -X POST -u "elastic:${ELASTIC_PASSWORD}" -H "Content-Type: application/json" http://elastic:9200/_security/user/kibana_system/_password -d "{\"password\":\"${KIBANA_PASSWORD}\"}" | grep -q "^{}"; do sleep 10; done; echo "All done!"; ' - healthcheck: - test: [ "CMD-SHELL", "[ -f config/certs/es01/es01.crt ]" ] - interval: 1s - timeout: 5s - retries: 120 elastic: image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION} volumes: @@ -101,4 +103,4 @@ volumes: kibana_data: driver: local rabbitmq_data: - driver: local \ No newline at end of file + driver: local diff --git a/k8s/helm/values.yaml b/k8s/helm/values.yaml index 9556550..0447866 100644 --- a/k8s/helm/values.yaml +++ b/k8s/helm/values.yaml @@ -61,13 +61,11 @@ ingress: # hosts: # - chart-example.local -resources: {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi +resources: + limits: + memory: 4Gi + requests: + memory: 2Gi autoscaling: enabled: false diff --git a/readme.md b/readme.md index c970df7..c1fb71c 100644 --- a/readme.md +++ b/readme.md @@ -20,7 +20,7 @@ Three main APIs are exposed by the application. 2. Search samples streaming (GRPC) 3. Get facets for search (POST, GRPC) -BioSamples core services uses GRPC to communicate with `biosamples-search`. The RESTfull services are implemented mainly for the testing and development purposes. +BioSamples core services uses GRPC to communicate with `biosamples-search`. The RESTful services are implemented mainly for the testing and development purposes. ### Build diff --git a/server/src/it/java/uk/ac/ebi/biosamples/search/samples/SampleServiceGrpcIT.java b/server/src/it/java/uk/ac/ebi/biosamples/search/samples/SampleServiceGrpcIT.java index 068ab25..4557968 100644 --- a/server/src/it/java/uk/ac/ebi/biosamples/search/samples/SampleServiceGrpcIT.java +++ b/server/src/it/java/uk/ac/ebi/biosamples/search/samples/SampleServiceGrpcIT.java @@ -3,9 +3,11 @@ import io.grpc.ManagedChannel; import io.grpc.ManagedChannelBuilder; import org.junit.jupiter.api.Test; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.boot.test.context.SpringBootTest; import org.springframework.boot.testcontainers.context.ImportTestcontainers; import org.springframework.context.annotation.Import; +import org.springframework.grpc.server.lifecycle.GrpcServerLifecycle; import uk.ac.ebi.biosamples.search.IntegrationTestConfiguration; import uk.ac.ebi.biosamples.search.TestDependencyContainers; import uk.ac.ebi.biosamples.search.grpc.*; @@ -22,6 +24,9 @@ @ImportTestcontainers(TestDependencyContainers.class) public class SampleServiceGrpcIT { + @Autowired + private GrpcServerLifecycle grpcServerLifecycle; + @Test void searchSamples_shouldReturnFirstAccessionPage() { runTestWithSetupAndTearDown((stub) -> { @@ -159,7 +164,8 @@ void streamSamplesWithAdvancedTextSearch_shouldReturnCorrectSampleAccessionsStre } void runTestWithSetupAndTearDown(Consumer test) { - ManagedChannel channel = ManagedChannelBuilder.forAddress("localhost", 9090) + int grpcPort = grpcServerLifecycle.getPort(); + ManagedChannel channel = ManagedChannelBuilder.forAddress("localhost", grpcPort) .usePlaintext() .build(); SearchGrpc.SearchBlockingStub searchBlockingStub = SearchGrpc.newBlockingStub(channel); diff --git a/server/src/it/resources/application.yml b/server/src/it/resources/application.yml index da1f522..042aef6 100644 --- a/server/src/it/resources/application.yml +++ b/server/src/it/resources/application.yml @@ -6,6 +6,9 @@ spring: port: 5672 username: guest password: guest + grpc: + server: + port: 0 jackson: deserialization: FAIL_ON_UNKNOWN_PROPERTIES: false diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java index 9a500fd..c9300c8 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java @@ -34,8 +34,9 @@ public FacetService(ElasticsearchOperations elasticsearchOperations, public List getFacets(SearchQuery searchQuery) { Query esSearchQuery = QueryHelper.getSearchQuery(searchQuery); Map aggregations = getAggregations(searchQuery); + log.info("Generated Elasticsearch Aggregations: {} aggregations configured", aggregations.size()); NativeQuery query = getEsNativeQuery(esSearchQuery, aggregations); - return retrieveFacets(query); + return retrieveFacets(query, aggregations); } private Map getAggregations(SearchQuery searchQuery) { @@ -60,8 +61,9 @@ private NativeQuery getEsNativeQuery(Query searchQuery, Map return builder.build(); } - public List retrieveFacets(NativeQuery query) { + public List retrieveFacets(NativeQuery query, Map aggregations) { log.info("Generated Elasticsearch Query: {}", query.getQuery()); + log.info("Generated Elasticsearch Aggregations ({}): {}", aggregations.size(), aggregations.keySet()); SearchHits hits = elasticsearchOperations.search(query, Sample.class); return facetingStrategy.retrieveFacets(hits); } diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/SamplingFacetingStrategy.java b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/SamplingFacetingStrategy.java index b56934a..8f8e7ed 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/SamplingFacetingStrategy.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/SamplingFacetingStrategy.java @@ -18,81 +18,168 @@ @Service("samplingFacetingStrategy") public class SamplingFacetingStrategy implements FacetingStrategy { + /** + * Builds the Elasticsearch aggregation query structure for sampling-based faceting. + * This method creates a hierarchical aggregation structure where: + * 1. A sampler aggregation limits the number of documents processed per shard + * 2. Nested aggregations run on the sampled documents to compute facets + * 3. An extrapolation factor is calculated to scale facet counts to the full dataset + */ @Override public Map getDefaultAggregations() { + // Create a value_count aggregation that counts unique accession values + // This aggregation will be used in two places: + // 1. Inside the sampler to count sampled documents + // 2. At the top level to count total matching documents Aggregation totalCountAgg = Aggregation.of(a -> a - .valueCount(v -> v.field("accession.keyword")) + .valueCount(v -> v.field("accession.keyword")) // Count documents by accession.keyword field ); + // Create a map to hold sub-aggregations that will run INSIDE the sampler aggregation + // These aggregations will only process the sampled documents (not all documents) Map subAggregations = new HashMap<>(); + + // Add a count aggregation to track how many documents were actually sampled + // This is needed to calculate the extrapolation factor later subAggregations.put("total_sampled", totalCountAgg); + + // Add nested aggregations for characteristics (e.g., organism, sex, etc.) + // This will create facets for attribute key-value pairs from the characteristics nested field subAggregations.put("characteristics", AttributeFacet.getAggregations()); + + // Add aggregations for relationships (e.g., "derived from", "parent of", etc.) subAggregations.put("relationships", RelationshipFacet.getAggregations()); + + // Add aggregations for external references (e.g., ENA, SRA accessions) subAggregations.put("externalReferences", ExternalRefFacet.getAggregations()); + // Create the sampler aggregation - this is the key optimization + // The sampler aggregation randomly selects documents from each shard BEFORE running nested aggregations Aggregation samplerAgg = new Aggregation.Builder() - .sampler(s -> s.shardSize(100000)) - .aggregations(subAggregations) + .sampler(s -> s.shardSize(100000)) // Sample up to 100,000 documents per shard + // With 3 shards, this means ~300k total documents sampled + // All nested aggregations below will run ONLY on these sampled docs + .aggregations(subAggregations) // Attach all the sub-aggregations to run on sampled docs .build(); + // Create the top-level aggregation map that will be sent to Elasticsearch + // This structure has aggregations at two levels: + // - Top level: aggregations that run on ALL matching documents + // - Inside sampler: aggregations that run only on SAMPLED documents Map aggregationMap = new HashMap<>(); + + // Add the sampler aggregation with all its nested sub-aggregations + // This will compute facets from sampled documents only aggregationMap.put("sampled_facets", samplerAgg); + + // Add date range facet for the "update" field + // This runs on ALL documents (not sampled) because date ranges don't need sampling aggregationMap.put("update", DateRangeFacet.getAggregations()); + + // Add a count of total matching documents (runs on ALL documents, not sampled) + // This is used to calculate: extrapolationFactor = totalDocs / sampledDocs aggregationMap.put("total_docs", totalCountAgg); return aggregationMap; } + /** + * Extracts and processes facet results from Elasticsearch aggregation response. + * This method: + * 1. Extracts the total document count and sampled document count + * 2. Calculates an extrapolation factor to scale sampled facet counts to full dataset + * 3. Applies the extrapolation factor to sampled facets (characteristics, relationships, externalRefs) + * 4. Adds non-sampled facets (date ranges) without extrapolation + */ @Override public List retrieveFacets(SearchHits hits) { + // Initialize the list that will hold all facet results List facets = new ArrayList<>(); + // Extract aggregations from the Elasticsearch search response + // The aggregations contain the facet counts computed by Elasticsearch ElasticsearchAggregations aggregations = (ElasticsearchAggregations) hits.getAggregations(); if (aggregations == null) { + // If no aggregations were returned, return empty list return facets; } + // Convert aggregations to a map for easier access by name + // Keys are: "sampled_facets", "update", "total_docs" Map aggMap = aggregations.aggregationsAsMap(); - long totalDocs = 0; - long sampledDocs = 0; - double extrapolationFactor = 1.0; + // Initialize variables to calculate extrapolation factor + long totalDocs = 0; // Total number of documents matching the query (from all shards) + long sampledDocs = 0; // Number of documents actually sampled (from sampler aggregation) + double extrapolationFactor = 1.0; // Factor to multiply sampled counts by to estimate full counts + // Default is 1.0 (no extrapolation) if calculation fails + // Extract the total document count from the top-level "total_docs" aggregation + // This counts ALL documents matching the query (not just sampled ones) + // Example: If query matches 50M documents, totalDocs = 50,000,000 if (aggMap.containsKey("total_docs")) { + // Get the value_count aggregation result and extract the count value totalDocs = (long) aggMap.get("total_docs").aggregation().getAggregate().valueCount().value(); } - + // Process the sampler aggregation results + // The "sampled_facets" key contains the sampler aggregation with all its nested sub-aggregations if (aggMap.containsKey("sampled_facets")) { + // Get the container holding the sampler aggregation ElasticsearchAggregation sampledAggContainer = aggMap.get("sampled_facets"); + + // Verify that this is indeed a sampler aggregation (safety check) if (sampledAggContainer != null && sampledAggContainer.aggregation().getAggregate().isSampler()) { + // Extract the sub-aggregations that ran INSIDE the sampler + // These contain: "total_sampled", "characteristics", "relationships", "externalReferences" Map subAggs = sampledAggContainer.aggregation().getAggregate().sampler().aggregations(); - + // Extract the count of documents that were actually sampled + // This is the count from the "total_sampled" aggregation that ran inside the sampler + // Example: If sampler selected 300k docs (100k per shard × 3 shards), sampledDocs = 300,000 if (subAggs.containsKey("total_sampled")) { sampledDocs = (long) subAggs.get("total_sampled").valueCount().value(); } + // Calculate the extrapolation factor + // This factor tells us how much to multiply sampled counts to estimate full dataset counts + // Example: If totalDocs = 50M and sampledDocs = 300k, then factor = 50M / 300k = ~166.67 + // This means each sampled document represents ~167 documents in the full dataset if (sampledDocs > 0 && totalDocs > 0) { extrapolationFactor = (double) totalDocs / sampledDocs; } + // Extract characteristics facets and apply extrapolation factor + // The AttributeFacet.populateFacetFromAggregationResults method will: + // 1. Parse the nested aggregation results for characteristics + // 2. Multiply each facet count by extrapolationFactor + // 3. Return a list of Facet objects (e.g., "organism" -> {"Homo sapiens": 1000, "Mus musculus": 500}) if (subAggs.containsKey("characteristics")) { facets.addAll(AttributeFacet.populateFacetFromAggregationResults(subAggs.get("characteristics"), extrapolationFactor)); } + + // Extract relationship facets and apply extrapolation factor + // Examples: "derived from", "parent of", etc. if (subAggs.containsKey("relationships")) { facets.addAll(RelationshipFacet.populateFacetFromAggregationResults(subAggs.get("relationships"), extrapolationFactor)); } + + // Extract external reference facets and apply extrapolation factor + // Examples: ENA accessions, SRA accessions, etc. if (subAggs.containsKey("externalReferences")) { facets.addAll(ExternalRefFacet.populateFacetFromAggregationResults(subAggs.get("externalReferences"), extrapolationFactor)); } } } + // Extract date range facets for the "update" field + // Date range facets are NOT sampled - they run on ALL documents + // Therefore, no extrapolation factor is needed (extrapolationFactor = 1.0) if (aggMap.containsKey("update")) { facets.addAll(DateRangeFacet.populateFacetFromAggregationResults(aggMap.get("update"))); } + // Return the complete list of facets with extrapolated counts return facets; } } From 5bebb867685d19d87ee9fe1be5491b810aa8fcef Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 22 Jan 2026 15:07:54 +0000 Subject: [PATCH 02/10] Getting ES and faceting working --- .gitlab-ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 419e161..10765dd 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,6 +39,7 @@ deploy_primary_dev: only: - dev - main + - chore/working-es-search-and-faceting when: manual extends: .kube_deploy_script @@ -52,6 +53,7 @@ deploy_primary_prod: only: - dev - main + - chore/working-es-search-and-faceting when: manual extends: .kube_deploy_script @@ -65,6 +67,7 @@ deploy_fallback_prod: only: - dev - main + - chore/working-es-search-and-faceting when: manual extends: .kube_deploy_script From c2a9a9ac69a572075f03a2d4d111788f1e996b3d Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 22 Jan 2026 15:41:50 +0000 Subject: [PATCH 03/10] increase facet load timeout to 60 seconds. --- .../java/uk/ac/ebi/biosamples/search/facet/FacetService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java index c9300c8..c347882 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/facet/FacetService.java @@ -52,7 +52,7 @@ private NativeQuery getEsNativeQuery(Query searchQuery, Map NativeQueryBuilder builder = NativeQuery.builder() .withQuery(searchQuery) .withMaxResults(0) - .withTimeout(Duration.ofSeconds(30)); + .withTimeout(Duration.ofSeconds(60)); if (!aggregations.isEmpty()) { aggregations.forEach(builder::withAggregation); From ea0fdb89ea338df0e74e5103f69356f580105298 Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Wed, 4 Feb 2026 10:52:10 +0000 Subject: [PATCH 04/10] reindexing queue to be used to reindexing and also improvements to StructuredDataSearchFilter --- .../filter/StructuredDataSearchFilter.java | 45 ++++++++++++------- .../biosamples/search/index/RabbitConfig.java | 12 ++++- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/StructuredDataSearchFilter.java b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/StructuredDataSearchFilter.java index 3670150..d7192b6 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/StructuredDataSearchFilter.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/StructuredDataSearchFilter.java @@ -3,32 +3,45 @@ import co.elastic.clients.elasticsearch._types.query_dsl.NestedQuery; import co.elastic.clients.elasticsearch._types.query_dsl.Query; import co.elastic.clients.elasticsearch._types.query_dsl.TermQuery; +import org.springframework.util.StringUtils; +import java.util.ArrayList; import java.util.List; public record StructuredDataSearchFilter(String dataType, String key, String value) implements SearchFilter { public Query getQuery() { + List queries = new ArrayList<>(); + + // dataType is required + if (StringUtils.hasText(dataType)) { + queries.add(TermQuery.of(t -> t + .field("structuredData.dataType.keyword") + .value(dataType) + )._toQuery()); + } + + // key is optional + if (StringUtils.hasText(key)) { + queries.add(TermQuery.of(t -> t + .field("structuredData.key.keyword") + .value(key) + )._toQuery()); + } + + // value is optional + if (StringUtils.hasText(value)) { + queries.add(TermQuery.of(t -> t + .field("structuredData.value.keyword") + .value(value) + )._toQuery()); + } + return NestedQuery.of(n -> n .path("structuredData") .query(q -> q .bool(b -> b - .must( - List.of( - TermQuery.of(t -> t - .field("structuredData.dataType.keyword") - .value(dataType) - )._toQuery(), - TermQuery.of(t -> t - .field("structuredData.key.keyword") - .value(key) - )._toQuery(), - TermQuery.of(t -> t - .field("structuredData.value.keyword") - .value(value) - )._toQuery() - ) - ) + .must(queries) ) ) )._toQuery(); diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java b/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java index 69a5964..ff48d39 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java @@ -21,6 +21,11 @@ Queue queue() { return new Queue(INDEXING_QUEUE, true); } + @Bean + Queue reindexingQueue() { + return new Queue(REINDEXING_QUEUE, true); + } + @Bean DirectExchange exchange() { return new DirectExchange(INDEXING_EXCHANGE); @@ -31,12 +36,17 @@ Binding binding(Queue queue, DirectExchange exchange) { return BindingBuilder.bind(queue).to(exchange).with(INDEXING_QUEUE); } + @Bean + Binding reindexingBinding(Queue reindexingQueue, DirectExchange exchange) { + return BindingBuilder.bind(reindexingQueue).to(exchange).with(REINDEXING_QUEUE); + } + @Bean SimpleMessageListenerContainer container(ConnectionFactory connectionFactory, MessageListenerAdapter listenerAdapter) { SimpleMessageListenerContainer container = new SimpleMessageListenerContainer(); container.setConnectionFactory(connectionFactory); - container.setQueueNames(INDEXING_QUEUE); + container.setQueueNames(INDEXING_QUEUE, REINDEXING_QUEUE); container.setMessageListener(listenerAdapter); return container; } From 288dad017799b2283ac55b1a819e14dbc3dae112 Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 5 Feb 2026 11:46:43 +0000 Subject: [PATCH 05/10] have separate reindexing exchange to avoid federation conflicts --- .../ebi/biosamples/search/index/RabbitConfig.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java b/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java index ff48d39..85d1ef8 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/index/RabbitConfig.java @@ -13,6 +13,7 @@ @Configuration public class RabbitConfig { public static final String INDEXING_EXCHANGE = "biosamples.indexing"; + public static final String REINDEXING_EXCHANGE = "biosamples.reindexing"; public static final String INDEXING_QUEUE = "biosamples.indexing.es"; public static final String REINDEXING_QUEUE = "biosamples.reindexing.es"; @@ -32,13 +33,18 @@ DirectExchange exchange() { } @Bean - Binding binding(Queue queue, DirectExchange exchange) { - return BindingBuilder.bind(queue).to(exchange).with(INDEXING_QUEUE); + DirectExchange reindexingExchange() { + return new DirectExchange(REINDEXING_EXCHANGE); } @Bean - Binding reindexingBinding(Queue reindexingQueue, DirectExchange exchange) { - return BindingBuilder.bind(reindexingQueue).to(exchange).with(REINDEXING_QUEUE); + Binding reindexingBinding(Queue reindexingQueue, DirectExchange reindexingExchange) { + return BindingBuilder.bind(reindexingQueue).to(reindexingExchange).with(REINDEXING_QUEUE); + } + + @Bean + Binding binding(Queue queue, DirectExchange exchange) { + return BindingBuilder.bind(queue).to(exchange).with(INDEXING_QUEUE); } @Bean From ea36969023340f4916876a9ff06412707914f5e0 Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 5 Feb 2026 11:58:10 +0000 Subject: [PATCH 06/10] request and limit set to 4 Gi and 8 Gi respectively --- k8s/helm/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/helm/values.yaml b/k8s/helm/values.yaml index 0447866..73f76b5 100644 --- a/k8s/helm/values.yaml +++ b/k8s/helm/values.yaml @@ -63,9 +63,9 @@ ingress: resources: limits: - memory: 4Gi + memory: 8Gi requests: - memory: 2Gi + memory: 4Gi autoscaling: enabled: false From bec06afbed8c68829d7fb0447c53b22373ba7eba Mon Sep 17 00:00:00 2001 From: dgupta Date: Thu, 12 Feb 2026 11:04:34 +0000 Subject: [PATCH 07/10] performance optimization for release date range filter queries --- .../ebi/biosamples/search/es/QueryHelper.java | 29 ++++++++++++++-- .../search/filter/PublicSearchFilter.java | 33 +++++-------------- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java index b64441e..89023af 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java @@ -4,8 +4,11 @@ import org.springframework.util.CollectionUtils; import org.springframework.util.StringUtils; import uk.ac.ebi.biosamples.search.samples.SearchQuery; +import uk.ac.ebi.biosamples.search.filter.DateRangeSearchFilter; +import uk.ac.ebi.biosamples.search.filter.PublicSearchFilter; import uk.ac.ebi.biosamples.search.filter.SearchFilter; +import java.util.ArrayList; import java.util.List; public class QueryHelper { @@ -46,9 +49,29 @@ private static Query getFilterQuery(SearchQuery searchQuery) { return MatchAllQuery.of(m -> m)._toQuery(); } - List filterQueries = searchQuery.getFilters().stream() - .map(SearchFilter::getQuery).toList(); - return BoolQuery.of(b -> b.must(filterQueries))._toQuery(); + List filters = searchQuery.getFilters(); + boolean hasReleaseDateRange = filters.stream() + .anyMatch(f -> f instanceof DateRangeSearchFilter dt + && dt.field() == DateRangeSearchFilter.DateField.RELEASE); + + List mustQueries = new ArrayList<>(); + boolean needExcludeSuppressed = false; + + for (SearchFilter filter : filters) { + if (filter instanceof PublicSearchFilter && hasReleaseDateRange) { + needExcludeSuppressed = true; + continue; + } + mustQueries.add(filter.getQuery()); + } + + if (needExcludeSuppressed) { + return BoolQuery.of(b -> b + .must(mustQueries) + .mustNot(PublicSearchFilter.getExcludeSuppressedQuery()) + )._toQuery(); + } + return BoolQuery.of(b -> b.must(mustQueries))._toQuery(); } diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/PublicSearchFilter.java b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/PublicSearchFilter.java index dc608e4..55d249a 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/PublicSearchFilter.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/PublicSearchFilter.java @@ -1,9 +1,7 @@ package uk.ac.ebi.biosamples.search.filter; import co.elastic.clients.elasticsearch._types.query_dsl.BoolQuery; -import co.elastic.clients.elasticsearch._types.query_dsl.NestedQuery; import co.elastic.clients.elasticsearch._types.query_dsl.Query; -import co.elastic.clients.elasticsearch._types.query_dsl.TermQuery; import com.fasterxml.jackson.annotation.JsonIgnore; import org.springframework.util.StringUtils; @@ -17,11 +15,10 @@ public Query getQuery() { Query publicDateQuery = new DateRangeSearchFilter( DateRangeSearchFilter.DateField.RELEASE, null, Instant.now().toString()).getQuery(); - Query suppressedStatusQuery = - new AttributeSearchFilter("INSDC status", List.of("suppressed")).getQuery(); + Query suppressedStatusQuery = getExcludeSuppressedQuery(); Query publicQuery = new BoolQuery.Builder() - .must(List.of(publicDateQuery/*, getSurpressedStatusQuery()*/)) + .must(List.of(publicDateQuery)) .mustNot(List.of(suppressedStatusQuery)) .build() ._toQuery(); @@ -34,25 +31,11 @@ public Query getQuery() { return publicQuery; } - public Query getSurpressedStatusQuery() { - return NestedQuery.of(n -> n - .path("characteristics") - .query(q -> q - .bool(b -> b - .mustNot( - List.of( - TermQuery.of(t -> t - .field("characteristics.key.keyword") - .value("INSDC status") - )._toQuery(), - TermQuery.of(t -> t - .field("characteristics.value.keyword") - .value("suppressed") - )._toQuery() - ) - ) - ) - ) - )._toQuery(); + /** + * Returns the "exclude suppressed" query only (must_not for INSDC status=suppressed). + * Used when combining with a release date range so we can avoid a redundant 1970→now range. + */ + public static Query getExcludeSuppressedQuery() { + return new AttributeSearchFilter("INSDC status", List.of("suppressed")).getQuery(); } } From ad2fa3e81c860efd04d7b9f2485f9ee69b31085b Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Thu, 26 Feb 2026 11:33:50 +0000 Subject: [PATCH 08/10] performance optimization for release date range filter queries and fix date queries without a to date --- .../ebi/biosamples/search/es/QueryHelper.java | 56 +++++++++++++++++-- .../search/filter/DateRangeSearchFilter.java | 23 +++++++- 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java index 89023af..1878ea5 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java @@ -8,14 +8,20 @@ import uk.ac.ebi.biosamples.search.filter.PublicSearchFilter; import uk.ac.ebi.biosamples.search.filter.SearchFilter; +import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; public class QueryHelper { public static Query getSearchQuery(SearchQuery searchQuery) { - Query match = getTextMatchQuery(searchQuery); Query filter = getFilterQuery(searchQuery); + if (!StringUtils.hasText(searchQuery.getText())) { + return filter; + } + Query match = getTextMatchQuery(searchQuery); return BoolQuery.of(b -> b .must(match) .filter(filter) @@ -44,17 +50,22 @@ private static Query getTextMatchQuery(SearchQuery searchQuery) { } + private static final String DEFAULT_DATE_FROM = "1970-01-01T00:00:00.000Z"; + private static Query getFilterQuery(SearchQuery searchQuery) { if (CollectionUtils.isEmpty(searchQuery.getFilters())) { return MatchAllQuery.of(m -> m)._toQuery(); } List filters = searchQuery.getFilters(); - boolean hasReleaseDateRange = filters.stream() - .anyMatch(f -> f instanceof DateRangeSearchFilter dt - && dt.field() == DateRangeSearchFilter.DateField.RELEASE); - List mustQueries = new ArrayList<>(); + // Merge multiple date ranges on the same field into one (intersection: max from, min to) + Map mergedDateRangeQueries = getMergedDateRangeQueries(filters); + List mustQueries = new ArrayList<>(mergedDateRangeQueries.values()); + + boolean hasReleaseDateRange = mergedDateRangeQueries.containsKey(DateRangeSearchFilter.DateField.RELEASE) + || filters.stream().anyMatch(f -> f instanceof DateRangeSearchFilter dt + && dt.field() == DateRangeSearchFilter.DateField.RELEASE); boolean needExcludeSuppressed = false; for (SearchFilter filter : filters) { @@ -62,6 +73,12 @@ private static Query getFilterQuery(SearchQuery searchQuery) { needExcludeSuppressed = true; continue; } + + if (filter instanceof DateRangeSearchFilter dt + && mergedDateRangeQueries.containsKey(dt.field())) { + continue; + } + mustQueries.add(filter.getQuery()); } @@ -74,5 +91,34 @@ private static Query getFilterQuery(SearchQuery searchQuery) { return BoolQuery.of(b -> b.must(mustQueries))._toQuery(); } + /** + * Groups date range filters by field and merges multiple ranges on the same field + * into one (intersection: latest from, earliest to). Returns a map of field -> a merged query + * only for fields that had more than one filter. + */ + private static Map getMergedDateRangeQueries(List filters) { + final String defaultTo = Instant.now().toString(); + final Map> byField = filters.stream() + .filter(f -> f instanceof DateRangeSearchFilter) + .map(f -> (DateRangeSearchFilter) f) + .collect(Collectors.groupingBy(DateRangeSearchFilter::field)); + + return byField.entrySet().stream() + .filter(e -> e.getValue().size() > 1) + .collect(Collectors.toMap(Map.Entry::getKey, e -> { + final List list = e.getValue(); + final String mergedFrom = list.stream() + .map(dt -> StringUtils.hasText(dt.from()) ? dt.from() : DEFAULT_DATE_FROM) + .max(String::compareTo) + .orElse(DEFAULT_DATE_FROM); + final String mergedTo = list.stream() + .map(dt -> StringUtils.hasText(dt.to()) ? dt.to() : defaultTo) + .min(String::compareTo) + .orElse(defaultTo); + + return new DateRangeSearchFilter(e.getKey(), mergedFrom, mergedTo).getQuery(); + })); + } + } diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/DateRangeSearchFilter.java b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/DateRangeSearchFilter.java index 0e2a5da..a779272 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/DateRangeSearchFilter.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/DateRangeSearchFilter.java @@ -11,6 +11,9 @@ public record DateRangeSearchFilter(DateField field, String from, String to) implements SearchFilter { + /** Sentinel "end of time" sent by clients when "to" is omitted; ES cannot parse it. Treat as "no end" → now. */ + private static final String SENTINEL_FAR_FUTURE_PREFIX = "+999999999"; + @JsonIgnore public Query getQuery() { //todo from, to validation "2023-05-12T15:12:56.113Z" @@ -21,8 +24,11 @@ public Query getQuery() { } else { builder.gte("1970-01-01T00:00:00.000Z"); } - if (StringUtils.hasText(to)) { - builder.lte(to); + + String effectiveTo = effectiveTo(); + + if (StringUtils.hasText(effectiveTo)) { + builder.lte(effectiveTo); } else { builder.lte(Instant.now().toString()); } @@ -32,6 +38,19 @@ public Query getQuery() { )._toQuery(); } + /** Use {@code to} unless it is the far-future sentinel (e.g. +999999999-12-31...); then treat as no end. */ + private String effectiveTo() { + if (!StringUtils.hasText(to)) { + return null; + } + + if (to.startsWith(SENTINEL_FAR_FUTURE_PREFIX)) { + return null; + } + + return to; + } + public enum DateField { RELEASE, UPDATE, SUBMITTED, CREATE; From 5f91681522c5eaae4f731b83da1aef25763614dd Mon Sep 17 00:00:00 2001 From: dipayan1985 Date: Fri, 27 Feb 2026 14:27:57 +0000 Subject: [PATCH 09/10] fix accession wildcard search --- Dockerfile | 6 +++-- server/build.gradle.kts | 4 ++++ .../search/filter/AccessionSearchFilter.java | 23 ++++++++++++++++++- 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8f507e7..24e3fa1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,10 +2,12 @@ FROM gradle:8.14-jdk24 AS builder WORKDIR /app -# Copy module build files only, to warm up and pre-resolve dependencies +# Copy minimal Gradle build files first, to warm up and pre-resolve dependencies +COPY settings.gradle.kts . +COPY build.gradle.kts . COPY proto/build.gradle.kts proto/ COPY server/build.gradle.kts server/ -RUN gradle --no-daemon clean build -x test || return 0 +RUN gradle --no-daemon dependencies || true COPY . . RUN gradle :server:bootJar --no-daemon -x test diff --git a/server/build.gradle.kts b/server/build.gradle.kts index a4bdaf8..35dc8a7 100644 --- a/server/build.gradle.kts +++ b/server/build.gradle.kts @@ -4,6 +4,10 @@ plugins { id("io.spring.dependency-management") version "1.1.7" } +tasks.bootJar { + mainClass.set("uk.ac.ebi.biosamples.search.BiosamplesSearchApplication") +} + java { toolchain { languageVersion = JavaLanguageVersion.of(24) diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/AccessionSearchFilter.java b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/AccessionSearchFilter.java index b01c644..05616d4 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/filter/AccessionSearchFilter.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/filter/AccessionSearchFilter.java @@ -2,13 +2,34 @@ import co.elastic.clients.elasticsearch._types.query_dsl.Query; import co.elastic.clients.elasticsearch._types.query_dsl.TermQuery; +import co.elastic.clients.elasticsearch._types.query_dsl.WildcardQuery; public record AccessionSearchFilter(String accession) implements SearchFilter { + /** Index mapping: accession is text with fields.keyword (ignore_above 256). Use keyword subfield for term/wildcard. */ + private static final String ACCESSION_KEYWORD_FIELD = "accession.keyword"; + public Query getQuery() { + if (isWildcardPattern(accession)) { + return WildcardQuery.of(w -> w + .field(ACCESSION_KEYWORD_FIELD) + .value(toElasticsearchWildcard(accession)) + )._toQuery(); + } return TermQuery.of(t -> t - .field("accession.keyword") + .field(ACCESSION_KEYWORD_FIELD) .value(accession) )._toQuery(); } + + /** + * Converts regex-style .* to ES wildcard * (literal dot in ES wildcard would break SAME.* matching SAMEA1). + */ + private static String toElasticsearchWildcard(String value) { + return value == null ? null : value.replace(".*", "*"); + } + + private static boolean isWildcardPattern(String value) { + return value != null && (value.indexOf('*') >= 0 || value.indexOf('?') >= 0); + } } From 79e0c7aae4bd9839ec04569412da0fae68f245b2 Mon Sep 17 00:00:00 2001 From: dgupta Date: Tue, 3 Mar 2026 13:19:11 +0000 Subject: [PATCH 10/10] Fix text search --- docs/cursor/README_search_service_query.md | 49 ++ docs/cursor/check_characteristics_field.md | 116 ++++ ...check_documents_without_characteristics.sh | 96 +++ .../cursor/faceting-aggregations-explained.md | 416 ++++++++++++ docs/cursor/faceting-strategy-plan.md | 622 ++++++++++++++++++ docs/cursor/get_all_facets_postman.json | 6 + docs/cursor/grpc-and-proto-notes.md | 71 ++ .../postman-es-investigation-acc-filter.md | 217 ++++++ docs/cursor/query_search_service.ps1 | 17 + docs/cursor/query_search_service.sh | 15 + .../search-query-optimization-analysis.md | 117 ++++ .../search_date_range_filter_postman.json | 19 + docs/cursor/search_organism_request.json | 16 + ..._release_2014_2015_exclude_suppressed.json | 24 + docs/cursor/search_release_date_only_curl.sh | 29 + .../search_release_date_only_postman.json | 22 + docs/cursor/search_release_date_range_curl.sh | 26 + .../search_release_date_range_postman.json | 17 + .../search_release_exclude_suppressed_curl.sh | 32 + ...ch_release_exclude_suppressed_postman.json | 25 + .../search_service_query_acc_exact.json | 12 + .../search_service_query_acc_wildcard.json | 12 + .../search_structured_data_postman.json | 17 + .../ebi/biosamples/search/es/QueryHelper.java | 22 +- 24 files changed, 2011 insertions(+), 4 deletions(-) create mode 100644 docs/cursor/README_search_service_query.md create mode 100644 docs/cursor/check_characteristics_field.md create mode 100644 docs/cursor/check_documents_without_characteristics.sh create mode 100644 docs/cursor/faceting-aggregations-explained.md create mode 100644 docs/cursor/faceting-strategy-plan.md create mode 100644 docs/cursor/get_all_facets_postman.json create mode 100644 docs/cursor/grpc-and-proto-notes.md create mode 100644 docs/cursor/postman-es-investigation-acc-filter.md create mode 100644 docs/cursor/query_search_service.ps1 create mode 100644 docs/cursor/query_search_service.sh create mode 100644 docs/cursor/search-query-optimization-analysis.md create mode 100644 docs/cursor/search_date_range_filter_postman.json create mode 100644 docs/cursor/search_organism_request.json create mode 100644 docs/cursor/search_release_2014_2015_exclude_suppressed.json create mode 100644 docs/cursor/search_release_date_only_curl.sh create mode 100644 docs/cursor/search_release_date_only_postman.json create mode 100644 docs/cursor/search_release_date_range_curl.sh create mode 100644 docs/cursor/search_release_date_range_postman.json create mode 100644 docs/cursor/search_release_exclude_suppressed_curl.sh create mode 100644 docs/cursor/search_release_exclude_suppressed_postman.json create mode 100644 docs/cursor/search_service_query_acc_exact.json create mode 100644 docs/cursor/search_service_query_acc_wildcard.json create mode 100644 docs/cursor/search_structured_data_postman.json diff --git a/docs/cursor/README_search_service_query.md b/docs/cursor/README_search_service_query.md new file mode 100644 index 0000000..afa510e --- /dev/null +++ b/docs/cursor/README_search_service_query.md @@ -0,0 +1,49 @@ +# Querying the search service directly + +The **biosamples-search** service exposes a REST API for development/testing (core app uses gRPC). Use it to see what the search service returns without going through the core app. + +- **Search service** usually runs on **port 8080** (or set `SEARCH_PORT`). +- **Core app** (biosamples-v4) runs on **port 8081** and calls the search service via gRPC. + +## Endpoint + +- **POST** `http://localhost:8080/search` + Body: JSON `SearchQuery` (see examples below). + +## Quick test (PowerShell) + +From the repo root: + +```powershell +# Exact accession +Invoke-RestMethod -Uri "http://localhost:8080/search" -Method Post -ContentType "application/json" -Body '{"filters":[{"type":"pub","webinId":""},{"type":"acc","accession":"SAMEA26"}],"page":0,"size":20}' + +# Wildcard accession (asterisk in JSON is not URL-encoded) +Invoke-RestMethod -Uri "http://localhost:8080/search" -Method Post -ContentType "application/json" -Body '{"filters":[{"type":"pub","webinId":""},{"type":"acc","accession":"SAME*"}],"page":0,"size":20}' +``` + +## Scripts + +- **Bash:** `./docs/cursor/query_search_service.sh` (from project root; needs `jq` for pretty output). +- **PowerShell:** `.\docs\cursor\query_search_service.ps1` + +Override port: `$env:SEARCH_PORT=9090; .\docs\cursor\query_search_service.ps1` + +## Request JSON format + +Filters use a `type` discriminator and type-specific fields: + +| type | Example | +|------|--------| +| `pub` | `{"type":"pub","webinId":""}` | +| `acc` | `{"type":"acc","accession":"SAME*"}` or `{"type":"acc","accession":"SAMEA26"}` | + +Full query: `{"text":null,"filters":[...],"facets":null,"page":0,"size":20,"sort":null,"searchAfter":null}` + +Pre-made bodies: `search_service_query_acc_exact.json`, `search_service_query_acc_wildcard.json`. + +## How to interpret results + +- If **exact** returns hits and **wildcard** returns hits → search service and ES are fine; the core app or URL handling is likely dropping `*` for `filter=acc:SAME*`. +- If **exact** returns hits and **wildcard** returns 0 → issue is inside the search service (e.g. wildcard query building). +- If both return 0 → check that the search service is pointing at the same ES index and that data exists (e.g. public, not suppressed). diff --git a/docs/cursor/check_characteristics_field.md b/docs/cursor/check_characteristics_field.md new file mode 100644 index 0000000..3d99cfc --- /dev/null +++ b/docs/cursor/check_characteristics_field.md @@ -0,0 +1,116 @@ +# Checking for Documents Without Characteristics Field + +## Quick Check Commands + +### 1. Count documents WITHOUT characteristics field + +```bash +curl -u elastic:elastic -X POST "http://localhost:9200/samples/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 0, + "query": { + "bool": { + "must_not": [ + { + "exists": { + "field": "characteristics" + } + } + ] + } + }, + "aggs": { + "total_without_characteristics": { + "value_count": { + "field": "_id" + } + } + } +}' +``` + +### 2. Count documents WITH characteristics field + +```bash +curl -u elastic:elastic -X POST "http://localhost:9200/samples/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 0, + "query": { + "exists": { + "field": "characteristics" + } + }, + "aggs": { + "total_with_characteristics": { + "value_count": { + "field": "_id" + } + } + } +}' +``` + +### 3. Get sample documents without characteristics (first 5) + +```bash +curl -u elastic:elastic -X POST "http://localhost:9200/samples/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 5, + "_source": ["accession", "name"], + "query": { + "bool": { + "must_not": [ + { + "exists": { + "field": "characteristics" + } + } + ] + } + } +}' +``` + +### 4. Check if characteristics field is empty array + +```bash +curl -u elastic:elastic -X POST "http://localhost:9200/samples/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 5, + "_source": ["accession", "characteristics"], + "query": { + "script": { + "script": { + "source": "doc[\"characteristics\"].size() == 0", + "lang": "painless" + } + } + } +}' +``` + +### 5. Check mapping for characteristics field + +```bash +curl -u elastic:elastic -X GET "http://localhost:9200/samples/_mapping/field/characteristics?pretty" +``` + +## Understanding the Results + +- **Documents without characteristics**: These might cause issues with nested queries if the query doesn't handle missing fields properly +- **Empty characteristics arrays**: Some documents might have `characteristics: []` which is different from missing field +- **Nested query behavior**: Nested queries in `must_not` should handle missing fields, but there might be edge cases + +## Potential Issues + +If you find documents without `characteristics`: +1. The nested query in `must_not` should still work (it will match documents without the field) +2. However, if there's a query syntax issue, it might fail +3. Check Elasticsearch version compatibility with nested queries in `must_not` + + + diff --git a/docs/cursor/check_documents_without_characteristics.sh b/docs/cursor/check_documents_without_characteristics.sh new file mode 100644 index 0000000..f892dee --- /dev/null +++ b/docs/cursor/check_documents_without_characteristics.sh @@ -0,0 +1,96 @@ +#!/bin/bash + +# Check if there are documents without the characteristics field +# Replace with your Elasticsearch credentials and index name + +ELASTICSEARCH_URL="http://localhost:9200" +INDEX_NAME="samples" +USERNAME="elastic" +PASSWORD="elastic" + +echo "=== Checking for documents WITHOUT characteristics field ===" +echo "" + +# Query 1: Count documents that don't have characteristics field +curl -u "${USERNAME}:${PASSWORD}" -X POST "${ELASTICSEARCH_URL}/${INDEX_NAME}/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 0, + "query": { + "bool": { + "must_not": [ + { + "exists": { + "field": "characteristics" + } + } + ] + } + }, + "aggs": { + "total_without_characteristics": { + "value_count": { + "field": "_id" + } + } + } +}' + +echo "" +echo "" +echo "=== Checking for documents WITH characteristics field ===" +echo "" + +# Query 2: Count documents that have characteristics field +curl -u "${USERNAME}:${PASSWORD}" -X POST "${ELASTICSEARCH_URL}/${INDEX_NAME}/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 0, + "query": { + "exists": { + "field": "characteristics" + } + }, + "aggs": { + "total_with_characteristics": { + "value_count": { + "field": "_id" + } + } + } +}' + +echo "" +echo "" +echo "=== Sample documents without characteristics (first 5) ===" +echo "" + +# Query 3: Get sample documents without characteristics +curl -u "${USERNAME}:${PASSWORD}" -X POST "${ELASTICSEARCH_URL}/${INDEX_NAME}/_search?pretty" \ +-H 'Content-Type: application/json' \ +-d '{ + "size": 5, + "_source": ["accession", "name"], + "query": { + "bool": { + "must_not": [ + { + "exists": { + "field": "characteristics" + } + } + ] + } + } +}' + +echo "" +echo "" +echo "=== Total document count ===" +echo "" + +# Query 4: Total document count +curl -u "${USERNAME}:${PASSWORD}" -X GET "${ELASTICSEARCH_URL}/${INDEX_NAME}/_count?pretty" + + + diff --git a/docs/cursor/faceting-aggregations-explained.md b/docs/cursor/faceting-aggregations-explained.md new file mode 100644 index 0000000..2068fae --- /dev/null +++ b/docs/cursor/faceting-aggregations-explained.md @@ -0,0 +1,416 @@ +# Faceting Aggregations Explained + +## Overview + +This document explains how Elasticsearch aggregations work for faceting in the BioSamples Search service, specifically covering: +- **Characteristics** (key-value attributes like organism, sex, geographic location) +- **Relationships** (sample relationships like "derived from", "parent of") +- **External References** (links to external archives like ENA, SRA) + +## Data Structure + +Each `Sample` document in Elasticsearch contains three nested fields that are used for faceting: + +### 1. Characteristics (Nested Field) + +A set of key-value pairs representing sample attributes: + +```json +{ + "characteristics": [ + {"key": "organism", "value": "Homo sapiens"}, + {"key": "sex", "value": "male"}, + {"key": "geographic location", "value": "USA"}, + {"key": "env_medium", "value": "soil"} + ] +} +``` + +**Model**: `Attribute` class with `key` (String) and `value` (String) fields. + +**Common Keys**: +- `organism` - Species name +- `sex` - Gender/sex +- `geographic location` - Location information +- `env_medium` - Environmental medium +- And many more dynamic attributes + +### 2. Relationships (Nested Field) + +A set of relationships between samples: + +```json +{ + "relationships": [ + { + "type": "derived from", + "source": "SAMD00000001", + "target": "SAMD00000002" + }, + { + "type": "parent of", + "source": "SAMD00000001", + "target": "SAMD00000003" + } + ] +} +``` + +**Model**: `Relationship` record with `type`, `source`, and `target` fields. + +**Common Types**: +- `derived from` - Sample was derived from another +- `parent of` - Sample is a parent of another +- `child of` - Sample is a child of another + +### 3. External References (Nested Field) + +A set of links to external archives and databases: + +```json +{ + "externalReferences": [ + { + "archive": "ENA", + "accession": "SRR123456", + "url": "https://www.ebi.ac.uk/ena/browser/view/SRR123456" + }, + { + "archive": "SRA", + "accession": "SRS789012", + "url": "https://..." + } + ] +} +``` + +**Model**: `ExternalReference` record with `archive`, `accession`, `url`, and `duo` fields. + +**Common Archives**: +- `ENA` - European Nucleotide Archive +- `SRA` - Sequence Read Archive +- `GEO` - Gene Expression Omnibus + +## Why Nested Aggregations? + +All three fields are stored as **nested fields** in Elasticsearch. This means: + +1. **Nested fields are stored as separate internal documents** - Elasticsearch treats each array element as a separate document internally +2. **Nested aggregations are required** - To properly aggregate nested data, you must use a `nested` aggregation that: + - Enters the nested document context + - Runs aggregations on nested documents + - Returns results in the nested context + +Without nested aggregation, Elasticsearch would flatten the arrays and give incorrect counts. + +## Aggregation Structure + +### Overall Architecture + +The faceting system uses a **sampler aggregation** to optimize performance on large datasets (50M+ samples): + +``` +Top Level Query + ↓ +Sampler Aggregation (samples 100k docs per shard) + ↓ + ├── Nested Aggregation: characteristics + │ └── Terms Aggregation: by key + │ └── Terms Aggregation: by value + │ + ├── Nested Aggregation: relationships + │ └── Terms Aggregation: by type + │ + └── Nested Aggregation: externalReferences + └── Terms Aggregation: by archive +``` + +### Characteristics Aggregation + +**Location**: `AttributeFacet.java` + +**Structure**: Two-level nested aggregation + +```java +.nested(n -> n.path("characteristics")) // Enter nested field +.aggregations("dynamic", a1 -> a1 + .terms(t -> t + .field("characteristics.key.keyword") // Level 1: Group by KEY + .size(10) // Top 10 keys + .shardSize(200) // Consider top 200 per shard + ) + .aggregations("by_value", a2 -> a2 + .terms(t2 -> t2 + .field("characteristics.value.keyword") // Level 2: Group by VALUE + .size(10) // Top 10 values per key + .shardSize(200) // Consider top 200 per shard + ) + ) +) +``` + +**Result Structure**: +``` +characteristics +├── organism (key) +│ ├── Homo sapiens: 250,000 samples +│ ├── Mus musculus: 150,000 samples +│ └── ... +├── sex (key) +│ ├── male: 200,000 samples +│ ├── female: 175,000 samples +│ └── ... +└── geographic location (key) + ├── USA: 100,000 samples + ├── UK: 80,000 samples + └── ... +``` + +**Excluded Keys**: Certain keys are excluded from faceting (see `EXCLUDED_FACETS` in `AttributeFacet.java`): +- `description`, `sample name`, `title` +- `INSDC first public`, `ENA first public` +- `collection date`, `SRA accession` +- And others that are not useful for filtering + +### Relationships Aggregation + +**Location**: `RelationshipFacet.java` + +**Structure**: Single-level nested aggregation + +```java +.nested(n -> n.path("relationships")) // Enter nested field +.aggregations("by_type", a1 -> a1 + .terms(t -> t + .field("relationships.type.keyword") // Group by relationship type + .size(10) // Top 10 types + ) +) +``` + +**Result Structure**: +``` +relationships +└── relationship type + ├── derived from: 133,000 samples + ├── parent of: 100,000 samples + ├── child of: 85,000 samples + └── ... +``` + +**Note**: The code also supports aggregating by `source` and `target` (see `getAggregationsWithSourceAndTarget()`), but the default only uses `type`. + +### External References Aggregation + +**Location**: `ExternalRefFacet.java` + +**Structure**: Single-level nested aggregation + +```java +.nested(n -> n.path("externalReferences")) // Enter nested field +.aggregations("by_archive", a1 -> a1 + .terms(t -> t + .field("externalReferences.archive.keyword") // Group by archive type + .size(10) // Top 10 archives + .minDocCount(1) // Include all archives + ) +) +``` + +**Result Structure**: +``` +externalReferences +└── external archive + ├── ENA: 333,000 samples + ├── SRA: 250,000 samples + ├── GEO: 50,000 samples + └── ... +``` + +## Sampling and Extrapolation + +### Why Sampling? + +For a dataset with 50 million samples, running aggregations on all documents would be: +- **Slow**: Could take minutes +- **Memory-intensive**: Requires significant heap space +- **Unnecessary**: Approximate counts are sufficient for faceting + +### How Sampling Works + +**Location**: `SamplingFacetingStrategy.java` (lines 56-63) + +```java +.sampler(s -> s.shardSize(100000)) // Sample 100k documents per shard +.aggregations(subAggregations) // Run nested aggregations on sampled docs +``` + +**Process**: +1. **Sampling**: Randomly selects 100,000 documents per shard + - With 3 shards: ~300,000 total documents sampled + - Sampling rate: 300k / 50M = 0.6% + +2. **Aggregation**: Runs nested aggregations on sampled documents only + - Much faster than processing all 50M documents + - Results are approximate but statistically valid + +3. **Extrapolation**: Scales up the counts to estimate full dataset + +### Extrapolation Factor + +**Location**: `SamplingFacetingStrategy.java` (lines 144-150) + +```java +if (sampledDocs > 0 && totalDocs > 0) { + extrapolationFactor = (double) totalDocs / sampledDocs; +} +``` + +**Example**: +- Total documents matching query: 50,000,000 +- Sampled documents: 300,000 +- Extrapolation factor: 50M / 300k = **166.67** + +**Application**: Each sampled count is multiplied by the factor + +```java +// From AttributeFacet.java +long keyCount = Math.round(bucket.docCount() * extrapolationFactor); +long valueCount = Math.round(valueBucket.docCount() * extrapolationFactor); +``` + +**Example Result**: +- Sampled: "Homo sapiens" appears 1,500 times in 300k samples +- Extrapolated: 1,500 × 166.67 = **~250,000** samples in full dataset + +## Complete Flow Example + +### Query: "All public samples" (50M results) + +``` +1. Query Execution + └── Matches 50,000,000 sample documents + +2. Sampler Aggregation + └── Randomly selects 300,000 samples (100k per shard × 3 shards) + +3. Nested Aggregations (run on 300k samples) + ├── characteristics + │ ├── organism → {Homo sapiens: 1,500, Mus musculus: 900} + │ └── sex → {male: 1,200, female: 1,050} + │ + ├── relationships + │ └── by_type → {derived from: 800, parent of: 600} + │ + └── externalReferences + └── by_archive → {ENA: 2,000, SRA: 1,500} + +4. Extrapolation Calculation + └── Factor: 50M / 300k = 166.67 + +5. Final Facets (extrapolated counts) + ├── organism + │ ├── Homo sapiens: 250,000 (1,500 × 166.67) + │ └── Mus musculus: 150,000 (900 × 166.67) + │ + ├── sex + │ ├── male: 200,000 (1,200 × 166.67) + │ └── female: 175,000 (1,050 × 166.67) + │ + ├── relationship type + │ ├── derived from: 133,000 (800 × 166.67) + │ └── parent of: 100,000 (600 × 166.67) + │ + └── external archive + ├── ENA: 333,000 (2,000 × 166.67) + └── SRA: 250,000 (1,500 × 166.67) +``` + +## Code Locations + +### Key Files + +1. **`SamplingFacetingStrategy.java`** + - Builds the aggregation structure + - Implements sampling and extrapolation + - Extracts and processes facet results + +2. **`AttributeFacet.java`** + - Builds characteristics aggregation (two-level) + - Processes characteristics facet results + - Excludes certain keys from faceting + +3. **`RelationshipFacet.java`** + - Builds relationships aggregation + - Processes relationship facet results + +4. **`ExternalRefFacet.java`** + - Builds external references aggregation + - Processes external reference facet results + +5. **`FacetService.java`** + - Orchestrates facet retrieval + - Builds Elasticsearch queries with aggregations + - Calls strategy to retrieve facets + +### Model Classes + +- **`Sample.java`**: Main document model with nested fields +- **`Attribute.java`**: Characteristics key-value pairs +- **`Relationship.java`**: Relationship records +- **`ExternalReference.java`**: External reference records + +## Performance Considerations + +### Current Configuration + +- **Sampler size**: 100,000 documents per shard +- **Total samples**: ~300,000 (with 3 shards) +- **Sampling rate**: ~0.6% for 50M dataset +- **Extrapolation factor**: ~166x + +### Aggregation Parameters + +- **`size`**: Top 10 values returned per aggregation +- **`shardSize`**: Top 200 candidates per shard (for characteristics) +- **Timeout**: 60 seconds (recently increased from 30s) + +### Optimization Opportunities + +See `docs/faceting-strategy-plan.md` for proposed improvements: +- Increase sampler size to 500k (1% sampling rate) +- Increase `shardSize` to 1000 for better accuracy +- Implement adaptive sampling based on result set size +- Add caching for common queries + +## Accuracy Notes + +### Sampling Accuracy + +- **Common values** (> 1% prevalence): ±10-20% accuracy +- **Rare values** (< 0.1% prevalence): May be missed or inaccurate +- **Extrapolation error**: Amplified by factor of 166x + +### Limitations + +1. **Rare values**: Values appearing < 10 times in sample may be inaccurate +2. **Top-N limitation**: Only top 10 values per facet are returned +3. **Shard-level merging**: With `shardSize=200`, only 600 candidates considered (200 × 3 shards) before merging to top 10 + +### Future Improvements + +- Increase sampling rate to 1-2% for better accuracy +- Increase `shardSize` to 1000 for better top-value accuracy +- Implement confidence intervals for facet counts +- Flag low-confidence values (< 10 sampled occurrences) + +## Related Documentation + +- **`docs/faceting-strategy-plan.md`**: Comprehensive plan for improving faceting performance and accuracy +- **Elasticsearch Nested Aggregations**: [Official Documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-nested-aggregation.html) +- **Elasticsearch Sampler Aggregation**: [Official Documentation](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-sampler-aggregation.html) + +--- + +**Last Updated**: 2025-01-23 +**Author**: Documentation generated from codebase analysis diff --git a/docs/cursor/faceting-strategy-plan.md b/docs/cursor/faceting-strategy-plan.md new file mode 100644 index 0000000..f5ddcfa --- /dev/null +++ b/docs/cursor/faceting-strategy-plan.md @@ -0,0 +1,622 @@ +# Faceting Strategy Plan for 50M+ Samples +## 3-Node, 3-Shard Elasticsearch Cluster + +### Executive Summary +This document outlines a comprehensive faceting strategy optimized for a 50 million sample dataset on a 3-node, 3-shard Elasticsearch cluster, with scalability considerations for future growth. + +--- + +## 1. Current State Analysis + +### 1.1 Current Configuration +- **Cluster**: 3 nodes, 3 shards (primary shards only) +- **Data Distribution**: ~16.7M documents per shard +- **Current Strategy**: `SamplingFacetingStrategy` +- **Sampler Size**: 100,000 documents per shard (300k total) +- **Sampling Rate**: 0.6% (300k / 50M) +- **Extrapolation Factor**: ~166x (50M / 300k) +- **Terms Aggregation**: + - `size`: 10 (top 10 values returned) + - `shardSize`: 200 (top 200 per shard before merging) + +### 1.2 Current Facet Types +1. **Characteristics** (nested field) + - Dynamic attributes (key-value pairs) + - Excludes: description, sample name, title, etc. + - Includes: organism, sex, geographic location, etc. +2. **Relationships** (nested field) + - Relationship types, sources, targets +3. **External References** (nested field) + - Archive types (ENA, SRA, etc.) +4. **Date Ranges** + - Update dates (yearly histogram) + - Runs on ALL documents (not sampled) + +### 1.3 Identified Issues + +#### Accuracy Problems +- **Low Sampling Rate (0.6%)**: + - Rare values (< 0.6% prevalence) may be completely missed + - Extrapolation factor of 166x amplifies small counting errors + - Example: If a value appears 1 time in 300k samples, it extrapolates to 166, but could be 0 or 500 in reality + +#### Performance Problems +- **Small `shardSize` (200)**: + - May miss important terms that rank differently across shards + - With 3 shards, only 600 candidate terms considered before merging to top 10 + - High-cardinality fields (e.g., organism with thousands of species) may show incorrect top values + +#### Scalability Problems +- **Fixed Sampler Size**: + - Doesn't adapt to query result set size + - For filtered queries returning 1M docs, 300k sample is good (30%) + - For filtered queries returning 50M docs, 300k sample is poor (0.6%) + - For filtered queries returning 100k docs, 300k sample is wasteful + +#### Missing Features +- **No Caching**: Every facet request hits Elasticsearch +- **No Pagination**: Can only see top 10 values per facet +- **No Adaptive Strategy**: Same approach for all query sizes +- **No Timeout Handling**: 30s timeout may not be enough for complex queries + +--- + +## 2. Proposed Multi-Tier Strategy + +### 2.1 Strategy Selection Logic + +The system should automatically choose the best faceting strategy based on the query result set size: + +``` +IF resultSetSize < 100,000: + → Use RegularFacetingStrategy (no sampling, full accuracy) + +ELSE IF resultSetSize < 1,000,000: + → Use AdaptiveSamplingStrategy (10-20% sampling rate) + +ELSE IF resultSetSize < 10,000,000: + → Use OptimizedSamplingStrategy (1-5% sampling rate) + +ELSE (resultSetSize >= 10,000,000): + → Use HighVolumeSamplingStrategy (0.5-1% sampling rate) +``` + +**Rationale**: +- Small result sets (< 100k) can be fully aggregated quickly +- Medium result sets (100k-1M) benefit from moderate sampling +- Large result sets (1M-10M) need aggressive sampling but maintain accuracy +- Very large result sets (10M+) prioritize speed over perfect accuracy + +### 2.2 Optimized Sampling Strategy (Primary Focus) + +For the 50M dataset, this will be the default strategy for unfiltered queries. + +#### 2.2.1 Improved Sampler Configuration + +**Current**: 100k per shard = 300k total +**Proposed**: Adaptive based on result set size + +``` +samplerSize = min( + max(resultSetSize * 0.01, 500000), // At least 1% of result set, minimum 500k + 2000000 // Maximum 2M samples (memory limit) +) +perShardSize = samplerSize / numShards +``` + +**Example for 50M docs**: +- `samplerSize = min(50M * 0.01, 2M) = 500k` +- `perShardSize = 500k / 3 = ~167k per shard` +- **Sampling Rate**: 1% (vs current 0.6%) +- **Extrapolation Factor**: 100x (vs current 166x) + +**Benefits**: +- Better accuracy (1% vs 0.6%) +- Lower extrapolation error (100x vs 166x) +- Still fast (500k samples process quickly) + +#### 2.2.2 Enhanced Terms Aggregation + +**Current**: `size=10, shardSize=200` +**Proposed**: `size=20, shardSize=1000` + +**Rationale**: +- **`size=20`**: Return top 20 values instead of 10 (better UX, minimal performance impact) +- **`shardSize=1000`**: + - With 3 shards, consider 3000 candidate terms before merging + - Better chance of finding correct top values across shards + - Still manageable memory footprint + +**Memory Impact**: +- Per shard: 1000 terms × ~50 bytes = ~50KB per aggregation +- With 3 nested aggregations (characteristics, relationships, externalRefs) = ~150KB per shard +- Total: ~450KB across cluster (negligible) + +#### 2.2.3 Smart Field Selection + +**Problem**: Some characteristics fields have very high cardinality (thousands of unique values) +**Solution**: Use different strategies for different field types + +**High-Cardinality Fields** (e.g., organism, geographic location): +- Use `shardSize=2000` (more candidates) +- Use `minDocCount=2` (filter out singletons in sampled set) +- Apply extrapolation with confidence intervals + +**Low-Cardinality Fields** (e.g., sex, ENA-CHECKLIST): +- Use `shardSize=500` (fewer candidates needed) +- Use `minDocCount=1` +- Standard extrapolation + +#### 2.2.4 Composite Aggregation for Pagination + +**Current Limitation**: Can only see top 10-20 values +**Solution**: Use Elasticsearch `composite` aggregation for paginated facet browsing + +**Implementation**: +- For characteristics: Composite aggregation on `[key, value]` +- Supports `after_key` parameter for pagination +- Client can request "next page" of facet values +- Useful for browsing all organisms, all geographic locations, etc. + +**Trade-off**: +- Slightly slower than terms aggregation +- But enables "show all" functionality for power users + +### 2.3 Caching Strategy + +#### 2.3.1 Query Result Caching + +**Cache Key**: Hash of query + filter combination +**Cache TTL**: +- Unfiltered queries: 5 minutes (data changes frequently) +- Filtered queries: 15 minutes (more stable) +- Highly filtered queries (< 1M results): 30 minutes + +**Cache Storage**: +- In-memory cache (Caffeine or similar) +- Size limit: 100MB (approximately 100-200 cached facet results) +- Eviction: LRU (Least Recently Used) + +**Cache Invalidation**: +- On new sample indexing (invalidate all) +- On sample update (invalidate matching filters) +- Manual invalidation endpoint for admin + +#### 2.3.2 Popular Facet Caching + +**Problem**: Common queries (e.g., "all public samples") are requested frequently +**Solution**: Pre-compute and cache popular facet combinations + +**Popular Queries to Cache**: +1. Unfiltered public samples +2. Public samples by domain +3. Recent samples (last 30 days) +4. Samples by top 10 organisms + +**Update Frequency**: Every 10 minutes (background job) + +### 2.4 Performance Optimizations + +#### 2.4.1 Global Ordinals + +**Problem**: Nested field aggregations are slower than regular field aggregations +**Solution**: Enable `eager_global_ordinals` for frequently aggregated fields + +**Fields to Optimize**: +- `characteristics.key.keyword` +- `characteristics.value.keyword` +- `relationships.type.keyword` + +**Trade-off**: +- Slightly slower indexing (ordinals built on refresh) +- Much faster aggregation queries +- Worth it for read-heavy workloads + +#### 2.4.2 Doc Values Optimization + +**Ensure**: All `.keyword` fields use `doc_values: true` (default, but verify) +**Benefit**: Faster aggregations, lower memory usage + +#### 2.4.3 Request Timeout Strategy + +**Current**: Fixed 30 seconds +**Proposed**: Adaptive timeout + +``` +IF resultSetSize < 1M: + timeout = 10 seconds +ELSE IF resultSetSize < 10M: + timeout = 30 seconds +ELSE: + timeout = 60 seconds +``` + +**Fallback**: If timeout occurs, return partial results with a flag indicating incompleteness + +### 2.5 Accuracy Improvements + +#### 2.5.1 Confidence Intervals + +**Problem**: Extrapolated counts are estimates, but we don't know how accurate +**Solution**: Calculate confidence intervals based on sampling statistics + +**Formula** (simplified): +``` +estimatedCount = sampledCount * extrapolationFactor +marginOfError = 1.96 * sqrt(sampledCount) * extrapolationFactor // 95% confidence +confidenceInterval = [estimatedCount - marginOfError, estimatedCount + marginOfError] +``` + +**Display**: Show counts as "~1,000,000 (±50,000)" in UI + +#### 2.5.2 Rare Value Detection + +**Problem**: Values appearing < 10 times in sample may be inaccurate +**Solution**: Flag low-confidence values + +**Logic**: +``` +IF sampledCount < 10: + confidence = "low" + display = "~" + estimatedCount + " (approximate)" +ELSE IF sampledCount < 100: + confidence = "medium" + display = estimatedCount +ELSE: + confidence = "high" + display = estimatedCount +``` + +#### 2.5.3 Stratified Sampling (Future Enhancement) + +**Advanced**: Instead of random sampling, use stratified sampling +- Sample proportionally from each shard +- Weight by document distribution +- More accurate for skewed data + +**Complexity**: High - requires custom aggregation logic +**Benefit**: Better accuracy for rare values + +--- + +## 3. Implementation Phases + +### Phase 1: Quick Wins (Week 1) +**Goal**: Improve current strategy with minimal code changes + +1. **Increase Sampler Size** + - Change from 100k to 167k per shard (500k total) + - Improves accuracy from 0.6% to 1% sampling rate + +2. **Increase Terms Aggregation Parameters** + - Change `size` from 10 to 20 + - Change `shardSize` from 200 to 1000 + - Better top-value accuracy + +3. **Add Adaptive Timeout** + - Implement timeout logic based on result set size + - Prevents premature timeouts + +**Expected Impact**: +- Accuracy: +40% (1% vs 0.6% sampling) +- Performance: Similar (slightly slower, but acceptable) +- UX: Better (20 values vs 10) + +### Phase 2: Adaptive Strategy (Week 2-3) +**Goal**: Implement strategy selection based on result set size + +1. **Result Set Size Estimation** + - Add `value_count` aggregation before main query + - Or use `track_total_hits: true` in search + - Cache result set size for strategy selection + +2. **Strategy Selection Logic** + - Implement `if/else` logic in `FacetService` + - Route to appropriate strategy based on size + +3. **RegularFacetingStrategy for Small Queries** + - Use existing `RegularFacetingStrategy` for < 100k results + - No sampling = perfect accuracy + +**Expected Impact**: +- Small queries: Perfect accuracy, faster (no sampling overhead) +- Large queries: Better accuracy (adaptive sampling) + +### Phase 3: Caching (Week 4) +**Goal**: Reduce Elasticsearch load for common queries + +1. **Query Result Cache** + - Implement Caffeine cache + - Cache key: query + filter hash + - Cache value: facet results + +2. **Cache Invalidation** + - On indexing events + - Manual invalidation endpoint + +3. **Monitoring** + - Cache hit rate metrics + - Cache size metrics + +**Expected Impact**: +- Common queries: 10-100x faster (cache hit) +- Elasticsearch load: -30% to -50% reduction + +### Phase 4: Advanced Features (Week 5-6) +**Goal**: Add pagination and accuracy improvements + +1. **Composite Aggregation** + - Implement for characteristics pagination + - Add `after_key` parameter support + +2. **Confidence Intervals** + - Calculate and return with facet results + - Display in UI (optional) + +3. **Global Ordinals** + - Enable for key fields + - Monitor indexing performance impact + +**Expected Impact**: +- UX: Pagination enables "show all" functionality +- Accuracy: Users understand estimate quality + +--- + +## 4. Performance Targets + +### 4.1 Response Time Targets + +| Query Type | Current | Target | Strategy | +|------------|---------|--------|----------| +| Unfiltered (50M) | 15-30s | 10-20s | Optimized sampling | +| Filtered (1-10M) | 10-20s | 5-10s | Adaptive sampling | +| Filtered (< 1M) | 5-10s | 1-3s | Regular (no sampling) | +| Cached | N/A | < 100ms | Cache hit | + +### 4.2 Accuracy Targets + +| Metric | Current | Target | +|--------|---------|--------| +| Sampling Rate (50M) | 0.6% | 1-2% | +| Extrapolation Error | ±20% | ±10% | +| Rare Value Detection | Poor | Good (flag < 10 count) | + +### 4.3 Scalability Targets + +- **Current**: 50M samples, 3 shards +- **Target**: Support 100M+ samples with same performance +- **Approach**: + - Increase shards to 5-7 (when scaling) + - Maintain 1-2% sampling rate + - Use caching to offset increased load + +--- + +## 5. Monitoring & Metrics + +### 5.1 Key Metrics to Track + +1. **Performance Metrics** + - Facet query response time (p50, p95, p99) + - Timeout rate + - Cache hit rate + - Elasticsearch query time + +2. **Accuracy Metrics** + - Sampling rate per query + - Extrapolation factor + - Rare value detection rate + +3. **Resource Metrics** + - Memory usage (cache size) + - Elasticsearch CPU usage + - Network I/O + +### 5.2 Alerts + +- **Response Time**: Alert if p95 > 30s +- **Timeout Rate**: Alert if > 5% of queries timeout +- **Cache Hit Rate**: Alert if < 20% (indicates cache too small or TTL too short) +- **Elasticsearch Errors**: Alert on any aggregation errors + +--- + +## 6. Scaling Considerations + +### 6.1 Horizontal Scaling (More Nodes) + +**Current**: 3 nodes, 3 shards +**Future**: 5-7 nodes, 5-7 shards + +**Impact on Strategy**: +- More shards = more parallel processing +- Can maintain same per-shard sampler size +- Total samples increase proportionally +- Example: 7 shards × 167k = 1.17M samples (still 1-2% of 50M) + +**Action**: Strategy automatically adapts (sampler size per shard remains constant) + +### 6.2 Vertical Scaling (More Memory/CPU) + +**Benefit**: Can increase sampler size without performance degradation +**Action**: Increase `maxSamplerSize` from 2M to 5M + +### 6.3 Data Growth (100M+ Samples) + +**Challenge**: Maintaining performance as data grows +**Solutions**: +1. **Increase Shards**: 3 → 5 → 7 shards +2. **Maintain Sampling Rate**: Keep 1-2% sampling rate +3. **Aggressive Caching**: Cache more query combinations +4. **Consider Separate Facet Index**: Pre-computed facets (advanced) + +--- + +## 7. Risk Assessment + +### 7.1 High Risk + +**Risk**: Increasing sampler size causes timeouts +**Mitigation**: +- Implement adaptive timeout +- Monitor timeout rates +- Fallback to smaller sampler if timeout occurs + +### 7.2 Medium Risk + +**Risk**: Cache memory usage grows too large +**Mitigation**: +- Set strict size limits (100MB) +- Use LRU eviction +- Monitor memory usage + +### 7.3 Low Risk + +**Risk**: Accuracy improvements not noticeable to users +**Mitigation**: +- A/B test old vs new strategy +- Collect user feedback +- Monitor accuracy metrics + +--- + +## 8. Success Criteria + +### 8.1 Performance +- ✅ 90% of facet queries complete in < 20s (currently 15-30s) +- ✅ Cache hit rate > 30% +- ✅ Timeout rate < 1% + +### 8.2 Accuracy +- ✅ Sampling rate ≥ 1% for 50M+ result sets +- ✅ Extrapolation error < ±10% for common values +- ✅ Rare values (< 0.1% prevalence) detected and flagged + +### 8.3 Scalability +- ✅ Strategy works for 100M+ samples with same performance +- ✅ Easy to add more shards without code changes + +--- + +## 9. Alternative Approaches (Not Recommended) + +### 9.1 Pre-computed Facets +**Approach**: Maintain a separate Elasticsearch index with pre-computed facets +**Pros**: Very fast queries +**Cons**: +- Complex to maintain +- High storage cost +- Difficult to keep in sync +- Doesn't work with dynamic filters + +**Verdict**: Not recommended unless caching proves insufficient + +### 9.2 Materialized Views +**Approach**: Use Elasticsearch transform API to create materialized facet views +**Pros**: Automatic updates, fast queries +**Cons**: +- Requires Elasticsearch Platinum license +- Still need to handle dynamic filters +- Complex setup + +**Verdict**: Consider if Elasticsearch license allows and caching insufficient + +### 9.3 Separate Facet Service +**Approach**: Dedicated microservice for faceting with specialized optimizations +**Pros**: Can optimize independently +**Cons**: +- Increased complexity +- Network overhead +- More services to maintain + +**Verdict**: Not needed for current scale + +--- + +## 10. Next Steps + +1. **Review this plan** with team +2. **Prioritize phases** based on business needs +3. **Set up monitoring** before making changes +4. **Implement Phase 1** (quick wins) +5. **Measure impact** and iterate +6. **Proceed to Phase 2** if Phase 1 successful + +--- + +## Appendix A: Elasticsearch Configuration Recommendations + +### A.1 Index Settings + +```json +{ + "settings": { + "number_of_shards": 3, + "number_of_replicas": 1, + "refresh_interval": "30s", // Reduce refresh frequency for better indexing performance + "index": { + "max_result_window": 50000 // Allow deeper pagination if needed + } + } +} +``` + +### A.2 Field Mappings (Key Fields) + +```json +{ + "characteristics.key.keyword": { + "type": "keyword", + "eager_global_ordinals": true // Enable for faster aggregations + }, + "characteristics.value.keyword": { + "type": "keyword", + "eager_global_ordinals": true + } +} +``` + +### A.3 Cluster Settings + +- **Heap Size**: At least 50% of available RAM (but not more than 32GB) +- **Field Data Cache**: Monitor usage, increase if needed +- **Query Cache**: Enable (default) + +--- + +## Appendix B: Example Query Performance + +### B.1 Current Performance (Estimated) + +**Query**: Unfiltered public samples (50M results) +- **Sampler**: 300k samples (0.6%) +- **Time**: 15-30 seconds +- **Accuracy**: ±20% for common values + +### B.2 Phase 1 Performance (Estimated) + +**Query**: Unfiltered public samples (50M results) +- **Sampler**: 500k samples (1%) +- **Time**: 12-25 seconds +- **Accuracy**: ±15% for common values + +### B.3 Phase 2 Performance (Estimated) + +**Query**: Filtered by organism (5M results) +- **Strategy**: Adaptive (500k samples, 10% sampling) +- **Time**: 8-15 seconds +- **Accuracy**: ±8% for common values + +**Query**: Filtered by domain (500k results) +- **Strategy**: Regular (no sampling) +- **Time**: 2-5 seconds +- **Accuracy**: Perfect (100%) + +--- + +**Document Version**: 1.0 +**Last Updated**: 2025-12-22 +**Author**: AI Assistant +**Status**: Draft - Pending Review + diff --git a/docs/cursor/get_all_facets_postman.json b/docs/cursor/get_all_facets_postman.json new file mode 100644 index 0000000..46b5f28 --- /dev/null +++ b/docs/cursor/get_all_facets_postman.json @@ -0,0 +1,6 @@ +{ + "text": "", + "filters": [], + "page": 0, + "size": 0 +} diff --git a/docs/cursor/grpc-and-proto-notes.md b/docs/cursor/grpc-and-proto-notes.md new file mode 100644 index 0000000..f4fff53 --- /dev/null +++ b/docs/cursor/grpc-and-proto-notes.md @@ -0,0 +1,71 @@ +# gRPC and Protocol Buffers (Proto) Notes + +## Protocol Buffers (Proto) + +Protocol Buffers (protobuf) is a language-agnostic serialization format: + +1. **Define your data structures** in `.proto` files: + + ```protobuf + message Sample { + string id = 1; + string name = 2; + int32 count = 3; + } + ``` + +2. **Compile** the `.proto` file using `protoc` to generate code in your target language (Java, Python, Go, etc.) + +3. **Serialize/deserialize** data efficiently — more compact and faster than JSON/XML + +## gRPC + +gRPC is an RPC framework that uses Protocol Buffers by default: + +1. **Define services** in `.proto` files: + + ```protobuf + service SampleService { + rpc GetSample(SampleRequest) returns (Sample); + rpc ListSamples(ListRequest) returns (stream Sample); + } + ``` + +2. **Generate client/server code** — `protoc` creates: + - Server stubs (implement the service methods) + - Client stubs (call the service methods) + +3. **Communication**: + - Uses HTTP/2 (multiplexing, streaming) + - Binary protobuf messages (efficient) + - Strong typing (contract-driven) + +## How They Work Together + +1. **Design phase**: Write `.proto` files defining messages and services +2. **Code generation**: `protoc` generates language-specific code +3. **Implementation**: + - **Server**: Implement the generated service interface + - **Client**: Use the generated client to make calls +4. **Runtime**: gRPC handles serialization, networking, and RPC + +## Key Benefits + +- **Performance**: Binary format, HTTP/2, efficient serialization +- **Type safety**: Compile-time checks via generated code +- **Streaming**: Supports unary, server streaming, client streaming, bidirectional streaming +- **Language agnostic**: Same `.proto` works across languages +- **Code generation**: Reduces boilerplate + +## Example Flow + +``` +Client Server + | | + |--(protobuf message)---->| + | | (deserializes) + | | (processes) + | | (serializes) + |<--(protobuf response)---| + | | +``` diff --git a/docs/cursor/postman-es-investigation-acc-filter.md b/docs/cursor/postman-es-investigation-acc-filter.md new file mode 100644 index 0000000..60cc0b3 --- /dev/null +++ b/docs/cursor/postman-es-investigation-acc-filter.md @@ -0,0 +1,217 @@ +# Postman queries: investigate `filter=acc:SAME.*` at Elasticsearch level + +Use these against your Elasticsearch instance. Default from `application.yml`: **base URL** `http://localhost:9200`, and if auth is enabled use **Basic Auth** (e.g. `elastic` / `elastic`). + +--- + +## 1. Cluster health (connectivity) + +**Method:** `GET` +**URL:** `http://localhost:9200/` + +**Purpose:** Confirm ES is reachable. + +--- + +## 2. Index exists and doc count + +**Method:** `GET` +**URL:** `http://localhost:9200/samples/_count` + +**Purpose:** Confirm `samples` index exists and has documents. + +--- + +## 3. Index mapping (how `accession` is stored) + +**Method:** `GET` +**URL:** `http://localhost:9200/samples/_mapping` + +**Purpose:** Check whether you have: +- `accession` as **keyword** (no `.keyword` subfield), or +- `accession` as **text** with subfield `accession.keyword`. + +Our code queries `accession.keyword`; if the mapping only has `accession` (keyword), wildcard must target `accession`, not `accession.keyword`. + +--- + +## 4. Sample documents (see real accession values) + +**Method:** `POST` +**URL:** `http://localhost:9200/samples/_search` + +**Headers:** `Content-Type: application/json` + +**Body (raw JSON):** + +```json +{ + "size": 5, + "_source": ["accession", "name", "update"], + "query": { "match_all": {} } +} +``` + +**Purpose:** See actual accession values (e.g. SAMEA, SAME, SAMN, SAMD). Check if any start with `SAME` for the pattern `SAME.*`. + +--- + +## 5. Wildcard query on `accession.keyword` (what the app sends) + +**Method:** `POST` +**URL:** `http://localhost:9200/samples/_search` + +**Headers:** `Content-Type: application/json` + +**Body (raw JSON):** + +```json +{ + "size": 20, + "_source": ["accession", "name"], + "query": { + "wildcard": { + "accession.keyword": { + "value": "SAME.*" + } + } + } +} +``` + +**Purpose:** Replicate the app’s wildcard filter. If this returns 0 hits but step 6 (on `accession`) returns hits, the index uses `accession` (keyword) and the app should query that field. + +--- + +## 6. Wildcard query on `accession` (if mapping has no `.keyword`) + +**Method:** `POST` +**URL:** `http://localhost:9200/samples/_search` + +**Headers:** `Content-Type: application/json` + +**Body (raw JSON):** + +```json +{ + "size": 20, + "_source": ["accession", "name"], + "query": { + "wildcard": { + "accession": { + "value": "SAME.*" + } + } + } +} +``` + +**Purpose:** If step 5 returns nothing but the mapping (step 3) shows `accession` as keyword (no `accession.keyword`), this checks whether wildcard on `accession` works and has data. + +--- + +## 7. Full app-like query (bool + wildcard + public filter) + +The app wraps the accession filter in a bool with other filters (e.g. release date, must_not suppressed). To test with only the wildcard (no date/status): + +**Method:** `POST` +**URL:** `http://localhost:9200/samples/_search` + +**Headers:** `Content-Type: application/json` + +**Body (raw JSON):** + +```json +{ + "size": 20, + "_source": ["accession", "name", "release", "status"], + "query": { + "bool": { + "must": [ + { + "wildcard": { + "accession.keyword": { "value": "SAME.*" } + } + } + ] + } + } +} +``` + +If you use the same bool structure as the app (with `must_not` for suppressed, etc.), add those clauses here to see if they remove all hits. + +--- + +## 8. Term query (exact accession) – sanity check + +**Method:** `POST` +**URL:** `http://localhost:9200/samples/_search` + +**Headers:** `Content-Type: application/json` + +**Body (raw JSON):** Replace `SAMD00000001` with a real accession from step 4. + +```json +{ + "size": 5, + "_source": ["accession", "name"], + "query": { + "term": { + "accession.keyword": { "value": "SAMD00000001" } + } + } +} +``` + +**Purpose:** If term on `accession.keyword` works, the field exists. If it fails, try `"accession": { "value": "SAMD00000001" }` (no `.keyword`). + +--- + +## Summary + +| Step | What to check | +|------|----------------| +| 3 | Mapping: `accession` vs `accession.keyword` | +| 4 | Real accessions: any starting with `SAME`? | +| 5 | Wildcard on `accession.keyword` (app behaviour) | +| 6 | Wildcard on `accession` if mapping has no `.keyword` | +| 8 | Term on `accession.keyword` or `accession` to confirm field name | + +If step 5 returns 0 and step 6 returns hits, the fix is to use field `accession` instead of `accession.keyword` in `AccessionSearchFilter` when your index mapping has no `.keyword` subfield. + +--- + +## When `filter=acc:SAME*` returns nothing (API) + +1. **URL encoding** – Some clients or proxies drop `*` in the query string. Try with the asterisk encoded: + `http://localhost:8081/biosamples/samples?filter=acc:SAME%2A` + +2. **Other filters** – The app always adds a “public” filter: `release <= now` and `must_not` INSDC status = suppressed. If all documents matching `SAME*` are suppressed or not yet released, the API returns 0. Replicate the full query in ES (step 7 below) to confirm. + +3. **Full app query in ES** – Run this to see if the combination of wildcard + release + not suppressed returns any hits (replace the date with current ISO instant if needed): + +**POST** `http://localhost:9200/samples/_search` + +```json +{ + "size": 20, + "_source": ["accession", "name", "release", "characteristics"], + "query": { + "bool": { + "must": [ + { "wildcard": { "accession.keyword": { "value": "SAME*" } } }, + { "range": { "release": { "lte": "now" } } } + ], + "must_not": [ + { "nested": { "path": "characteristics", "query": { "bool": { "must": [ + { "term": { "characteristics.key.keyword": "INSDC status" } }, + { "term": { "characteristics.value.keyword": "suppressed" } } + ]}}}} + ] + } + } +} +``` + +If this returns 0 hits, the 43 SAME* documents are all filtered out by release date or suppressed status. diff --git a/docs/cursor/query_search_service.ps1 b/docs/cursor/query_search_service.ps1 new file mode 100644 index 0000000..9631bef --- /dev/null +++ b/docs/cursor/query_search_service.ps1 @@ -0,0 +1,17 @@ +# Query the biosamples-search REST API to see what it returns. +# Use the port where the search service is running (default 8080; core app is usually 8081). +$SearchPort = if ($env:SEARCH_PORT) { $env:SEARCH_PORT } else { "8080" } +$BaseUrl = "http://localhost:$SearchPort/search" + +Write-Host "=== 1. Exact accession (acc:SAMEA26) ===" +$exactBody = @' +{"text":null,"filters":[{"type":"pub","webinId":""},{"type":"acc","accession":"SAMEA26"}],"facets":null,"page":0,"size":20,"sort":null,"searchAfter":null} +'@ +Invoke-RestMethod -Uri $BaseUrl -Method Post -ContentType "application/json" -Body $exactBody | ConvertTo-Json -Depth 10 + +Write-Host "" +Write-Host "=== 2. Wildcard accession (acc:SAME*) ===" +$wildcardBody = @' +{"text":null,"filters":[{"type":"pub","webinId":""},{"type":"acc","accession":"SAME*"}],"facets":null,"page":0,"size":20,"sort":null,"searchAfter":null} +'@ +Invoke-RestMethod -Uri $BaseUrl -Method Post -ContentType "application/json" -Body $wildcardBody | ConvertTo-Json -Depth 10 diff --git a/docs/cursor/query_search_service.sh b/docs/cursor/query_search_service.sh new file mode 100644 index 0000000..9525a7d --- /dev/null +++ b/docs/cursor/query_search_service.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +# Query the biosamples-search REST API to see what it returns. +# Use the port where the search service is running (default 8080; core app is usually 8081). +SEARCH_PORT="${SEARCH_PORT:-8080}" + +echo "=== 1. Exact accession (acc:SAMEA26) ===" +curl -s -X POST "http://localhost:${SEARCH_PORT}/search" \ + -H "Content-Type: application/json" \ + -d @docs/cursor/search_service_query_acc_exact.json | jq . + +echo "" +echo "=== 2. Wildcard accession (acc:SAME*) ===" +curl -s -X POST "http://localhost:${SEARCH_PORT}/search" \ + -H "Content-Type: application/json" \ + -d @docs/cursor/search_service_query_acc_wildcard.json | jq . diff --git a/docs/cursor/search-query-optimization-analysis.md b/docs/cursor/search-query-optimization-analysis.md new file mode 100644 index 0000000..b2469b3 --- /dev/null +++ b/docs/cursor/search-query-optimization-analysis.md @@ -0,0 +1,117 @@ +# Search query optimization analysis + +Analysis of the biosamples-search query-building logic for further optimization opportunities. **No code changes** — recommendations only. + +--- + +## 1. Already done + +- **Public + release date range:** When both `pub` and a release `dt` filter are present, the query now uses a single release range and one `must_not` (exclude suppressed) instead of two ranges and extra nesting. Implemented in `QueryHelper` + `PublicSearchFilter.getExcludeSuppressedQuery()`. + +--- + +## 2. Recommended optimizations + +### 2.1 Multiple date ranges on the same field + +**Current behaviour:** If a request includes more than one `dt` filter on the same field (e.g. two release ranges), each becomes a separate range clause and ES applies both (intersection). + +**Opportunity:** In `QueryHelper.getFilterQuery()`, before building the final bool: + +- Collect all `DateRangeSearchFilter` instances and group by `(field)`. +- For each field with multiple ranges, compute one range: `gte = max(froms)`, `lte = min(tos)` (intersection). +- Add a single range query per field and add other filters as today. + +**Benefit:** One range per field instead of N; simpler and slightly more efficient. +**Effort:** Low–medium (grouping + intersection logic, then single `DateRangeSearchFilter` or a single range query per field). + +--- + +### 2.2 Top-level query when search text is empty + +**Current behaviour:** The top-level query is always: + +```text +bool.must( match_query ) // match_all when text is empty +.filter( filter_query ) +``` + +**Opportunity:** When `searchQuery.getText()` is null or blank, use only the filter as the root query (e.g. `bool.filter(filterQuery)` with no `must`), instead of `bool.must(match_all).filter(filterQuery)`. + +**Benefit:** Slightly smaller query tree and one less clause; semantics unchanged. +**Effort:** Low (branch in `QueryHelper.getSearchQuery()` on empty text). + +--- + +### 2.3 AttributeSearchFilter with a single value + +**Current behaviour:** For one value we still build: + +```text +nested(path=characteristics, bool.must( term(key), bool.should( term(value) ) )) +``` + +**Opportunity:** When `values.size() == 1`, use a single `term` for the value instead of `bool.should([term])`. + +**Benefit:** Slightly simpler inner bool; minor. +**Effort:** Low (branch in `AttributeSearchFilter.buildSubQueryForOrCondition()` or equivalent). + +--- + +## 3. Edge cases and robustness (no performance gain, but safer) + +### 3.1 StructuredDataSearchFilter with no criteria + +**Current behaviour:** If `dataType`, `key`, and `value` are all null/empty, `queries` is empty and we build a nested query with `bool.must([])`. In Elasticsearch, an empty `must` can lead to surprising matching behaviour. + +**Recommendation:** Reject or ignore the filter when none of `dataType`, `key`, `value` have text (e.g. throw or skip adding this filter), or document that at least one is required. Comment in code already says “dataType is required” but it’s not enforced. + +--- + +### 3.2 ExternalRefSearchFilter with no criteria + +**Current behaviour:** If both `archive` and `accession` are null/empty, `queries` is empty and we build `nested(path=externalReferences, bool.must([]))`. + +**Recommendation:** Same as 3.1 — reject or skip when both are empty, or document that at least one is required. + +--- + +### 3.3 RelationshipSearchFilter with no criteria + +**Current behaviour:** If `relType`, `source`, and `target` are all null/empty, we build a nested query with `bool.must([])`. + +**Recommendation:** Reject or skip when all three are empty; or require at least one. + +--- + +## 4. Lower priority / optional + +### 4.1 PublicSearchFilter + multiple release date ranges + +If a client ever sends **pub** plus **two** release `dt` filters, we already skip the pub “1970→now” range (because `hasReleaseDateRange` is true), but we still add two separate range clauses. Merging date ranges on the same field (see 2.1) would automatically turn that into one range (intersection) and keep the current pub optimization behaviour. + +### 4.2 Filter ordering + +Elasticsearch does not guarantee evaluation order for filter clauses. Putting the most selective filters first is sometimes suggested for caching or early termination, but it’s index-dependent and not a general win. Leave as-is unless profiling shows a clear benefit. + +--- + +## 5. Summary table + +| Item | Benefit | Effort | Recommendation | +|------------------------------------|----------------|----------|------------------------| +| Multiple date ranges, same field | Clear | Low–Med | Implement | +| No must(match_all) when text empty | Small | Low | Implement | +| Single-value AttributeSearchFilter| Minor | Low | Optional | +| Empty StructuredDataSearchFilter | Correctness | Low | Validate / reject | +| Empty ExternalRefSearchFilter | Correctness | Low | Validate / reject | +| Empty RelationshipSearchFilter | Correctness | Low | Validate / reject | + +--- + +## 6. Files involved + +- **Query building:** `QueryHelper.java`, `SearchService.java`, `FacetService.java` +- **Filters:** All under `server/.../filter/` (e.g. `DateRangeSearchFilter`, `PublicSearchFilter`, `AttributeSearchFilter`, `StructuredDataSearchFilter`, `ExternalRefSearchFilter`, `RelationshipSearchFilter`) + +No changes have been made in the codebase; this document is analysis and recommendations only. diff --git a/docs/cursor/search_date_range_filter_postman.json b/docs/cursor/search_date_range_filter_postman.json new file mode 100644 index 0000000..3af05b5 --- /dev/null +++ b/docs/cursor/search_date_range_filter_postman.json @@ -0,0 +1,19 @@ +{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "update", + "from": "2024-01-01T00:00:00.000Z", + "to": "2024-12-31T23:59:59.999Z" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} diff --git a/docs/cursor/search_organism_request.json b/docs/cursor/search_organism_request.json new file mode 100644 index 0000000..cecefdd --- /dev/null +++ b/docs/cursor/search_organism_request.json @@ -0,0 +1,16 @@ +{ + "text": "organism", + "filters": [], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} + + + + diff --git a/docs/cursor/search_release_2014_2015_exclude_suppressed.json b/docs/cursor/search_release_2014_2015_exclude_suppressed.json new file mode 100644 index 0000000..7940c0f --- /dev/null +++ b/docs/cursor/search_release_2014_2015_exclude_suppressed.json @@ -0,0 +1,24 @@ +{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "release", + "from": "2014-01-01T00:00:00Z", + "to": "2015-01-02T00:00:00Z" + }, + { + "type": "excludeAttr", + "field": "INSDC status", + "values": ["suppressed"] + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} diff --git a/docs/cursor/search_release_date_only_curl.sh b/docs/cursor/search_release_date_only_curl.sh new file mode 100644 index 0000000..540aaef --- /dev/null +++ b/docs/cursor/search_release_date_only_curl.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +# Curl command with explicit release date range filter only +# Use this to test the release date filtering separately from suppressed exclusion + +curl --location 'http://localhost:8080/search' \ +--header 'Content-Type: application/json' \ +--data '{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "release", + "from": "1970-01-01T00:00:00.000Z", + "to": "2025-12-15T23:59:59.999Z" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +}' + + + diff --git a/docs/cursor/search_release_date_only_postman.json b/docs/cursor/search_release_date_only_postman.json new file mode 100644 index 0000000..995968b --- /dev/null +++ b/docs/cursor/search_release_date_only_postman.json @@ -0,0 +1,22 @@ +{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "release", + "from": "1970-01-01T00:00:00.000Z", + "to": "2025-12-15T23:59:59.999Z" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} + + + diff --git a/docs/cursor/search_release_date_range_curl.sh b/docs/cursor/search_release_date_range_curl.sh new file mode 100644 index 0000000..f6660d8 --- /dev/null +++ b/docs/cursor/search_release_date_range_curl.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Curl command with release date range AND suppressed exclusion +# Uses "pub" filter which internally: +# 1. Filters by release date (release <= current time) +# 2. Excludes suppressed samples (INSDC status != "suppressed") + +curl --location 'http://localhost:8080/search' \ +--header 'Content-Type: application/json' \ +--data '{ + "text": "", + "filters": [ + { + "type": "pub" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +}' + diff --git a/docs/cursor/search_release_date_range_postman.json b/docs/cursor/search_release_date_range_postman.json new file mode 100644 index 0000000..64b219c --- /dev/null +++ b/docs/cursor/search_release_date_range_postman.json @@ -0,0 +1,17 @@ +{ + "text": "", + "filters": [ + { + "type": "pub" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} + diff --git a/docs/cursor/search_release_exclude_suppressed_curl.sh b/docs/cursor/search_release_exclude_suppressed_curl.sh new file mode 100644 index 0000000..665296a --- /dev/null +++ b/docs/cursor/search_release_exclude_suppressed_curl.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +# Curl command with explicit release date range AND exclude suppressed samples +# This breaks down the "pub" filter into explicit filters for debugging + +curl --location 'http://localhost:8080/search' \ +--header 'Content-Type: application/json' \ +--data '{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "release", + "from": "1970-01-01T00:00:00.000Z", + "to": "2025-12-15T23:59:59.999Z" + }, + { + "type": "excludeAttr", + "field": "INSDC status", + "values": ["suppressed"] + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +}' + diff --git a/docs/cursor/search_release_exclude_suppressed_postman.json b/docs/cursor/search_release_exclude_suppressed_postman.json new file mode 100644 index 0000000..fe8a8b7 --- /dev/null +++ b/docs/cursor/search_release_exclude_suppressed_postman.json @@ -0,0 +1,25 @@ +{ + "text": "", + "filters": [ + { + "type": "dt", + "field": "release", + "from": "1970-01-01T00:00:00.000Z", + "to": "2025-12-15T23:59:59.999Z" + }, + { + "type": "excludeAttr", + "field": "INSDC status", + "values": ["suppressed"] + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} + diff --git a/docs/cursor/search_service_query_acc_exact.json b/docs/cursor/search_service_query_acc_exact.json new file mode 100644 index 0000000..ced30e7 --- /dev/null +++ b/docs/cursor/search_service_query_acc_exact.json @@ -0,0 +1,12 @@ +{ + "text": null, + "filters": [ + { "type": "pub", "webinId": "" }, + { "type": "acc", "accession": "SAMEA26" } + ], + "facets": null, + "page": 0, + "size": 20, + "sort": null, + "searchAfter": null +} diff --git a/docs/cursor/search_service_query_acc_wildcard.json b/docs/cursor/search_service_query_acc_wildcard.json new file mode 100644 index 0000000..ea3a5ac --- /dev/null +++ b/docs/cursor/search_service_query_acc_wildcard.json @@ -0,0 +1,12 @@ +{ + "text": null, + "filters": [ + { "type": "pub", "webinId": "" }, + { "type": "acc", "accession": "SAME*" } + ], + "facets": null, + "page": 0, + "size": 20, + "sort": null, + "searchAfter": null +} diff --git a/docs/cursor/search_structured_data_postman.json b/docs/cursor/search_structured_data_postman.json new file mode 100644 index 0000000..9d68c38 --- /dev/null +++ b/docs/cursor/search_structured_data_postman.json @@ -0,0 +1,17 @@ +{ + "text": "", + "filters": [ + { + "type": "sdata", + "dataType": "AMR" + } + ], + "page": 0, + "size": 10, + "sort": [ + { + "direction": "DESC", + "field": "update" + } + ] +} diff --git a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java index 1878ea5..7a69cd6 100644 --- a/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java +++ b/server/src/main/java/uk/ac/ebi/biosamples/search/es/QueryHelper.java @@ -42,10 +42,24 @@ private static Query getTextMatchQuery(SearchQuery searchQuery) { // return MatchQuery.of(m -> m.field("sample_full_text").query(searchText))._toQuery(); - return QueryStringQuery.of(qs -> qs - .defaultField("sample_full_text") - .query(searchText) - .defaultOperator(Operator.Or) // Default to OR if no operator is specified by the user + // Unquoted text: prefer exact phrase matches, but still include documents + // that contain all terms in any order. Phrase matches are boosted so they rank first. + return BoolQuery.of(b -> b + .should(s -> s + .matchPhrase(mp -> mp + .field("sample_full_text") + .query(searchText) + .boost(5.0f) + ) + ) + .should(s -> s + .queryString(qs -> qs + .defaultField("sample_full_text") + .query(searchText) + .defaultOperator(Operator.And) + ) + ) + .minimumShouldMatch("1") )._toQuery(); }