diff --git a/Cargo.toml b/Cargo.toml index ab5a0ce..37dbe0b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ authors = ["genomehubs "] description = "Generic CLI generator for genomehubs instances" license = "MIT" readme = "README.md" +default-run = "cli-generator" [workspace] members = [".", "crates/genomehubs-query", "crates/genomehubs-api"] diff --git a/agent-logs/2026-05-28_001_histogram-scatter-modular-refactor.md b/agent-logs/2026-05-28_001_histogram-scatter-modular-refactor.md new file mode 100644 index 0000000..6106b7e --- /dev/null +++ b/agent-logs/2026-05-28_001_histogram-scatter-modular-refactor.md @@ -0,0 +1,140 @@ +--- +date: 2026-05-28 +agent: GitHub Copilot +model: claude-sonnet-4-6 +task: "Refactor histogram and scatter report functions and parsing to be modular, DRY, and cover all edge cases" +files_changed: + - crates/genomehubs-api/src/report/field.rs + - crates/genomehubs-api/src/report/mod.rs + - crates/genomehubs-api/src/report/agg.rs + - crates/genomehubs-api/src/report/bounds.rs + - crates/genomehubs-api/src/report/report_types.rs + - crates/genomehubs-api/src/report/spec_builder.rs + - crates/genomehubs-query/src/report/mod.rs +--- + +## Task summary + +The user requested a full architectural refactor of the server-side report infrastructure in +`crates/genomehubs-api/src/report/`. The prior code had duplicated field-type helpers spread +across `agg.rs` and `bounds.rs`, a 4-case branch in `build_nested_attribute_histogram_with_categories` +(~200 lines), non-deterministic category histogram extraction paths (4 candidate paths +searched at runtime), and ~200 lines of duplicated tick label/value extraction in +`spec_builder.rs`. Two additional bug fixes preceded this session: presence filters were +not ANDed into histogram bounds queries, and per-category histograms used `{x_field}` as +the inner container name, making extraction non-deterministic. + +This session replaced all of the above with a clean, type-agnostic architecture. + +## Key decisions + +- **New `field.rs` module as single source of truth**: All field-type resolution + (`is_rank`, `is_attribute`, `get_attribute_value_field`) and all ES nested-path logic + now live in a `FieldStorage` enum (`Attribute{key, es_value_field}`, `Lineage{rank}`, + `Root{es_field}`). Methods encode every path decision in one place, eliminating drift + between builder and extractor code. + +- **Canonical container naming enforced in `build_inner_x_agg_block`**: The per-category + inner x aggregation container is now always `"by_key"` (attribute) or `"at_rank"` + (lineage), never `{x_field}`. This makes extraction paths `O(1)` pointer dereferences + instead of a 4-candidate runtime search. The previous `{x_field}` naming was the root + cause of the "most cats fall in first bin" bug fixed in the prior session. + +- **`GenericBucketAgg` replaces 5 typed builder structs**: A single + `GenericBucketAgg { storage, bucket_type, bucket_params }` implements `AggBuilder` for + all field types. `build()` delegates wrapping to `wrap_in_nested(storage, …)`. + `extract()` delegates path resolution to `storage.main_bucket_path(agg_name, bucket_type)`. + +- **`build_nested_attribute_histogram_with_categories` reduced from ~200 to ~80 lines**: + Replaced 4 hand-written `(x_type, cat_type)` cases with type-agnostic composition: + `build_inner_x_agg_block` + `wrap_cat_in_nested` + `inject_category_histograms`. + Adding a new field storage type in future requires no changes here. + +- **`fill_tick_data_from_buckets` helper in `spec_builder.rs`**: Extracts the identical + ~60-line pattern (keyword label list OR numeric boundary computation) shared between the + histogram x-axis and scatter x-axis branches. Y-axis handling reuses the same helper + with an `explicit_labels` parameter for `yBucketLabels`. Raw scalar `yBuckets` values + are wrapped into pseudo-bucket objects before dispatch so the helper has a uniform + interface. + +- **`grouped_bar_size_px` helper in `genomehubs-query/src/report/mod.rs`**: Extracted + the 5-line pixel-width computation duplicated in the keyword and numeric branches of the + grouped histogram path into a pure helper: `grouped_bar_size_px(n_bins, n_cats, plot_width_px)`. + +- **No speculative abstraction**: Only patterns that appeared at least twice in the + current code were extracted. The VL `vl_scatter` function was not restructured because + its internal branching differs significantly from `vl_histogram`. + +## Interaction log + +| Turn | Role | Summary | +| ---- | ----- | --------------------------------------------------------------------------------- | +| 1 | User | Requested full modular refactor of histogram/scatter report infrastructure | +| 2 | Agent | Created `field.rs` with `FieldStorage`, `resolve_field_storage`, wrapping helpers | +| 3 | Agent | Registered `pub mod field;` in `mod.rs`; refactored `agg.rs` (GenericBucketAgg) | +| 4 | Agent | Refactored `bounds.rs` — removed duplicated helpers, imported from `field.rs` | +| 5 | Agent | Refactored `report_types.rs` — new `extract_cat_histograms`, `align_to_keys` | +| 6 | Agent | Confirmed clean build (0 errors, 0 warnings) | +| 7 | Agent | Added `bucket_keys_to_boundaries` + `fill_tick_data_from_buckets` to spec_builder | +| 8 | Agent | (New session) Replaced duplicate tick extraction blocks in `build_plot_spec` | +| 9 | Agent | Extracted `grouped_bar_size_px` from `vl_histogram` numeric/keyword branches | +| 10 | Agent | Confirmed clean `cargo clippy --all-targets -- -D warnings` across workspace | + +## Changes made + +### `crates/genomehubs-api/src/report/field.rs` (new file, ~350 lines) + +- `FieldStorage` enum with `nested_path()`, `key_filter()`, `x_container_name()`, + `cat_wrapper_names()`, `presence_filter()`, `bucket_field()`, `main_bucket_path()`, + `cat_histograms_base()`, `inner_x_path()` +- `resolve_field_storage(field, value_type, cache)` — prefers `TaxonRank` over `Attribute` +- `is_rank()`, `is_attribute()`, `get_attribute_value_field()` — canonical, previously duplicated +- `wrap_in_nested()`, `wrap_cat_in_nested()`, `build_inner_x_agg_block()` — composition helpers + +### `crates/genomehubs-api/src/report/agg.rs` + +- Removed: `HistogramAggBuilder`, `DateHistogramAggBuilder`, `TermsAggBuilder`, + `StatsAggBuilder`, `NestedAttributeAggBuilder`, `NestedRankAggBuilder`, + `CompositeAggBuilder`, `ReverseNestedAggBuilder`, `GeoHashAggBuilder` +- Added: `GenericBucketAgg` — single `AggBuilder` impl for all field types +- `build_nested_attribute_histogram_with_categories`: 200 lines → 80 lines, fully type-agnostic +- `inject_category_histograms`: uses `x_storage.x_container_name()` for deterministic insertion + +### `crates/genomehubs-api/src/report/bounds.rs` + +- Removed duplicated `is_rank`, `is_attribute`, `get_attribute_value_field` functions +- Imported canonical versions from `field.rs` + +### `crates/genomehubs-api/src/report/report_types.rs` + +- Removed `presence_filter_for_axis` — replaced by `FieldStorage::presence_filter()` +- Replaced old 4-candidate-path `extract_cat_histograms` with `FieldStorage`-based deterministic version +- Added `align_to_keys` shared helper for per-category count alignment + +### `crates/genomehubs-api/src/report/spec_builder.rs` + +- Added `bucket_keys_to_boundaries(sorted_keys, axis_obj)` — `N` keys → `N+1` VL bin boundaries +- Added `fill_tick_data_from_buckets(meta, axis_obj, buckets, label_source)` — unified tick extraction +- Replaced two ~60-line duplicated blocks (histogram x-axis and scatter x-axis) with calls to helper +- Replaced ~80-line y-axis block (scatter) with wrapped call to same helper using `explicit_labels` + +### `crates/genomehubs-query/src/report/mod.rs` + +- Added `grouped_bar_size_px(n_bins, n_cats, plot_width_px)` — extracted from two identical 5-line computations in `vl_histogram` + +## Notes / warnings + +- The new `"by_key"` / `"at_rank"` canonical container names are a **breaking change** relative + to any cached Elasticsearch responses or client-side code that expected `{x_field}` as the + container name. Any stored ES aggregation responses will be unaffected (they are computed fresh), + but any client that manually inspects the raw ES response shape should be updated. + +- `geohash_precision_for_size` in `agg.rs` has `#[allow(dead_code)]` — it is used by the + geo report path which is not currently exercised by the test suite. + +- The scatter `vl_scatter` function in `genomehubs-query/src/report/mod.rs` still has some + duplication with `vl_histogram` in the category handling paths. Full extraction was deferred + because the two functions diverge significantly in their data transformation logic. + +- Pending feature (deferred): 3-level nested binning for x/y/cat scatter (x-binned + y-binned + + category breakdown). The `FieldStorage` composition pattern makes this straightforward to add. diff --git a/config/swagger-examples-goat.yaml b/config/swagger-examples-goat.yaml index 6a0d94b..72382e4 100644 --- a/config/swagger-examples-goat.yaml +++ b/config/swagger-examples-goat.yaml @@ -256,3 +256,31 @@ examples: - record_id: "7227" result: taxon fields: ["genome_size"] + + # ── POST /api/v3/report/batch ───────────────────────────────────────────── + + - path: "/api/v3/report/batch" + method: post + name: chordata_and_nematoda_report_batch + summary: "Run arc reports for Chordata and Nematoda in parallel" + value: + concurrency: 2 + reports: + - query: + taxa: ["Chordata"] + taxon_filter_type: tree + rank: species + report: + report: arc + feature: "bioproject=prjna533106" + reference: assembly_level + context: "" + - query: + taxa: ["Nematoda"] + taxon_filter_type: tree + rank: species + report: + report: arc + feature: "bioproject=prjna533106" + reference: assembly_level + context: "" diff --git a/crates/genomehubs-api/Cargo.toml b/crates/genomehubs-api/Cargo.toml index ceb03eb..08fcc02 100644 --- a/crates/genomehubs-api/Cargo.toml +++ b/crates/genomehubs-api/Cargo.toml @@ -2,6 +2,7 @@ name = "genomehubs-api" version = "0.1.0" edition = "2021" +default-run = "genomehubs-api" [dependencies] axum = { version = "0.7", features = ["tokio", "http1", "macros"] } diff --git a/crates/genomehubs-api/src/main.rs b/crates/genomehubs-api/src/main.rs index 3e3ac13..456eb5d 100644 --- a/crates/genomehubs-api/src/main.rs +++ b/crates/genomehubs-api/src/main.rs @@ -63,6 +63,7 @@ pub struct AppState { routes::record::get_record, routes::record_batch::post_record_batch, routes::report::post_report, + routes::report_batch::post_report_batch, routes::positional::post_positional, routes::result_fields::get_result_fields, routes::search::post_search, @@ -100,6 +101,9 @@ pub struct AppState { routes::record_batch::RecordBatchResponse, routes::report::ReportRequest, routes::report::ReportResponse, + routes::report_batch::ReportBatchRequest, + routes::report_batch::ReportBatchResponse, + routes::report_batch::ReportBatchResultItem, routes::positional::PositionalRequest, routes::positional::PositionalResponse, routes::metadata::MetadataResponse, @@ -461,6 +465,10 @@ async fn main() { "/api/v3/report", axum::routing::post(routes::report::post_report), ) + .route( + "/api/v3/report/batch", + axum::routing::post(routes::report_batch::post_report_batch), + ) .route( "/api/v3/positional", axum::routing::post(routes::positional::post_positional), diff --git a/crates/genomehubs-api/src/report/agg.rs b/crates/genomehubs-api/src/report/agg.rs index 3298557..59e607e 100644 --- a/crates/genomehubs-api/src/report/agg.rs +++ b/crates/genomehubs-api/src/report/agg.rs @@ -1,13 +1,23 @@ //! Elasticsearch aggregation builders for report axes. //! -//! Each `AggBuilder` produces the JSON fragment for one ES aggregation, and extracts -//! the bucket list from the response. Builders are composable: use `CompositeAggBuilder` -//! to nest them (e.g., histogram containing stats). +//! All field-type detection and path logic is centralised in [`super::field`]; +//! this module is responsible only for composing valid ES aggregation JSON and +//! extracting buckets from responses. +//! +//! ## Key types +//! - [`AggBuilder`] — trait for all bucket aggregations +//! - [`GenericBucketAgg`] — single implementation that handles attribute/lineage/root fields +//! - [`build_nested_attribute_histogram_with_categories`] — type-agnostic 2-level agg +//! - [`build_nested_attribute_scatter_agg`] — scatter 2-level agg with optional categories use serde_json::{json, Value}; use crate::es_metadata::MetadataCache; -use genomehubs_query::report::axis::{Scale, ValueType}; +use crate::report::field::{ + build_inner_x_agg_block, resolve_field_storage, wrap_cat_in_nested, wrap_in_nested, + FieldStorage, +}; +use genomehubs_query::report::axis::{DateInterval, Scale, ValueType}; use genomehubs_query::report::{AxisSpec, BoundsResult}; use std::sync::Arc; @@ -27,302 +37,43 @@ pub trait AggBuilder: Send + Sync { fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets; } -/// Build a numeric `histogram` aggregation. -pub struct HistogramAggBuilder { - pub field: String, - pub interval: f64, - pub min: f64, - pub max: f64, - pub script: Option, -} - -impl AggBuilder for HistogramAggBuilder { - fn build(&self, agg_name: &str) -> Value { - let mut hist = json!({ - "field": &self.field, - "interval": self.interval, - "extended_bounds": { "min": self.min, "max": self.max }, - "min_doc_count": 0 - }); - - if let Some(script) = &self.script { - hist["script"] = Value::String(script.clone()); - } - - json!({ - agg_name: { - "histogram": hist - } - }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!("/aggregations/{agg_name}/buckets")) - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default() - } -} - -/// Build a `date_histogram` aggregation with `calendar_interval`. -pub struct DateHistogramAggBuilder { - pub field: String, - pub calendar_interval: String, // "1d", "1w", "1M", "3M", "1y", "10y" - pub time_zone: Option, -} - -impl AggBuilder for DateHistogramAggBuilder { - fn build(&self, agg_name: &str) -> Value { - let mut agg = json!({ - "date_histogram": { - "field": &self.field, - "calendar_interval": &self.calendar_interval, - "min_doc_count": 0 - } - }); - if let Some(tz) = &self.time_zone { - agg["date_histogram"]["time_zone"] = Value::String(tz.clone()); - } - json!({ agg_name: agg }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!("/aggregations/{agg_name}/buckets")) - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default() - } -} - -/// Build a `terms` aggregation for categorical axes. -pub struct TermsAggBuilder { - pub field: String, - pub size: usize, - pub include: Option>, // fixed term list -} - -impl AggBuilder for TermsAggBuilder { - fn build(&self, agg_name: &str) -> Value { - let mut terms = json!({ - "field": format!("{}.keyword", &self.field), - "size": self.size, - "min_doc_count": 0 - }); - if let Some(include) = &self.include { - terms["include"] = json!(include); - } - json!({ agg_name: { "terms": terms } }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!("/aggregations/{agg_name}/buckets")) - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default() - } -} - -/// Build a `stats` sub-aggregation (used for Y-axis values within X buckets). -#[allow(dead_code)] -pub struct StatsAggBuilder { - pub field: String, -} +// ── GenericBucketAgg ───────────────────────────────────────────────────────── -impl AggBuilder for StatsAggBuilder { - fn build(&self, agg_name: &str) -> Value { - json!({ agg_name: { "stats": { "field": &self.field } } }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - // Stats returns a single object, not a bucket list; return wrapped - resp.pointer(&format!("/aggregations/{agg_name}")) - .cloned() - .into_iter() - .collect() - } -} - -/// Build a `geohash_grid` aggregation for map reports. -pub struct GeoHashAggBuilder { - pub field: String, - pub precision: u8, - pub size: usize, +/// A single `AggBuilder` implementation that covers attribute, lineage and +/// root-level fields by deriving the nested path from [`FieldStorage`]. +/// +/// Replaces the previous `NestedAttributeAggBuilder`, `NestedRankAggBuilder`, +/// `HistogramAggBuilder` and `TermsAggBuilder` specialisations. Call +/// [`agg_builder_for`] to obtain a boxed instance. +pub struct GenericBucketAgg { + /// Where the field lives in the ES document. + pub storage: FieldStorage, + /// ES aggregation type: `"terms"`, `"histogram"`, `"date_histogram"`, etc. + pub bucket_type: String, + /// Parameters object placed inside `{ bucket_type: params }`. + pub bucket_params: Value, } -impl AggBuilder for GeoHashAggBuilder { +impl AggBuilder for GenericBucketAgg { fn build(&self, agg_name: &str) -> Value { - json!({ - agg_name: { - "geohash_grid": { - "field": &self.field, - "precision": self.precision, - "size": self.size - } - } - }) + // ES requires {agg_name: {agg_type: params}}. Wrap params in the type first, + // then wrap the whole named agg in the nested envelope. + let named_agg = + json!({ &self.bucket_type: { &self.bucket_type: self.bucket_params.clone() } }); + let container = self.storage.x_container_name(); + let wrapped = wrap_in_nested(&self.storage, container, named_agg); + json!({ agg_name: wrapped }) } fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!("/aggregations/{agg_name}/buckets")) + let path = self.storage.main_bucket_path(agg_name, &self.bucket_type); + resp.pointer(&path) .and_then(|b| b.as_array()) .cloned() .unwrap_or_default() } } -/// Build a `reverse_nested` aggregation (used for tree node counts). -#[allow(dead_code)] -pub struct ReverseNestedAggBuilder; - -impl AggBuilder for ReverseNestedAggBuilder { - fn build(&self, agg_name: &str) -> Value { - json!({ agg_name: { "reverse_nested": {} } }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!("/aggregations/{agg_name}")) - .cloned() - .into_iter() - .collect() - } -} - -/// Compose two `AggBuilder`s: parent builds outer agg; inner is nested within each bucket. -/// -/// Used for patterns like: x-axis histogram → y-axis stats within each x bucket. -#[allow(dead_code)] -pub struct CompositeAggBuilder<'a> { - pub outer: &'a dyn AggBuilder, - pub inner: &'a dyn AggBuilder, - pub inner_name: String, -} - -impl<'a> AggBuilder for CompositeAggBuilder<'a> { - fn build(&self, agg_name: &str) -> Value { - let mut outer = self.outer.build(agg_name); - let inner_agg = self.inner.build(&self.inner_name); - - // Recursively inject inner agg into outer's nested structure - self.inject_inner_agg(&mut outer, agg_name, &inner_agg); - outer - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - self.outer.extract(resp, agg_name) - } -} - -impl<'a> CompositeAggBuilder<'a> { - /// Recursively inject inner aggregation into nested structures. - /// Handles both direct aggregations and nested attribute aggregations. - #[allow(dead_code)] - fn inject_inner_agg(&self, outer: &mut Value, agg_name: &str, inner_agg: &Value) { - if let Some(outer_obj) = outer.get_mut(agg_name) { - // First try direct injection (for simple histogram, terms, etc.) - for key in &["histogram", "date_histogram", "terms", "geohash_grid"] { - if outer_obj.get(key).is_some() { - outer_obj["aggs"] = inner_agg.clone(); - return; - } - } - - // If not found directly, look inside nested aggregations - if outer_obj.get("nested").is_some() { - if let Some(nested_aggs) = outer_obj.get_mut("aggs") { - let filter_agg_opt = if nested_aggs.get("by_key").is_some() { - nested_aggs.get_mut("by_key") - } else { - nested_aggs.get_mut("by_value") - }; - - if let Some(filter_agg) = filter_agg_opt { - if let Some(inner_aggs) = filter_agg.get_mut("aggs") { - for key in &["histogram", "date_histogram", "terms", "geohash_grid"] { - if inner_aggs.get(key).is_some() { - if self.inner_name == "cat_agg" { - if let Some(agg_def) = inner_aggs.get_mut(key) { - if agg_def.get("aggs").is_none() { - agg_def["aggs"] = json!({}); - } - if let Some(cat_agg_inner) = inner_agg.get("cat_agg") { - agg_def["aggs"]["cat_agg"] = cat_agg_inner.clone(); - } - } - } else if let Some(inner_value) = - inner_agg.get(&self.inner_name) - { - inner_aggs[&self.inner_name.clone()] = inner_value.clone(); - } else { - inner_aggs[&self.inner_name.clone()] = inner_agg.clone(); - } - return; - } - } - } - } - } - } - } - } -} - -/// Determine if a field is a taxonomic rank. -fn is_rank(field: &str, cache: &Option>>) -> bool { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - return c.taxonomic_ranks.contains(&field.to_string()); - } - } - false -} - -/// Determine if a field is an attribute. -fn is_attribute(field: &str, cache: &Option>>) -> bool { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - if let Value::Object(groups) = &c.attr_types { - for (_, group) in groups { - if let Value::Object(fields) = group { - if fields.contains_key(field) { - return true; - } - } - } - } - } - } - false -} - -/// Get the exact value field for an attribute from metadata. -/// Returns the processed_summary field (e.g., "attributes.long_value" for type=long). -/// This MUST come from metadata, not guessed. -fn get_attribute_value_field( - field: &str, - cache: &Option>>, -) -> Result { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - if let Value::Object(groups) = &c.attr_types { - // Search all groups for this field - for (_, group) in groups { - if let Value::Object(fields) = group { - if let Some(Value::Object(meta_obj)) = fields.get(field) { - // Get processed_summary which is the exact ES field name - if let Some(ps) = - meta_obj.get("processed_summary").and_then(|v| v.as_str()) - { - return Ok(format!("attributes.{}", ps)); - } - } - } - } - } - } - } - Err(format!("field '{}' not found in metadata", field)) -} - /// Return the ES aggregation type name (and matching agg name) for an x-axis value type. /// /// The agg is always named the same as its type so extraction paths are predictable: @@ -357,6 +108,22 @@ fn build_x_agg_params( } t } + ValueType::Date => { + // For date histograms use calendar_interval. + let calendar_interval = x_bounds + .interval + .map(|i| i.to_es_interval().to_string()) + .unwrap_or_else(|| "1y".to_string()); + let mut params = json!({ + "field": x_value_field, + "calendar_interval": calendar_interval, + "min_doc_count": 0 + }); + if let Some(domain_arr) = x_bounds.domain { + params["extended_bounds"] = json!({ "min": domain_arr[0], "max": domain_arr[1] }); + } + params + } _ => { let [domain_min, domain_max] = x_bounds.domain.unwrap_or([0.0, 1.0]); let (hist_min, hist_max, script_opt) = match x_spec.opts.scale { @@ -394,79 +161,172 @@ fn build_x_agg_params( (agg_type, params) } -/// Build the `yHistograms` sub-aggregation used inside each x-histogram bucket. -/// -/// Escapes nested context via `reverse_nested`, re-enters attributes, and runs a histogram -/// on the y-field value. Supports log/sqrt scale transforms via ES script. -/// -/// ```text -/// yHistograms: reverse_nested -/// by_attribute: nested(attributes) -/// {y_field}: filter(y_field) -/// histogram: histogram(y_value_field) -/// ``` -fn build_y_histogram_sub_agg( +#[allow(clippy::too_many_arguments)] +/// Build a Y-axis sub-aggregation that adapts to the Y value type. +/// For numeric values this produces a histogram, for date a date_histogram, +/// and for keyword/taxon-rank a `terms` (named `top_terms`) aggregation. +fn build_y_sub_agg( y_field: &str, y_value_field: &str, + y_value_type: ValueType, y_scale: Scale, - y_domain_min: f64, - y_domain_max: f64, + y_bounds_min: f64, + y_bounds_max: f64, y_ticks: usize, + y_interval: Option, ) -> Value { - let (hist_min, hist_max, script_opt) = match y_scale { - Scale::Log | Scale::Log10 => { - let mn = y_domain_min.max(1.0).log10(); - let mx = y_domain_max.max(1.0).log10(); - (mn, mx, Some("Math.log10(_value)".to_string())) - } - Scale::Log2 => { - let mn = y_domain_min.max(1.0).log2(); - let mx = y_domain_max.max(1.0).log2(); - ( - mn, - mx, - Some("Math.max(Math.log(_value)/Math.log(2), 0)".to_string()), - ) + match y_value_type { + ValueType::TaxonRank => { + // For taxon ranks, aggregate within the `lineage` nested path and + // filter ancestors by the requested rank (e.g., "genus"), then + // terms-aggregate on `lineage.taxon_id` (or configured y_value_field). + let mut y_field_agg = serde_json::Map::new(); + y_field_agg.insert( + y_field.to_string(), + json!({ + "filter": { "term": { "lineage.taxon_rank": y_field } }, + "aggs": { + "top_terms": { + "terms": { + "field": y_value_field, + "size": y_ticks, + "min_doc_count": 0 + } + } + } + }), + ); + json!({ + "reverse_nested": {}, + "aggs": { + "by_attribute": { + "nested": { "path": "lineage" }, + "aggs": Value::Object(y_field_agg) + } + } + }) } - Scale::Sqrt => { - let mn = y_domain_min.max(0.0).sqrt(); - let mx = y_domain_max.sqrt(); - (mn, mx, None) + ValueType::Keyword => { + // terms agg named `top_terms` inside the `attributes` nested path + let mut y_field_agg = serde_json::Map::new(); + y_field_agg.insert( + y_field.to_string(), + json!({ + "filter": { "term": { "attributes.key": y_field } }, + "aggs": { + "top_terms": { + "terms": { + "field": y_value_field, + "size": y_ticks, + "min_doc_count": 0 + } + } + } + }), + ); + json!({ + "reverse_nested": {}, + "aggs": { + "by_attribute": { + "nested": { "path": "attributes" }, + "aggs": Value::Object(y_field_agg) + } + } + }) } - _ => (y_domain_min, y_domain_max, None), - }; + ValueType::Date => { + // date_histogram using calendar_interval derived from bounds tick_count or provided interval + let calendar_interval = y_interval + .map(|i| i.to_es_interval().to_string()) + .unwrap_or_else(|| "1y".to_string()); - let interval = (hist_max - hist_min) / y_ticks.max(1) as f64; + let mut date_hist_params = json!({ + "field": y_value_field, + "calendar_interval": calendar_interval, + "min_doc_count": 0 + }); + // Ensure buckets for empty intervals cover the full domain + date_hist_params["extended_bounds"] = + json!({ "min": y_bounds_min, "max": y_bounds_max }); - let mut hist_params = json!({ - "field": y_value_field, - "interval": interval, - "extended_bounds": { "min": hist_min, "max": hist_max }, - "offset": hist_min, - "min_doc_count": 0 - }); - if let Some(script) = script_opt { - hist_params["script"] = Value::String(script); - } + let mut y_field_agg = serde_json::Map::new(); + y_field_agg.insert( + y_field.to_string(), + json!({ + "filter": { "term": { "attributes.key": y_field } }, + "aggs": { + "date_histogram": { "date_histogram": date_hist_params } + } + }), + ); + json!({ + "reverse_nested": {}, + "aggs": { + "by_attribute": { + "nested": { "path": "attributes" }, + "aggs": Value::Object(y_field_agg) + } + } + }) + } + _ => { + // Numeric histogram path (existing behaviour) + let (hist_min, hist_max, script_opt) = match y_scale { + Scale::Log | Scale::Log10 => { + let mn = y_bounds_min.max(1.0).log10(); + let mx = y_bounds_max.max(1.0).log10(); + (mn, mx, Some("Math.log10(_value)".to_string())) + } + Scale::Log2 => { + let mn = y_bounds_min.max(1.0).log2(); + let mx = y_bounds_max.max(1.0).log2(); + ( + mn, + mx, + Some("Math.max(Math.log(_value)/Math.log(2), 0)".to_string()), + ) + } + Scale::Sqrt => { + let mn = y_bounds_min.max(0.0).sqrt(); + let mx = y_bounds_max.sqrt(); + (mn, mx, None) + } + _ => (y_bounds_min, y_bounds_max, None), + }; - let mut y_field_agg = serde_json::Map::new(); - y_field_agg.insert( - y_field.to_string(), - json!({ - "filter": { "term": { "attributes.key": y_field } }, - "aggs": { "histogram": { "histogram": hist_params } } - }), - ); + let interval = (hist_max - hist_min) / y_ticks.max(1) as f64; - json!({ - "reverse_nested": {}, - "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, - "aggs": Value::Object(y_field_agg) + let mut hist_params = json!({ + "field": y_value_field, + "interval": interval, + "extended_bounds": { "min": hist_min, "max": hist_max }, + "offset": hist_min, + "min_doc_count": 0 + }); + if let Some(script) = script_opt { + hist_params["script"] = Value::String(script); } + + let mut y_field_agg = serde_json::Map::new(); + y_field_agg.insert( + y_field.to_string(), + json!({ + "filter": { "term": { "attributes.key": y_field } }, + "aggs": { "histogram": { "histogram": hist_params } } + }), + ); + + json!({ + "reverse_nested": {}, + "aggs": { + "by_attribute": { + "nested": { "path": "attributes" }, + "aggs": Value::Object(y_field_agg) + } + } + }) } - }) + } } #[allow(clippy::too_many_arguments)] @@ -500,20 +360,21 @@ pub fn build_nested_attribute_scatter_agg( show_other: bool, cache: &Option>>, ) -> Result { - let x_field = x_spec.field.as_str(); - let x_value_field = get_attribute_value_field(x_field, cache)?; - let y_value_field = get_attribute_value_field(y_field, cache)?; - + let x_storage = resolve_field_storage(&x_spec.field, x_spec.value_type, cache)?; + let y_storage = resolve_field_storage(y_field, y_bounds.value_type, cache)?; + let x_value_field = x_storage.bucket_field().to_string(); + let y_value_field = y_storage.bucket_field().to_string(); let (x_agg_type, x_agg_params) = build_x_agg_params(x_spec, &x_value_field, x_bounds); - let [y_domain_min, y_domain_max] = y_bounds.domain.unwrap_or([0.0, 1.0]); - let y_histogram_sub_agg = build_y_histogram_sub_agg( + let y_histogram_sub_agg = build_y_sub_agg( y_field, &y_value_field, + y_bounds.value_type, y_scale, y_domain_min, y_domain_max, y_bounds.tick_count, + y_bounds.interval, ); // Main x agg with nested y-histograms. @@ -525,8 +386,8 @@ pub fn build_nested_attribute_scatter_agg( // Category histograms (optional). let category_histograms_opt = if let Some(cat) = cat_field { - let cat_value_field = get_attribute_value_field(cat, cache)?; let cat_vt = cat_value_type.unwrap_or(ValueType::Keyword); + let cat_storage = resolve_field_storage(cat, cat_vt, cache)?; let is_numeric_cat = !matches!(cat_vt, ValueType::Keyword | ValueType::TaxonRank); // Skip only when keyword cat has no known labels. @@ -536,72 +397,81 @@ pub fn build_nested_attribute_scatter_agg( let default_bounds = cat_bounds.unwrap_or(x_bounds); let (by_value_agg_type, by_value_def) = build_by_value_agg( cat_vt, - &cat_value_field, + cat_storage.bucket_field(), default_bounds, cat_labels, show_other, ); // Per-cat x agg: same type as main x, with y-histograms nested inside. - let mut cat_x_field_agg = serde_json::Map::new(); - cat_x_field_agg.insert( - x_field.to_string(), - json!({ - "filter": { "term": { "attributes.key": x_field } }, - "aggs": { x_agg_type: x_with_y.clone() } - }), + // Uses build_inner_x_agg_block so extraction paths remain deterministic. + // Pass raw x_agg_params; build_inner_x_agg_block wraps them in the + // required {name: {type: params}} nesting. yHistograms is provided + // as sub_aggs so it sits inside the x bucket agg, not alongside it. + let per_cat_x_with_y = build_inner_x_agg_block( + &x_storage, + x_agg_type, + x_agg_params.clone(), + Some(json!({ "yHistograms": y_histogram_sub_agg.clone() })), ); + let by_value_with_inner = json!({ + "by_value": { + by_value_agg_type: by_value_def, + "aggs": per_cat_x_with_y + } + }); + let cat_aggs = wrap_cat_in_nested(&cat_storage, by_value_with_inner); + Some(json!({ "reverse_nested": {}, - "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, - "aggs": { - "by_cat": { - "filter": { "term": { "attributes.key": cat } }, - "aggs": { - "by_value": { - by_value_agg_type: by_value_def, - "aggs": { - "histogram": { - "reverse_nested": {}, - "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, - "aggs": Value::Object(cat_x_field_agg) - } - } - } - } - } - } - } - } - } - } + "aggs": cat_aggs })) } } else { None }; - let mut by_key_aggs = json!({ x_agg_type: x_with_y }); + // Build main x agg via generic factory and inject category histograms. + let x_agg_builder = agg_builder_for(x_spec, x_bounds, cache)?; + let mut final_agg = x_agg_builder.build(agg_name); + + // Inject yHistograms into the inner agg. + inject_y_histograms_into_agg(&mut final_agg, agg_name, &x_storage, x_agg_type, x_with_y); + if let Some(cat_hist) = category_histograms_opt { - by_key_aggs["categoryHistograms"] = cat_hist; + inject_category_histograms(&mut final_agg, agg_name, &x_storage, cat_hist); } - Ok(json!({ - agg_name: { - "nested": { "path": "attributes" }, - "aggs": { - "by_key": { - "filter": { "term": { "attributes.key": x_field } }, - "aggs": by_key_aggs - } - } + Ok(final_agg) +} + +/// Inject `x_with_y` (the x bucket agg including yHistograms sub-agg) into the built x agg. +fn inject_y_histograms_into_agg( + final_agg: &mut Value, + agg_name: &str, + x_storage: &FieldStorage, + x_agg_type: &str, + x_with_y: Value, +) { + let container = x_storage.x_container_name(); + let root = match final_agg.get_mut(agg_name) { + Some(v) => v, + None => return, + }; + let aggs_obj = match root.get_mut("aggs") { + Some(v) => v, + None => return, + }; + if container.is_empty() { + aggs_obj[x_agg_type] = x_with_y; + return; + } + if let Some(container_obj) = aggs_obj.get_mut(container) { + if let Some(inner_aggs) = container_obj.get_mut("aggs") { + inner_aggs[x_agg_type] = x_with_y; } - })) + } } /// Build the `by_value` aggregation used for per-category sub-histograms. @@ -628,6 +498,20 @@ fn build_by_value_agg( } ("filters", def) } + ValueType::Date => { + let calendar_interval = cat_bounds + .interval + .map(|i| i.to_es_interval().to_string()) + .unwrap_or_else(|| "1y".to_string()); + ( + "date_histogram", + json!({ + "field": cat_value_field, + "calendar_interval": calendar_interval, + "min_doc_count": 0 + }), + ) + } _ => { let [domain_min, domain_max] = cat_bounds.domain.unwrap_or([0.0, 1.0]); let ticks = cat_bounds.tick_count.max(1) as f64; @@ -647,28 +531,25 @@ fn build_by_value_agg( } #[allow(clippy::too_many_arguments)] -/// Build a complete nested-attribute histogram aggregation with per-category sub-histograms. -/// -/// Supports any x-axis value type: numeric fields use `histogram`, keyword/rank fields use -/// `terms`. The cat axis is always filtered by term (keyword/rank); pass a keyword or rank -/// field for `cat_field`. +/// Build a complete histogram aggregation with per-category sub-histograms. /// -/// Supports any cat-axis value type: keyword/rank fields use named `filters` (one per label), -/// numeric fields use a `histogram` agg bucketed by the cat domain. +/// Fully type-agnostic: any combination of `(x_storage, cat_storage)` — +/// attribute × attribute, attribute × lineage, lineage × attribute, lineage × lineage — +/// is handled by composing [`FieldStorage`] values from [`field`][crate::report::field] +/// rather than by hand-writing separate cases. /// -/// # Aggregation structure +/// # Aggregation structure (generalised) /// ```text -/// {agg_name}: nested(attributes) -/// by_key: filter(x_field) -/// {x_agg_type}: histogram or terms (main x-axis counts) -/// categoryHistograms: reverse_nested -/// by_attribute: nested(attributes) -/// by_cat: filter(cat_field) -/// by_value: filters (keyword) or histogram (numeric) -/// histogram: reverse_nested -/// by_attribute: nested(attributes) -/// {x_field}: filter(x_field) -/// {x_agg_type}: histogram or terms (per-category counts) +/// {agg_name}: +/// [x nested envelope] +/// x_container: filter(x) +/// {x_bucket_type}: … ← main x counts +/// categoryHistograms: +/// reverse_nested: {} +/// [cat nested envelope] +/// cat_container: filter(cat) +/// by_value: filters/histogram ← per-cat buckets +/// [per-cat inner x agg — same x nested envelope] /// ``` pub fn build_nested_attribute_histogram_with_categories( agg_name: &str, @@ -681,310 +562,101 @@ pub fn build_nested_attribute_histogram_with_categories( show_other: bool, cache: &Option>>, ) -> Result { - let x_field = x_spec.field.as_str(); - let x_value_field = get_attribute_value_field(x_field, cache)?; - let cat_value_field = get_attribute_value_field(cat_field, cache)?; + let x_storage = resolve_field_storage(&x_spec.field, x_spec.value_type, cache)?; + let cat_storage = resolve_field_storage(cat_field, cat_value_type, cache)?; - let (x_agg_type, x_agg_params) = build_x_agg_params(x_spec, &x_value_field, x_bounds); + let (x_bucket_type, x_bucket_params) = + build_x_agg_params(x_spec, x_storage.bucket_field(), x_bounds); let (by_value_agg_type, by_value_def) = build_by_value_agg( cat_value_type, - &cat_value_field, + cat_storage.bucket_field(), cat_bounds, cat_labels, show_other, ); - // Per-category inner x agg (same type as main). - let mut x_field_agg = serde_json::Map::new(); - x_field_agg.insert( - x_field.to_string(), - json!({ - "filter": { "term": { "attributes.key": x_field } }, - "aggs": { x_agg_type: { x_agg_type: x_agg_params.clone() } } - }), - ); + // Build the inner x agg block for each category bucket. + // Uses canonical container names ("by_key"/"at_rank") so extraction paths + // are deterministic via FieldStorage::inner_x_path(). + let per_cat_x_block = + build_inner_x_agg_block(&x_storage, x_bucket_type, x_bucket_params.clone(), None); + + // Assemble: a named "by_value" agg → per_cat_x_block sub-aggs. + // ES requires a name for every agg; "by_value" matches the extraction + // path in FieldStorage::cat_histograms_base(). + let by_value_with_inner_x = json!({ + "by_value": { + by_value_agg_type: by_value_def, + "aggs": per_cat_x_block + } + }); + + // Wrap in the cat nested envelope (by_attribute/at_cat_rank/etc.) + let cat_aggs = wrap_cat_in_nested(&cat_storage, by_value_with_inner_x); let category_histograms = json!({ "reverse_nested": {}, - "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, - "aggs": { - "by_cat": { - "filter": { "term": { "attributes.key": cat_field } }, - "aggs": { - "by_value": { - by_value_agg_type: by_value_def, - "aggs": { - "histogram": { - "reverse_nested": {}, - "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, - "aggs": Value::Object(x_field_agg) - } - } - } - } - } - } - } - } - } - } + "aggs": cat_aggs }); - Ok(json!({ - agg_name: { - "nested": { "path": "attributes" }, - "aggs": { - "by_key": { - "filter": { "term": { "attributes.key": x_field } }, - "aggs": { - x_agg_type: { x_agg_type: x_agg_params }, - "categoryHistograms": category_histograms - } - } - } + // Build the main x agg via the generic factory and inject categoryHistograms. + let x_agg_builder = agg_builder_for(x_spec, x_bounds, cache)?; + let mut final_agg = x_agg_builder.build(agg_name); + + inject_category_histograms(&mut final_agg, agg_name, &x_storage, category_histograms); + + Ok(final_agg) +} + +/// Inject `category_histograms` into the correct inner `aggs` map of a pre-built x agg. +fn inject_category_histograms( + final_agg: &mut Value, + agg_name: &str, + x_storage: &FieldStorage, + category_histograms: Value, +) { + let container = x_storage.x_container_name(); + let root = match final_agg.get_mut(agg_name) { + Some(v) => v, + None => return, + }; + let aggs_obj = match root.get_mut("aggs") { + Some(v) => v, + None => return, + }; + if container.is_empty() { + // Root-level x: inject directly into the top-level aggs. + aggs_obj["categoryHistograms"] = category_histograms; + return; + } + if let Some(container_obj) = aggs_obj.get_mut(container) { + if let Some(inner_aggs) = container_obj.get_mut("aggs") { + inner_aggs["categoryHistograms"] = category_histograms; } - })) + } } /// Select the appropriate `AggBuilder` for an axis spec. /// -/// This is the main factory function; report handlers call it rather than -/// constructing builders directly. +/// Delegates all field-type detection to [`resolve_field_storage`] and returns a +/// [`GenericBucketAgg`] — a single type that handles attributes, lineage ranks and +/// root-level fields uniformly. pub fn agg_builder_for( spec: &AxisSpec, bounds: &BoundsResult, cache: &Option>>, ) -> Result, String> { - let is_attr = is_attribute(&spec.field, cache); - let is_rk = is_rank(&spec.field, cache); - - match spec.value_type { - ValueType::Numeric => { - let [domain_min, domain_max] = bounds.domain.unwrap_or([0.0, 1.0]); - - // For log scales, transform bounds to log space for histogram interval calculation - let (hist_min, hist_max) = match spec.opts.scale { - Scale::Log | Scale::Log10 => { - let min_val = domain_min.max(1.0).log10(); - let max_val = domain_max.max(1.0).log10(); - (min_val, max_val) - } - Scale::Log2 => { - let min_val = domain_min.max(1.0).log2(); - let max_val = domain_max.max(1.0).log2(); - (min_val, max_val) - } - Scale::Sqrt => { - let min_val = domain_min.max(0.0).sqrt(); - let max_val = domain_max.sqrt(); - (min_val, max_val) - } - _ => (domain_min, domain_max), - }; - - // Compute interval in transformed space - let ticks = bounds.tick_count.max(1) as f64; - let interval = (hist_max - hist_min) / ticks; - - if is_attr { - let value_field = get_attribute_value_field(&spec.field, cache)?; - - // Build script transform for log scales - let script_opt = match spec.opts.scale { - Scale::Log10 => Some("Math.log10(_value)".to_string()), - Scale::Log => Some("Math.log(_value)".to_string()), - Scale::Log2 => Some("Math.max(Math.log(_value)/Math.log(2), 0)".to_string()), - Scale::Sqrt => Some("Math.sqrt(_value)".to_string()), - _ => None, - }; - - let mut inner_agg = json!({ - "histogram": { - "field": &value_field, - "interval": interval, - "extended_bounds": { "min": hist_min, "max": hist_max }, - "min_doc_count": 0 - } - }); - - if let Some(script) = script_opt { - inner_agg["histogram"]["script"] = Value::String(script); - } - - Ok(Box::new(NestedAttributeAggBuilder { - field: spec.field.clone(), - inner_agg_body: inner_agg, - inner_agg_name: "histogram".to_string(), - })) - } else { - let script_opt = match spec.opts.scale { - Scale::Log10 => Some("Math.log10(_value)".to_string()), - Scale::Log => Some("Math.log(_value)".to_string()), - Scale::Log2 => Some("Math.max(Math.log(_value)/Math.log(2), 0)".to_string()), - Scale::Sqrt => Some("Math.sqrt(_value)".to_string()), - _ => None, - }; - - Ok(Box::new(HistogramAggBuilder { - field: spec.field.clone(), - interval, - min: hist_min, - max: hist_max, - script: script_opt, - })) - } - } - ValueType::Date => { - let calendar_interval = bounds - .interval - .map(|i| i.to_es_interval().to_string()) - .unwrap_or_else(|| "1y".to_string()); - - if is_attr { - let value_field = get_attribute_value_field(&spec.field, cache)?; - let inner_agg = json!({ - "date_histogram": { - "field": &value_field, - "calendar_interval": &calendar_interval, - "min_doc_count": 0 - } - }); - Ok(Box::new(NestedAttributeAggBuilder { - field: spec.field.clone(), - inner_agg_body: inner_agg, - inner_agg_name: "date_histogram".to_string(), - })) - } else { - Ok(Box::new(DateHistogramAggBuilder { - field: spec.field.clone(), - calendar_interval, - time_zone: None, - })) - } - } - ValueType::Keyword | ValueType::TaxonRank => { - if is_attr { - let value_field = get_attribute_value_field(&spec.field, cache)?; - let inner_agg = json!({ - "terms": { - "field": &value_field, - "size": spec.opts.size, - "min_doc_count": 0 - } - }); - Ok(Box::new(NestedAttributeAggBuilder { - field: spec.field.clone(), - inner_agg_body: inner_agg, - inner_agg_name: "terms".to_string(), - })) - } else if is_rk { - let inner_agg = json!({ - "terms": { - "field": "lineage.taxon_id", - "size": spec.opts.size, - "min_doc_count": 0 - } - }); - Ok(Box::new(NestedRankAggBuilder { - field: spec.field.clone(), - inner_agg_body: inner_agg, - inner_agg_name: "terms".to_string(), - })) - } else { - Ok(Box::new(TermsAggBuilder { - field: spec.field.clone(), - size: spec.opts.size, - include: if bounds.fixed_terms.is_empty() { - None - } else { - Some(bounds.fixed_terms.clone()) - }, - })) - } - } - ValueType::GeoPoint => Ok(Box::new(GeoHashAggBuilder { - field: spec.field.clone(), - precision: geohash_precision_for_size(spec.opts.size), - size: spec.opts.size, - })), - } -} - -/// Wrapper that adds nested query logic around a base aggregation for nested attributes. -pub struct NestedAttributeAggBuilder { - pub field: String, - pub inner_agg_body: Value, - pub inner_agg_name: String, -} - -impl AggBuilder for NestedAttributeAggBuilder { - fn build(&self, agg_name: &str) -> Value { - json!({ - agg_name: { - "nested": { "path": "attributes" }, - "aggs": { - "by_key": { - "filter": { "term": { "attributes.key": &self.field } }, - "aggs": { - &self.inner_agg_name: self.inner_agg_body.clone() - } - } - } - } - }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!( - "/aggregations/{}/by_key/{}/buckets", - agg_name, self.inner_agg_name - )) - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default() - } -} - -/// Wrapper that adds nested query logic around a base aggregation for nested rank (lineage) fields. -pub struct NestedRankAggBuilder { - pub field: String, - pub inner_agg_body: Value, - pub inner_agg_name: String, -} - -impl AggBuilder for NestedRankAggBuilder { - fn build(&self, agg_name: &str) -> Value { - json!({ - agg_name: { - "nested": { "path": "lineage" }, - "aggs": { - "at_rank": { - "filter": { "term": { "lineage.taxon_rank": &self.field } }, - "aggs": { - &self.inner_agg_name: self.inner_agg_body.clone() - } - } - } - } - }) - } - - fn extract(&self, resp: &Value, agg_name: &str) -> RawBuckets { - resp.pointer(&format!( - "/aggregations/{}/at_rank/{}/buckets", - agg_name, self.inner_agg_name - )) - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default() - } + let storage = resolve_field_storage(&spec.field, spec.value_type, cache)?; + let (bucket_type, bucket_params) = build_x_agg_params(spec, storage.bucket_field(), bounds); + Ok(Box::new(GenericBucketAgg { + storage, + bucket_type: bucket_type.to_string(), + bucket_params, + })) } /// Map a requested geohash count to an ES geohash precision level (1–12). +#[allow(dead_code)] fn geohash_precision_for_size(size: usize) -> u8 { match size { 0..=50 => 3, diff --git a/crates/genomehubs-api/src/report/arc.rs b/crates/genomehubs-api/src/report/arc.rs index eb045a4..c814ccd 100644 --- a/crates/genomehubs-api/src/report/arc.rs +++ b/crates/genomehubs-api/src/report/arc.rs @@ -204,24 +204,31 @@ pub async fn run_arc_report( if config.ranks.is_some() { return run_per_rank_report(client, es_base, index, base_query, config).await; } - if config.rings.is_some() { - return run_rings_report(client, es_base, index, base_query, config).await; - } - let feature_ref_filter = filter_expr_to_es_query( &combine_terms(&config.feature_term, &config.reference_term), base_query, )?; let reference_filter = filter_expr_to_es_query(&config.reference_term, base_query)?; - if let Some(ref context_term) = config.context_term { + let (feature_count, reference_count) = tokio::try_join!( + count_matching(client, es_base, index, &feature_ref_filter), + count_matching(client, es_base, index, &reference_filter), + )?; + let context_count = if let Some(ref context_term) = config.context_term { let context_filter = filter_expr_to_es_query(context_term, base_query)?; - let (feature_count, reference_count, context_count) = tokio::try_join!( - count_matching(client, es_base, index, &feature_ref_filter), - count_matching(client, es_base, index, &reference_filter), - count_matching(client, es_base, index, &context_filter), - )?; + let (context_count,) = + tokio::try_join!(count_matching(client, es_base, index, &context_filter))?; + Some(context_count) + } else { + None + }; + if config.rings.is_some() { + return run_rings_report(client, es_base, index, base_query, config, context_count).await; + } + + if let Some(ref context_term) = config.context_term { + let context_count = context_count.unwrap_or(0); let arc = safe_fraction(feature_count, reference_count); let arc2 = safe_fraction(reference_count, context_count); @@ -240,11 +247,6 @@ pub async fn run_arc_report( }); Ok((feature_count, 0, report_data)) } else { - let (feature_count, reference_count) = tokio::try_join!( - count_matching(client, es_base, index, &feature_ref_filter), - count_matching(client, es_base, index, &reference_filter), - )?; - let arc = safe_fraction(feature_count, reference_count); let report_data = json!({ @@ -274,6 +276,7 @@ async fn run_rings_report( index: &str, base_query: &Value, config: &ArcConfig, + context_count: Option, ) -> Result<(u64, u64, Value), String> { let rings = config.rings.as_deref().unwrap_or(&[]); @@ -317,6 +320,16 @@ async fn run_rings_report( entry.insert("reference_count".to_string(), json!(reference_count)); entry.insert("featureTerm".to_string(), json!(ring.feature_term)); entry.insert("referenceTerm".to_string(), json!(ring_ref)); + + if let Some(context_count) = context_count { + let arc2 = safe_fraction(reference_count, context_count); + entry.insert("arc2".to_string(), json!(arc2)); + entry.insert("context_count".to_string(), json!(context_count)); + entry.insert( + "contextTerm".to_string(), + json!(config.context_term.as_deref().unwrap_or("")), + ); + } Value::Object(entry) }) .collect(); @@ -421,7 +434,10 @@ async fn msearch_counts( for query in queries { body.push_str(&serde_json::to_string(&header).unwrap()); body.push('\n'); - body.push_str(&serde_json::to_string(&json!({ "query": query, "size": 0 })).unwrap()); + body.push_str( + &serde_json::to_string(&json!({ "query": query, "size": 0, "track_total_hits": true })) + .unwrap(), + ); body.push('\n'); } diff --git a/crates/genomehubs-api/src/report/bounds.rs b/crates/genomehubs-api/src/report/bounds.rs index 115f9b1..288229b 100644 --- a/crates/genomehubs-api/src/report/bounds.rs +++ b/crates/genomehubs-api/src/report/bounds.rs @@ -2,6 +2,8 @@ //! //! Each `compute_*_bounds()` function issues one ES aggregation query to determine //! the actual data range for a field, then wraps it in a `BoundsResult`. +//! +//! All field-type detection delegates to [`crate::report::field`]. use genomehubs_query::report::axis::{DateInterval, Scale, ValueType}; use genomehubs_query::report::{AxisSpec, BoundsResult}; @@ -10,70 +12,7 @@ use serde_json::{json, Value}; use crate::es_client; use crate::es_metadata::MetadataCache; - -/// Determine if a field is a taxonomic rank (from lineage). -/// Ranks are stored in the lineage.taxon_rank nested field. -fn is_rank( - field: &str, - cache: &Option>>, -) -> bool { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - return c.taxonomic_ranks.contains(&field.to_string()); - } - } - false -} - -/// Determine if a field is an attribute (from attributes nested array). -fn is_attribute( - field: &str, - cache: &Option>>, -) -> bool { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - if let Value::Object(groups) = &c.attr_types { - for (_, group) in groups { - if let Value::Object(fields) = group { - if fields.contains_key(field) { - return true; - } - } - } - } - } - } - false -} - -/// Get the exact value field for an attribute from metadata. -/// Returns the processed_summary field (e.g., "attributes.long_value" for type=long). -/// This MUST come from metadata, not guessed. -fn get_attribute_value_field( - field: &str, - cache: &Option>>, -) -> Result { - if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - if let Value::Object(groups) = &c.attr_types { - // Search all groups for this field - for (_, group) in groups { - if let Value::Object(fields) = group { - if let Some(Value::Object(meta_obj)) = fields.get(field) { - // Get processed_summary which is the exact ES field name - if let Some(ps) = - meta_obj.get("processed_summary").and_then(|v| v.as_str()) - { - return Ok(format!("attributes.{}", ps)); - } - } - } - } - } - } - } - Err(format!("field '{}' not found in metadata", field)) -} +use crate::report::field::{get_attribute_value_field, is_attribute, is_rank}; /// Probe Elasticsearch for the domain of a single axis field. /// @@ -122,6 +61,22 @@ async fn compute_numeric_bounds( let is_attr = is_attribute(&spec.field, cache); let is_rk = is_rank(&spec.field, cache); + // If this field is a taxonomic rank, prefer that interpretation and + // return a `BoundsResult` for rank-type axes without probing attribute + // subdocuments. This avoids treating rank-like names that may also appear + // in attribute metadata as attributes. + if is_rk { + return Ok(BoundsResult { + domain: None, + tick_count: spec.opts.size, + interval: None, + scale: Scale::Ordinal, + value_type: ValueType::TaxonRank, + fixed_terms: vec![], + cat_labels: vec![], + }); + } + let agg_body = if is_attr { let value_field = get_attribute_value_field(&spec.field, cache)?; json!({ @@ -143,16 +98,6 @@ async fn compute_numeric_bounds( } } }) - } else if is_rk { - return Ok(BoundsResult { - domain: None, - tick_count: spec.opts.size, - interval: None, - scale: Scale::Ordinal, - value_type: ValueType::TaxonRank, - fixed_terms: vec![], - cat_labels: vec![], - }); } else { json!({ "size": 0, @@ -322,20 +267,22 @@ async fn compute_keyword_bounds( let is_attr = is_attribute(&spec.field, cache); let is_rk = is_rank(&spec.field, cache); - let agg_body = if is_attr { + // Prefer taxon ranks over attributes: if the field looks like a rank, + // query lineage buckets rather than attribute nested terms. + let agg_body = if is_rk { json!({ "size": 0, "query": base_query, "aggs": { - "by_attribute": { - "nested": { "path": "attributes" }, + "by_lineage": { + "nested": { "path": "lineage" }, "aggs": { - "by_key": { - "filter": { "term": { "attributes.key": &spec.field } }, + "at_rank": { + "filter": { "term": { "lineage.taxon_rank": &spec.field } }, "aggs": { "top_terms": { "terms": { - "field": "attributes.keyword_value.raw", + "field": "lineage.taxon_id", "size": spec.opts.size, "min_doc_count": 0 } @@ -346,20 +293,20 @@ async fn compute_keyword_bounds( } } }) - } else if is_rk { + } else if is_attr { json!({ "size": 0, "query": base_query, "aggs": { - "by_lineage": { - "nested": { "path": "lineage" }, + "by_attribute": { + "nested": { "path": "attributes" }, "aggs": { - "at_rank": { - "filter": { "term": { "lineage.taxon_rank": &spec.field } }, + "by_key": { + "filter": { "term": { "attributes.key": &spec.field } }, "aggs": { "top_terms": { "terms": { - "field": "lineage.taxon_id", + "field": "attributes.keyword_value.raw", "size": spec.opts.size, "min_doc_count": 0 } @@ -482,9 +429,9 @@ async fn compute_geo_bounds( /// /// Selects the most appropriate calendar interval for rendering: /// - < 30 days → Day -/// - < 6 years → Month -/// - < 50 years → Year -/// - >= 50 years → Decade +/// - < 2 years → Month +/// - < 4 years → Quarter +/// - >= 4 years → Year pub fn auto_date_interval(range_ms: f64) -> Option { const DAY_MS: f64 = 86_400_000.0; const YEAR_MS: f64 = DAY_MS * 365.25; @@ -495,11 +442,11 @@ pub fn auto_date_interval(range_ms: f64) -> Option { Some(if range_ms < 30.0 * DAY_MS { DateInterval::Day - } else if range_ms < 6.0 * YEAR_MS { + } else if range_ms < 2.0 * YEAR_MS { DateInterval::Month - } else if range_ms < 50.0 * YEAR_MS { - DateInterval::Year + } else if range_ms < 4.0 * YEAR_MS { + DateInterval::Quarter } else { - DateInterval::Decade + DateInterval::Year }) } diff --git a/crates/genomehubs-api/src/report/field.rs b/crates/genomehubs-api/src/report/field.rs new file mode 100644 index 0000000..14c6fb7 --- /dev/null +++ b/crates/genomehubs-api/src/report/field.rs @@ -0,0 +1,442 @@ +//! Field storage resolution: single source of truth for where a field lives in ES. +//! +//! Every ES nested/attribute/lineage path decision in agg builders, bounds +//! computation and extraction is derived from [`FieldStorage`]. No other file +//! should call `is_rank`, `is_attribute`, or `get_attribute_value_field` +//! directly; use [`resolve_field_storage`] instead. + +use serde_json::{json, Value}; + +use crate::es_metadata::MetadataCache; +use genomehubs_query::report::axis::ValueType; + +// ── FieldStorage ───────────────────────────────────────────────────────────── + +/// Where a field's values are physically stored in the ES document. +/// +/// All agg builders and extractors derive their nested path structure from +/// this enum so that the (x_type × cat_type) combinations are handled by +/// composing two `FieldStorage` values rather than hand-writing four cases. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FieldStorage { + /// `attributes[].key == key`, value at `es_value_field`. + Attribute { + key: String, + /// Full dotted ES field path, e.g. `"attributes.keyword_value.raw"`. + es_value_field: String, + }, + /// `lineage[].taxon_rank == rank`; canonical bucket key is `lineage.taxon_id`. + Lineage { rank: String }, + /// Top-level document field; `es_field` includes `.keyword` suffix when needed. + Root { es_field: String }, +} + +impl FieldStorage { + /// ES `nested` path required before filtering this field, if any. + #[allow(dead_code)] + pub fn nested_path(&self) -> Option<&str> { + match self { + FieldStorage::Attribute { .. } => Some("attributes"), + FieldStorage::Lineage { .. } => Some("lineage"), + FieldStorage::Root { .. } => None, + } + } + + /// Term filter that restricts to documents/sub-docs containing this field. + #[allow(dead_code)] + pub fn key_filter(&self) -> Value { + match self { + FieldStorage::Attribute { key, .. } => json!({ "term": { "attributes.key": key } }), + FieldStorage::Lineage { rank } => json!({ "term": { "lineage.taxon_rank": rank } }), + FieldStorage::Root { .. } => json!({ "match_all": {} }), + } + } + + /// The name of the inner filter-container used inside the nested agg. + /// + /// - Attribute in *x* position: `"by_key"` + /// - Lineage in *x* position: `"at_rank"` + /// - Root: `""` (no container needed) + /// + /// The same names are used in the per-cat inner x agg so extraction + /// paths are deterministic. + pub fn x_container_name(&self) -> &str { + match self { + FieldStorage::Attribute { .. } => "by_key", + FieldStorage::Lineage { .. } => "at_rank", + FieldStorage::Root { .. } => "", + } + } + + /// Names used when this field is in the *cat* position inside + /// `categoryHistograms`. + /// + /// Returns `(outer_wrapper, inner_container)` for the cat-level nesting: + /// + /// ```text + /// outer_wrapper: { + /// nested: {path: ...}, + /// aggs: { + /// inner_container: { filter: ..., aggs: { by_value: ... } } + /// } + /// } + /// ``` + pub fn cat_wrapper_names(&self) -> (&str, &str) { + match self { + FieldStorage::Attribute { .. } => ("by_attribute", "by_cat"), + FieldStorage::Lineage { .. } => ("by_lineage", "at_cat_rank"), + FieldStorage::Root { .. } => ("", ""), + } + } + + /// Build a presence-existence filter at the *document* level. + /// + /// Used when anding presence filters into the base query so bounds + /// reflect only documents that will actually appear in the final plot. + pub fn presence_filter(&self) -> Value { + match self { + FieldStorage::Attribute { key, .. } => json!({ + "nested": { + "path": "attributes", + "query": { "term": { "attributes.key": key } } + } + }), + FieldStorage::Lineage { rank } => json!({ + "nested": { + "path": "lineage", + "query": { "term": { "lineage.taxon_rank": rank } } + } + }), + FieldStorage::Root { es_field } => json!({ "exists": { "field": es_field } }), + } + } + + /// The canonical ES field to bucket on (passed to `terms`, `histogram`, etc.). + pub fn bucket_field(&self) -> &str { + match self { + FieldStorage::Attribute { es_value_field, .. } => es_value_field.as_str(), + FieldStorage::Lineage { .. } => "lineage.taxon_id", + FieldStorage::Root { es_field } => es_field.as_str(), + } + } + + // ── Path helpers ───────────────────────────────────────────────────────── + + /// JSON pointer to the main bucket list for a top-level agg (`agg_name`). + /// + /// ```text + /// attribute x: /aggregations/{agg_name}/by_key/{bucket_type}/buckets + /// lineage x: /aggregations/{agg_name}/at_rank/{bucket_type}/buckets + /// root x: /aggregations/{agg_name}/{bucket_type}/buckets + /// ``` + pub fn main_bucket_path(&self, agg_name: &str, bucket_type: &str) -> String { + match self { + FieldStorage::Attribute { .. } => { + format!("/aggregations/{}/by_key/{}/buckets", agg_name, bucket_type) + } + FieldStorage::Lineage { .. } => { + format!("/aggregations/{}/at_rank/{}/buckets", agg_name, bucket_type) + } + FieldStorage::Root { .. } => { + format!("/aggregations/{}/{}/buckets", agg_name, bucket_type) + } + } + } + + /// JSON pointer to the `by_value` buckets object inside `categoryHistograms`, + /// given the x-storage (self) and cat-storage. + /// + /// ```text + /// /aggregations/{agg_name}/{x_container}/categoryHistograms/{cat_outer}/{cat_inner}/by_value/buckets + /// ``` + pub fn cat_histograms_base( + &self, + agg_name: &str, + cat_storage: &FieldStorage, + ) -> Option { + let x_container = self.x_container_name(); + let (cat_outer, cat_inner) = cat_storage.cat_wrapper_names(); + if cat_outer.is_empty() { + // root cat not yet supported in category histogram path + return None; + } + let path = if x_container.is_empty() { + // root x + format!( + "/aggregations/{}/categoryHistograms/{}/{}/by_value/buckets", + agg_name, cat_outer, cat_inner + ) + } else { + format!( + "/aggregations/{}/{}/categoryHistograms/{}/{}/by_value/buckets", + agg_name, x_container, cat_outer, cat_inner + ) + }; + Some(path) + } + + /// JSON pointer from a per-category bucket root to the inner x histogram + /// buckets array. + /// + /// ```text + /// attribute x: /histogram/by_attribute/by_key/{bucket_type}/buckets + /// lineage x: /histogram/by_lineage/at_rank/{bucket_type}/buckets + /// root x: /histogram/{bucket_type}/buckets + /// ``` + pub fn inner_x_path(&self, bucket_type: &str) -> String { + match self { + FieldStorage::Attribute { .. } => { + format!("/histogram/by_attribute/by_key/{}/buckets", bucket_type) + } + FieldStorage::Lineage { .. } => { + format!("/histogram/by_lineage/at_rank/{}/buckets", bucket_type) + } + FieldStorage::Root { .. } => { + format!("/histogram/{}/buckets", bucket_type) + } + } + } +} + +// ── Resolution ─────────────────────────────────────────────────────────────── + +/// Determine where `field` is stored, given its declared `value_type` and the +/// metadata cache. +/// +/// Taxon ranks take priority over same-named attributes. Unknown fields fall +/// back to a root-level field. +pub fn resolve_field_storage( + field: &str, + value_type: ValueType, + cache: &Option>>, +) -> Result { + // Rank interpretation takes priority + if matches!(value_type, ValueType::TaxonRank) || is_rank(field, cache) { + return Ok(FieldStorage::Lineage { + rank: field.to_string(), + }); + } + + if is_attribute(field, cache) { + let es_value_field = get_attribute_value_field(field, cache)?; + return Ok(FieldStorage::Attribute { + key: field.to_string(), + es_value_field, + }); + } + + // Root-level field — add .keyword suffix for keyword types + let es_field = if matches!(value_type, ValueType::Keyword) { + format!("{}.keyword", field) + } else { + field.to_string() + }; + Ok(FieldStorage::Root { es_field }) +} + +// ── Low-level helpers (used internally and by bounds.rs / agg.rs) ───────── + +/// Return `true` if `field` is a known taxonomic rank. +pub fn is_rank( + field: &str, + cache: &Option>>, +) -> bool { + if let Some(lock) = cache { + if let Ok(c) = lock.try_read() { + return c.taxonomic_ranks.contains(&field.to_string()); + } + } + false +} + +/// Return `true` if `field` is a nested attribute (and not a taxonomic rank). +pub fn is_attribute( + field: &str, + cache: &Option>>, +) -> bool { + if let Some(lock) = cache { + if let Ok(c) = lock.try_read() { + if c.taxonomic_ranks.contains(&field.to_string()) { + return false; + } + if let Value::Object(groups) = &c.attr_types { + for (_, group) in groups { + if let Value::Object(fields) = group { + if fields.contains_key(field) { + return true; + } + } + } + } + } + } + false +} + +/// Return the fully-qualified ES field path for a nested attribute value. +/// +/// Reads `processed_summary` from the metadata cache, e.g. +/// `"keyword_value.raw"` → `"attributes.keyword_value.raw"`. +pub fn get_attribute_value_field( + field: &str, + cache: &Option>>, +) -> Result { + if let Some(lock) = cache { + if let Ok(c) = lock.try_read() { + if let Value::Object(groups) = &c.attr_types { + for (_, group) in groups { + if let Value::Object(fields) = group { + if let Some(Value::Object(meta)) = fields.get(field) { + if let Some(ps) = meta.get("processed_summary").and_then(|v| v.as_str()) + { + return Ok(format!("attributes.{}", ps)); + } + } + } + } + } + } + } + Err(format!("field '{}' not found in metadata", field)) +} + +// ── Agg block builders ─────────────────────────────────────────────────────── + +/// Wrap `inner_aggs` in the nested + filter envelope appropriate for this storage. +/// +/// For `Attribute`/`Lineage` this emits: +/// ```json +/// { "nested": {"path":"..."}, "aggs": { "{container}": { "filter": {...}, "aggs": inner_aggs } } } +/// ``` +/// For `Root` it returns `inner_aggs` unchanged (no nesting needed). +/// +/// `container_name` is the name of the inner filter-agg key +/// (use [`FieldStorage::x_container_name`] or the cat container names). +pub fn wrap_in_nested(storage: &FieldStorage, container_name: &str, inner_aggs: Value) -> Value { + match storage { + FieldStorage::Root { .. } => inner_aggs, + FieldStorage::Attribute { key, .. } => json!({ + "nested": { "path": "attributes" }, + "aggs": { + container_name: { + "filter": { "term": { "attributes.key": key } }, + "aggs": inner_aggs + } + } + }), + FieldStorage::Lineage { rank } => json!({ + "nested": { "path": "lineage" }, + "aggs": { + container_name: { + "filter": { "term": { "lineage.taxon_rank": rank } }, + "aggs": inner_aggs + } + } + }), + } +} + +/// Wrap `inner_aggs` in the cat-level nested envelope (used inside +/// `categoryHistograms` reverse-nested context). +/// +/// Uses `cat_wrapper_names()` to determine the outer wrapper and inner +/// container keys, keeping extraction paths deterministic. +pub fn wrap_cat_in_nested(storage: &FieldStorage, inner_aggs: Value) -> Value { + let (outer, container) = storage.cat_wrapper_names(); + if outer.is_empty() { + return inner_aggs; + } + match storage { + FieldStorage::Attribute { key, .. } => json!({ + outer: { + "nested": { "path": "attributes" }, + "aggs": { + container: { + "filter": { "term": { "attributes.key": key } }, + "aggs": inner_aggs + } + } + } + }), + FieldStorage::Lineage { rank } => json!({ + outer: { + "nested": { "path": "lineage" }, + "aggs": { + container: { + "filter": { "term": { "lineage.taxon_rank": rank } }, + "aggs": inner_aggs + } + } + } + }), + FieldStorage::Root { .. } => inner_aggs, + } +} + +/// Build the inner x bucket agg block used inside each category bucket +/// (within `categoryHistograms`). +/// +/// `bucket_params` is the **raw** aggregation params object (e.g. the +/// `{"field": …, "interval": …}` body for a histogram agg). This function +/// wraps it in the required ES `{name: {type: params}}` nesting and then +/// adds the `reverse_nested` envelope with the correct nested path. +/// +/// `sub_aggs` is an optional inner `"aggs"` object to attach to the x bucket +/// agg (used by scatter to add `yHistograms` inside each x-bucket). +/// +/// Uses `"by_key"` / `"at_rank"` container names consistently so +/// [`FieldStorage::inner_x_path`] can compute the extraction path +/// deterministically. +pub fn build_inner_x_agg_block( + x_storage: &FieldStorage, + bucket_type: &str, + bucket_params: Value, + sub_aggs: Option, +) -> Value { + // Build the named x bucket agg: {name: {type: raw_params[, "aggs": sub_aggs]}} + let mut agg_body = json!({ bucket_type: bucket_params }); + if let Some(sa) = sub_aggs { + agg_body["aggs"] = sa; + } + let inner_content = json!({ bucket_type: agg_body }); + match x_storage { + FieldStorage::Root { .. } => json!({ + "histogram": { + "reverse_nested": {}, + "aggs": inner_content + } + }), + FieldStorage::Attribute { key, .. } => json!({ + "histogram": { + "reverse_nested": {}, + "aggs": { + "by_attribute": { + "nested": { "path": "attributes" }, + "aggs": { + "by_key": { + "filter": { "term": { "attributes.key": key } }, + "aggs": inner_content + } + } + } + } + } + }), + FieldStorage::Lineage { rank } => json!({ + "histogram": { + "reverse_nested": {}, + "aggs": { + "by_lineage": { + "nested": { "path": "lineage" }, + "aggs": { + "at_rank": { + "filter": { "term": { "lineage.taxon_rank": rank } }, + "aggs": inner_content + } + } + } + } + } + }), + } +} diff --git a/crates/genomehubs-api/src/report/mod.rs b/crates/genomehubs-api/src/report/mod.rs index ee72ffb..67c4458 100644 --- a/crates/genomehubs-api/src/report/mod.rs +++ b/crates/genomehubs-api/src/report/mod.rs @@ -11,7 +11,9 @@ pub mod agg; pub mod arc; pub mod bounds; +pub mod field; pub mod filter_expr; pub mod pipeline; pub mod positional; pub mod report_types; +pub mod spec_builder; diff --git a/crates/genomehubs-api/src/report/report_types.rs b/crates/genomehubs-api/src/report/report_types.rs index 31dc9ab..80e9e3c 100644 --- a/crates/genomehubs-api/src/report/report_types.rs +++ b/crates/genomehubs-api/src/report/report_types.rs @@ -3,47 +3,74 @@ //! Each handler issues ES queries, applies bounds/aggregation/pipeline logic, //! and returns structured report data. +use chrono::Datelike; use genomehubs_query::query::{QueryParams, SearchQuery}; use genomehubs_query::report::axis::{AxisInput, AxisRole, AxisSpec, AxisSummary, ValueType}; use serde_json::{json, Value}; use std::sync::Arc; use crate::es_client; +use crate::index_name; use crate::report::agg::{ agg_builder_for, build_nested_attribute_histogram_with_categories, build_nested_attribute_scatter_agg, x_bucket_agg_name, }; use crate::report::bounds::compute_bounds; +use crate::report::field::{resolve_field_storage, FieldStorage}; use crate::report::pipeline::{Pipeline, ReportContext, ScaleStep}; use crate::AppState; -/// Extract per-category per-bucket counts from a v2-pattern `categoryHistograms` response. +fn value_type_to_string(v: ValueType) -> &'static str { + match v { + ValueType::Numeric => "float", + ValueType::Keyword => "keyword", + ValueType::Date => "date", + ValueType::GeoPoint => "coordinate", + ValueType::TaxonRank => "keyword", + } +} + +/// Extract per-category per-bucket counts from a `categoryHistograms` ES response. /// -/// For each category label the function follows: -/// `.../categoryHistograms/by_attribute/by_cat/by_value/buckets/{label}/histogram/by_attribute/{x_field}/histogram/buckets` +/// Uses [`FieldStorage`] to compute deterministic JSON pointer paths rather +/// than searching a candidate list. The x-axis inner histogram container +/// is always `"by_key"` (attribute) or `"at_rank"` (lineage) — see +/// [`build_inner_x_agg_block`][crate::report::field::build_inner_x_agg_block]. /// -/// Returns a JSON object mapping each category key to an array of `doc_count` values, one per -/// main-histogram bucket. Includes an `"other"` key when `show_other` is true. +/// Returns a JSON object mapping each category key to an array of `doc_count` +/// values, one per main-histogram bucket, aligned by key to the main buckets. #[allow(clippy::too_many_arguments)] fn extract_cat_histograms( resp: &Value, agg_name: &str, - x_field: &str, + x_storage: &FieldStorage, + cat_storage: &FieldStorage, x_bucket_agg: &str, main_bucket_count: usize, cat_labels: &[String], show_other: bool, cat_is_numeric: bool, main_counts: &[u64], + main_buckets: &[Value], ) -> Value { - let base = format!( - "/aggregations/{}/by_key/categoryHistograms/by_attribute/by_cat/by_value/buckets", - agg_name - ); + let base = match x_storage.cat_histograms_base(agg_name, cat_storage) { + Some(p) if resp.pointer(&p).is_some() => p, + _ => return Value::Null, + }; - if resp.pointer(&base).is_none() { - return Value::Null; - } + let inner_x = x_storage.inner_x_path(x_bucket_agg); + + // Build main bucket keys list for alignment. + let main_keys: Vec = main_buckets + .iter() + .map(|b| { + b.get("key") + .and_then(|k| k.as_str().map(|s| s.to_string())) + .or_else(|| b.get("key").map(|k| k.to_string())) + .or_else(|| b.get("id").and_then(|i| i.as_str().map(|s| s.to_string()))) + .unwrap_or_default() + }) + .collect(); let mut by_cat = serde_json::Map::new(); @@ -55,60 +82,45 @@ fn extract_cat_histograms( .cloned() .unwrap_or_default(); for bucket in &cat_buckets { - let key = bucket.get("key").and_then(|k| k.as_f64()).unwrap_or(0.0); - let label = key.to_string(); - let hist_path = format!( - "/histogram/by_attribute/{}/{}/buckets", - x_field, x_bucket_agg - ); - let mut counts: Vec = bucket - .pointer(&hist_path) + let key_val = bucket.get("key").cloned().unwrap_or(json!(0)); + let label = if let Some(kf) = key_val.as_f64() { + kf.to_string() + } else if let Some(ks) = key_val.as_str() { + ks.to_string() + } else { + key_val.to_string() + }; + let hist_buckets = bucket + .pointer(&inner_x) .and_then(|b| b.as_array()) - .map(|buckets| { - buckets - .iter() - .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) - .collect() - }) + .cloned() .unwrap_or_default(); - counts.resize(main_bucket_count, 0); - by_cat.insert(label, json!(counts)); + by_cat.insert( + label, + json!(align_to_keys(&hist_buckets, &main_keys, main_bucket_count)), + ); } } else { // by_value uses a filters agg — buckets is an object keyed by label. let mut named_sums: Vec> = Vec::with_capacity(cat_labels.len()); for label in cat_labels { - let hist_path = format!( - "{}/{}/histogram/by_attribute/{}/{}/buckets", - base, label, x_field, x_bucket_agg - ); - let mut counts: Vec = resp + let hist_path = format!("{}/{}{}", base, label, inner_x); + let hist_buckets = resp .pointer(&hist_path) .and_then(|b| b.as_array()) - .map(|buckets| { - buckets - .iter() - .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) - .collect() - }) + .cloned() .unwrap_or_default(); - counts.resize(main_bucket_count, 0); + let counts = align_to_keys(&hist_buckets, &main_keys, main_bucket_count); named_sums.push(counts.clone()); by_cat.insert(label.clone(), json!(counts)); } if show_other { - let other_path = format!( - "{}/other/histogram/by_attribute/{}/{}/buckets", - base, x_field, x_bucket_agg - ); + let other_path = format!("{}/other{}", base, inner_x); let other_counts: Vec = if let Some(buckets) = resp.pointer(&other_path).and_then(|b| b.as_array()) { - let mut v: Vec = buckets - .iter() - .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) - .collect(); + let mut v = align_to_keys(buckets, &main_keys, main_bucket_count); v.resize(main_bucket_count, 0); v } else { @@ -137,6 +149,38 @@ fn extract_cat_histograms( } } +/// Align a per-category inner histogram bucket list to the main-axis key ordering. +/// +/// Returns a `Vec` of length `main_bucket_count`, each entry being the +/// `doc_count` for the corresponding main bucket key. Missing inner keys +/// produce a zero count. +fn align_to_keys( + inner_buckets: &[Value], + main_keys: &[String], + main_bucket_count: usize, +) -> Vec { + let mut map: std::collections::HashMap = + std::collections::HashMap::with_capacity(inner_buckets.len()); + for b in inner_buckets { + let k = b.get("key").cloned().unwrap_or(json!("")); + let kstr = if let Some(s) = k.as_str() { + s.to_string() + } else if let Some(n) = k.as_f64() { + n.to_string() + } else { + k.to_string() + }; + let cnt = b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0); + map.insert(kstr, cnt); + } + let mut counts: Vec = main_keys + .iter() + .map(|k| *map.get(k).unwrap_or(&0)) + .collect(); + counts.resize(main_bucket_count, 0); + counts +} + /// Run a histogram (or categorised histogram) report. /// /// Returns `(doc_count, took_ms, report_json)` or error. @@ -149,33 +193,60 @@ pub async fn run_histogram_report( base_query: &Value, ) -> Result<(u64, u64, Value), String> { let x_spec = resolve_axis_spec(AxisRole::X, report_config, state) + .await .ok_or("report config missing 'x' axis (set 'x' field or use 'axes')")?; - let x_field = x_spec.field.clone(); - let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state); + let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state).await; + + // Resolve storage types up-front so presence filters and extraction paths + // are computed from the same source of truth. + let x_storage = resolve_field_storage(&x_spec.field, x_spec.value_type, &state.cache)?; + let cat_storage_opt: Option = if let Some(ref cat_spec) = cat_spec_opt { + Some(resolve_field_storage( + &cat_spec.field, + cat_spec.value_type, + &state.cache, + )?) + } else { + None + }; + + // Augment the base query for bounds computation with a presence-filter + // for the opposite axis so bounds reflect only records that will be + // plotted. + let cat_presence = cat_storage_opt.as_ref().map(|s| s.presence_filter()); + let x_base_query = if let Some(f) = cat_presence { + json!({ "bool": { "must": [ base_query.clone(), f ] } }) + } else { + base_query.clone() + }; let x_bounds = compute_bounds( &state.client, &state.es_base, index, &x_spec, - base_query, + &x_base_query, &state.cache, ) .await?; let agg_name = "x_agg"; - let x_inner_agg = x_bucket_agg_name(x_spec.value_type); // Build aggregation — categorized path supports both keyword (filters) and numeric (histogram) cat. let (final_agg, cat_labels, show_other_cat, cat_is_numeric) = if let Some(ref cat_spec) = cat_spec_opt { + // Require the x-axis presence when computing cat bounds so returned + // categories are only those that will be plotted. + let x_presence = x_storage.presence_filter(); + let cat_base_query = json!({ "bool": { "must": [ base_query.clone(), x_presence ] } }); + let cat_bounds = compute_bounds( &state.client, &state.es_base, index, cat_spec, - base_query, + &cat_base_query, &state.cache, ) .await?; @@ -220,18 +291,24 @@ pub async fn run_histogram_report( .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) .collect(); - let by_cat = if !cat_labels.is_empty() || cat_is_numeric { - extract_cat_histograms( - &resp, - agg_name, - x_field.as_str(), - x_inner_agg, - raw_buckets.len(), - &cat_labels, - show_other_cat, - cat_is_numeric, - &main_counts, - ) + let by_cat = if let Some(ref cat_storage) = cat_storage_opt { + if !cat_labels.is_empty() || cat_is_numeric { + extract_cat_histograms( + &resp, + agg_name, + &x_storage, + cat_storage, + x_inner_agg, + raw_buckets.len(), + &cat_labels, + show_other_cat, + cat_is_numeric, + &main_counts, + &raw_buckets, + ) + } else { + Value::Null + } } else { Value::Null }; @@ -242,7 +319,34 @@ pub async fn run_histogram_report( cat_labels: x_bounds.cat_labels.clone(), show_other: x_spec.opts.show_other, }; - let processed_buckets = pipeline.run(raw_buckets.clone(), &ctx); + let processed_raw = pipeline.run(raw_buckets.clone(), &ctx); + + // Align and label processed buckets. When the bounds provide an + // authoritative `fixed_terms` list, use that ordering and drop any + // unexpected buckets. Otherwise, for keyword axes drop zero-count + // placeholder buckets and ensure each bucket has a label. + let processed_buckets = if !x_bounds.fixed_terms.is_empty() { + align_and_label_processed_buckets( + processed_raw, + &x_bounds.fixed_terms, + &x_bounds.cat_labels, + ) + } else { + let mut pb = processed_raw; + if matches!(x_spec.value_type, ValueType::Keyword) { + pb.retain(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0) > 0); + } + for b in pb.iter_mut() { + if b.get("label").is_none() { + let id_str = b + .get("key") + .and_then(|k| k.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| b.get("key").map(|k| k.to_string()).unwrap_or_default()); + b["label"] = json!(id_str); + } + } + pb + }; // allValues: flat array of doc_counts parallel to buckets. let all_values: Vec = raw_buckets @@ -253,19 +357,210 @@ pub async fn run_histogram_report( let mut report_data = json!({ "type": "histogram", "x": { - "field": &x_field, + "field": x_spec.field.as_str(), "scale": format!("{:?}", x_spec.opts.scale).to_lowercase(), "domain": x_bounds.domain, - "tickCount": x_bounds.tick_count + "tickCount": x_bounds.tick_count, + "value_type": value_type_to_string(x_spec.value_type) }, "buckets": processed_buckets, "allValues": all_values }); if !by_cat.is_null() { - report_data["by_cat"] = by_cat; - report_data["cat"] = json!(cat_spec_opt.as_ref().map(|s| s.field.as_str())); - report_data["cats"] = json!(cat_labels); + report_data["by_cat"] = by_cat.clone(); + if let Some(ref cat_spec) = cat_spec_opt { + report_data["cat"] = json!({ + "field": cat_spec.field, + "value_type": value_type_to_string(cat_spec.value_type), + "scale": format!("{:?}", cat_spec.opts.scale).to_lowercase() + }); + } + + // Determine the final `cats` labels. Prefer the pre-computed + // `cat_labels` (from bounds) when present; otherwise derive + // readable labels from the `by_cat` histogram keys. This covers + // numeric/date category histograms where `cat_labels` is empty. + // Also compute numeric `tick_values` (boundaries) when applicable + // and attach them to `report_data["cat"]["tick_values"]` so the + // plot-spec builder can use them for binned encodings. + let mut final_cat_labels = cat_labels.clone(); + let mut cat_tick_values: Option> = None; + // Keep canonical raw category keys (object keys) for `report.cats` + // so downstream converters can look up `by_cat[cat_key]`. We'll add + // human-readable labels into `report.cat.tick_labels`. + let mut cat_keys: Vec = Vec::new(); + if final_cat_labels.is_empty() { + if let Some(obj) = by_cat.as_object() { + if !obj.is_empty() { + // Preserve insertion order of the buckets as returned by ES. + let keys: Vec = obj.keys().cloned().collect(); + cat_keys = keys.clone(); + if let Some(ref cat_spec) = cat_spec_opt { + match cat_spec.value_type { + ValueType::Date => { + // Parse numeric keys and compute adjacent boundaries + // to present human-friendly date ranges. + let nums: Vec = keys + .iter() + .map(|k| k.parse::().unwrap_or_default()) + .collect(); + if nums.is_empty() { + final_cat_labels = keys; + } else { + // estimate interval from first two keys, fallback to 1 + let width = if nums.len() >= 2 { + nums[1] - nums[0] + } else { + 1.0 + }; + let mut boundaries = nums.clone(); + boundaries.push(nums.last().copied().unwrap_or(0.0) + width); + // Attach numeric boundaries for the axis + cat_tick_values = Some(boundaries.clone()); + let mut labels: Vec = Vec::with_capacity(nums.len()); + for i in 0..nums.len() { + let left = nums[i]; + let right = boundaries[i + 1]; + // Heuristic: treat large values as milliseconds, + // otherwise seconds since epoch. + let left_i = left as i64; + let right_i = right as i64; + let left_dt = if left_i.abs() > 1_000_000_000_000 { + // milliseconds -> seconds + nanos + let s = left_i / 1000; + #[allow(clippy::cast_abs_to_unsigned)] + let ns = ((left_i % 1000).abs() as u32) * 1_000_000; + #[allow(deprecated)] + chrono::NaiveDateTime::from_timestamp_opt(s, ns) + } else { + #[allow(deprecated)] + chrono::NaiveDateTime::from_timestamp_opt(left_i, 0) + }; + let right_dt = if right_i.abs() > 1_000_000_000_000 { + let s = right_i / 1000; + #[allow(clippy::cast_abs_to_unsigned)] + let ns = ((right_i % 1000).abs() as u32) * 1_000_000; + #[allow(deprecated)] + chrono::NaiveDateTime::from_timestamp_opt(s, ns) + } else { + #[allow(deprecated)] + chrono::NaiveDateTime::from_timestamp_opt(right_i, 0) + }; + if let (Some(ldt), Some(rdt)) = (left_dt, right_dt) { + #[allow(deprecated)] + let ldt = chrono::DateTime::::from_utc( + ldt, + chrono::Utc, + ); + #[allow(deprecated)] + let rdt = chrono::DateTime::::from_utc( + rdt, + chrono::Utc, + ); + // Format as %Y-%m if day is 1, else %Y-%m-%d; collapse to %Y if month and day are both 1 + let fmt_date = |dt: &chrono::DateTime| { + let y = dt.year(); + let m = dt.month(); + let d = dt.day(); + if m == 1 && d == 1 { + format!("{:04}", y) + } else if d == 1 { + format!("{:04}-{:02}", y, m) + } else { + format!("{:04}-{:02}-{:02}", y, m, d) + } + }; + labels.push(format!( + "{} to {}", + fmt_date(&ldt), + fmt_date(&rdt) + )); + } else if let Some(ldt) = left_dt { + #[allow(deprecated)] + let ldt = chrono::DateTime::::from_utc( + ldt, + chrono::Utc, + ); + labels.push(format!("{}", ldt.format("%Y-%m-%d"))); + } else { + labels.push(keys[i].clone()); + } + } + final_cat_labels = labels; + } + } + _ => { + // Numeric buckets: produce readable range labels + let nums: Vec = keys + .iter() + .map(|k| k.parse::().unwrap_or_default()) + .collect(); + if nums.is_empty() { + final_cat_labels = keys; + } else { + let width = if nums.len() >= 2 { + nums[1] - nums[0] + } else { + 1.0 + }; + let mut boundaries = nums.clone(); + boundaries.push(nums.last().copied().unwrap_or(0.0) + width); + // Attach numeric boundaries for the axis + cat_tick_values = Some(boundaries.clone()); + let mut labels: Vec = Vec::with_capacity(nums.len()); + for i in 0..nums.len() { + let left = nums[i]; + let right = boundaries[i + 1]; + let fmt = |v: f64| { + // Format as 3sf scientific/engineering notation (e.g. 2.13G) + let abs_v = v.abs(); + let (scaled, suffix) = if abs_v >= 1e9 { + (v / 1e9, "G") + } else if abs_v >= 1e6 { + (v / 1e6, "M") + } else if abs_v >= 1e3 { + (v / 1e3, "k") + } else { + (v, "") + }; + if suffix.is_empty() { + format!("{:.3}", scaled) + } else { + format!("{:.3}{}", scaled, suffix) + } + }; + labels.push(format!("{} to{}", fmt(left), fmt(right))); + } + final_cat_labels = labels; + } + } + } + } else { + final_cat_labels = keys.clone(); + cat_keys = keys; + } + } + } + } + if let Some(tvals) = cat_tick_values { + report_data["cat"]["tick_values"] = json!(tvals); + } + + // Use raw category keys for `report.cats` (these are the keys used + // to index `by_cat`). Provide human-readable labels under + // `report.cat.tick_labels` so the plot-spec builder can use them for + // legend/axis labeling while converters still match counts by key. + if cat_keys.is_empty() { + // Fallback: when we didn't capture keys earlier, attempt to + // populate from final_cat_labels (they may already be raw keys). + report_data["cats"] = json!(final_cat_labels.clone()); + } else { + report_data["cats"] = json!(cat_keys); + } + if !final_cat_labels.is_empty() { + report_data["cat"]["tick_labels"] = json!(final_cat_labels); + } } Ok((total_hits, took, report_data)) @@ -506,7 +801,7 @@ pub async fn run_tree_report( // Prefer structured `axes` array; fall back to flat `y:` / `y_opts:` or legacy // `fields:` sequence (AxisSummary::Value for all). let tree_field_specs: Vec<(String, AxisSummary)> = { - let from_axes = resolve_y_specs(report_config, state); + let from_axes = resolve_y_specs(report_config, state).await; if !from_axes.is_empty() { from_axes .into_iter() @@ -531,7 +826,7 @@ pub async fn run_tree_report( }; // --- Cat axis: resolve full AxisSpec + bounds (same pipeline as histogram) --- - let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state); + let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state).await; let cat_bounds_opt = if let Some(ref cat_spec) = cat_spec_opt { Some( compute_bounds( @@ -717,8 +1012,27 @@ pub async fn run_tree_report( // One extra ES query using nested lineage → reverse_nested to count how many // taxa at `count_rank` descend from each tree node. if let Some(ref rank) = count_rank { - let descendant_counts = - fetch_descendant_counts(&state.client, &state.es_base, index, &lca_id, rank).await?; + // Speed-up: restrict descendant count aggregation to the set of tree + // node IDs we've already collected so ES only computes counts for + // those ancestors. This is much faster than enumerating all + // ancestors under the LCA when the tree is small or moderate-sized. + let candidate_ids: Vec = tree_nodes.keys().cloned().collect(); + // Use fast-path only when candidate set is reasonably small to avoid + // building a huge `terms` filter; fallback to composite when > 10k. + let candidate_slice: Option<&[String]> = if candidate_ids.len() <= 10_000 { + Some(candidate_ids.as_slice()) + } else { + None + }; + let descendant_counts = fetch_descendant_counts( + &state.client, + &state.es_base, + index, + &lca_id, + rank, + candidate_slice, + ) + .await?; took_total += descendant_counts.took; for (taxon_id, count) in descendant_counts.counts { if let Some(node) = tree_nodes.get_mut(&taxon_id) { @@ -823,7 +1137,7 @@ pub async fn run_map_report( let hexbin_field = format!("hexbin{hex_resolution}"); // --- Cat axis (optional) --- - let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state); + let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state).await; let cat_bounds_opt = if let Some(ref spec) = cat_spec_opt { compute_bounds( &state.client, @@ -1039,40 +1353,36 @@ pub async fn run_map_report( // Helper functions // ============================================================================ -/// Infer the ValueType of a field from metadata cache. -/// Defaults to Numeric if not found or cache unavailable. -fn infer_value_type( +/// Async variant of `infer_value_type` that acquires the cache read lock +/// via `read().await` so callers in async handlers can reliably observe +/// populated metadata without falling back on the non-blocking `try_read`. +async fn infer_value_type_async( field: &str, cache: &Option>>, ) -> ValueType { - // Check if it's a rank in the metadata if let Some(cache_lock) = cache { - if let Ok(c) = cache_lock.try_read() { - if c.taxonomic_ranks.contains(&field.to_string()) { - return ValueType::TaxonRank; - } - - // Check if it's an attribute in the metadata - if let serde_json::Value::Object(groups) = &c.attr_types { - for (_, group) in groups { - if let serde_json::Value::Object(fields) = group { - if let Some(serde_json::Value::Object(meta_obj)) = fields.get(field) { - if let Some(type_str) = meta_obj.get("type").and_then(|v| v.as_str()) { - return match type_str { - "date" => ValueType::Date, - "keyword" => ValueType::Keyword, - "long" | "integer" | "float" | "double" => ValueType::Numeric, - "geo_point" => ValueType::GeoPoint, - _ => ValueType::Keyword, - }; - } + let guard = cache_lock.read().await; + if guard.taxonomic_ranks.contains(&field.to_string()) { + return ValueType::TaxonRank; + } + if let serde_json::Value::Object(groups) = &guard.attr_types { + for (_, group) in groups { + if let serde_json::Value::Object(fields) = group { + if let Some(serde_json::Value::Object(meta_obj)) = fields.get(field) { + if let Some(type_str) = meta_obj.get("type").and_then(|v| v.as_str()) { + return match type_str { + "date" => ValueType::Date, + "keyword" => ValueType::Keyword, + "long" | "integer" | "float" | "double" => ValueType::Numeric, + "geo_point" => ValueType::GeoPoint, + _ => ValueType::Keyword, + }; } } } } } } - // Default to Numeric if not found in metadata ValueType::Numeric } @@ -1081,7 +1391,7 @@ fn infer_value_type( /// Checks the structured `axes` array first. Falls back to legacy flat keys /// (`x`/`x_opts`, `y`/`y_opts`, `cat`/`cat_opts`) so existing request bodies /// continue to work unchanged. -fn resolve_axis_spec( +async fn resolve_axis_spec( role: AxisRole, report_config: &serde_yaml::Value, state: &Arc, @@ -1100,7 +1410,7 @@ fn resolve_axis_spec( continue; } if let Ok(input) = serde_yaml::from_value::(entry.clone()) { - let inferred = infer_value_type(&input.field, &state.cache); + let inferred = infer_value_type_async(&input.field, &state.cache).await; return Some(input.into_spec(inferred)); } } @@ -1117,7 +1427,7 @@ fn resolve_axis_spec( .get(&opts_key) .and_then(|v| v.as_str()) .unwrap_or(""); - let value_type = infer_value_type(field, &state.cache); + let value_type = infer_value_type_async(field, &state.cache).await; Some(AxisSpec { field: field.to_string(), role, @@ -1136,20 +1446,24 @@ fn resolve_axis_spec( /// Prefers the structured `axes` array (multiple entries, per-field `summary` and /// opts). Falls back to the flat `y:` + `y_opts:` shorthand for a single field /// with `AxisSummary::Value`. Returns an empty vec when neither is present. -fn resolve_y_specs(report_config: &serde_yaml::Value, state: &Arc) -> Vec { +async fn resolve_y_specs( + report_config: &serde_yaml::Value, + state: &Arc, +) -> Vec { // Structured form: collect every entry with position == "y" if let Some(axes) = report_config.get("axes").and_then(|a| a.as_sequence()) { - let specs: Vec = axes + let inputs: Vec = axes .iter() .filter(|e| e.get("position").and_then(|p| p.as_str()) == Some("y")) .filter_map(|e| serde_yaml::from_value::(e.clone()).ok()) - .map(|input| { - let inferred = infer_value_type(&input.field, &state.cache); - input.into_spec(inferred) - }) .collect(); - if !specs.is_empty() { - return specs; + if !inputs.is_empty() { + let mut out = Vec::with_capacity(inputs.len()); + for input in inputs { + let inferred = infer_value_type_async(&input.field, &state.cache).await; + out.push(input.into_spec(inferred)); + } + return out; } } @@ -1162,7 +1476,7 @@ fn resolve_y_specs(report_config: &serde_yaml::Value, state: &Arc) -> .get("y_opts") .and_then(|v| v.as_str()) .unwrap_or(""); - let value_type = infer_value_type(field, &state.cache); + let value_type = infer_value_type_async(field, &state.cache).await; vec![AxisSpec { field: field.to_string(), role: AxisRole::Y, @@ -1495,63 +1809,172 @@ async fn fetch_descendant_counts( index: &str, lca_id: &str, count_rank: &str, + candidate_ids: Option<&[String]>, ) -> Result { - let body = json!({ - "size": 0, - "query": { - "bool": { - "must": [ - { "term": { "taxon_rank": count_rank } }, - { - "nested": { - "path": "lineage", - "query": { "term": { "lineage.taxon_id": lca_id } } + // If caller provides a candidate ID set, restrict to those IDs using a + // nested `filter` + `terms` agg. This avoids paging and is much faster + // when the ID set is small relative to the full space. + if let Some(ids) = candidate_ids { + if ids.is_empty() { + return Ok(DescendantCounts { + counts: std::collections::HashMap::new(), + took: 0, + }); + } + // Build a nested -> filter(terms(ids)) -> terms agg over lineage.taxon_id + let body = json!({ + "size": 0, + "query": { + "bool": { + "must": [ + { "term": { "taxon_rank": count_rank } }, + { + "nested": { + "path": "lineage", + "query": { "term": { "lineage.taxon_id": lca_id } } + } + } + ] + } + }, + "aggs": { + "by_ancestor": { + "nested": { "path": "lineage" }, + "aggs": { + "filtered": { + "filter": { "terms": { "lineage.taxon_id": ids } }, + "aggs": { + "ancestors": { + "terms": { "field": "lineage.taxon_id", "size": 10000 }, + "aggs": { "node_count": { "reverse_nested": {} } } + } + } } } - ] + } } - }, - "aggs": { - "by_ancestor": { - "nested": { "path": "lineage" }, - "aggs": { - "ancestors": { - "terms": { - "field": "lineage.taxon_id", - "size": 100000 - }, - "aggs": { - "node_count": { "reverse_nested": {} } + }); + + let resp = crate::es_client::execute_search(client, es_base, index, &body).await?; + let took = resp.get("took").and_then(|t| t.as_u64()).unwrap_or(0); + let buckets = resp + .pointer("/aggregations/by_ancestor/filtered/ancestors/buckets") + .and_then(|b| b.as_array()) + .cloned() + .unwrap_or_default(); + let mut counts = std::collections::HashMap::with_capacity(buckets.len()); + for bucket in &buckets { + let taxon_id = match bucket.get("key").and_then(|k| k.as_str()) { + Some(id) => id.to_string(), + None => continue, + }; + let count = bucket + .pointer("/node_count/doc_count") + .and_then(|c| c.as_u64()) + .unwrap_or(0); + counts.insert(taxon_id, count); + } + return Ok(DescendantCounts { counts, took }); + } + + // Use a composite aggregation inside the nested `lineage` agg so we can + // page through ancestor buckets without materialising them all at once. + // Loop until `after_key` is absent. + let mut counts: std::collections::HashMap = std::collections::HashMap::new(); + let mut took_total: u64 = 0; + let mut after_key: Option = None; + + loop { + // Build composite aggregation block, including `after` when present. + let mut composite_obj = json!({ + "size": 1000, + "sources": [{ "ancestor_id": { "terms": { "field": "lineage.taxon_id" } } }] + }); + if let Some(ref ak) = after_key { + composite_obj["after"] = ak.clone(); + } + + let body = json!({ + "size": 0, + "query": { + "bool": { + "must": [ + { "term": { "taxon_rank": count_rank } }, + { + "nested": { + "path": "lineage", + "query": { "term": { "lineage.taxon_id": lca_id } } + } + } + ] + } + }, + "aggs": { + "by_ancestor": { + "nested": { "path": "lineage" }, + "aggs": { + "ancestors": { + "composite": composite_obj, + "aggs": { + "node_count": { "reverse_nested": {} } + } } } } } - } - }); + }); - let resp = crate::es_client::execute_search(client, es_base, index, &body).await?; - let took = resp.get("took").and_then(|t| t.as_u64()).unwrap_or(0); + let resp = crate::es_client::execute_search(client, es_base, index, &body).await?; + took_total += resp.get("took").and_then(|t| t.as_u64()).unwrap_or(0); - let buckets = resp - .pointer("/aggregations/by_ancestor/ancestors/buckets") - .and_then(|b| b.as_array()) - .cloned() - .unwrap_or_default(); + let buckets = resp + .pointer("/aggregations/by_ancestor/ancestors/buckets") + .and_then(|b| b.as_array()) + .cloned() + .unwrap_or_default(); - let mut counts = std::collections::HashMap::with_capacity(buckets.len()); - for bucket in &buckets { - let taxon_id = match bucket.get("key").and_then(|k| k.as_str()) { - Some(id) => id.to_string(), - None => continue, - }; - let count = bucket - .pointer("/node_count/doc_count") - .and_then(|c| c.as_u64()) - .unwrap_or(0); - counts.insert(taxon_id, count); + for bucket in &buckets { + // Composite bucket keys are objects like { "ancestor_id": "123" } + let taxon_id = if let Some(obj) = bucket.get("key").and_then(|k| k.as_object()) { + if let Some(v) = obj.get("ancestor_id") { + if let Some(s) = v.as_str() { + s.to_string() + } else if let Some(n) = v.as_f64() { + n.to_string() + } else { + continue; + } + } else { + continue; + } + } else if let Some(s) = bucket.get("key").and_then(|k| k.as_str()) { + s.to_string() + } else if let Some(n) = bucket.get("key").and_then(|k| k.as_f64()) { + n.to_string() + } else { + continue; + }; + + let count = bucket + .pointer("/node_count/doc_count") + .and_then(|c| c.as_u64()) + .unwrap_or(0); + counts.insert(taxon_id, count); + } + + // Check for pagination `after_key` + after_key = resp + .pointer("/aggregations/by_ancestor/ancestors/after_key") + .cloned(); + if after_key.is_none() { + break; + } } - Ok(DescendantCounts { counts, took }) + Ok(DescendantCounts { + counts, + took: took_total, + }) } /// Compute subtree counts via iterative post-order DFS. @@ -1881,6 +2304,51 @@ fn find_attr_numeric(attrs: &[Value], field: &str) -> Option { None } +/// Find the first date attribute value for `field` in an `attributes` array. +/// Returns milliseconds since epoch as `f64` when possible. +fn find_attr_date(attrs: &[Value], field: &str) -> Option { + for attr in attrs { + if attr.get("key").and_then(|k| k.as_str()) != Some(field) { + continue; + } + + // If ES stored the date as a numeric epoch (stats use this), accept it. + if let Some(n) = attr.get("date_value").and_then(|v| v.as_f64()) { + return Some(n); + } + + // If it's a string (ISO or yyyy-mm-dd), try parsing common formats. + if let Some(s) = attr.get("date_value").and_then(|v| v.as_str()) { + // Try RFC3339 first + if let Ok(dt) = chrono::DateTime::parse_from_rfc3339(s) { + return Some(dt.timestamp_millis() as f64); + } + // Try simple date-only form YYYY-MM-DD + if let Ok(nd) = chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d") { + if let Some(naive_dt) = nd.and_hms_opt(0, 0, 0) { + let dt = chrono::DateTime::::from_naive_utc_and_offset( + naive_dt, + chrono::Utc, + ); + return Some(dt.timestamp_millis() as f64); + } + } + // Try common datetime without timezone + if let Ok(ndt) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S") { + let dt = + chrono::DateTime::::from_naive_utc_and_offset(ndt, chrono::Utc); + return Some(dt.timestamp_millis() as f64); + } + if let Ok(ndt) = chrono::NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S") { + let dt = + chrono::DateTime::::from_naive_utc_and_offset(ndt, chrono::Utc); + return Some(dt.timestamp_millis() as f64); + } + } + } + None +} + /// Find the first keyword attribute value for `field` in an `attributes` array. fn find_attr_keyword(attrs: &[Value], field: &str) -> Option { attrs @@ -1902,24 +2370,26 @@ fn find_attr_keyword(attrs: &[Value], field: &str) -> Option { fn extract_scatter_by_cat( resp: &Value, agg_name: &str, - x_field: &str, + x_storage: &FieldStorage, x_bucket_agg: &str, + cat_storage: &FieldStorage, y_field: &str, + y_inner_agg: &str, x_bucket_count: usize, y_bucket_count: usize, cat_labels: &[String], show_other: bool, cat_is_numeric: bool, main_counts: &[u64], + y_fixed_terms: Option<&[String]>, ) -> (Value, Value) { - let base = format!( - "/aggregations/{}/by_key/categoryHistograms/by_attribute/by_cat/by_value/buckets", - agg_name - ); + let base = match x_storage.cat_histograms_base(agg_name, cat_storage) { + Some(p) if resp.pointer(&p).is_some() => p, + _ => return (Value::Null, Value::Null), + }; - if resp.pointer(&base).is_none() { - return (Value::Null, Value::Null); - } + // Relative path from a per-category bucket to the inner x histogram buckets. + let inner_x = x_storage.inner_x_path(x_bucket_agg); let mut by_cat = serde_json::Map::new(); let mut y_values_by_cat = serde_json::Map::new(); @@ -1934,12 +2404,8 @@ fn extract_scatter_by_cat( for bucket in &cat_buckets { let key = bucket.get("key").and_then(|k| k.as_f64()).unwrap_or(0.0); let label = key.to_string(); - let x_path = format!( - "/histogram/by_attribute/{}/{}/buckets", - x_field, x_bucket_agg - ); let x_buckets_inner = bucket - .pointer(&x_path) + .pointer(&inner_x) .and_then(|b| b.as_array()) .cloned() .unwrap_or_default(); @@ -1952,16 +2418,35 @@ fn extract_scatter_by_cat( .and_then(|c| c.as_u64()) .unwrap_or(0), ); - let y_path = format!("/yHistograms/by_attribute/{}/histogram/buckets", y_field); - let y_counts = x_bucket - .pointer(&y_path) - .and_then(|b| b.as_array()) - .map(|yb| { + let y_path = format!( + "/yHistograms/by_attribute/{}/{}/buckets", + y_field, y_inner_agg + ); + let y_counts = if let Some(yb) = + x_bucket.pointer(&y_path).and_then(|b| b.as_array()) + { + if let Some(fixed) = y_fixed_terms { + use std::collections::HashMap; + let mut map: HashMap = HashMap::new(); + for b in yb { + if let Some(k) = b.get("key").and_then(|k| k.as_str()) { + let c = b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0); + map.insert(k.to_string(), c); + } + } + let mut aligned = Vec::with_capacity(fixed.len()); + for key in fixed { + aligned.push(map.get(key.as_str()).copied().unwrap_or(0)); + } + aligned + } else { yb.iter() .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) .collect() - }) - .unwrap_or_else(|| vec![0; y_bucket_count]); + } + } else { + vec![0; y_bucket_count] + }; y_counts_per_x.push(y_counts); } x_counts.resize(x_bucket_count, 0); @@ -1980,10 +2465,7 @@ fn extract_scatter_by_cat( }; for label in &all_labels { - let x_hist_path = format!( - "{}/{}/histogram/by_attribute/{}/{}/buckets", - base, label, x_field, x_bucket_agg - ); + let x_hist_path = format!("{}/{}{}", base, label, inner_x); let x_buckets = resp .pointer(&x_hist_path) .and_then(|b| b.as_array()) @@ -1999,17 +2481,35 @@ fn extract_scatter_by_cat( .and_then(|c| c.as_u64()) .unwrap_or(0), ); - let y_hist_path = - format!("/yHistograms/by_attribute/{}/histogram/buckets", y_field); - let y_counts = x_bucket - .pointer(&y_hist_path) - .and_then(|b| b.as_array()) - .map(|yb| { + let y_hist_path = format!( + "/yHistograms/by_attribute/{}/{}/buckets", + y_field, y_inner_agg + ); + let y_counts = if let Some(yb) = + x_bucket.pointer(&y_hist_path).and_then(|b| b.as_array()) + { + if let Some(fixed) = y_fixed_terms { + use std::collections::HashMap; + let mut map: HashMap = HashMap::new(); + for b in yb { + if let Some(k) = b.get("key").and_then(|k| k.as_str()) { + let c = b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0); + map.insert(k.to_string(), c); + } + } + let mut aligned = Vec::with_capacity(fixed.len()); + for key in fixed { + aligned.push(map.get(key.as_str()).copied().unwrap_or(0)); + } + aligned + } else { yb.iter() .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) .collect() - }) - .unwrap_or_else(|| vec![0; y_bucket_count]); + } + } else { + vec![0; y_bucket_count] + }; y_counts_per_x.push(y_counts); } x_counts.resize(x_bucket_count, 0); @@ -2059,6 +2559,189 @@ fn compute_z_domain(all_y_values: &[Vec]) -> [u64; 2] { } } +/// Fetch scientific name labels for a list of taxon ids in the configured taxon index. +/// Returns a Vec of labels aligned to the input `ids` (falls back to the id string when +/// a name is not found). +async fn fetch_taxon_labels( + state: &Arc, + ids: &[String], + rank: &str, +) -> Result, String> { + if ids.is_empty() { + return Ok(vec![]); + } + + let taxon_index = index_name::resolve_index_str(&state.default_result, state); + + // Build msearch body: one query per id so we can preserve order in the + // responses. + let mut searches: Vec<(String, Value)> = Vec::new(); + for id in ids { + let q = json!({ + "query": { + "bool": { + "filter": [ + { "term": { "taxon_id": id } }, + { "term": { "taxon_rank": rank } } + ] + } + }, + "_source": ["taxon_id", "scientific_name"] + }); + searches.push((taxon_index.clone(), q)); + } + + let nd = es_client::build_msearch_body(&searches); + let resp = es_client::execute_msearch(&state.client, &state.es_base, &nd).await?; + + let mut labels: Vec = Vec::with_capacity(ids.len()); + if let Some(resps) = resp.get("responses").and_then(|r| r.as_array()) { + for (i, r) in resps.iter().enumerate() { + if let Some(total) = r.pointer("/hits/total/value").and_then(|v| v.as_u64()) { + if total >= 1 { + if let Some(hit) = r.pointer("/hits/hits/0/_source/scientific_name") { + if let Some(s) = hit.as_str() { + labels.push(s.to_string()); + continue; + } + } + } + } + // fallback: use the id string + labels.push(ids.get(i).cloned().unwrap_or_default()); + } + } + Ok(labels) +} + +/// Build a canonical, labelled buckets array from raw ES buckets. +/// +/// If `fixed_terms` is non-empty, produce buckets in that order and +/// include only those terms (this prevents appending unexpected buckets +/// produced by ES). If `bucket_labels` aligns with `fixed_terms`, use +/// those human-readable labels; otherwise fall back to any `label` field +/// on the bucket or the id string. +fn build_structured_buckets( + raw_buckets: &[Value], + fixed_terms: &[String], + bucket_labels: &[String], +) -> Vec { + use std::collections::HashMap; + // Build key -> bucket map for fast lookup + let mut map: HashMap = HashMap::new(); + for b in raw_buckets { + if let Some(kv) = b.get("key") { + let ks = if let Some(s) = kv.as_str() { + s.to_string() + } else { + kv.to_string() + }; + map.insert(ks, b.clone()); + } + } + + let mut out: Vec = Vec::new(); + if !fixed_terms.is_empty() { + // Use fixed_terms ordering and labels when available + for (i, id) in fixed_terms.iter().enumerate() { + let id_str = id.clone(); + let label = if !bucket_labels.is_empty() && bucket_labels.len() == fixed_terms.len() { + bucket_labels.get(i).cloned().unwrap_or(id_str.clone()) + } else if let Some(b) = map.get(&id_str) { + b.get("label") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or(id_str.clone()) + } else { + id_str.clone() + }; + let count = map + .get(&id_str) + .and_then(|b| b.get("doc_count").and_then(|c| c.as_u64())) + .unwrap_or(0); + out.push(json!({"id": id_str, "label": label, "count": count})); + } + } else { + // No fixed terms: preserve raw bucket order, attach label if present + for b in raw_buckets { + let id_val = b.get("key").cloned().unwrap_or(Value::Null); + let id_str = if let Some(s) = id_val.as_str() { + s.to_string() + } else { + id_val.to_string() + }; + let label = b + .get("label") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .unwrap_or(id_str.clone()); + let count = b.get("doc_count").and_then(|v| v.as_u64()).unwrap_or(0); + out.push(json!({"id": id_str, "label": label, "count": count})); + } + } + out +} + +/// Align processed buckets (which may include `key_scaled` etc.) to `fixed_terms` and +/// attach `label` fields. If `fixed_terms` is empty, return processed buckets with +/// labels attached where possible. +fn align_and_label_processed_buckets( + mut processed: Vec, + fixed_terms: &[String], + fixed_labels: &[String], +) -> Vec { + use std::collections::HashMap; + if fixed_terms.is_empty() { + // Attach labels if provided in fixed_labels (unlikely when empty) + for (i, b) in processed.iter_mut().enumerate() { + if let Some(_lbl) = b.get("label").and_then(|v| v.as_str()) { + // already has label + } else if i < fixed_labels.len() { + b["label"] = json!(fixed_labels[i].clone()); + } + } + return processed; + } + + // Map existing processed buckets by id string + let mut map: HashMap = HashMap::new(); + for b in processed.into_iter() { + let id_str = if let Some(k) = b.get("key") { + if let Some(s) = k.as_str() { + s.to_string() + } else { + k.to_string() + } + } else if let Some(id) = b.get("id") { + id.as_str().map(|s| s.to_string()).unwrap_or(id.to_string()) + } else { + continue; + }; + map.insert(id_str, b); + } + + let mut out: Vec = Vec::new(); + for (i, id) in fixed_terms.iter().enumerate() { + let bucket = map.remove(id); + let mut b = if let Some(existing) = bucket { + existing + } else { + // Create empty bucket placeholder + json!({"key": id.clone(), "doc_count": 0}) + }; + let label = if !fixed_labels.is_empty() && fixed_labels.len() == fixed_terms.len() { + fixed_labels.get(i).cloned().unwrap_or(id.clone()) + } else if let Some(lbl) = b.get("label").and_then(|v| v.as_str()) { + lbl.to_string() + } else { + id.clone() + }; + b["label"] = json!(label); + out.push(b); + } + out +} + /// Fetch raw point data for scatter when total hits are within the scatter threshold. /// /// Returns an object mapping category name to an array of `{scientific_name, taxonId, x, y, cat}` @@ -2069,16 +2752,68 @@ async fn fetch_raw_point_data( index: &str, base_query: &Value, x_field: &str, + x_is_taxon_rank: bool, y_field: &str, + y_is_taxon_rank: bool, cat_field: Option<&str>, cat_labels: &[String], show_other: bool, + x_fixed_terms: Option<&[String]>, threshold: usize, ) -> Value { + // Build combined query: base_query AND optional x_bucket filter when + // `x_fixed_terms` provided. This ensures raw points align with the + // canonical buckets used to compute axis ticks and avoid stray points + // that fall outside those buckets. + let mut final_query: Value = base_query.clone(); + if let Some(fixed) = x_fixed_terms { + // Convert slice into JSON array + let fixed_json = json!(fixed); + if x_is_taxon_rank { + // Nested lineage filter: require ancestor at the requested rank + // whose taxon_id is one of the fixed terms. + let extra_filter = json!({ + "nested": { + "path": "lineage", + "query": { + "bool": { + "must": [ + { "term": { "lineage.taxon_rank": x_field } }, + { "terms": { "lineage.taxon_id": fixed_json } } + ] + } + } + } + }); + final_query = json!({ "bool": { "must": [ base_query.clone(), extra_filter ] } }); + } else { + // Non-rank: try to match either nested attributes (attributes.key) + // or a top-level `.keyword` field. Use a SHOULD so either form + // matching will include the document. + let attr_filter = json!({ + "nested": { + "path": "attributes", + "query": { + "bool": { + "must": [ + { "term": { "attributes.key": x_field } }, + { "terms": { "attributes.keyword_value.raw": fixed_json } } + ] + } + } + } + }); + let top_filter = json!({ "terms": { format!("{}.keyword", x_field): fixed_json } }); + let should_filter = json!({ "bool": { "should": [ attr_filter, top_filter ], "minimum_should_match": 1 } }); + final_query = json!({ "bool": { "must": [ base_query.clone(), should_filter ] } }); + } + } + + // Request `lineage` so we can resolve ancestor IDs when the axis is a taxon rank. let es_body = json!({ "size": threshold, - "query": base_query, - "_source": ["scientific_name", "taxon_id", "attributes"] + "query": final_query, + "_source": ["scientific_name", "taxon_id", "attributes", "lineage"] }); let resp = match es_client::execute_search(&state.client, &state.es_base, index, &es_body).await @@ -2120,13 +2855,105 @@ async fn fetch_raw_point_data( .cloned() .unwrap_or_default(); - let x_val = match find_attr_numeric(&attrs, x_field) { - Some(v) => v, - None => continue, + // Extract x and y values. When the axis is a taxon rank, prefer the + // ancestor id found in `lineage`. Otherwise prefer numeric/date/keyword + // attributes as before. + let mut x_label_for_point: Option = None; + let x_json_val = if x_is_taxon_rank { + // Try to find ancestor at the requested rank in the `lineage` array. + let mut found: Option = None; + if let Some(lineage_arr) = src.get("lineage").and_then(|l| l.as_array()) { + for anc in lineage_arr { + if anc.get("taxon_rank").and_then(|r| r.as_str()) == Some(x_field) { + if let Some(idv) = anc.get("taxon_id") { + if let Some(s) = idv.as_str() { + found = Some(json!(s.to_string())); + } else if let Some(n) = idv.as_u64() { + found = Some(json!(n.to_string())); + } + } + // Try to capture scientific_name from the ancestor for labeling + if x_label_for_point.is_none() { + if let Some(sn) = anc.get("scientific_name").and_then(|v| v.as_str()) { + x_label_for_point = Some(sn.to_string()); + } else if let Some(nm) = anc.get("name").and_then(|v| v.as_str()) { + x_label_for_point = Some(nm.to_string()); + } + } + if found.is_some() { + break; + } + } + } + } + if let Some(v) = found { + v + } else if let Some(v) = find_attr_numeric(&attrs, x_field) { + json!(v) + } else if let Some(d) = find_attr_date(&attrs, x_field) { + json!(d) + } else if let Some(s) = find_attr_keyword(&attrs, x_field) { + json!(s) + } else { + continue; + } + } else if let Some(v) = find_attr_numeric(&attrs, x_field) { + json!(v) + } else if let Some(d) = find_attr_date(&attrs, x_field) { + json!(d) + } else if let Some(s) = find_attr_keyword(&attrs, x_field) { + json!(s) + } else { + continue; }; - let y_val = match find_attr_numeric(&attrs, y_field) { - Some(v) => v, - None => continue, + + let mut y_label_for_point: Option = None; + let y_json_val = if y_is_taxon_rank { + // y-axis as taxon rank — resolve ancestor id from lineage if present. + let mut found: Option = None; + if let Some(lineage_arr) = src.get("lineage").and_then(|l| l.as_array()) { + for anc in lineage_arr { + if anc.get("taxon_rank").and_then(|r| r.as_str()) == Some(y_field) { + if let Some(idv) = anc.get("taxon_id") { + if let Some(s) = idv.as_str() { + found = Some(json!(s.to_string())); + } else if let Some(n) = idv.as_u64() { + found = Some(json!(n.to_string())); + } + } + // capture ancestor scientific name for label + if y_label_for_point.is_none() { + if let Some(sn) = anc.get("scientific_name").and_then(|v| v.as_str()) { + y_label_for_point = Some(sn.to_string()); + } else if let Some(nm) = anc.get("name").and_then(|v| v.as_str()) { + y_label_for_point = Some(nm.to_string()); + } + } + if found.is_some() { + break; + } + } + } + } + if let Some(v) = found { + v + } else if let Some(v) = find_attr_numeric(&attrs, y_field) { + json!(v) + } else if let Some(d) = find_attr_date(&attrs, y_field) { + json!(d) + } else if let Some(s) = find_attr_keyword(&attrs, y_field) { + json!(s) + } else { + continue; + } + } else if let Some(v) = find_attr_numeric(&attrs, y_field) { + json!(v) + } else if let Some(d) = find_attr_date(&attrs, y_field) { + json!(d) + } else if let Some(s) = find_attr_keyword(&attrs, y_field) { + json!(s) + } else { + continue; }; let cat_key = if let Some(cf) = cat_field { @@ -2142,13 +2969,20 @@ async fn fetch_raw_point_data( "all".to_string() }; - raw_data.entry(cat_key.clone()).or_default().push(json!({ + let mut point_obj = json!({ "scientific_name": scientific_name, "taxonId": taxon_id, - "x": x_val, - "y": y_val, + "x": x_json_val, + "y": y_json_val, "cat": cat_key - })); + }); + if let Some(lbl) = x_label_for_point { + point_obj["x_label"] = json!(lbl); + } + if let Some(lbl) = y_label_for_point { + point_obj["y_label"] = json!(lbl); + } + raw_data.entry(cat_key.clone()).or_default().push(point_obj); } let mut result = serde_json::Map::new(); @@ -2176,32 +3010,59 @@ pub async fn run_scatter_report( base_query: &Value, ) -> Result<(u64, u64, Value), String> { let x_spec = resolve_axis_spec(AxisRole::X, report_config, state) + .await .ok_or("report config missing 'x' axis (set 'x' field or use 'axes')")?; let y_spec = resolve_axis_spec(AxisRole::Y, report_config, state) + .await .ok_or("scatter report requires 'y' axis (set 'y' field or use 'axes')")?; let x_field = x_spec.field.clone(); let y_field = y_spec.field.clone(); - let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state); + let cat_spec_opt = resolve_axis_spec(AxisRole::Cat, report_config, state).await; let scatter_threshold = report_config .get("scatter_threshold") .and_then(|v| v.as_u64()) .unwrap_or(1000) as usize; + // Augment the base query for bounds computation with a presence-filter + // for the opposite axis so bounds reflect only records that will appear + // in the final plot. This avoids empty buckets caused by one axis being + // filtered out by the other. + let y_storage = resolve_field_storage(&y_spec.field, y_spec.value_type, &state.cache)?; + let x_base_query = json!({ + "bool": { "must": [ base_query.clone(), y_storage.presence_filter() ] } + }); + let x_bounds = compute_bounds( &state.client, &state.es_base, index, &x_spec, - base_query, + &x_base_query, &state.cache, ) .await?; + // If this is a taxon-rank axis and bounds provided a fixed term list (ids), + // attempt to fetch human-readable labels (scientific names) for each id so + // the final report can include a labelled mapping. Fall back to the + // original bounds.cat_labels when lookup fails. + let mut x_bucket_labels: Vec = x_bounds.cat_labels.clone(); + if matches!(x_spec.value_type, ValueType::TaxonRank) && !x_bounds.fixed_terms.is_empty() { + if let Ok(labels) = fetch_taxon_labels(state, &x_bounds.fixed_terms, &x_spec.field).await { + if labels.len() == x_bounds.fixed_terms.len() { + x_bucket_labels = labels; + } + } + } + let x_storage = resolve_field_storage(&x_spec.field, x_spec.value_type, &state.cache)?; + let y_presence = x_storage.presence_filter(); + let y_base_query = json!({ "bool": { "must": [ base_query.clone(), y_presence ] } }); + let y_bounds = compute_bounds( &state.client, &state.es_base, index, &y_spec, - base_query, + &y_base_query, &state.cache, ) .await?; @@ -2257,19 +3118,36 @@ pub async fn run_scatter_report( .unwrap_or(0); // ---- Extract main x buckets (histogram or terms depending on x type) ---- - let x_hist_path = format!("/aggregations/{}/by_key/{}/buckets", agg_name, x_inner_agg); - let x_raw_buckets = resp + let x_hist_path = x_storage.main_bucket_path(agg_name, x_inner_agg); + let mut x_raw_buckets = resp .pointer(&x_hist_path) .and_then(|b| b.as_array()) .cloned() .unwrap_or_default(); - let x_bucket_count = x_raw_buckets.len(); - // Keys may be numeric (histogram) or string (terms) — collect as raw JSON Values. - let x_bucket_keys: Vec = x_raw_buckets - .iter() - .filter_map(|b| b.get("key").cloned()) - .collect(); + // Respect the definitive fixed term order calculated during bounds. + // If `x_bounds.fixed_terms` is non-empty, reorder the returned buckets to + // match that list. Append any unexpected buckets at the end. + if !x_bounds.fixed_terms.is_empty() { + let mut ordered: Vec = Vec::with_capacity(x_raw_buckets.len()); + for id in &x_bounds.fixed_terms { + if let Some(pos) = x_raw_buckets + .iter() + .position(|b| b.get("key").and_then(|k| k.as_str()) == Some(id.as_str())) + { + ordered.push(x_raw_buckets[pos].clone()); + } + } + // Append any remaining buckets not present in fixed_terms + for b in &x_raw_buckets { + let key = b.get("key").and_then(|k| k.as_str()).unwrap_or(""); + if !x_bounds.fixed_terms.iter().any(|t| t == key) { + ordered.push(b.clone()); + } + } + x_raw_buckets = ordered; + } + let x_bucket_count = x_raw_buckets.len(); let all_values: Vec = x_raw_buckets .iter() @@ -2279,27 +3157,164 @@ pub async fn run_scatter_report( // ---- Extract allYValues (per x-bucket y-histogram) and yBuckets ---- let y_bucket_count = y_bounds.tick_count; let mut all_y_values: Vec> = Vec::with_capacity(x_bucket_count); - let mut y_bucket_keys: Vec = Vec::new(); + let mut y_bucket_keys: Vec = Vec::new(); + let mut y_bucket_labels: Vec = Vec::new(); + + // If bounds provided canonical fixed terms for a keyword/rank Y axis, + // prefer that ordering for yBuckets so keys are consistent across x buckets. + if matches!( + y_bounds.value_type, + ValueType::Keyword | ValueType::TaxonRank + ) && !y_bounds.fixed_terms.is_empty() + { + y_bucket_keys = y_bounds + .fixed_terms + .iter() + .map(|s| Value::String(s.clone())) + .collect(); + } + + // If this is a taxon-rank Y axis and bounds provided fixed term ids, + // attempt to fetch human-readable labels (scientific names). Keep + // `y_bucket_keys` as the canonical ids used for bin alignment, and + // separately store `y_bucket_labels` for display. + if matches!(y_spec.value_type, ValueType::TaxonRank) && !y_bounds.fixed_terms.is_empty() { + if let Ok(labels) = fetch_taxon_labels(state, &y_bounds.fixed_terms, &y_spec.field).await { + if labels.len() == y_bounds.fixed_terms.len() { + y_bucket_labels = labels; + // Ensure the canonical keys remain the ids from fixed_terms + // (they were set earlier from `y_bounds.fixed_terms`). + } + } + } + + // Determine inner agg name for y histograms so we can locate buckets + // inside each x-bucket's `yHistograms` result. + let y_inner_agg = if matches!( + y_bounds.value_type, + ValueType::Keyword | ValueType::TaxonRank + ) { + "top_terms" + } else if matches!(y_bounds.value_type, ValueType::Date) { + "date_histogram" + } else { + "histogram" + }; + + // If we still have no canonical y keys, scan *all* x-buckets and collect + // the union of y bucket keys found. This avoids using a single + // first-non-empty-bucket ordering which can produce too-small yBuckets + // when some x-buckets yield sparse date/rank histograms. + if y_bucket_keys.is_empty() { + use std::collections::HashSet; + let mut seen: HashSet = HashSet::new(); + let mut ordered_keys: Vec = Vec::new(); + for x_bucket in &x_raw_buckets { + let y_hist_path = format!( + "/yHistograms/by_attribute/{}/{}/buckets", + y_field, y_inner_agg + ); + if let Some(ybuckets) = x_bucket.pointer(&y_hist_path).and_then(|b| b.as_array()) { + for b in ybuckets { + if let Some(kv) = b.get("key").cloned() { + let ks = if let Some(s) = kv.as_str() { + s.to_string() + } else { + kv.to_string() + }; + if seen.insert(ks) { + ordered_keys.push(kv); + } + } + } + } + } + + if !ordered_keys.is_empty() { + // If all keys are numeric, sort ascending numerically; otherwise + // keep discovery order which tends to reflect term ordering. + let all_numeric = ordered_keys.iter().all(|v| v.as_f64().is_some()); + if all_numeric { + ordered_keys.sort_by(|a, b| { + a.as_f64() + .partial_cmp(&b.as_f64()) + .unwrap_or(std::cmp::Ordering::Equal) + }); + } + y_bucket_keys = ordered_keys; + } + } for x_bucket in &x_raw_buckets { - let y_hist_path = format!("/yHistograms/by_attribute/{}/histogram/buckets", y_field); + let y_hist_path = format!( + "/yHistograms/by_attribute/{}/{}/buckets", + y_field, y_inner_agg + ); let y_buckets_opt = x_bucket.pointer(&y_hist_path).and_then(|b| b.as_array()); if let Some(ybuckets) = y_buckets_opt { + // If we don't already have canonical keys, initialise from this first non-empty bucket if y_bucket_keys.is_empty() { y_bucket_keys = ybuckets .iter() - .filter_map(|b| b.get("key").and_then(|k| k.as_f64())) + .filter_map(|b| b.get("key").cloned()) .collect(); } - all_y_values.push( - ybuckets - .iter() - .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) - .collect(), - ); - } else { + + // Build counts aligned to `y_bucket_keys`. For keyword/rank keys this + // ensures the same ordering even if some x buckets lack particular terms. + if matches!( + y_bounds.value_type, + ValueType::Keyword | ValueType::TaxonRank + ) { + use std::collections::HashMap; + let mut map: HashMap = HashMap::new(); + for b in ybuckets { + if let Some(kv) = b.get("key") { + // Normalize the bucket key to a string regardless of JSON type + let key_s = if let Some(s) = kv.as_str() { + s.to_string() + } else if let Some(n) = kv.as_u64() { + n.to_string() + } else if let Some(n) = kv.as_i64() { + n.to_string() + } else if let Some(f) = kv.as_f64() { + f.to_string() + } else { + kv.to_string() + }; + let c = b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0); + map.insert(key_s, c); + } + } + let mut aligned: Vec = Vec::with_capacity(y_bucket_keys.len()); + for k in &y_bucket_keys { + let key_s = if let Some(s) = k.as_str() { + s.to_string() + } else if let Some(n) = k.as_u64() { + n.to_string() + } else if let Some(n) = k.as_i64() { + n.to_string() + } else if let Some(f) = k.as_f64() { + f.to_string() + } else { + k.to_string() + }; + aligned.push(map.get(&key_s).copied().unwrap_or(0)); + } + all_y_values.push(aligned); + } else { + all_y_values.push( + ybuckets + .iter() + .map(|b| b.get("doc_count").and_then(|c| c.as_u64()).unwrap_or(0)) + .collect(), + ); + } + } else if y_bucket_keys.is_empty() { all_y_values.push(vec![0; y_bucket_count]); + } else { + all_y_values.push(vec![0; y_bucket_keys.len()]); } } @@ -2307,34 +3322,64 @@ pub async fn run_scatter_report( // ---- Extract per-category data ---- let (by_cat, y_values_by_cat) = if !cat_labels.is_empty() || cat_is_numeric { + // Resolve cat_storage so extract_scatter_by_cat can build deterministic paths. + let cat_storage_for_extract = cat_spec_opt + .as_ref() + .and_then(|spec| resolve_field_storage(&spec.field, spec.value_type, &state.cache).ok()) + .unwrap_or(FieldStorage::Root { + es_field: String::new(), + }); extract_scatter_by_cat( &resp, agg_name, - x_field.as_str(), + &x_storage, x_inner_agg, + &cat_storage_for_extract, y_field.as_str(), + y_inner_agg, x_bucket_count, y_bucket_count, &cat_labels, show_other_cat, cat_is_numeric, &all_values, + if !y_bounds.fixed_terms.is_empty() { + Some(&y_bounds.fixed_terms[..]) + } else { + None + }, ) } else { (Value::Null, Value::Null) }; - // ---- Fetch raw point data if below threshold ---- - let raw_data = if total_hits as usize <= scatter_threshold { + // ---- Fetch raw point data when needed ---- + // Previously we only fetched raw points when total hits <= threshold. + // For categorical axes (keyword/taxon) we also want raw points so the + // client/converter can jitter points within categories for visibility. + // Only fetch rawData when the total matched hits are within the configured + // `scatter_threshold`. Previously we also fetched raw points for categorical + // axes to enable client jittering; that behaviour is opt-in and not the + // default — respect the threshold by default. + let should_fetch_raw = total_hits as usize <= scatter_threshold; + + let raw_data = if should_fetch_raw { fetch_raw_point_data( state, index, base_query, x_field.as_str(), + matches!(x_spec.value_type, ValueType::TaxonRank), y_field.as_str(), + matches!(y_spec.value_type, ValueType::TaxonRank), cat_field_str, &cat_labels, show_other_cat, + if !x_bounds.fixed_terms.is_empty() { + Some(&x_bounds.fixed_terms[..]) + } else { + None + }, scatter_threshold, ) .await @@ -2342,29 +3387,49 @@ pub async fn run_scatter_report( Value::Null }; + // Build a single structured `buckets` array where each element is an + // object `{ id, label, count }`. Use `x_bounds.fixed_terms` (when + // present) as the authoritative ordering to avoid appending spurious + // buckets returned by ES. + let buckets_struct: Vec = + build_structured_buckets(&x_raw_buckets, &x_bounds.fixed_terms, &x_bucket_labels); + let mut report_data = json!({ "type": "scatter", "x": { "field": x_field, "scale": format!("{:?}", x_spec.opts.scale).to_lowercase(), - "domain": x_bounds.domain + "domain": x_bounds.domain, + "value_type": value_type_to_string(x_spec.value_type) }, "y": { "field": y_field, "scale": format!("{:?}", y_spec.opts.scale).to_lowercase(), - "domain": y_bounds.domain + "domain": y_bounds.domain, + "value_type": value_type_to_string(y_spec.value_type) }, - "buckets": x_bucket_keys, + "buckets": buckets_struct, "allValues": all_values, "yBuckets": y_bucket_keys, + "yBucketLabels": y_bucket_labels, "allYValues": all_y_values, "zDomain": z_domain }); + // Historically we returned `bucketLabels` separately; clients should now + // consume the structured `buckets` array. Keep `bucketLabels` absent to + // avoid duplication. + if !by_cat.is_null() { report_data["by_cat"] = by_cat; report_data["yValuesByCat"] = y_values_by_cat; - report_data["cat"] = json!(cat_spec_opt.as_ref().map(|s| s.field.as_str())); + if let Some(ref cat_spec) = cat_spec_opt { + report_data["cat"] = json!({ + "field": cat_spec.field, + "value_type": value_type_to_string(cat_spec.value_type), + "scale": format!("{:?}", cat_spec.opts.scale).to_lowercase() + }); + } report_data["cats"] = json!(cat_labels); } diff --git a/crates/genomehubs-api/src/report/spec_builder.rs b/crates/genomehubs-api/src/report/spec_builder.rs new file mode 100644 index 0000000..e28c3c3 --- /dev/null +++ b/crates/genomehubs-api/src/report/spec_builder.rs @@ -0,0 +1,737 @@ +//! Server-side PlotSpec construction helpers. +//! +//! Build a fully-resolved `PlotSpec` from a report payload and optional +//! `display` hints. This lives in the API crate because it may consult +//! server-side knowledge (report JSON shapes) and is not intended for the +//! WASM-local build path. + +use serde_json::{json, Value}; + +use genomehubs_query::report::display::TickLabelPlacement; +use genomehubs_query::report::plot_spec::{AxisMeta, PlotReportType, SeriesMeta}; +use genomehubs_query::report::DisplaySpec; +use genomehubs_query::report::PlotSpec; + +fn parse_display(display: Option<&Value>) -> DisplaySpec { + if let Some(dv) = display { + if let Some(s) = dv.as_str() { + serde_yaml::from_str(s).unwrap_or_default() + } else { + serde_json::from_value(dv.clone()).unwrap_or_default() + } + } else { + DisplaySpec::default() + } +} + +fn domain_from_value(v: Option<&Value>) -> [f64; 2] { + if let Some(Value::Array(arr)) = v { + if arr.len() >= 2 { + let a = arr[0].as_f64().unwrap_or(0.0); + let b = arr[1].as_f64().unwrap_or(a + 1.0); + return [a, b]; + } + } + [0.0, 1.0] +} + +fn make_axis_meta( + field: &str, + scale: Option<&str>, + domain_val: Option<&Value>, + value_type_hint: Option<&str>, +) -> AxisMeta { + let domain = domain_from_value(domain_val); + let scale_s = scale + .map(|s| s.to_string()) + .unwrap_or_else(|| "linear".to_string()); + let value_type = value_type_hint.map(|s| s.to_string()).unwrap_or_else(|| { + if domain != [0.0, 1.0] { + "float".to_string() + } else { + "keyword".to_string() + } + }); + + let tick_label_placement = if value_type == "keyword" { + TickLabelPlacement::BetweenTicks + } else { + TickLabelPlacement::OnTick + }; + + AxisMeta { + field: field.to_string(), + label: None, + scale: scale_s, + domain, + tick_values: vec![], + tick_labels: vec![], + value_type, + tick_label_placement, + tick_label_stride: 1, + tick_label_max_length: None, + } +} + +fn build_series_from_cats(cats: Option<&Value>) -> Vec { + if let Some(Value::Array(arr)) = cats { + arr.iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .map(|key| SeriesMeta { + key: key.clone(), + label: key, + color: None, + }) + .collect() + } else { + Vec::new() + } +} + +/// Compute bin boundary values from an ordered list of numeric bucket keys. +/// +/// Returns `keys.len() + 1` values: the original `keys` plus one extra right +/// boundary estimated as `last_key + (last_key − second_to_last_key)`. +/// +/// The boundary list is used by Vega-Lite `binned` encodings to draw each bar +/// from its left edge to its right edge without overlap or gap. +fn bucket_keys_to_boundaries(sorted_keys: &[f64], axis_obj: &Value) -> Vec { + if sorted_keys.is_empty() { + return vec![]; + } + let width = if sorted_keys.len() >= 2 { + sorted_keys[1] - sorted_keys[0] + } else { + // Estimate width from domain / tickCount when there is only one bucket. + axis_obj + .get("domain") + .and_then(|d| d.as_array()) + .map(|arr| { + if arr.len() >= 2 { + let lo = arr[0].as_f64().unwrap_or(0.0); + let hi = arr[1].as_f64().unwrap_or(lo + 1.0); + let ticks = axis_obj + .get("tickCount") + .and_then(|v| v.as_u64()) + .unwrap_or(10) as f64; + (hi - lo) / ticks.max(1.0) + } else { + 1.0 + } + }) + .unwrap_or(1.0) + }; + let mut boundaries = sorted_keys.to_vec(); + boundaries.push(sorted_keys[sorted_keys.len() - 1] + width); + boundaries +} + +/// Extract numeric or keyword tick data from a bucket array and write it onto `meta`. +/// +/// For keyword axes: extracts `label` (or `id`) strings → `meta.tick_labels`. +/// For numeric axes: extracts `key` / `id` floats, sorts them, computes bin +/// boundaries → `meta.tick_values`. +/// +/// `axis_obj` is the axis spec JSON object (provides `domain` / `tickCount` +/// for single-bucket width estimation). +/// +/// `label_source` is an optional pre-built label list that takes priority over +/// bucket-derived labels (used for y-axis when the server supplies +/// `yBucketLabels` directly). +fn fill_tick_data_from_buckets( + meta: &mut AxisMeta, + axis_obj: &Value, + buckets: &[Value], + label_source: Option<&[Value]>, +) { + if meta.value_type == "keyword" { + // Prefer explicit labels when provided (e.g. yBucketLabels for taxon ranks). + if let Some(lbls) = label_source { + let labels: Vec = lbls + .iter() + .map(|v| v.as_str().unwrap_or("").to_string()) + .collect(); + if !labels.is_empty() { + meta.tick_labels = labels; + return; + } + } + // Fall back to label/id fields from bucket objects. + let labels: Vec = buckets + .iter() + .map(|b| { + b.get("label") + .and_then(|l| l.as_str()) + .map(|s| s.to_string()) + .or_else(|| { + b.get("id") + .or_else(|| b.get("key")) + .and_then(|v| v.as_str().map(|s| s.to_string())) + }) + .unwrap_or_default() + }) + .collect(); + if !labels.is_empty() { + meta.tick_labels = labels; + } + } else { + // Numeric: build sorted boundary list from bucket numeric keys. + let mut keys: Vec = buckets + .iter() + .filter_map(|b| { + b.get("key").and_then(|v| v.as_f64()).or_else(|| { + b.get("id") + .and_then(|v| v.as_str()) + .and_then(|s| s.parse().ok()) + }) + }) + .collect(); + if !keys.is_empty() { + keys.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + meta.tick_values = bucket_keys_to_boundaries(&keys, axis_obj); + } + } +} + +/// Build a merged PlotSpec for multiple arc reports. +/// +/// Each entry in `reports` is expected to be the `report` object returned by +/// the arc report handlers (either a scalar `arc` or an array of ring objects). +/// The function normalises ring entries, computes a `scaled` value in [0,1] +/// (arc is already in [0,1]; arc2 values are scaled relative to the max +/// arc2 value across the batch), and returns a `PlotSpec` with `x` axis using +/// the `scaled` field and a `data.entries` array containing all rings. +pub fn build_arc_plot_spec_from_reports( + reports: &[Value], + batch_display: Option<&Value>, +) -> Result { + // Parse batch display into a DisplaySpec so defaults are available. + let display_spec = parse_display(batch_display); + + // Normalise reports into a flat list of ring-like objects. + let mut entries: Vec = Vec::new(); + let mut report_labels: Vec = Vec::new(); + + for (ri, rep) in reports.iter().enumerate() { + let label = rep + .get("featureTerm") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .or_else(|| { + rep.get("referenceTerm") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + }) + .or_else(|| { + rep.get("queryString") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + }) + .unwrap_or_else(|| format!("report{}", ri)); + report_labels.push(label.clone()); + + if let Some(arr) = rep.get("arc").and_then(|v| v.as_array()) { + for ring in arr.iter() { + let mut obj = match ring { + Value::Object(m) => m.clone(), + other => { + let mut m = serde_json::Map::new(); + m.insert("arc".to_string(), other.clone()); + m + } + }; + obj.insert("report_index".to_string(), json!(ri)); + obj.insert("report_label".to_string(), json!(label.clone())); + entries.push(Value::Object(obj)); + } + } else if let Some(arc_v) = rep.get("arc") { + let mut obj = serde_json::Map::new(); + obj.insert("arc".to_string(), arc_v.clone()); + if let Some(arc2v) = rep.get("arc2") { + obj.insert("arc2".to_string(), arc2v.clone()); + } + if let Some(fc) = rep.get("feature_count") { + obj.insert("feature_count".to_string(), fc.clone()); + } + if let Some(rc) = rep.get("reference_count") { + obj.insert("reference_count".to_string(), rc.clone()); + } + if let Some(ft) = rep.get("featureTerm") { + obj.insert("featureTerm".to_string(), ft.clone()); + } + if let Some(rt) = rep.get("referenceTerm") { + obj.insert("referenceTerm".to_string(), rt.clone()); + } + obj.insert("report_index".to_string(), json!(ri)); + obj.insert("report_label".to_string(), json!(label.clone())); + entries.push(Value::Object(obj)); + } + } + + // Compute scaling factor for arc2 values (if present) + let max_arc2 = entries + .iter() + .filter_map(|e| e.get("arc2").and_then(|v| v.as_f64())) + .fold(0.0_f64, f64::max); + + // Compute scaled value for each entry and assemble final entries array. + let mut final_entries: Vec = Vec::new(); + for e in entries.into_iter() { + if let Value::Object(mut m) = e { + let scaled = if let Some(a) = m.get("arc").and_then(|v| v.as_f64()) { + a + } else if let Some(a2) = m.get("arc2").and_then(|v| v.as_f64()) { + if max_arc2 > 0.0 { + a2 / max_arc2 + } else { + 0.0 + } + } else { + 0.0 + }; + m.insert("scaled".to_string(), json!(scaled)); + final_entries.push(Value::Object(m)); + } else { + // Non-object entry: wrap it into an object with scaled=0 + final_entries.push(json!({"scaled": 0.0})); + } + } + + // Series metadata: one series per input report (labeled by report label) + let mut series_meta: Vec = Vec::new(); + for (i, label) in report_labels.iter().enumerate() { + series_meta.push(SeriesMeta { + key: format!("report_{i}"), + label: label.clone(), + color: None, + }); + } + + // X axis: scaled arc values in [0,1] + let x_meta = AxisMeta { + field: "scaled".to_string(), + label: Some("Arc (scaled)".to_string()), + scale: "linear".to_string(), + domain: [0.0, 1.0], + tick_values: vec![], + tick_labels: vec![], + value_type: "float".to_string(), + tick_label_placement: TickLabelPlacement::OnTick, + tick_label_stride: 1, + tick_label_max_length: None, + }; + + let data = json!({ + "type": "arc_batch", + "entries": final_entries, + "reports": report_labels, + }); + + let spec = PlotSpec { + report_type: PlotReportType::Arc, + x: Some(x_meta), + y: None, + cat: None, + z: None, + series: series_meta, + display: display_spec, + data, + }; + + Ok(spec) +} + +/// Build a `PlotSpec` from a v3 report payload and optional `display` hints. +/// +/// `report_type` is the canonical report string (e.g. "histogram", "scatter"). +/// `report_data` is the JSON returned by the report handlers. `display` may be +/// a YAML string or JSON object and will be merged into the resulting spec. +pub fn build_plot_spec( + report_type: &str, + report_data: &Value, + display: Option<&Value>, +) -> Result { + let pr = PlotReportType::parse(report_type).unwrap_or(PlotReportType::Histogram); + let display_spec = parse_display(display); + + // Normalise histogram display options: prefer explicit `mode` when present; + // otherwise derive it from legacy boolean flags for compatibility. + if let Some(hist) = display_spec.histogram.as_ref() { + // nothing to do when mode already set + if hist.mode.is_none() { + // We'll fill in a sensible default later when serialising the + // PlotSpec; clone and adjust the DisplaySpec to ensure the + // resulting `plot_spec.display.histogram.mode` is always present + // for clients and converters. + } + } + + // Ensure the returned DisplaySpec contains a canonical `histogram.mode` + // when histogram options are present. This keeps downstream converters + // simple: `mode` is authoritative and overrides `stacked`/`cumulative`. + let mut display_spec = display_spec; + if let Some(hist_opts) = display_spec.histogram.as_mut() { + if hist_opts.mode.is_none() { + if hist_opts.stacked.unwrap_or(false) { + hist_opts.mode = Some("stacked".to_string()); + } else if hist_opts.cumulative.unwrap_or(false) { + hist_opts.mode = Some("cumulative".to_string()); + } else { + // default behaviour remains stacked for backward-compatibility + hist_opts.mode = Some("stacked".to_string()); + } + } + // Keep boolean `stacked` consistent with `mode` for consumers + match hist_opts.mode.as_deref() { + Some("stacked") => hist_opts.stacked = Some(true), + Some("grouped") | Some("facet") | Some("cumulative") => hist_opts.stacked = Some(false), + _ => {} + } + } + if let Some(arc_opts) = display_spec.arc.as_mut() { + if arc_opts.mode.is_none() { + arc_opts.mode = Some("grouped".to_string()); + } + if arc_opts.shape.is_none() { + arc_opts.shape = Some("auto".to_string()); + } + } + + // Default empty values + let mut x: Option = None; + let mut y: Option = None; + let z: Option = None; + let mut series: Vec = Vec::new(); + + match pr { + PlotReportType::Histogram => { + if let Some(x_obj) = report_data.get("x") { + if let Some(field) = x_obj.get("field").and_then(|v| v.as_str()) { + let scale = x_obj.get("scale").and_then(|v| v.as_str()); + let domain = x_obj.get("domain"); + let value_type_hint = x_obj.get("value_type").and_then(|v| v.as_str()); + x = Some(make_axis_meta(field, scale, domain, value_type_hint)); + if let Some(meta) = x.as_mut() { + let axis_opts = display_spec + .histogram + .as_ref() + .and_then(|h| h.x_axis.as_ref()); + genomehubs_query::report::spec_builder::resolve_axis_display( + meta, axis_opts, + ); + let buckets = report_data + .get("buckets") + .and_then(|v| v.as_array()) + .map(|a| a.as_slice()) + .unwrap_or(&[]); + fill_tick_data_from_buckets(meta, x_obj, buckets, None); + } + } + } + // Series from cats + series = build_series_from_cats(report_data.get("cats")); + // Y axis: histogram counts (doc_count) — ensure converter receives + // authoritative axis metadata so it does not need to guess. + if let Some(buckets) = report_data.get("buckets").and_then(|v| v.as_array()) { + let counts: Vec = buckets + .iter() + .map(|b| b.get("doc_count").and_then(|c| c.as_f64()).unwrap_or(0.0)) + .collect(); + let max = counts.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + let domain = if max.is_finite() { + [0.0, if max > 0.0 { max } else { 1.0 }] + } else { + [0.0, 1.0] + }; + y = Some(AxisMeta { + field: "doc_count".to_string(), + label: Some("count".to_string()), + scale: "linear".to_string(), + domain, + tick_values: vec![], + tick_labels: vec![], + value_type: "integer".to_string(), + tick_label_placement: TickLabelPlacement::OnTick, + tick_label_stride: 1, + tick_label_max_length: None, + }); + } else { + y = Some(make_axis_meta( + "doc_count", + Some("linear"), + None, + Some("integer"), + )); + } + } + PlotReportType::Scatter => { + if let Some(x_obj) = report_data.get("x") { + if let Some(field) = x_obj.get("field").and_then(|v| v.as_str()) { + let scale = x_obj.get("scale").and_then(|v| v.as_str()); + let domain = x_obj.get("domain"); + let value_type_hint = x_obj.get("value_type").and_then(|v| v.as_str()); + x = Some(make_axis_meta(field, scale, domain, value_type_hint)); + if let Some(meta) = x.as_mut() { + let axis_opts = display_spec + .scatter + .as_ref() + .and_then(|s| s.x_axis.as_ref()) + .or_else(|| { + display_spec + .histogram + .as_ref() + .and_then(|h| h.x_axis.as_ref()) + }); + genomehubs_query::report::spec_builder::resolve_axis_display( + meta, axis_opts, + ); + let buckets = report_data + .get("buckets") + .and_then(|v| v.as_array()) + .map(|a| a.as_slice()) + .unwrap_or(&[]); + fill_tick_data_from_buckets(meta, x_obj, buckets, None); + } + } + } + if let Some(y_obj) = report_data.get("y") { + if let Some(field) = y_obj.get("field").and_then(|v| v.as_str()) { + let scale = y_obj.get("scale").and_then(|v| v.as_str()); + let domain = y_obj.get("domain"); + let value_type_hint = y_obj.get("value_type").and_then(|v| v.as_str()); + y = Some(make_axis_meta(field, scale, domain, value_type_hint)); + if let Some(meta) = y.as_mut() { + let axis_opts = display_spec + .scatter + .as_ref() + .and_then(|s| s.y_axis.as_ref()) + .or_else(|| { + display_spec + .histogram + .as_ref() + .and_then(|h| h.y_axis.as_ref()) + }); + genomehubs_query::report::spec_builder::resolve_axis_display( + meta, axis_opts, + ); + // Prefer explicit yBucketLabels (human-readable taxon rank names) + // then fall back to yBuckets for both keyword and numeric axes. + let explicit_labels: Option<&[Value]> = report_data + .get("yBucketLabels") + .and_then(|v| v.as_array()) + .map(|a| a.as_slice()); + let y_buckets: Vec = report_data + .get("yBuckets") + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .map(|v| match v { + // Convert raw scalar bucket keys into fake bucket objects + // so fill_tick_data_from_buckets can process them. + Value::Number(_) | Value::String(_) => { + serde_json::json!({ "key": v }) + } + other => other.clone(), + }) + .collect() + }) + .unwrap_or_default(); + fill_tick_data_from_buckets(meta, y_obj, &y_buckets, explicit_labels); + } + } + } + series = build_series_from_cats(report_data.get("cats")); + } + PlotReportType::CountPerRank => { + // Count per rank: x is rank labels (keyword), y is count + if let Some(buckets) = report_data.get("buckets").and_then(|v| v.as_array()) { + // pick first bucket's rank field name via keys + // we'll construct a dummy x axis named "rank" + x = Some(make_axis_meta( + "rank", + Some("ordinal"), + None, + Some("keyword"), + )); + // y domain from counts + let counts: Vec = buckets + .iter() + .map(|b| b.get("count").and_then(|c| c.as_f64()).unwrap_or(0.0)) + .collect(); + let min = counts.iter().cloned().fold(f64::INFINITY, f64::min); + let max = counts.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + let domain = if min.is_finite() && max.is_finite() { + [min, if max > min { max } else { min + 1.0 }] + } else { + [0.0, 1.0] + }; + y = Some(AxisMeta { + field: "count".to_string(), + label: Some("count".to_string()), + scale: "linear".to_string(), + domain, + tick_values: vec![], + tick_labels: vec![], + value_type: "integer".to_string(), + tick_label_placement: TickLabelPlacement::OnTick, + tick_label_stride: 1, + tick_label_max_length: None, + }); + } + } + PlotReportType::Sources => { + // Sources returns buckets; treat as categorical x + numeric y + if let Some(buckets) = report_data.get("buckets").and_then(|v| v.as_array()) { + x = Some(make_axis_meta( + "source", + Some("ordinal"), + None, + Some("keyword"), + )); + let counts: Vec = buckets + .iter() + .map(|b| b.get("count").and_then(|c| c.as_f64()).unwrap_or(0.0)) + .collect(); + let min = counts.iter().cloned().fold(f64::INFINITY, f64::min); + let max = counts.iter().cloned().fold(f64::NEG_INFINITY, f64::max); + let domain = if min.is_finite() && max.is_finite() { + [min, if max > min { max } else { min + 1.0 }] + } else { + [0.0, 1.0] + }; + y = Some(AxisMeta { + field: "count".to_string(), + label: Some("count".to_string()), + scale: "linear".to_string(), + domain, + tick_values: vec![], + tick_labels: vec![], + value_type: "integer".to_string(), + tick_label_placement: TickLabelPlacement::OnTick, + tick_label_stride: 1, + tick_label_max_length: None, + }); + } + } + PlotReportType::Arc => { + dbg!(&report_data); + } + PlotReportType::Tree + | PlotReportType::Map + | PlotReportType::Oxford + | PlotReportType::Ribbon + | PlotReportType::Painting => { + // Positional / complex reports: rely on display/data only. Axis + // metadata for these are highly report-specific and are handled by + // the positional endpoint's own PlotSpec builder. Here we provide + // a conservative default: embed the full report JSON as data and + // leave axes empty. + } + } + + // Build `cat` AxisMeta from report_data["cat"] when present. This keeps + // categorical metadata (field, value_type, scale, tick labels) in the + // canonical PlotSpec so converters can deterministically render legends + // and category axes. + let mut cat_meta: Option = None; + if let Some(cat_obj) = report_data.get("cat") { + if let Some(field) = cat_obj.get("field").and_then(|v| v.as_str()) { + let scale = cat_obj.get("scale").and_then(|v| v.as_str()); + let domain = cat_obj.get("domain"); + let value_type_hint = cat_obj.get("value_type").and_then(|v| v.as_str()); + let mut cm = make_axis_meta(field, scale, domain, value_type_hint); + // Apply any top-level display label for categories if provided + if let Some(label) = display_spec.cat_label.as_ref() { + cm.label = Some(label.clone()); + } + // Prefer explicit tick labels supplied under `report_data["cat"]["tick_labels"]`. + if let Some(lbls) = cat_obj.get("tick_labels").and_then(|v| v.as_array()) { + let labels: Vec = lbls + .iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(); + if !labels.is_empty() { + cm.tick_labels = labels; + } + } + + // Populate tick labels for categorical cat axes from report_data["cats"] + if cm.value_type == "keyword" { + if let Some(cats_arr) = report_data.get("cats").and_then(|v| v.as_array()) { + let labels: Vec = cats_arr + .iter() + .filter_map(|v| v.as_str().map(|s| s.to_string())) + .collect(); + if !labels.is_empty() { + cm.tick_labels = labels; + } + } + } else { + // Numeric cat axes: prefer explicit numeric `tick_values` supplied + // in `report_data["cat"]["tick_values"]`. Fall back to parsing + // `report_data["cats"]` when not present. + if let Some(vals) = report_data + .get("cat") + .and_then(|c| c.get("tick_values")) + .and_then(|v| v.as_array()) + { + let nums: Vec = vals.iter().filter_map(|v| v.as_f64()).collect(); + if !nums.is_empty() { + cm.tick_values = nums; + } + } else if let Some(cats_arr) = report_data.get("cats").and_then(|v| v.as_array()) { + let mut nums: Vec = Vec::new(); + for v in cats_arr.iter() { + if let Some(n) = v.as_f64() { + nums.push(n); + } else if let Some(s) = v.as_str() { + if let Ok(n) = s.parse::() { + nums.push(n); + } + } + } + if !nums.is_empty() { + nums.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let width = if nums.len() >= 2 { + nums[1] - nums[0] + } else { + 1.0 + }; + let mut boundaries = nums.clone(); + let last = nums[nums.len() - 1] + width; + boundaries.push(last); + cm.tick_values = boundaries; + } + } + } + cat_meta = Some(cm); + } + } + + // If the server supplied human-readable category tick labels, apply + // them to the series labels so the legend displays friendly names while + // the underlying series keys remain the raw category keys used in + // `data.by_cat`. + if let Some(ref cm) = cat_meta { + if !cm.tick_labels.is_empty() { + for (i, lbl) in cm.tick_labels.iter().enumerate() { + if let Some(s) = series.get_mut(i) { + s.label = lbl.clone(); + } + } + } + } + + let plot_spec = PlotSpec { + report_type: pr, + x, + y, + cat: cat_meta, + z, + series, + display: display_spec, + data: report_data.clone(), + }; + + Ok(plot_spec) +} diff --git a/crates/genomehubs-api/src/routes/count_batch.rs b/crates/genomehubs-api/src/routes/count_batch.rs index 7e4a0c0..e56d9b8 100644 --- a/crates/genomehubs-api/src/routes/count_batch.rs +++ b/crates/genomehubs-api/src/routes/count_batch.rs @@ -1,6 +1,6 @@ use axum::{extract::Json, Extension}; use serde::{Deserialize, Deserializer, Serialize}; -use serde_json::Value; +use serde_json::{json, Value}; use std::sync::Arc; use super::deserialize_helpers; @@ -12,10 +12,14 @@ fn combine_es_bodies( combine_with: &genomehubs_query::query::CombineStrategy, ) -> serde_json::Value { if bodies.is_empty() { - return serde_json::json!({ "query": { "match_all": {} }, "size": 0 }); + return serde_json::json!({ "query": { "match_all": {} }, "size": 0, "track_total_hits": true }); } if bodies.len() == 1 { - return bodies.into_iter().next().unwrap(); + let mut count_query = bodies.into_iter().next().unwrap(); + if let Some(obj) = count_query.as_object_mut() { + obj.insert("track_total_hits".to_string(), serde_json::json!(true)); + } + return count_query; } // Extract the "query" clause from each body; combine with bool.should/must @@ -46,6 +50,7 @@ fn combine_es_bodies( let mut result = bodies.into_iter().next().unwrap(); if let Some(obj) = result.as_object_mut() { obj.insert("query".to_string(), combined_query); + obj.insert("track_total_hits".to_string(), serde_json::json!(true)); } result } @@ -118,9 +123,13 @@ fn build_msearch_body(searches: &[(String, serde_json::Value)]) -> String { .iter() .flat_map(|(index, body)| { let header = serde_json::json!({ "index": index }); + let count_body = json!({ + "query": body.get("query").cloned().unwrap_or_else(|| serde_json::json!({"match_all": {}})), + "track_total_hits": true + }); vec![ serde_json::to_string(&header).unwrap(), - serde_json::to_string(body).unwrap(), + serde_json::to_string(&count_body).unwrap(), ] }) .collect::>() diff --git a/crates/genomehubs-api/src/routes/mod.rs b/crates/genomehubs-api/src/routes/mod.rs index 98d888a..ac529eb 100644 --- a/crates/genomehubs-api/src/routes/mod.rs +++ b/crates/genomehubs-api/src/routes/mod.rs @@ -61,6 +61,7 @@ pub mod positional; pub mod record; pub mod record_batch; pub mod report; +pub mod report_batch; pub mod result_fields; pub mod search; pub mod search_batch; diff --git a/crates/genomehubs-api/src/routes/positional.rs b/crates/genomehubs-api/src/routes/positional.rs index 77c8099..97122f3 100644 --- a/crates/genomehubs-api/src/routes/positional.rs +++ b/crates/genomehubs-api/src/routes/positional.rs @@ -580,6 +580,7 @@ pub async fn post_positional( report_type: pr, x: None, y: None, + cat: None, z: None, series: Vec::new(), display: genomehubs_query::report::DisplaySpec::default(), diff --git a/crates/genomehubs-api/src/routes/report.rs b/crates/genomehubs-api/src/routes/report.rs index 27454ec..08fe472 100644 --- a/crates/genomehubs-api/src/routes/report.rs +++ b/crates/genomehubs-api/src/routes/report.rs @@ -8,11 +8,15 @@ use genomehubs_query::query::{QueryParams, SearchQuery}; use crate::{index_name, report::report_types, routes::ApiStatus, AppState}; -#[derive(utoipa::ToSchema)] +#[derive(Serialize, utoipa::ToSchema)] pub struct ReportRequest { pub query_yaml: String, pub params_yaml: String, pub report_yaml: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub include_plot_spec: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub display: Option, } impl<'de> Deserialize<'de> for ReportRequest { @@ -55,10 +59,15 @@ impl<'de> Deserialize<'de> for ReportRequest { return Err(de::Error::missing_field("report or report_yaml")); }; + let include_plot_spec = map.get("include_plot_spec").and_then(|v| v.as_bool()); + let display = map.get("display").cloned(); + Ok(ReportRequest { query_yaml, params_yaml, report_yaml, + include_plot_spec, + display, }) } } @@ -67,6 +76,8 @@ impl<'de> Deserialize<'de> for ReportRequest { pub struct ReportResponse { pub status: ApiStatus, pub report: Value, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub plot_spec: Option, } #[utoipa::path( @@ -88,6 +99,7 @@ pub async fn post_report( return Json(ReportResponse { status: ApiStatus::error($msg), report: Value::Null, + plot_spec: None, }) }; } @@ -209,15 +221,33 @@ pub async fn post_report( unknown => Err(format!("unknown report type: {unknown}")), }; - // Return response + // Return response (optionally include a minimal PlotSpec when requested) match result { - Ok((hits, took, report_data)) => Json(ReportResponse { - status: ApiStatus::query_ok(hits, took), - report: report_data, - }), + Ok((hits, took, report_data)) => { + let plot_spec_value: Option = + if req.include_plot_spec.unwrap_or(false) || req.display.is_some() { + match crate::report::spec_builder::build_plot_spec( + report_type, + &report_data, + req.display.as_ref(), + ) { + Ok(spec) => serde_json::to_value(&spec).ok(), + Err(_) => None, + } + } else { + None + }; + + Json(ReportResponse { + status: ApiStatus::query_ok(hits, took), + report: report_data, + plot_spec: plot_spec_value, + }) + } Err(e) => Json(ReportResponse { status: ApiStatus::error(e), report: Value::Null, + plot_spec: None, }), } } diff --git a/crates/genomehubs-api/src/routes/report_batch.rs b/crates/genomehubs-api/src/routes/report_batch.rs new file mode 100644 index 0000000..7abf6d3 --- /dev/null +++ b/crates/genomehubs-api/src/routes/report_batch.rs @@ -0,0 +1,137 @@ +use axum::{extract::Json, Extension}; +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::sync::Arc; + +use crate::{routes::ApiStatus, AppState}; + +/// Batch request for running multiple reports in one HTTP call. +#[derive(Deserialize, utoipa::ToSchema)] +pub struct ReportBatchRequest { + /// Array of report requests to execute in batch (max 50). + pub reports: Vec, + /// Optional concurrency limit (1..=32). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub concurrency: Option, + /// Optionally request a combined PlotSpec for the batch and provide + /// display hints that apply to the combined spec. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub include_plot_spec: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub display: Option, +} + +#[derive(Serialize, utoipa::ToSchema)] +pub struct ReportBatchResultItem { + pub status: ApiStatus, + pub report: Value, + #[serde(skip_serializing_if = "Option::is_none")] + pub plot_spec: Option, +} + +#[derive(Serialize, utoipa::ToSchema)] +pub struct ReportBatchResponse { + pub status: ApiStatus, + /// Per-request results in the same order as the input `reports`. + pub results: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub plot_spec: Option, +} + +#[utoipa::path( + post, + path = "/api/v3/report/batch", + tag = "Data", + summary = "Generate multiple reports in a single request", + description = "Execute multiple report requests concurrently; returns per-item report responses.", + request_body(content = ReportBatchRequest), + responses((status = 200, description = "Batch report results", body = ReportBatchResponse)) +)] +#[axum::debug_handler] +pub async fn post_report_batch( + Extension(state): Extension>, + Json(req): Json, +) -> Json { + if req.reports.len() > 50 { + return Json(ReportBatchResponse { + status: ApiStatus::error("maximum 50 reports per request".to_string()), + results: vec![], + plot_spec: None, + }); + } + + let concurrency = req.concurrency.unwrap_or(8).clamp(1, 32); + let semaphore = Arc::new(tokio::sync::Semaphore::new(concurrency)); + + // Spawn a task per report; each task acquires a semaphore permit so we bound + // the number of concurrently-executing handlers. + let mut handles = Vec::with_capacity(req.reports.len()); + for report_req in req.reports.into_iter() { + let sem = semaphore.clone(); + let st = state.clone(); + let handle = tokio::spawn(async move { + let _permit = sem.acquire_owned().await.expect("semaphore closed"); + // Call the existing single-report handler directly so we reuse + // the same parsing, chain resolution, and dispatch logic. + let resp = crate::routes::report::post_report(Extension(st), Json(report_req)).await; + let Json(report_resp) = resp; + ReportBatchResultItem { + status: report_resp.status, + report: report_resp.report, + plot_spec: report_resp.plot_spec, + } + }); + handles.push(handle); + } + + // Await all tasks and preserve input order. + let mut results: Vec = Vec::with_capacity(handles.len()); + for h in handles { + match h.await { + Ok(item) => results.push(item), + Err(e) => results.push(ReportBatchResultItem { + status: ApiStatus::error(format!("task join failed: {e}")), + report: Value::Null, + plot_spec: None, + }), + } + } + + // If the caller requested a batch-level PlotSpec (or supplied a top-level + // `display`), attempt to build a combined arc PlotSpec from any arc + // reports in the results. We only produce a combined spec when there + // are arc-type reports present. + let top_plot_spec: Option = + if req.include_plot_spec.unwrap_or(false) || req.display.is_some() { + let arc_reports: Vec = results + .iter() + .filter_map(|r| { + r.report + .get("type") + .and_then(|v| v.as_str()) + .filter(|s| *s == "arc") + .map(|_| r.report.clone()) + }) + .collect(); + + if !arc_reports.is_empty() { + match crate::report::spec_builder::build_arc_plot_spec_from_reports( + &arc_reports, + req.display.as_ref(), + ) { + Ok(spec) => serde_json::to_value(&spec).ok(), + Err(_) => None, + } + } else { + None + } + } else { + None + }; + + Json(ReportBatchResponse { + status: ApiStatus::ok(), + results, + plot_spec: top_plot_spec, + }) +} diff --git a/crates/genomehubs-api/target/openapi.json b/crates/genomehubs-api/target/openapi.json index 02cb627..125309a 100644 --- a/crates/genomehubs-api/target/openapi.json +++ b/crates/genomehubs-api/target/openapi.json @@ -588,6 +588,38 @@ } } }, + "/api/v3/report/batch": { + "post": { + "tags": [ + "Data" + ], + "summary": "Generate multiple reports in a single request", + "description": "Execute multiple report requests concurrently; returns per-item report responses.", + "operationId": "post_report_batch", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReportBatchRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Batch report results", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ReportBatchResponse" + } + } + } + } + } + } + }, "/api/v3/search": { "get": { "tags": [ @@ -1390,6 +1422,16 @@ "positional_yaml" ], "properties": { + "display": { + "description": "Optional display spec (JSON) applied to the returned PlotSpec." + }, + "include_plot_spec": { + "type": [ + "boolean", + "null" + ], + "description": "When true, produce a serialisable `PlotSpec` alongside the report." + }, "positional_yaml": { "type": "string" }, @@ -1409,6 +1451,7 @@ "report" ], "properties": { + "plot_spec": {}, "report": {}, "status": { "$ref": "#/components/schemas/ApiStatus" @@ -1531,6 +1574,72 @@ } } }, + "ReportBatchRequest": { + "type": "object", + "description": "Batch request for running multiple reports in one HTTP call.", + "required": [ + "reports" + ], + "properties": { + "concurrency": { + "type": [ + "integer", + "null" + ], + "description": "Optional concurrency limit (1..=32).", + "minimum": 0 + }, + "display": {}, + "include_plot_spec": { + "type": [ + "boolean", + "null" + ], + "description": "Optionally request a combined PlotSpec for the batch and provide\ndisplay hints that apply to the combined spec." + }, + "reports": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ReportRequest" + }, + "description": "Array of report requests to execute in batch (max 50)." + } + } + }, + "ReportBatchResponse": { + "type": "object", + "required": [ + "status", + "results" + ], + "properties": { + "plot_spec": {}, + "results": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ReportBatchResultItem" + }, + "description": "Per-request results in the same order as the input `reports`." + }, + "status": { + "$ref": "#/components/schemas/ApiStatus" + } + } + }, + "ReportBatchResultItem": { + "type": "object", + "required": [ + "status", + "report" + ], + "properties": { + "plot_spec": {}, + "report": {}, + "status": { + "$ref": "#/components/schemas/ApiStatus" + } + } + }, "ReportRequest": { "type": "object", "required": [ @@ -1539,6 +1648,13 @@ "report_yaml" ], "properties": { + "display": {}, + "include_plot_spec": { + "type": [ + "boolean", + "null" + ] + }, "params_yaml": { "type": "string" }, @@ -1557,6 +1673,7 @@ "report" ], "properties": { + "plot_spec": {}, "report": {}, "status": { "$ref": "#/components/schemas/ApiStatus" diff --git a/crates/genomehubs-api/tests/e2e_scatter_axis_types.rs b/crates/genomehubs-api/tests/e2e_scatter_axis_types.rs new file mode 100644 index 0000000..ebbdeec --- /dev/null +++ b/crates/genomehubs-api/tests/e2e_scatter_axis_types.rs @@ -0,0 +1,113 @@ +use serde_json::{json, Value}; + +use reqwest::Client; + +use genomehubs_query::report::plot_spec_to_vega_lite_json; + +// This is an end-to-end test that posts to a running API at localhost:3000. +// It iterates a small set of axis-type combinations in both raw and binned +// modes and asserts the server-provided `plot_spec` includes axis `value_type` +// and that the converter produces a valid Vega-Lite spec (no error payload). + +#[tokio::test] +async fn e2e_scatter_axis_type_permutations() -> Result<(), Box> { + let client = Client::new(); + let base_url = + std::env::var("GH_API_URL").unwrap_or_else(|_| "http://localhost:3000".to_string()); + let url = format!("{}/api/v3/report", base_url); + + // Axis candidates: rank-like (genus), numeric, keyword, date + let axes = vec!["genus", "assembly_span", "assembly_level", "assembly_date"]; + + for x in &axes { + for y in &axes { + for &threshold in &[1000_i64, 10_i64] { + let req_body = json!({ + "query": {"index":"taxon", "taxa": ["canidae"], "taxon_filter_type": "tree"}, + "params": {}, + "report": {"report":"scatter", "x": x, "y": y, "scatter_threshold": threshold}, + "include_plot_spec": true, + "display": {"title": format!("scatter {} vs {} thresh {}", x, y, threshold)} + }); + + let resp = client + .post(&url) + .header("accept", "application/json") + .json(&req_body) + .send() + .await?; + + let status = resp.status(); + if !status.is_success() { + let body = resp.text().await.unwrap_or_default(); + panic!( + "API returned non-success for x={} y={} threshold={}: status={} body={}", + x, y, threshold, status, body + ); + } + + let resp_json: Value = resp.json().await?; + + dbg!(&resp_json); + + let plot_spec = resp_json + .get("plot_spec") + .cloned() + .ok_or_else(|| format!("no plot_spec in response for x={} y={}", x, y))?; + + // Server must provide authoritative axis value types + let x_vt = plot_spec + .get("x") + .and_then(|v| v.get("value_type")) + .and_then(|v| v.as_str()) + .ok_or_else(|| format!("plot_spec.x.value_type missing for x={} y={}", x, y))?; + let y_vt = plot_spec + .get("y") + .and_then(|v| v.get("value_type")) + .and_then(|v| v.as_str()) + .ok_or_else(|| format!("plot_spec.y.value_type missing for x={} y={}", x, y))?; + + eprintln!( + "Testing x={} ({}) y={} ({}) threshold={}", + x, x_vt, y, y_vt, threshold + ); + + // Convert plot_spec to Vega-Lite using the workspace converter + let ps_str = serde_json::to_string(&plot_spec)?; + let vl_json_str = plot_spec_to_vega_lite_json(&ps_str); + let vl_val: Value = serde_json::from_str(&vl_json_str).map_err(|e| { + format!( + "converter returned invalid JSON: {} -- payload: {}", + e, vl_json_str + ) + })?; + + if vl_val.get("error").is_some() { + panic!( + "converter returned error for x={} y={} threshold={}: {}", + x, y, threshold, vl_json_str + ); + } + + // Determine mark type (supports either string or object `mark` forms) + let mark_type = match vl_val.get("mark") { + Some(Value::String(s)) => s.clone(), + Some(Value::Object(obj)) => obj + .get("type") + .and_then(|t| t.as_str()) + .unwrap_or("") + .to_string(), + _ => "".to_string(), + }; + + if threshold >= 1000 { + assert!(mark_type == "point" || mark_type == "circle" || mark_type == "symbol", "expected point-like mark for raw mode but got {:?} for x={} y={} threshold={}", mark_type, x, y, threshold); + } else { + assert!(mark_type == "rect" || mark_type == "bar", "expected rect/bar mark for binned mode but got {:?} for x={} y={} threshold={}", mark_type, x, y, threshold); + } + } + } + } + + Ok(()) +} diff --git a/crates/genomehubs-query/src/local_report/builder.rs b/crates/genomehubs-query/src/local_report/builder.rs index 9cc40e1..31da2e8 100644 --- a/crates/genomehubs-query/src/local_report/builder.rs +++ b/crates/genomehubs-query/src/local_report/builder.rs @@ -121,6 +121,7 @@ pub fn local_plot_spec( x: Some(x_meta), y: y_meta, z: None, + cat: None, series: vec![], display, data, diff --git a/crates/genomehubs-query/src/report/axis.rs b/crates/genomehubs-query/src/report/axis.rs index 456ac6b..295a675 100644 --- a/crates/genomehubs-query/src/report/axis.rs +++ b/crates/genomehubs-query/src/report/axis.rs @@ -64,7 +64,6 @@ pub enum DateInterval { Month, Quarter, Year, - Decade, } impl DateInterval { @@ -74,9 +73,8 @@ impl DateInterval { DateInterval::Day => "1d", DateInterval::Week => "1w", DateInterval::Month => "1M", - DateInterval::Quarter => "3M", + DateInterval::Quarter => "1q", DateInterval::Year => "1y", - DateInterval::Decade => "10y", } } } @@ -356,9 +354,8 @@ fn parse_date_interval(s: &str) -> Option { "day" | "1d" => Some(DateInterval::Day), "week" | "1w" => Some(DateInterval::Week), "month" | "1M" | "1m" => Some(DateInterval::Month), - "quarter" | "3M" | "3m" => Some(DateInterval::Quarter), + "quarter" | "3M" | "3m" | "1q" => Some(DateInterval::Quarter), "year" | "1y" => Some(DateInterval::Year), - "decade" | "10y" => Some(DateInterval::Decade), _ => None, } } @@ -700,10 +697,9 @@ mod tests { ("1M", DateInterval::Month), ("quarter", DateInterval::Quarter), ("3M", DateInterval::Quarter), + ("1q", DateInterval::Quarter), ("year", DateInterval::Year), ("1y", DateInterval::Year), - ("decade", DateInterval::Decade), - ("10y", DateInterval::Decade), ]; for (interval_str, expected_interval) in intervals { let opts = AxisOpts::parse(&format!(";;;;;{}", interval_str)); @@ -721,9 +717,8 @@ mod tests { assert_eq!(DateInterval::Day.to_es_interval(), "1d"); assert_eq!(DateInterval::Week.to_es_interval(), "1w"); assert_eq!(DateInterval::Month.to_es_interval(), "1M"); - assert_eq!(DateInterval::Quarter.to_es_interval(), "3M"); + assert_eq!(DateInterval::Quarter.to_es_interval(), "1q"); assert_eq!(DateInterval::Year.to_es_interval(), "1y"); - assert_eq!(DateInterval::Decade.to_es_interval(), "10y"); } // ── AxisSpec tests ── diff --git a/crates/genomehubs-query/src/report/display.rs b/crates/genomehubs-query/src/report/display.rs index b76721f..a7288ff 100644 --- a/crates/genomehubs-query/src/report/display.rs +++ b/crates/genomehubs-query/src/report/display.rs @@ -94,6 +94,9 @@ pub struct AxisOptions { pub struct HistogramOptions { /// Stack category series instead of overlaying them. pub stacked: Option, + /// Display mode for categorized histograms: "stacked", "grouped", or "facet". + /// When present, overrides `stacked` where applicable. + pub mode: Option, /// Cumulative sum mode: each bar shows the sum of all preceding bars. pub cumulative: Option, /// Y-axis scale: `"linear"` (default), `"log10"`, or `"proportion"`. @@ -203,6 +206,11 @@ pub struct TreeOptions { pub struct ArcOptions { /// Show percentage labels inside arc segments (default: `true`). pub show_labels: Option, + /// Display mode for multiple arcs: "grouped", or "facet". + pub mode: Option, + /// Shape of the arcs: "auto" (default), "ring", "pie", "rainbow", "horizontal" or "vertical". + /// "auto" resolves to "ring" for single arc, and "rainbow" for 2 or more. + pub shape: Option, } /// Sources data-attribution bar chart display options. diff --git a/crates/genomehubs-query/src/report/mod.rs b/crates/genomehubs-query/src/report/mod.rs index a98d98c..634cd98 100644 --- a/crates/genomehubs-query/src/report/mod.rs +++ b/crates/genomehubs-query/src/report/mod.rs @@ -207,7 +207,7 @@ pub fn report_yaml_from_url_params(url: &str) -> Result<(String, String, String) // ── Vega-Lite conversion ────────────────────────────────────────────────────── -/// Convert a `PlotSpec` JSON string into a Vega-Lite v5 specification JSON string. +/// Convert a `PlotSpec` JSON string into a Vega-Lite v6 specification JSON string. /// /// Accepts the full `/report` response envelope (extracts `plot_spec` automatically) /// or a bare `PlotSpec` object. Returns an error JSON on failure. @@ -244,7 +244,7 @@ pub fn plot_spec_to_vega_lite_json(input: &str) -> String { .unwrap_or(12.0); let mut base = serde_json::json!({ - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "$schema": "https://vega.github.io/schema/vega-lite/v6.json", "width": width, "height": height, "config": { @@ -277,8 +277,20 @@ pub fn plot_spec_to_vega_lite_json(input: &str) -> String { base } "arc" => { - base["mark"] = serde_json::Value::String("arc".to_string()); - base + // Special-case batch arc payloads produced by the API: these use + // data.type == "arc_batch" and contain `entries` with per-ring + // `scaled` values (0..1) and `report_index`/`report_label`. + if spec_val + .get("data") + .and_then(|d| d.get("type")) + .and_then(|t| t.as_str()) + == Some("arc_batch") + { + vl_arc_batch(spec_val, base) + } else { + base["mark"] = serde_json::Value::String("arc".to_string()); + base + } } _ => base, }; @@ -289,26 +301,273 @@ pub fn plot_spec_to_vega_lite_json(input: &str) -> String { } } -fn vl_histogram(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_json::Value { - let x_meta = spec.get("x").unwrap_or(&serde_json::Value::Null); - let x_field = x_meta - .get("field") +/// Build a Vega-Lite encoding object for an axis given optional server-side +/// axis metadata and optional numeric boundaries (bin edges). +fn make_vl_axis_encoding( + axis_meta_opt: Option<&serde_json::Value>, + data_field: &str, + label_hint: Option<&str>, + boundaries_opt: Option<&[f64]>, + prefer_nominal: bool, + z_index: Option, +) -> Result { + // Require server-provided axis metadata including `value_type`. + let meta = axis_meta_opt.ok_or_else(|| { + format!( + "missing axis metadata for field '{}' — server must provide axis.value_type", + data_field + ) + })?; + + let value_type = meta + .get("value_type") .and_then(|v| v.as_str()) - .unwrap_or("key"); - let x_label = x_meta + .ok_or_else(|| format!("axis metadata for '{}' lacks 'value_type'", data_field))?; + + let label = meta .get("label") .and_then(|v| v.as_str()) - .unwrap_or(x_field); - let x_scale_str = x_meta + .or(label_hint) + .unwrap_or(data_field); + + let scale_hint = meta .get("scale") .and_then(|v| v.as_str()) .unwrap_or("linear"); - let x_vl_scale = if x_scale_str == "log10" { + + // Map canonical server `value_type` strings to Vega-Lite types deterministically. + let vl_type = if prefer_nominal || value_type == "keyword" { + "nominal" + } else if value_type == "date" { + "temporal" + } else if value_type == "float" + || value_type == "integer" + || value_type == "number" + || value_type == "coordinate" + { + "quantitative" + } else { + return Err(format!( + "unknown axis value_type '{}' for field '{}'", + value_type, data_field + )); + }; + + let scale_type = if vl_type == "nominal" { + "band" + } else if vl_type == "temporal" { + "time" + } else if scale_hint == "log10" { "log" } else { "linear" }; + // Helper to convert JSON values (number or numeric string) to f64 + fn json_to_f64(v: &serde_json::Value) -> Option { + if let Some(n) = v.as_f64() { + Some(n) + } else if let Some(s) = v.as_str() { + s.parse::().ok() + } else { + None + } + } + + // Tick values: explicit computed boundaries (if provided) because these + // are derived from bin edges and are usually the best ticks for histograms; + // otherwise use server-provided tick values if present and non-empty. + let tick_values_json = if let Some(b) = boundaries_opt { + if vl_type == "temporal" { + Some(serde_json::Value::Array( + b.iter() + .map(|v| serde_json::Value::Number((*v as i64).into())) + .collect(), + )) + } else { + Some(serde_json::Value::Array( + b.iter().map(|v| serde_json::Value::from(*v)).collect(), + )) + } + } else { + meta.get("tick_values") + .and_then(|tv| tv.as_array()) + .and_then(|arr| { + if arr.is_empty() { + None + } else if vl_type == "temporal" { + // Convert numeric tick values to datetime signals; leave strings alone + Some(serde_json::Value::Array( + arr.iter() + .map(|v| { + if let Some(n) = v.as_f64() { + serde_json::json!({"signal": format!("datetime({})", n as i64)}) + } else if let Some(s) = v.as_str() { + serde_json::Value::String(s.to_string()) + } else { + v.clone() + } + }) + .collect(), + )) + } else { + Some(serde_json::Value::Array(arr.clone())) + } + }) + }; + + // Domain: prefer explicit computed boundaries (if provided) because these + // are derived from bin edges and are usually the correct visual domain; + // otherwise fall back to a server-provided domain (robustly parsed). + let domain_opt = boundaries_opt + .and_then(|b| { + if !b.is_empty() { + Some((b[0], *b.last().unwrap())) + } else { + None + } + }) + .or_else(|| { + meta.get("domain") + .and_then(|d| d.as_array()) + .and_then(|arr| { + if arr.len() >= 2 { + let lo = json_to_f64(&arr[0]).unwrap_or(0.0); + let hi = json_to_f64(&arr[1]).unwrap_or(lo + 1.0); + Some((lo, hi)) + } else { + None + } + }) + }); + + // Build scale object + let mut scale_obj = serde_json::Map::new(); + scale_obj.insert( + "type".to_string(), + serde_json::Value::String(scale_type.to_string()), + ); + if let Some((lo, hi)) = domain_opt { + if vl_type == "temporal" { + scale_obj.insert( + "domain".to_string(), + serde_json::Value::Array(vec![ + serde_json::Value::Number((lo as i64).into()), + serde_json::Value::Number((hi as i64).into()), + ]), + ); + } else { + scale_obj.insert("domain".to_string(), serde_json::json!([lo, hi])); + } + } + if vl_type == "nominal" { + scale_obj.insert( + "paddingOuter".to_string(), + serde_json::Value::Number((0).into()), + ); + // Remove inner padding so adjacent categorical bars fill the full + // width between ticks (useful for histogram-style categorical axes). + scale_obj.insert( + "paddingInner".to_string(), + serde_json::Value::Number((0).into()), + ); + // If the server provided explicit tick values for a nominal axis, + // use them as the scale domain to preserve ordering (e.g. taxon id + // list or human-readable bucket labels). + #[allow(clippy::collapsible_match)] + if let Some(tv) = &tick_values_json { + if let serde_json::Value::Array(arr) = tv { + if !arr.is_empty() { + scale_obj.insert("domain".to_string(), serde_json::Value::Array(arr.clone())); + } + } + } + } + + // Build axis object + let mut axis_obj = serde_json::Map::new(); + axis_obj.insert( + "title".to_string(), + serde_json::Value::String(label.to_string()), + ); + if let Some(tv) = tick_values_json { + axis_obj.insert("values".to_string(), tv); + } + if vl_type == "temporal" { + // Choose a sensible date format. Prefer server-declared interval when + // present (e.g. "year" -> show year only). Otherwise heuristically + // infer from computed boundaries if available. + let mut date_fmt = "%Y-%m-%d".to_string(); + if let Some(interval_str) = meta.get("interval").and_then(|v| v.as_str()) { + match interval_str { + "year" | "decade" => date_fmt = "%Y".to_string(), + "month" | "quarter" => date_fmt = "%Y-%m".to_string(), + _ => date_fmt = "%Y-%m-%d".to_string(), + } + } else if let Some(b) = boundaries_opt { + if b.len() >= 2 { + let width = (b[1] - b[0]).abs(); + let day_ms = 86400.0 * 1000.0; + let year_ms = 365.0 * day_ms; + if width >= year_ms { + date_fmt = "%Y".to_string(); + } else if width >= 28.0 * day_ms { + date_fmt = "%Y-%m".to_string(); + } else { + date_fmt = "%Y-%m-%d".to_string(); + } + } + } + axis_obj.insert("format".to_string(), serde_json::Value::String(date_fmt)); + } else if vl_type == "quantitative" { + axis_obj.insert( + "format".to_string(), + serde_json::Value::String(".3s".to_string()), + ); + } else if vl_type == "nominal" { + axis_obj.insert("grid".to_string(), serde_json::Value::Bool(true)); + axis_obj.insert( + "tickBand".to_string(), + serde_json::Value::String("extent".to_string()), + ); + } + if let Some(z) = z_index { + axis_obj.insert("zindex".to_string(), serde_json::Value::Number(z.into())); + } + + Ok(serde_json::json!({ + "field": data_field, + "type": vl_type, + "scale": serde_json::Value::Object(scale_obj), + "axis": serde_json::Value::Object(axis_obj) + })) +} + +/// Compute the pixel width for a single bar in a grouped histogram or scatter bar chart. +/// +/// Divides the available `plot_width_px` evenly across `n_bins` bins, reserves 90 % +/// of each bin for bar content, then splits that space evenly among `n_cats` categories. +/// The result is clamped to a minimum of 2 px so bars remain visible for large datasets. +fn grouped_bar_size_px(n_bins: f64, n_cats: f64, plot_width_px: f64) -> f64 { + let bin_pixel = if n_bins > 0.0 { + plot_width_px / n_bins + } else { + 10.0 + }; + ((bin_pixel * 0.9) / n_cats.max(1.0)).max(2.0) +} + +fn vl_histogram(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_json::Value { + let x_meta = spec.get("x").unwrap_or(&serde_json::Value::Null); + let x_field = x_meta + .get("field") + .and_then(|v| v.as_str()) + .unwrap_or("key"); + let x_label = x_meta + .get("label") + .and_then(|v| v.as_str()) + .unwrap_or(x_field); + let display = spec.get("display").unwrap_or(&serde_json::Value::Null); let hist = display.get("histogram").unwrap_or(&serde_json::Value::Null); let y_scale_str = hist @@ -320,37 +579,875 @@ fn vl_histogram(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_ } else { "linear" }; + let y_min = match y_vl_scale { + "log" => 1.0, // avoid log(0) issues + _ => 0.0, + }; let y_label = display .get("y_label") .and_then(|v| v.as_str()) .unwrap_or("Count"); - let buckets = spec + // Transform ES-style buckets (key, doc_count) into left/right bar values + // with explicit `x` (left) and `x2` (right) so Vega-Lite draws bars with + // bin boundaries. Also compute axis `values` ticks at each boundary. + let raw_buckets = spec .get("data") .and_then(|d| d.get("buckets")) .cloned() .unwrap_or_else(|| serde_json::json!([])); - base["data"] = serde_json::json!({"values": buckets}); + // If the server provided per-category breakdowns (`by_cat`) produce + // a long-form dataset so Vega-Lite can render stacked/grouped/faceted + // histograms. Preserve axis metadata for numeric/date axes by keeping + // `x` as numeric/temporal values when possible so tick formatting is + // delegated to `make_vl_axis_encoding` (server-side metadata remains + // authoritative). + if let Some(by_cat_val) = spec.get("data").and_then(|d| d.get("by_cat")).cloned() { + if by_cat_val.is_object() { + let hist_mode = hist + .get("mode") + .and_then(|v| v.as_str()) + .unwrap_or("grouped"); + let hist_cumulative = hist + .get("cumulative") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + // Category order preference: explicit `data.cats` else object keys order + let cats: Vec = spec + .get("data") + .and_then(|d| d.get("cats")) + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .map(|s| s.as_str().unwrap_or("").to_string()) + .collect() + }) + .unwrap_or_else(|| by_cat_val.as_object().unwrap().keys().cloned().collect()); + + // Optional mapping from raw cat key -> human-readable label. If + // the server provided `report_data.cat.tick_labels` and a + // parallel `report_data.cats` ordering, use that to map keys to + // friendly labels for display (legend, color). This preserves + // raw keys for lookups while presenting readable names. + let mut cat_label_map: Option> = None; + if let Some(lbls) = spec + .get("data") + .and_then(|d| d.get("cat")) + .and_then(|c| c.get("tick_labels")) + .and_then(|v| v.as_array()) + { + if lbls.len() == cats.len() { + let mut m = std::collections::HashMap::new(); + for (i, k) in cats.iter().enumerate() { + if let Some(lbl) = lbls.get(i).and_then(|v| v.as_str()) { + m.insert(k.clone(), lbl.to_string()); + } + } + if !m.is_empty() { + cat_label_map = Some(m); + } + } + } + + // Decide how to treat the x axis: preserve numeric/temporal type + // when the server indicates it (so axis ticks/formatting are kept). + let x_value_type = x_meta + .get("value_type") + .and_then(|v| v.as_str()) + .unwrap_or("keyword"); + + // Helper: extract bucket count for (cat, idx) + let get_count = |cat: &str, idx: usize| -> f64 { + by_cat_val + .get(cat) + .and_then(|arr| arr.as_array()) + .and_then(|a| a.get(idx)) + .and_then(|v| v.as_f64()) + .unwrap_or(0.0) + }; + + if x_value_type == "keyword" { + // Categorical: previous behaviour — string `x` values with server + // ordering preserved; support stacked/grouped/facet via color/xOffset/facet. + let bucket_labels: Vec = raw_buckets + .as_array() + .map(|arr| { + arr.iter() + .map(|b| { + b.get("label") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .or_else(|| { + b.get("key").and_then(|k| k.as_str().map(|s| s.to_string())) + }) + .or_else(|| { + b.get("id").and_then(|k| k.as_str().map(|s| s.to_string())) + }) + .unwrap_or_else(|| b.to_string()) + }) + .collect() + }) + .unwrap_or_default(); + + // Long-form values and compute per-bucket sums. Support + // cumulative mode by maintaining running totals per category. + let mut values: Vec = Vec::new(); + let mut max_sum: f64 = 0.0; + let mut running: Vec = vec![0.0; cats.len()]; + let mut cat_max: Vec = vec![0.0; cats.len()]; + for (i, bl) in bucket_labels.iter().enumerate() { + let mut bucket_sum = 0.0_f64; + for (ci, cat) in cats.iter().enumerate() { + let count = get_count(cat, i); + let display_count = if hist_cumulative { + running[ci] += count; + running[ci] + } else { + count + }; + // track per-category maximum for grouped/facet domains + if display_count > cat_max[ci] { + cat_max[ci] = display_count; + } + bucket_sum += display_count; + let mut obj = serde_json::Map::new(); + obj.insert("x".to_string(), serde_json::Value::String(bl.clone())); + let display_cat = cat_label_map + .as_ref() + .and_then(|m| m.get(cat)) + .cloned() + .unwrap_or_else(|| cat.clone()); + obj.insert("cat".to_string(), serde_json::Value::String(display_cat)); + obj.insert( + "doc_count".to_string(), + serde_json::Value::from(display_count), + ); + values.push(serde_json::Value::Object(obj)); + } + if bucket_sum > max_sum { + max_sum = bucket_sum; + } + } + let max_cat_max = cat_max.iter().cloned().fold(0.0_f64, f64::max); + + // X encoding uses nominal domain derived from bucket labels + let x_meta_override = + serde_json::json!({"tick_values": bucket_labels, "value_type": "keyword"}); + let mut x_encoding = match make_vl_axis_encoding( + Some(&x_meta_override), + "x", + Some(x_label), + None, + true, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + + // When grouped mode is requested on a nominal x axis, allow + // some inner padding so `xOffset` can place multiple bars + // side-by-side inside each band. Default `paddingInner=0` + // would make bars occupy the full band and overlap. + if hist_mode == "grouped" { + if let Some(x_enc_obj) = x_encoding.as_object() { + if let Some(scale_val) = x_enc_obj.get("scale").cloned() { + if scale_val.is_object() { + let mut scale_map = + scale_val.as_object().cloned().unwrap_or_default(); + scale_map.insert("paddingInner".to_string(), serde_json::json!(0)); + scale_map.insert("paddingOuter".to_string(), serde_json::json!(0)); + // replace the scale in x_encoding + if let Some(x_enc_obj_mut) = x_encoding.as_object_mut() { + x_enc_obj_mut.insert( + "scale".to_string(), + serde_json::Value::Object(scale_map), + ); + } + } + } + } + } + + // Y axis encoding: doc_count; prefer quantitative with sensible domain + let y_axis_meta = spec.get("y"); + let mut y_encoding = match make_vl_axis_encoding( + y_axis_meta, + "doc_count", + Some(y_label), + None, + false, + None, + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + // Determine y domain depending on histogram mode: + // - stacked: domain = max total per bin (max_sum) + // - grouped/facet: domain = max per-category bar height (max_cat_max) + let desired_y_max = match hist_mode { + "grouped" | "facet" => { + if max_cat_max > 0.0 { + max_cat_max + } else { + 1.0 + } + } + _ => { + if max_sum > 0.0 { + max_sum + } else { + 1.0 + } + } + }; + + if let Some(obj) = y_encoding.as_object_mut() { + obj.insert( + "aggregate".to_string(), + serde_json::Value::String("sum".to_string()), + ); + if let Some(scale_val) = obj.get_mut("scale") { + if scale_val.is_object() { + scale_val.as_object_mut().unwrap().insert( + "domain".to_string(), + serde_json::json!([0.0, desired_y_max]), + ); + } + } + } + + base["data"] = serde_json::json!({"values": values}); + + // Compute pixel-based grouped bar size for categorical bins + let plot_width_px = + base.get("width").and_then(|v| v.as_u64()).unwrap_or(600) as f64; + let grouped_bar_px = grouped_bar_size_px( + bucket_labels.len() as f64, + cats.len() as f64, + plot_width_px, + ); + + match hist_mode { + "grouped" => { + // Use xOffset to separate categories within each bucket. + // Also explicitly disable stacking so viewers that auto-stack + // aggregated colour channels will render grouped bars. + if let Some(y_obj) = y_encoding.as_object_mut() { + y_obj.insert("stack".to_string(), serde_json::Value::Null); + } + base["mark"] = serde_json::json!({"type": "bar", "size": grouped_bar_px}); + base["encoding"] = serde_json::json!({ + "x": x_encoding, + "y": y_encoding, + "color": {"field": "cat", "type": "nominal"}, + "xOffset": {"field": "cat", "type": "nominal"} + }); + return base; + } + "facet" => { + // Use facet: row by category (small multiples). Place the + // `mark` and `encoding` inside `spec` only; do not set a + // top-level `mark` (invalid with `facet`). + let spec_obj = serde_json::json!({ + "mark": {"type": "bar"}, + "encoding": {"x": x_encoding, "y": y_encoding, "y2": {"datum": y_min}} + }); + base["facet"] = + serde_json::json!({"row": {"field": "cat", "type": "nominal"}}); + base["spec"] = spec_obj; + // Keep colour/legend off for facet default; caller can style separately + return base; + } + _ => { + // default: stacked — let Vega-Lite perform stacking via + // the `color` encoding and aggregated `y`. + base["mark"] = serde_json::json!({"type": "bar", "size": grouped_bar_px}); + base["encoding"] = serde_json::json!({ + "x": x_encoding, + "y": y_encoding, + "color": {"field": "cat", "type": "nominal"} + }); + return base; + } + } + } else { + // Numeric / temporal buckets: preserve numeric x values so axis + // formatting and tick values computed by the server are retained. + // Compute numeric keys and boundaries similar to the non-cat path. + let mut keys_num: Vec = Vec::new(); + if let Some(arr) = raw_buckets.as_array() { + for b in arr { + if let Some(k) = b.get("key").and_then(|v| v.as_f64()) { + keys_num.push(k); + } + } + } + + // Determine bin width + let width = if keys_num.len() >= 2 { + keys_num[1] - keys_num[0] + } else if let Some(domain_arr) = x_meta.get("domain").and_then(|d| d.as_array()) { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + let tick_count = x_meta + .get("tickCount") + .and_then(|v| v.as_u64()) + .unwrap_or(10) as f64; + (hi - lo) / tick_count.max(1.0) + } else { + 1.0 + } + } else { + 1.0 + }; + + // Build numeric boundaries + let mut boundaries_f64: Vec = Vec::new(); + if !keys_num.is_empty() { + for k in &keys_num { + boundaries_f64.push(*k); + } + let last_right = if keys_num.len() >= 2 { + keys_num[keys_num.len() - 1] + (keys_num[1] - keys_num[0]) + } else { + keys_num[0] + width + }; + boundaries_f64.push(last_right); + } + + // Long-form numeric values with either (x,x2) for stacked mode + // or center-based `x` for grouped mode so bars can be narrower + // and offset with `xOffset`. + let mut values: Vec = Vec::new(); + let mut max_sum: f64 = 0.0; + + // Precompute bin centers + let mut centers: Vec = Vec::new(); + for (i, left) in keys_num.iter().enumerate() { + let right = if i + 1 < keys_num.len() { + keys_num[i + 1] + } else { + left + width + }; + centers.push(left + (right - left) / 2.0); + } + + // Compute pixel-based bar size for grouped mode so bars fit side-by-side + let plot_width_px = + base.get("width").and_then(|v| v.as_u64()).unwrap_or(600) as f64; + let n_cats = cats.len() as f64; + let grouped_bar_px = + grouped_bar_size_px(keys_num.len() as f64, n_cats, plot_width_px); + + // Precompute domain span for converting pixel offsets into data units + let domain_min = boundaries_f64.first().cloned().unwrap_or(0.0); + let domain_max = boundaries_f64.last().cloned().unwrap_or(domain_min + 1.0); + let domain_span = if domain_max > domain_min { + domain_max - domain_min + } else { + 1.0 + }; + + let mut running: Vec = vec![0.0; cats.len()]; + let mut cat_max: Vec = vec![0.0; cats.len()]; + for (i, left) in keys_num.iter().enumerate() { + let right = if i + 1 < keys_num.len() { + keys_num[i + 1] + } else { + left + width + }; + let mut bucket_sum = 0.0_f64; + for (ci, cat) in cats.iter().enumerate() { + let count = get_count(cat, i); + let display_count = if hist_cumulative { + running[ci] += count; + running[ci] + } else { + count + }; + // track per-category maximum for grouped/facet scaling + if display_count > cat_max[ci] { + cat_max[ci] = display_count; + } + bucket_sum += display_count; + let mut obj = serde_json::Map::new(); + if hist_mode == "grouped" { + // Compute a small data-space shift for this category so + // bars are placed side-by-side without relying on + // viewer support for `xOffset`. + let ci_f = ci as f64; + let center_index = (n_cats - 1.0) / 2.0; + let data_per_pixel = domain_span / plot_width_px.max(1.0); + let bar_data_width = grouped_bar_px * data_per_pixel; + let shift = (ci_f - center_index) * bar_data_width; + let x_val = centers[i] + shift; + obj.insert("x".to_string(), serde_json::Value::from(x_val)); + } else { + // Stacked / default: use range [x,x2] + obj.insert("x".to_string(), serde_json::Value::from(*left)); + obj.insert("x2".to_string(), serde_json::Value::from(right)); + } + let display_cat = cat_label_map + .as_ref() + .and_then(|m| m.get(cat)) + .cloned() + .unwrap_or_else(|| cat.clone()); + obj.insert("cat".to_string(), serde_json::Value::String(display_cat)); + obj.insert( + "doc_count".to_string(), + serde_json::Value::from(display_count), + ); + values.push(serde_json::Value::Object(obj)); + } + if bucket_sum > max_sum { + max_sum = bucket_sum; + } + } + + // Build x encoding using numeric boundaries so axis formatting is correct + let x_encoding = match make_vl_axis_encoding( + spec.get("x"), + "x", + Some(x_label), + Some(&boundaries_f64), + false, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + + // Y axis encoding: doc_count; aggregate per x and ensure domain starts at 0 + let y_axis_meta = spec.get("y"); + let mut y_encoding = match make_vl_axis_encoding( + y_axis_meta, + "doc_count", + Some(y_label), + None, + false, + None, + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + let max_cat_max = cat_max.iter().cloned().fold(0.0_f64, f64::max); + let desired_y_max = match hist_mode { + "grouped" | "facet" => { + if max_cat_max > 0.0 { + max_cat_max + } else { + 1.0 + } + } + _ => { + if max_sum > 0.0 { + max_sum + } else { + 1.0 + } + } + }; + if let Some(obj) = y_encoding.as_object_mut() { + obj.insert( + "aggregate".to_string(), + serde_json::Value::String("sum".to_string()), + ); + if let Some(scale_val) = obj.get_mut("scale") { + if scale_val.is_object() { + scale_val.as_object_mut().unwrap().insert( + "domain".to_string(), + serde_json::json!([0.0, desired_y_max]), + ); + } + } + } + + base["data"] = serde_json::json!({"values": values}); + + match hist_mode { + "facet" => { + // Facet: small multiples by category; keep shared x scale + let spec_obj = serde_json::json!({ + "mark": {"type": "bar"}, + "encoding": {"x": x_encoding, "x2": {"field": "x2"}, "y": y_encoding, "y2": {"datum": y_min}} + }); + base["facet"] = + serde_json::json!({"row": {"field": "cat", "type": "nominal"}}); + base["spec"] = spec_obj; + return base; + } + "grouped" => { + // Grouped: use xOffset (Vega-Lite v5+) to offset categories within numeric bins + // and explicitly disable stacking on the y encoding so viewers + // do not aggregate into stacked bars. + if let Some(y_obj) = y_encoding.as_object_mut() { + y_obj.insert("stack".to_string(), serde_json::Value::Null); + } + base["mark"] = serde_json::json!({"type": "bar", "size": grouped_bar_px}); + base["encoding"] = serde_json::json!({ + "x": x_encoding, + "y": y_encoding, + "color": {"field": "cat", "type": "nominal"}, + "xOffset": {"field": "cat", "type": "nominal"} + }); + return base; + } + _ => { + // Default: attempt stacked. Vega-Lite stacking across numeric + // continuous axes is not universally supported; for numeric + // axes we fallback to grouped behaviour to preserve axis + // formatting. If the server prefers true stacked nominal + // bins it can provide `x.tick_labels` and the client can + // request `value_type: keyword` instead. + base["mark"] = serde_json::json!({"type": "bar"}); + base["encoding"] = serde_json::json!({ + "x": x_encoding, + "x2": {"field": "x2"}, + "y": y_encoding, + "color": {"field": "cat", "type": "nominal"}, + "xOffset": {"field": "cat", "type": "nominal"} + }); + return base; + } + } + } + } + } + + let mut values: Vec = Vec::new(); + let mut keys: Vec = Vec::new(); + let x_value_type = x_meta + .get("value_type") + .and_then(|v| v.as_str()) + .unwrap_or("keyword"); + + if let Some(arr) = raw_buckets.as_array() { + if x_value_type == "keyword" { + // Categorical histogram: emit one value per category with + // a string `x` field and numeric `doc_count` so Vega-Lite can + // render nominal bars with the server-provided tick order. + for b in arr { + let label = b + .get("label") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .or_else(|| b.get("key").and_then(|k| k.as_str().map(|s| s.to_string()))) + .or_else(|| b.get("id").and_then(|k| k.as_str().map(|s| s.to_string()))) + .unwrap_or_else(|| b.to_string()); + let count = b.get("doc_count").and_then(|v| v.as_f64()).unwrap_or(0.0); + let mut obj = serde_json::Map::new(); + obj.insert("x".to_string(), serde_json::Value::String(label.clone())); + obj.insert("doc_count".to_string(), serde_json::Value::from(count)); + if let Some(kv) = b.get("key") { + obj.insert("key".to_string(), kv.clone()); + } else if let Some(idv) = b.get("id") { + obj.insert("id".to_string(), idv.clone()); + } + values.push(serde_json::Value::Object(obj)); + } + } else { + for b in arr { + if let Some(k) = b.get("key").and_then(|v| v.as_f64()) { + keys.push(k); + } + } + // Determine bin width + let width = if keys.len() >= 2 { + keys[1] - keys[0] + } else if let Some(domain_arr) = x_meta.get("domain").and_then(|d| d.as_array()) { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + let tick_count = x_meta + .get("tickCount") + .and_then(|v| v.as_u64()) + .unwrap_or(10) as f64; + (hi - lo) / tick_count.max(1.0) + } else { + 1.0 + } + } else { + 1.0 + }; + + for (i, b) in arr.iter().enumerate() { + let key = b.get("key").and_then(|v| v.as_f64()).unwrap_or(0.0); + let right = if i + 1 < keys.len() { + keys[i + 1] + } else { + key + width + }; + let count = b.get("doc_count").and_then(|v| v.as_f64()).unwrap_or(0.0); + let mut obj = serde_json::Map::new(); + obj.insert("x".to_string(), serde_json::Value::from(key)); + obj.insert("x2".to_string(), serde_json::Value::from(right)); + obj.insert("doc_count".to_string(), serde_json::Value::from(count)); + // Preserve original key for backwards compatibility + obj.insert("key".to_string(), serde_json::Value::from(key)); + values.push(serde_json::Value::Object(obj)); + } + } + } + + if x_value_type == "keyword" { + // Extract category order from the buckets (labels) to use as tick values + let mut cats: Vec = Vec::new(); + if let Some(arr) = raw_buckets.as_array() { + for b in arr { + let label = b + .get("label") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .or_else(|| b.get("key").and_then(|k| k.as_str().map(|s| s.to_string()))) + .or_else(|| b.get("id").and_then(|k| k.as_str().map(|s| s.to_string()))) + .unwrap_or_default(); + cats.push(label); + } + } + + let x_meta_override = serde_json::json!({"tick_values": cats, "value_type": "keyword"}); + let x_encoding = match make_vl_axis_encoding( + Some(&x_meta_override), + "x", + Some(x_label), + None, + true, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + + // Y axis encoding: doc_count; prefer quantitative with sensible domain + let y_axis_meta = spec.get("y"); + let mut y_encoding = + match make_vl_axis_encoding(y_axis_meta, "doc_count", Some(y_label), None, false, None) + { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + // Ensure y domain starts at zero for histograms + if let Some(scale_obj) = y_encoding.get_mut("scale") { + if scale_obj.is_object() { + let max_val = values + .iter() + .filter_map(|o| o.get("doc_count").and_then(|v| v.as_f64())) + .fold(0.0_f64, |a, b| a.max(b)); + scale_obj.as_object_mut().unwrap().insert( + "domain".to_string(), + serde_json::json!([0.0, if max_val > 0.0 { max_val } else { 1.0 }]), + ); + } + } + + base["data"] = serde_json::json!({"values": values}); + base["mark"] = serde_json::json!({"type": "bar"}); + base["encoding"] = serde_json::json!({ + "x": x_encoding, + "y": y_encoding, + "y2": {"datum": y_min} + }); + let _ = x_field; + return base; + } + + // Compute numeric boundaries (left edges + final right edge) + let mut boundaries_f64: Vec = Vec::new(); + if !keys.is_empty() { + for k in &keys { + boundaries_f64.push(*k); + } + // final right + let last_right = if keys.len() >= 2 { + keys[keys.len() - 1] + (keys[1] - keys[0]) + } else { + keys[0] + 1.0 + }; + boundaries_f64.push(last_right); + } + + // X axis encoding: use server axis meta + computed boundaries + let x_encoding = match make_vl_axis_encoding( + spec.get("x"), + "x", + Some(x_label), + Some(&boundaries_f64), + false, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + + // Y axis encoding: doc_count; prefer quantitative with sensible domain + let y_axis_meta = spec.get("y"); + let y_encoding = + match make_vl_axis_encoding(y_axis_meta, "doc_count", Some(y_label), None, false, None) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + + base["data"] = serde_json::json!({"values": values}); base["mark"] = serde_json::json!({"type": "bar"}); base["encoding"] = serde_json::json!({ - "x": { - "field": "key", - "type": "quantitative", - "scale": {"type": x_vl_scale}, - "axis": {"title": x_label} - }, - "y": { - "field": "doc_count", - "type": "quantitative", - "scale": {"type": y_vl_scale}, - "axis": {"title": y_label} - } + "x": x_encoding, + "x2": {"field": "x2"}, + "y": y_encoding, + "y2": {"datum": y_min} }); let _ = x_field; base } +/// Vega-Lite renderer for `arc_batch` PlotSpec data. +/// Produces a layered semicircular concentric ring chart coloured by series. +fn vl_arc_batch(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_json::Value { + let data_entries = spec + .get("data") + .and_then(|d| d.get("entries")) + .and_then(|v| v.as_array()) + .cloned() + .unwrap_or_default(); + + // Top-level display options (unused for now but kept for future tuning) + let _display = spec.get("display").unwrap_or(&serde_json::Value::Null); + let width = base.get("width").and_then(|v| v.as_u64()).unwrap_or(600) as f64; + let height = base.get("height").and_then(|v| v.as_u64()).unwrap_or(400) as f64; + + // Unique series (report labels) in encountered order + let mut labels: Vec = Vec::new(); + for e in &data_entries { + if let Some(lbl) = e.get("report_label").and_then(|v| v.as_str()) { + if !labels.contains(&lbl.to_string()) { + labels.push(lbl.to_string()); + } + } + } + let n = labels.len().max(1) as f64; + + // radius allocation: leave small inner padding and outer padding + let max_radius = (height.min(width) / 2.0) * 0.9; + let inner_padding = 8.0; + let slot = ((max_radius - inner_padding) / n).max(8.0); + + // Shared color encoding using a rainbow scheme + let color_encoding = serde_json::json!({ + "field": "report_label", + "type": "nominal", + "scale": {"scheme": "rainbow"} + }); + + // Build layers: one data-backed background + wedge per series + let mut layers: Vec = Vec::new(); + for (i, _lbl) in labels.iter().enumerate() { + // collect entries belonging to this report index + let mut entries_for_i: Vec = Vec::new(); + for e in &data_entries { + if let Some(idx) = e.get("report_index").and_then(|v| v.as_i64()) { + if idx as usize == i { + entries_for_i.push(e.clone()); + } + } else if let Some(idx) = e.get("report_index").and_then(|v| v.as_u64()) { + if idx as usize == i { + entries_for_i.push(e.clone()); + } + } + } + if entries_for_i.is_empty() { + continue; + } + + let inner = (inner_padding + (i as f64) * slot).round(); + let outer = (inner + slot * 0.8).round(); + + // Background full semicircle (light grey) values + let mut bg_vals: Vec = Vec::new(); + for ev in &entries_for_i { + let mut be = ev.clone(); + if let serde_json::Value::Object(ref mut m) = be { + m.insert( + "endAngle".to_string(), + serde_json::json!(std::f64::consts::PI), + ); + } + bg_vals.push(be); + } + + // Foreground wedge values (scaled -> endAngle) + let mut wedge_vals: Vec = Vec::new(); + for ev in &entries_for_i { + let mut we = ev.clone(); + let scaled = ev.get("scaled").and_then(|v| v.as_f64()).unwrap_or(0.0_f64); + if let serde_json::Value::Object(ref mut m) = we { + m.insert( + "endAngle".to_string(), + serde_json::json!(scaled * std::f64::consts::PI), + ); + } + wedge_vals.push(we); + } + + // Background layer + let background = serde_json::json!({ + "data": {"values": bg_vals}, + "mark": { + "type": "arc", + "innerRadius": {"value": inner}, + "outerRadius": {"value": outer}, + "cornerRadius": 6, + "opacity": 0.25 + }, + "encoding": { + "theta": { + "field": "endAngle", + "type": "quantitative", + "scale": {"domain": [0.0, std::f64::consts::PI]} + }, + "theta2": {"value": 0}, + "color": {"value": "#d9d9d9"} + } + }); + + // Wedge layer (coloured) + let mut wedge = serde_json::json!({ + "data": {"values": wedge_vals}, + "mark": { + "type": "arc", + "innerRadius": {"value": inner + 1.0}, + "outerRadius": {"value": outer - 1.0}, + "cornerRadius": 6 + }, + "encoding": { + "theta": { + "field": "endAngle", + "type": "quantitative", + "scale": {"domain": [0.0, std::f64::consts::PI]} + }, + "theta2": {"value": 0} + } + }); + + // Insert color scale into wedge encoding + if let Some(obj) = wedge.as_object_mut() { + if let Some(enc) = obj.get_mut("encoding") { + if let Some(enc_obj) = enc.as_object_mut() { + enc_obj.insert("color".to_string(), color_encoding.clone()); + } + } + } + + layers.push(background); + layers.push(wedge); + } + + base["layer"] = serde_json::Value::Array(layers); + base +} + fn vl_scatter(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_json::Value { let x_meta = spec.get("x").unwrap_or(&serde_json::Value::Null); let x_field = x_meta.get("field").and_then(|v| v.as_str()).unwrap_or("x"); @@ -358,15 +1455,6 @@ fn vl_scatter(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_js .get("label") .and_then(|v| v.as_str()) .unwrap_or(x_field); - let x_scale_str = x_meta - .get("scale") - .and_then(|v| v.as_str()) - .unwrap_or("linear"); - let x_vl_scale = if x_scale_str == "log10" { - "log" - } else { - "linear" - }; let y_meta = spec.get("y").unwrap_or(&serde_json::Value::Null); let y_field = y_meta.get("field").and_then(|v| v.as_str()).unwrap_or("y"); @@ -374,38 +1462,900 @@ fn vl_scatter(spec: &serde_json::Value, mut base: serde_json::Value) -> serde_js .get("label") .and_then(|v| v.as_str()) .unwrap_or(y_field); - let y_scale_str = y_meta - .get("scale") - .and_then(|v| v.as_str()) - .unwrap_or("linear"); - let y_vl_scale = if y_scale_str == "log10" { - "log" + + // Extract optional server-provided buckets. Support two shapes: + // 1) legacy: `buckets` is an array of primitive ids (strings/numbers) and + // `bucketLabels` may be present as a parallel array of labels. + // 2) structured: `buckets` is an array of objects `{id,label,count}`. + let bucket_labels_opt: Option>; + let mut bucket_ids_opt: Option> = None; + + if let Some(buckets_arr) = spec + .get("data") + .and_then(|d| d.get("buckets")) + .and_then(|v| v.as_array()) + .cloned() + { + if !buckets_arr.is_empty() && buckets_arr[0].is_object() { + // structured array of objects + let mut ids: Vec = Vec::new(); + let mut labels: Vec = Vec::new(); + for obj in &buckets_arr { + if let Some(idv) = obj.get("id").or_else(|| obj.get("key")) { + if let Some(s) = idv.as_str() { + ids.push(s.to_string()); + } else { + ids.push(idv.to_string()); + } + } else { + ids.push(obj.to_string()); + } + if let Some(lv) = obj.get("label").or_else(|| obj.get("name")) { + if let Some(s) = lv.as_str() { + labels.push(s.to_string()); + } else { + labels.push(lv.to_string()); + } + } else { + labels.push(String::new()); + } + } + bucket_ids_opt = Some(ids); + bucket_labels_opt = Some(labels); + } else { + // legacy primitive array + bucket_ids_opt = Some( + buckets_arr + .iter() + .map(|k| { + if let Some(s) = k.as_str() { + s.to_string() + } else { + k.to_string() + } + }) + .collect(), + ); + // try separate `bucketLabels` field as fallback but treat empty + // arrays or arrays of empty strings as absent. + bucket_labels_opt = spec + .get("data") + .and_then(|d| d.get("bucketLabels")) + .and_then(|v| v.as_array()) + .and_then(|arr| { + let vec: Vec = arr + .iter() + .map(|s| s.as_str().unwrap_or("").to_string()) + .collect(); + if vec.iter().all(|s| s.is_empty()) { + None + } else { + Some(vec) + } + }); + } } else { - "linear" + // no buckets array at all; attempt to read `bucketLabels` only + bucket_labels_opt = spec + .get("data") + .and_then(|d| d.get("bucketLabels")) + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .map(|s| s.as_str().unwrap_or("").to_string()) + .collect() + }); + } + + // Build id->label map when both arrays are present and aligned. + let id_to_label: Option> = + if let (Some(ids), Some(labels)) = (&bucket_ids_opt, &bucket_labels_opt) { + if ids.len() == labels.len() { + let mut m = std::collections::HashMap::new(); + for (i, id) in ids.iter().enumerate() { + m.insert(id.clone(), labels[i].clone()); + } + Some(m) + } else { + None + } + } else { + None + }; + + // Build y id->label map from `yBuckets` + `yBucketLabels` when available. + let y_id_to_label: Option> = { + let y_ids_opt: Option> = spec + .get("data") + .and_then(|d| d.get("yBuckets")) + .and_then(|v| v.as_array()) + .map(|arr| { + arr.iter() + .map(|k| k.as_str().unwrap_or(&k.to_string()).to_string()) + .collect() + }); + let y_labels_opt: Option> = spec + .get("data") + .and_then(|d| d.get("yBucketLabels")) + .and_then(|v| v.as_array()) + .and_then(|arr| { + let vec: Vec = arr + .iter() + .map(|s| s.as_str().unwrap_or("").to_string()) + .collect(); + if vec.iter().all(|s| s.is_empty()) { + None + } else { + Some(vec) + } + }); + + if let (Some(ids), Some(labels)) = (y_ids_opt, y_labels_opt) { + if ids.len() == labels.len() { + let mut m = std::collections::HashMap::new(); + for (i, id) in ids.iter().enumerate() { + m.insert(id.clone(), labels[i].clone()); + } + Some(m) + } else { + None + } + } else { + None + } }; - let cells = spec + let cells = if let Some(existing_cells) = spec.get("data").and_then(|d| d.get("cells")) { + existing_cells.clone() + } else { + // API scatter responses provide raw points grouped by category under + // data.rawData.{cat}[]; flatten them into a single values array. + let mut flattened: Vec = Vec::new(); + if let Some(raw_data_obj) = spec + .get("data") + .and_then(|d| d.get("rawData")) + .and_then(|v| v.as_object()) + { + for (cat_key, points) in raw_data_obj { + if let Some(point_arr) = points.as_array() { + for point in point_arr { + let mut point_obj = point.as_object().cloned().unwrap_or_default(); + if !point_obj.contains_key("cat") { + point_obj.insert( + "cat".to_string(), + serde_json::Value::String(cat_key.clone()), + ); + } + // If we have an id->label map, attach an `x_label` field + // for this point so categorical axes can display + // human-readable labels while preserving ids. + if let Some(map) = id_to_label.as_ref() { + // find a candidate id on the point + let mut key_opt: Option = None; + if let Some(s) = point_obj.get("x").and_then(|v| v.as_str()) { + key_opt = Some(s.to_string()); + } else if let Some(s) = point_obj.get("cat").and_then(|v| v.as_str()) { + key_opt = Some(s.to_string()); + } else if let Some(n) = + point_obj.get("taxonId").and_then(|v| v.as_i64()) + { + key_opt = Some(n.to_string()); + } else if let Some(s) = + point_obj.get("taxonId").and_then(|v| v.as_str()) + { + key_opt = Some(s.to_string()); + } + if let Some(k) = key_opt { + if let Some(lbl) = map.get(&k) { + point_obj.insert( + "x_label".to_string(), + serde_json::Value::String(lbl.clone()), + ); + } + } + + // Populate `y_label` when possible so categorical Y + // encodings that expect `y_label` find a value. + if !point_obj.contains_key("y_label") { + // If we have a y id->label map prefer that. + if let Some(y_map) = y_id_to_label.as_ref() { + let mut y_key_opt: Option = None; + if let Some(s) = point_obj.get("y").and_then(|v| v.as_str()) { + y_key_opt = Some(s.to_string()); + } else if let Some(n) = + point_obj.get("y").and_then(|v| v.as_i64()) + { + y_key_opt = Some(n.to_string()); + } + if let Some(yk) = y_key_opt { + if let Some(y_lbl) = y_map.get(&yk) { + point_obj.insert( + "y_label".to_string(), + serde_json::Value::String(y_lbl.clone()), + ); + } else { + // Fall back to copying the existing `y` string + point_obj.insert( + "y_label".to_string(), + serde_json::Value::String(yk), + ); + } + } + } else { + // No mapping available: if `y` is already a string + // copy it to `y_label` so encoders using that + // field render correctly. + if let Some(s) = point_obj.get("y").and_then(|v| v.as_str()) { + point_obj.insert( + "y_label".to_string(), + serde_json::Value::String(s.to_string()), + ); + } else if let Some(n) = + point_obj.get("y").and_then(|v| v.as_i64()) + { + point_obj.insert( + "y_label".to_string(), + serde_json::Value::String(n.to_string()), + ); + } + } + } + } + flattened.push(serde_json::Value::Object(point_obj)); + } + } + } + } + serde_json::Value::Array(flattened) + }; + + // Pre-compute boundaries or category labels from buckets so tick marks can + // be applied even when raw point `cells` are present. We handle numeric + // and string buckets differently: numeric buckets yield numeric + // boundaries; string buckets yield categorical tick values. + let mut x_boundaries_f64: Vec = Vec::new(); + let mut y_boundaries_f64: Vec = Vec::new(); + let mut x_categories: Option> = None; + let mut y_categories: Option> = None; + + if let Some(x_keys_arr) = spec .get("data") - .and_then(|d| d.get("cells")) + .and_then(|d| d.get("buckets")) + .and_then(|v| v.as_array()) .cloned() - .unwrap_or_else(|| serde_json::json!([])); + { + // If the buckets are structured objects, prefer the extracted ids + // from `bucket_ids_opt`. Otherwise fall back to primitive handling. + if !x_keys_arr.is_empty() && x_keys_arr[0].is_object() { + // Structured buckets (objects) are typed by server-provided + // axis metadata. Require `value_type` to decide how to treat ids. + if let Some(ids) = bucket_ids_opt.clone() { + match x_meta.get("value_type").and_then(|v| v.as_str()) { + Some("float") | Some("integer") | Some("date") | Some("coordinate") => { + let x_keys_num: Vec = ids + .iter() + .map(|s| s.parse::().unwrap_or(0.0)) + .collect(); + // Determine bin width + let width = if x_keys_num.len() >= 2 { + x_keys_num[1] - x_keys_num[0] + } else if let Some(domain_arr) = + x_meta.get("domain").and_then(|d| d.as_array()) + { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + let tick_count = x_meta + .get("tickCount") + .and_then(|v| v.as_u64()) + .unwrap_or(10) + as f64; + (hi - lo) / tick_count.max(1.0) + } else { + 1.0 + } + } else { + 1.0 + }; - base["data"] = serde_json::json!({"values": cells}); - base["mark"] = serde_json::Value::String("point".to_string()); - base["encoding"] = serde_json::json!({ - "x": { - "field": "x", - "type": "quantitative", - "scale": {"type": x_vl_scale}, - "axis": {"title": x_label} - }, - "y": { - "field": "y", - "type": "quantitative", - "scale": {"type": y_vl_scale}, - "axis": {"title": y_label} + for k in &x_keys_num { + x_boundaries_f64.push(*k); + } + let last_right = if x_keys_num.len() >= 2 { + x_keys_num[x_keys_num.len() - 1] + (x_keys_num[1] - x_keys_num[0]) + } else { + x_keys_num[0] + width + }; + x_boundaries_f64.push(last_right); + } + Some("keyword") => { + x_categories = Some(ids); + } + Some(other) => { + return serde_json::json!({"error": format!("unsupported axis value_type '{}' for x buckets", other)}); + } + None => { + return serde_json::json!({"error": "missing axis value_type for x buckets; server must provide axis.value_type"}); + } + } + } else { + // No extracted ids available; stringify structured objects into labels + x_categories = Some( + x_keys_arr + .iter() + .map(|o| match o.get("id").or_else(|| o.get("key")) { + Some(idv) => { + if let Some(s) = idv.as_str() { + s.to_string() + } else { + idv.to_string() + } + } + None => match o.get("label").or_else(|| o.get("name")) { + Some(lv) => { + if let Some(s) = lv.as_str() { + s.to_string() + } else { + lv.to_string() + } + } + None => o.to_string(), + }, + }) + .collect(), + ); + } + } else { + // For primitive arrays, require server-provided type information. + match x_meta.get("value_type").and_then(|v| v.as_str()) { + Some("keyword") => { + x_categories = Some( + x_keys_arr + .iter() + .map(|k| k.as_str().unwrap_or("").to_string()) + .collect(), + ); + } + Some("float") | Some("integer") | Some("number") | Some("date") + | Some("coordinate") => { + // parse values to f64 as needed + let to_f64 = |v: &serde_json::Value| -> f64 { + v.as_f64() + .or_else(|| v.as_str().and_then(|s| s.parse::().ok())) + .unwrap_or(0.0) + }; + let x_keys: Vec = x_keys_arr.iter().map(to_f64).collect(); + if !x_keys.is_empty() { + let width = if x_keys.len() >= 2 { + x_keys[1] - x_keys[0] + } else if let Some(domain_arr) = spec + .get("x") + .and_then(|x| x.get("domain")) + .and_then(|d| d.as_array()) + { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + let tick_count = + spec.get("x") + .and_then(|x| x.get("tickCount")) + .and_then(|v| v.as_u64()) + .unwrap_or(10) as f64; + (hi - lo) / tick_count.max(1.0) + } else { + 1.0 + } + } else { + 1.0 + }; + + for k in &x_keys { + x_boundaries_f64.push(*k); + } + let last_right = if x_keys.len() >= 2 { + x_keys[x_keys.len() - 1] + (x_keys[1] - x_keys[0]) + } else { + x_keys[0] + width + }; + x_boundaries_f64.push(last_right); + } + } + Some(other) => { + return serde_json::json!({"error": format!("unsupported axis value_type '{}' for x primitive buckets", other)}); + } + None => { + return serde_json::json!({"error": "missing axis value_type for x primitive buckets; server must provide axis.value_type"}); + } + } } - }); + } + + if let Some(y_keys_arr) = spec + .get("data") + .and_then(|d| d.get("yBuckets")) + .and_then(|v| v.as_array()) + .cloned() + { + // Prefer explicit label array when provided by the server. This + // keeps `yBuckets` as the canonical ids used for bin alignment and + // uses `yBucketLabels` for human-readable axis categories. + let y_labels_opt: Option> = spec + .get("data") + .and_then(|d| d.get("yBucketLabels")) + .and_then(|v| v.as_array()) + .and_then(|arr| { + let vec: Vec = arr + .iter() + .map(|s| s.as_str().unwrap_or("").to_string()) + .collect(); + if vec.iter().all(|s| s.is_empty()) { + None + } else { + Some(vec) + } + }); + + // Require server-provided `value_type` for the Y axis as well. + match y_meta.get("value_type").and_then(|v| v.as_str()) { + Some("keyword") => { + if let Some(lbls) = y_labels_opt { + y_categories = Some(lbls); + } else { + y_categories = Some( + y_keys_arr + .iter() + .map(|k| k.as_str().unwrap_or("").to_string()) + .collect(), + ); + } + } + Some("float") | Some("integer") | Some("number") | Some("date") + | Some("coordinate") => { + let to_f64 = |v: &serde_json::Value| -> f64 { + v.as_f64() + .or_else(|| v.as_str().and_then(|s| s.parse::().ok())) + .unwrap_or(0.0) + }; + let y_keys: Vec = y_keys_arr.iter().map(to_f64).collect(); + if !y_keys.is_empty() { + let height = if y_keys.len() >= 2 { + y_keys[1] - y_keys[0] + } else if let Some(domain_arr) = spec + .get("y") + .and_then(|y| y.get("domain")) + .and_then(|d| d.as_array()) + { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + let tick_count = spec + .get("y") + .and_then(|y| y.get("tickCount")) + .and_then(|v| v.as_u64()) + .unwrap_or(10) as f64; + (hi - lo) / tick_count.max(1.0) + } else { + 1.0 + } + } else { + 1.0 + }; + + for k in &y_keys { + y_boundaries_f64.push(*k); + } + let last_top = if y_keys.len() >= 2 { + y_keys[y_keys.len() - 1] + (y_keys[1] - y_keys[0]) + } else { + y_keys[0] + height + }; + y_boundaries_f64.push(last_top); + } + } + Some(other) => { + return serde_json::json!({"error": format!("unsupported axis value_type '{}' for y buckets", other)}); + } + None => { + return serde_json::json!({"error": "missing axis value_type for y buckets; server must provide axis.value_type"}); + } + } + } + + // If we have point data (cells/rawData) render points, otherwise check for + // binned 2D data (`allYValues` + `yBuckets`) and render as a heatmap rect grid. + let mut is_cells_empty = true; + if let serde_json::Value::Array(arr) = &cells { + is_cells_empty = arr.is_empty(); + } + + if !is_cells_empty { + // Use shared axis encoding that respects server-provided axis meta. + // Prefer computed bin boundaries (if we extracted them above) so tick + // marks align with histogram/scatter bin edges even when raw points + // (`cells`) are present. + // If we have categorical bucket labels, prefer them for axis ticks. + let mut x_meta_override_value: Option = None; + if let Some(ref cats) = x_categories { + // Prefer server-provided human-readable bucket labels when present. + if let Some(ref labels) = bucket_labels_opt { + x_meta_override_value = + Some(serde_json::json!({"tick_values": labels, "value_type": "keyword"})); + } else { + x_meta_override_value = + Some(serde_json::json!({"tick_values": cats, "value_type": "keyword"})); + } + } + let x_enc = if let Some(ref meta) = x_meta_override_value { + // When using human-readable labels, the data objects will include + // an `x_label` field; use that field for axis encoding so labels + // render in the intended order. + let enc_res = if bucket_labels_opt.is_some() { + make_vl_axis_encoding(Some(meta), "x_label", Some(x_label), None, true, None) + } else { + make_vl_axis_encoding(Some(meta), "x", Some(x_label), None, true, None) + }; + match enc_res { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + } + } else { + let x_bound_opt: Option<&[f64]> = if x_boundaries_f64.is_empty() { + None + } else { + Some(x_boundaries_f64.as_slice()) + }; + match make_vl_axis_encoding(spec.get("x"), "x", Some(x_label), x_bound_opt, false, None) + { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + } + }; + + let mut y_meta_override_value: Option = None; + if let Some(ref cats) = y_categories { + // Use the y bucket categories for y-axis tick values. Do NOT reuse + // `bucket_labels_opt` which holds labels for the x-axis buckets. + y_meta_override_value = + Some(serde_json::json!({"tick_values": cats, "value_type": "keyword"})); + } + let y_enc = if let Some(ref meta) = y_meta_override_value { + match make_vl_axis_encoding(Some(meta), "y_label", Some(y_label), None, true, None) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + } + } else { + let y_bound_opt: Option<&[f64]> = if y_boundaries_f64.is_empty() { + None + } else { + Some(y_boundaries_f64.as_slice()) + }; + match make_vl_axis_encoding(spec.get("y"), "y", Some(y_label), y_bound_opt, false, None) + { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + } + }; + + base["data"] = serde_json::json!({"values": cells}); + base["mark"] = serde_json::Value::String("point".to_string()); + + // Build encoding map and add jitter offsets when axes are categorical. + let mut encoding_map = serde_json::Map::new(); + encoding_map.insert("x".to_string(), x_enc); + encoding_map.insert("y".to_string(), y_enc); + + let mut transforms: Vec = Vec::new(); + // Add a small pixel-offset jitter for categorical axes using Vega's + // `random()` expression. Offsets are in pixels and encoded via + // `xOffset`/`yOffset` which Vega-Lite supports for point marks. + if x_categories.is_some() { + transforms.push(serde_json::json!({"calculate": "(random()-0.5) * (random()-0.5)", "as": "_xOffset"})); + encoding_map.insert( + "xOffset".to_string(), + serde_json::json!({"field": "_xOffset", "scale":{"domain":[-1,1]}, "type": "quantitative"}), + ); + } + if y_categories.is_some() { + transforms.push(serde_json::json!({"calculate": "(random()-0.5) * (random()-0.5)", "as": "_yOffset"})); + encoding_map.insert( + "yOffset".to_string(), + serde_json::json!({"field": "_yOffset", "scale":{"domain":[-1,1]}, "type": "quantitative"}), + ); + } + + if spec.get("data").and_then(|d| d.get("cats")).is_some() { + // If the spec includes a top-level `cats` array, add a color encoding + // that maps the category field to a color scheme. + // This is a common pattern for scatter plots with categorical grouping. + encoding_map.insert( + "color".to_string(), + serde_json::json!({"field": "cat", "type": "nominal"}), + ); + } + + if !transforms.is_empty() { + base["transform"] = serde_json::Value::Array(transforms); + } + + base["encoding"] = serde_json::Value::Object(encoding_map); + } else { + // Attempt binned heatmap: x buckets + yBuckets + allYValues + let maybe_x_keys = spec + .get("data") + .and_then(|d| d.get("buckets")) + .and_then(|v| v.as_array()) + .cloned(); + let maybe_y_keys = spec + .get("data") + .and_then(|d| d.get("yBuckets")) + .and_then(|v| v.as_array()) + .cloned(); + let maybe_all_y = spec + .get("data") + .and_then(|d| d.get("allYValues")) + .and_then(|v| v.as_array()) + .cloned(); + + if let (Some(x_keys_arr), Some(y_keys_arr), Some(all_y_arr)) = + (maybe_x_keys, maybe_y_keys, maybe_all_y) + { + // Decide whether x/y buckets are categorical (strings) or numeric. + let x_is_categorical = x_categories.is_some(); + let y_is_categorical = y_categories.is_some(); + + // Prepare numeric vectors if needed. Support primitive numeric + // arrays as well as structured object buckets where ids were + // extracted into `bucket_ids_opt`. + let x_keys: Vec = if !x_is_categorical { + if !x_keys_arr.is_empty() && x_keys_arr[0].is_object() { + if let Some(ids) = bucket_ids_opt.clone() { + ids.iter() + .map(|s| s.parse::().unwrap_or(0.0)) + .collect() + } else { + Vec::new() + } + } else { + x_keys_arr + .iter() + .map(|k| k.as_f64().unwrap_or(0.0)) + .collect() + } + } else { + Vec::new() + }; + let y_keys: Vec = if !y_is_categorical { + y_keys_arr + .iter() + .map(|k| k.as_f64().unwrap_or(0.0)) + .collect() + } else { + Vec::new() + }; + + let x_width = if !x_is_categorical && x_keys.len() >= 2 { + x_keys[1] - x_keys[0] + } else if !x_is_categorical { + if let Some(domain_arr) = spec + .get("x") + .and_then(|x| x.get("domain")) + .and_then(|d| d.as_array()) + { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + (hi - lo) / (x_keys.len() as f64).max(1.0) + } else { + 1.0 + } + } else { + 1.0 + } + } else { + 1.0 + }; + + let y_height = if !y_is_categorical && y_keys.len() >= 2 { + y_keys[1] - y_keys[0] + } else if !y_is_categorical { + if let Some(domain_arr) = spec + .get("y") + .and_then(|y| y.get("domain")) + .and_then(|d| d.as_array()) + { + if domain_arr.len() >= 2 { + let lo = domain_arr[0].as_f64().unwrap_or(0.0); + let hi = domain_arr[1].as_f64().unwrap_or(lo + 1.0); + (hi - lo) / (y_keys.len() as f64).max(1.0) + } else { + 1.0 + } + } else { + 1.0 + } + } else { + 1.0 + }; + + // Build rects from allYValues: outer array per x-bucket, inner per y-bucket + let mut rects: Vec = Vec::new(); + for (xi, x_bucket) in all_y_arr.iter().enumerate() { + if let Some(y_counts) = x_bucket.as_array() { + for (yi, count_val) in y_counts.iter().enumerate() { + let count_opt = count_val + .as_u64() + .or_else(|| count_val.as_i64().map(|n| n as u64)) + .and_then(|n| if n == 0 { None } else { Some(n) }); + + // Only emit rects for buckets with a non-zero count. + if let Some(count) = count_opt { + let mut obj = serde_json::Map::new(); + if x_is_categorical { + let x_cat = x_categories + .as_ref() + .and_then(|v| v.get(xi)) + .cloned() + .unwrap_or_default(); + obj.insert( + "x".to_string(), + serde_json::Value::String(x_cat.clone()), + ); + if let Some(ref labels) = bucket_labels_opt { + if let Some(lbl) = labels.get(xi) { + obj.insert( + "x_label".to_string(), + serde_json::Value::String(lbl.clone()), + ); + } + } + } else { + let left = *x_keys.get(xi).unwrap_or(&0.0); + let right = if xi + 1 < x_keys.len() { + x_keys[xi + 1] + } else { + left + x_width + }; + obj.insert("x".to_string(), serde_json::Value::from(left)); + obj.insert("x2".to_string(), serde_json::Value::from(right)); + } + + if y_is_categorical { + let y_cat = y_categories + .as_ref() + .and_then(|v| v.get(yi)) + .cloned() + .unwrap_or_default(); + obj.insert("y".to_string(), serde_json::Value::String(y_cat)); + } else { + let bottom = *y_keys.get(yi).unwrap_or(&0.0); + let top = if yi + 1 < y_keys.len() { + y_keys[yi + 1] + } else { + bottom + y_height + }; + obj.insert("y".to_string(), serde_json::Value::from(bottom)); + obj.insert("y2".to_string(), serde_json::Value::from(top)); + } + + obj.insert("count".to_string(), serde_json::Value::from(count)); + rects.push(serde_json::Value::Object(obj)); + } + } + } + } + + // Colour domain from zDomain if provided (as Value) + let color_domain_value = spec + .get("data") + .and_then(|d| d.get("zDomain")) + .and_then(|v| v.as_array()) + .and_then(|arr| { + if arr.len() >= 2 { + let a = arr[0].as_f64().unwrap_or(0.0); + let b = arr[1].as_f64().unwrap_or(a + 1.0); + Some(serde_json::Value::Array(vec![ + serde_json::Value::from(a), + serde_json::Value::from(b), + ])) + } else { + None + } + }) + .unwrap_or_else(|| serde_json::Value::Array(vec![])); + + // Build axis encodings: use categorical tick_values when available, + // otherwise use numeric boundaries computed above. + let mut encoding_map = serde_json::Map::new(); + + if let Some(ref cats) = x_categories { + let x_meta = serde_json::json!({"tick_values": cats, "value_type": "keyword"}); + let x_enc_res = + make_vl_axis_encoding(Some(&x_meta), "x", Some(x_label), None, true, Some(1)); + let x_enc = match x_enc_res { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + encoding_map.insert("x".to_string(), x_enc); + } else { + let mut x_boundaries_num: Vec = x_keys.clone(); + if !x_boundaries_num.is_empty() { + let last_right = if x_boundaries_num.len() >= 2 { + x_boundaries_num[x_boundaries_num.len() - 1] + + (x_boundaries_num[1] - x_boundaries_num[0]) + } else { + x_boundaries_num[0] + x_width + }; + x_boundaries_num.push(last_right); + } + let x_enc = match make_vl_axis_encoding( + spec.get("x"), + "x", + Some(x_label), + Some(&x_boundaries_num), + false, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + encoding_map.insert("x".to_string(), x_enc); + encoding_map.insert("x2".to_string(), serde_json::json!({"field": "x2"})); + } + + if let Some(ref cats) = y_categories { + let y_meta = serde_json::json!({"tick_values": cats, "value_type": "keyword"}); + let y_enc_res = + make_vl_axis_encoding(Some(&y_meta), "y", Some(y_label), None, true, Some(1)); + let y_enc = match y_enc_res { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + encoding_map.insert("y".to_string(), y_enc); + } else { + let mut y_boundaries_num: Vec = y_keys.clone(); + if !y_boundaries_num.is_empty() { + let last_top = if y_boundaries_num.len() >= 2 { + y_boundaries_num[y_boundaries_num.len() - 1] + + (y_boundaries_num[1] - y_boundaries_num[0]) + } else { + y_boundaries_num[0] + y_height + }; + y_boundaries_num.push(last_top); + } + let y_enc = match make_vl_axis_encoding( + spec.get("y"), + "y", + Some(y_label), + Some(&y_boundaries_num), + false, + Some(1), + ) { + Ok(v) => v, + Err(e) => return serde_json::json!({"error": e}), + }; + encoding_map.insert("y".to_string(), y_enc); + encoding_map.insert("y2".to_string(), serde_json::json!({"field": "y2"})); + } + + encoding_map.insert( + "color".to_string(), + serde_json::json!({ + "field": "count", + "type": "quantitative", + "scale": {"type": "linear", "domain": color_domain_value} + }), + ); + + base["data"] = serde_json::json!({"values": rects}); + base["mark"] = serde_json::json!({"type": "rect"}); + base["encoding"] = serde_json::Value::Object(encoding_map); + } else { + // Fallback to empty points if nothing useful present + base["data"] = serde_json::json!({"values": serde_json::json!([])}); + base["mark"] = serde_json::Value::String("point".to_string()); + base["encoding"] = serde_json::json!({}); + } + } let _ = (x_field, y_field); base } @@ -496,4 +2446,70 @@ mod tests { let url = "https://goat.genomehubs.org/api/v2/search?result=taxon"; assert!(report_yaml_from_url_params(url).is_err()); } + + #[test] + fn scatter_vega_lite_uses_raw_data_when_cells_missing() { + let spec = serde_json::json!({ + "report_type": "scatter", + "x": {"field": "genome_size", "scale": "linear", "value_type": "float"}, + "y": {"field": "busco_total", "scale": "linear", "value_type": "float"}, + "data": { + "rawData": { + "all": [ + {"x": 10.0, "y": 20.0} + ] + } + } + }); + + let out = plot_spec_to_vega_lite_json(&spec.to_string()); + let parsed: serde_json::Value = serde_json::from_str(&out).unwrap(); + let values = parsed + .pointer("/data/values") + .and_then(|v| v.as_array()) + .unwrap(); + + assert_eq!(values.len(), 1); + assert_eq!(values[0].get("x").and_then(|v| v.as_f64()), Some(10.0)); + assert_eq!(values[0].get("y").and_then(|v| v.as_f64()), Some(20.0)); + assert_eq!(values[0].get("cat").and_then(|v| v.as_str()), Some("all")); + } + + #[test] + fn scatter_vega_lite_renders_heatmap_from_binned_values() { + let spec = serde_json::json!({ + "report_type": "scatter", + "x": {"field": "x", "scale": "linear", "value_type": "float"}, + "y": {"field": "y", "scale": "linear", "value_type": "float"}, + "data": { + "buckets": [0.0, 10.0], + "yBuckets": [0.0, 5.0], + "allYValues": [[1,2],[3,4]], + "zDomain": [1,4] + } + }); + + let out = plot_spec_to_vega_lite_json(&spec.to_string()); + let parsed: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert_eq!( + parsed + .get("mark") + .and_then(|m| m.get("type")) + .and_then(|t| t.as_str()), + Some("rect") + ); + let values = parsed + .pointer("/data/values") + .and_then(|v| v.as_array()) + .unwrap(); + // 2 x-buckets * 2 y-buckets -> 4 rects + assert_eq!(values.len(), 4); + // check a sample rect has expected keys + let sample = &values[0]; + assert!(sample.get("x").is_some()); + assert!(sample.get("x2").is_some()); + assert!(sample.get("y").is_some()); + assert!(sample.get("y2").is_some()); + assert!(sample.get("count").is_some()); + } } diff --git a/crates/genomehubs-query/src/report/plot_spec.rs b/crates/genomehubs-query/src/report/plot_spec.rs index b6670f8..a593beb 100644 --- a/crates/genomehubs-query/src/report/plot_spec.rs +++ b/crates/genomehubs-query/src/report/plot_spec.rs @@ -151,6 +151,8 @@ pub struct PlotSpec { pub x: Option, /// Secondary (Y) axis metadata, if applicable. pub y: Option, + /// Category axis metadata (for series / categorical axes), if applicable. + pub cat: Option, /// Tertiary (Z / heatmap density) axis metadata, if applicable. pub z: Option, /// Series (category) metadata. Empty for non-categorised plots. @@ -207,6 +209,7 @@ mod tests { tick_label_max_length: None, }), y: None, + cat: None, z: None, series: vec![SeriesMeta { key: "chromosome".to_string(), diff --git a/crates/genomehubs-query/tests/fix_y_axis.rs b/crates/genomehubs-query/tests/fix_y_axis.rs new file mode 100644 index 0000000..6f3cc31 --- /dev/null +++ b/crates/genomehubs-query/tests/fix_y_axis.rs @@ -0,0 +1,28 @@ +use genomehubs_query::plot_spec_to_vega_lite_json; +use serde_json::json; + +#[test] +fn y_axis_uses_yBuckets_for_raw_points() { + let spec = json!({ + "report_type": "scatter", + "x": {"field":"assembly_span", "label":"assembly_span", "scale":"linear"}, + "y": {"field":"assembly_level","label":"assembly_level","scale":"linear"}, + "data": { + "buckets": [{"id":"1","label":"B1"},{"id":"2","label":"B2"}], + "yBuckets": ["Scaffold","Chromosome"], + "rawData": { + "all": [ + {"x":1.0,"y":"Scaffold","cat":"all"}, + {"x":2.0,"y":"Chromosome","cat":"all"} + ] + } + } + }); + + let out = plot_spec_to_vega_lite_json(&spec.to_string()); + let parsed: serde_json::Value = serde_json::from_str(&out).unwrap(); + let y_values = parsed.pointer("/encoding/y/axis/values").unwrap(); + let arr = y_values.as_array().unwrap(); + assert_eq!(arr[0].as_str().unwrap(), "Scaffold"); + assert_eq!(arr[1].as_str().unwrap(), "Chromosome"); +} diff --git a/docs/planning/phases/phase-12-plot-spec.md b/docs/planning/phases/phase-12-plot-spec.md index 9cf2bf2..aac69a7 100644 --- a/docs/planning/phases/phase-12-plot-spec.md +++ b/docs/planning/phases/phase-12-plot-spec.md @@ -318,7 +318,7 @@ Vega-Lite JSON. Called by the user when they want interactive rendering. export function plotSpecToVegaLite(plotSpec) { const display = plotSpec.display ?? {}; const base = { - $schema: "https://vega.github.io/schema/vega-lite/v5.json", + $schema: "https://vega.github.io/schema/vega-lite/v6.json", title: display.title, width: display.width ?? 600, height: display.height ?? 400, diff --git a/python/cli_generator/query.py b/python/cli_generator/query.py index abd20f9..626aaf5 100644 --- a/python/cli_generator/query.py +++ b/python/cli_generator/query.py @@ -26,7 +26,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence if TYPE_CHECKING: import pandas @@ -1312,6 +1312,91 @@ def count_batch( counts.append(int(result.get("total") or 0)) return counts + def report_batch( + self, + reports: Sequence[ReportBuilder | tuple[QueryBuilder, ReportBuilder]], + api_base: str = "https://goat.genomehubs.org/api", + api_version: str = "v3", + ) -> list[dict[str, Any]]: + """Execute multiple reports in a single batch request. + + Args: + reports: List of ReportBuilder objects or (QueryBuilder, ReportBuilder) + pairs. When a bare ReportBuilder is provided the calling + QueryBuilder (``self``) is used as the query scope. + api_base: Base URL of the API. + api_version: API version string (default: ``"v3"``). + + Returns: + List of per-report result dicts. Each dict contains at least + ``"report"`` and ``"status"`` keys and may include ``"plot_spec"``. + + Raises: + ValueError: If more than 100 reports are provided, or items are + of an unsupported shape. + """ + import json + import urllib.request + + if len(reports) > 100: + raise ValueError("maximum 100 reports per batch request") + + url = f"{api_base}/{api_version}/report/batch" + payload_reports = [] + for item in reports: + # Accept either a bare ReportBuilder (use self as the query) + # or a (QueryBuilder, ReportBuilder) pair. + if isinstance(item, tuple) or isinstance(item, list): + if len(item) != 2: + raise ValueError("report_batch() tuple items must be (QueryBuilder, ReportBuilder)") + qb, rb = item[0], item[1] + else: + # Assume ReportBuilder and use self as the query scope. + qb, rb = self, item + + # Help static type checkers: narrow dynamic unions to the expected types + from typing import cast + + qb = cast("QueryBuilder", qb) + rb = cast("ReportBuilder", rb) + + # Build per-item POST payload + payload_item: dict[str, Any] = { + "query_yaml": qb.to_query_yaml(), + "params_yaml": qb.to_params_yaml(), + "report_yaml": rb.to_report_yaml(), + } + if getattr(rb, "_display", None) is not None: + payload_item["display"] = rb._display + if getattr(rb, "_include_plot_spec", False): + payload_item["include_plot_spec"] = True + payload_reports.append(payload_item) + + req = urllib.request.Request( + url, + data=json.dumps({"reports": payload_reports}).encode("utf-8"), + headers={"Content-Type": "application/json"}, + ) + with urllib.request.urlopen(req) as resp: + body_text = resp.read().decode("utf-8") + + batch_data = json.loads(body_text) + results: list[dict[str, Any]] = [] + for item in batch_data.get("results", []): + # Preserve plot_spec wrapper when present (client may request it). + if "plot_spec" in item: + results.append(item) + else: + entry: dict[str, Any] = { + "report": item.get("report", {}), + "status": item.get("status", {}), + } + if "error" in item: + entry["error"] = item["error"] + results.append(entry) + + return results + def record( self, record_id: str, @@ -2769,7 +2854,7 @@ def plot_spec_to_vega_lite(spec: dict[str, Any]) -> dict[str, Any]: """ display: dict[str, Any] = spec.get("display") or {} base: dict[str, Any] = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "$schema": "https://vega.github.io/schema/vega-lite/v6.json", "width": display.get("width", 600), "height": display.get("height", 400), "config": { diff --git a/scripts/generate_reports.sh b/scripts/generate_reports.sh new file mode 100755 index 0000000..72099ff --- /dev/null +++ b/scripts/generate_reports.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +OUTDIR=~/reports +mkdir -p "$OUTDIR" +FORMAT=png +AXES="assembly_level assembly_span assembly_date genus" +THRESHOLDS="10 1000" +MODES="stacked grouped facet" +CUMULATIVE="true false" + + +function vl_convert() { + local format="$1" + local output_file="$2" + if [[ "$format" == "svg" ]]; then + python3 -c "import vl_convert as vlc, json, sys; spec=json.loads(sys.stdin.read()); print(vlc.vegalite_to_svg(spec), end='')" > "$output_file" + else + python3 -c "import vl_convert as vlc, json, sys; spec=json.loads(sys.stdin.read()); sys.stdout.buffer.write(vlc.vegalite_to_png(spec))" > "$output_file" + fi +} + +# # Test histogram report with different axis and mode combinations +# category_axis="assembly_level" +# for x_axis in $AXES; do +# for mode in $MODES; do +# for cumulative in $CUMULATIVE; do +# echo "Testing histogram with x=$x_axis, mode=$mode, cumulative=$cumulative" +# curl -s -X POST 'http://localhost:3000/api/v3/report' -H 'accept: application/json' -H 'Content-Type: application/json' -d "{\"query\":{\"index\":\"taxon\", \"taxa\": [\"canidae\"], \"taxon_filter_type\": \"tree\"},\"params\":{},\"report\":{\"report\":\"histogram\",\"x\":\"$x_axis\",\"cat\":\"$category_axis\",\"bucket_count\":20},\"include_plot_spec\":true,\"display\":{\"title\":\"histogram test\",\"histogram\":{\"mode\":\"$mode\",\"cumulative\":$cumulative}}}" \ +# | cargo run --quiet --bin plot_to_vl \ +# | vl_convert "$FORMAT" "$OUTDIR/histogram_${x_axis}_${mode}_cumulative_${cumulative}.$FORMAT" +# done +# done +# done + + +# # Test histogram report with different axis and category combinations +# for x_axis in $AXES; do +# for cat_axis in $AXES; do +# echo "Testing histogram with x=$x_axis, category=$cat_axis" +# curl -s -X POST 'http://localhost:3000/api/v3/report' -H 'accept: application/json' -H 'Content-Type: application/json' -d "{\"query\":{\"index\":\"taxon\", \"taxa\": [\"canidae\"], \"taxon_filter_type\": \"tree\"},\"params\":{},\"report\":{\"report\":\"histogram\",\"x\":\"$x_axis\",\"cat\":\"$cat_axis\",\"bucket_count\":20},\"include_plot_spec\":true,\"display\":{\"title\":\"histogram test\"}}" \ +# | cargo run --quiet --bin plot_to_vl \ +# | vl_convert "$FORMAT" "$OUTDIR/histogram_${x_axis}_by_${cat_axis}.$FORMAT" +# done +# done + + + +# # Test scatter report with different axis combinations and thresholds + +# for x_axis in $AXES; do +# for y_axis in $AXES; do +# for threshold in $THRESHOLDS; do +# echo "Testing scatter with x=$x_axis, y=$y_axis, threshold=$threshold" +# shape=$(if [[ "$threshold" -le 10 ]]; then echo "rect"; else echo "point"; fi) +# curl -s -X POST 'http://localhost:3000/api/v3/report' -H 'accept: application/json' -H 'Content-Type: application/json' -d "{\"query\":{\"index\":\"taxon\", \"taxa\": [\"canidae\"], \"taxon_filter_type\": \"tree\"},\"params\":{},\"report\":{\"report\":\"scatter\",\"x\":\"$x_axis\",\"y\":\"$y_axis\",\"scatter_threshold\":$threshold},\"include_plot_spec\":true,\"display\":{\"title\":\"scatter test\"}}" \ +# | cargo run --quiet --bin plot_to_vl \ +# | vl_convert "$FORMAT" "$OUTDIR/scatter_${shape}_${x_axis}_${y_axis}.$FORMAT" +# done +# done +# done + + +# test scatter with categories +for x_axis in $AXES; do + y_axis=assembly_span + for cat_axis in axes; do + for threshold in $THRESHOLDS; do + echo "Testing scatter with x=$x_axis, cat=$cat_axis, threshold=$threshold" + shape=$(if [[ "$threshold" -le 10 ]]; then echo "rect"; else echo "point"; fi) + curl -s -X POST 'http://localhost:3000/api/v3/report' -H 'accept: application/json' -H 'Content-Type: application/json' -d "{\"query\":{\"index\":\"taxon\", \"taxa\": [\"canidae\"], \"taxon_filter_type\": \"tree\"},\"params\":{},\"report\":{\"report\":\"scatter\",\"x\":\"$x_axis\",\"y\":\"$y_axis\",\"cat\":\"$cat_axis\",\"scatter_threshold\":$threshold},\"include_plot_spec\":true,\"display\":{\"title\":\"scatter test\"}}" \ + | cargo run --quiet --bin plot_to_vl \ + | vl_convert "$FORMAT" "$OUTDIR/scatter_${shape}_${x_axis}_${y_axis}_by_${cat_axis}.$FORMAT" + done + done +done diff --git a/src/bin/plot_to_vl.rs b/src/bin/plot_to_vl.rs new file mode 100644 index 0000000..b5597f9 --- /dev/null +++ b/src/bin/plot_to_vl.rs @@ -0,0 +1,12 @@ +use std::io::{self, Read}; + +fn main() { + let mut input = String::new(); + if let Err(e) = io::stdin().read_to_string(&mut input) { + eprintln!("failed to read stdin: {}", e); + std::process::exit(2); + } + + let out = genomehubs_query::plot_spec_to_vega_lite_json(&input); + println!("{}", out); +} diff --git a/src/core/query_builder.rs b/src/core/query_builder.rs index cb55040..21dd6c7 100644 --- a/src/core/query_builder.rs +++ b/src/core/query_builder.rs @@ -85,6 +85,7 @@ pub fn build_search_body( "size": size, "from": offset, "query": { "bool": { "filter": [] } }, + "track_total_hits": true, "_source": { "include": ["taxon_id","scientific_name","taxon_rank","parent","taxon_names.*","lineage.*"], "exclude": [] } }); diff --git a/templates/docs/reference/query-builder.qmd.tera b/templates/docs/reference/query-builder.qmd.tera index 98f3afc..f721145 100644 --- a/templates/docs/reference/query-builder.qmd.tera +++ b/templates/docs/reference/query-builder.qmd.tera @@ -1011,6 +1011,82 @@ curl -s -X POST {{ api_base }}/v3/count/batch \ --- +### `report_batch(reports, api_base, api_version) -> list[dict]` + +Execute multiple reports in a single request to `/v3/report/batch`. Each +input item may be either a bare `ReportBuilder` (in which case the calling +`QueryBuilder` is used as the query scope) or a `(QueryBuilder, ReportBuilder)` +pair to run the report against a different query. Returns a list of per-report +result dicts in the same order as the input. Individual results from `report_batch` +may include `plot_spec` when requested by the client. + +::: {.panel-tabset group="language"} + +## Python + +```python +from {{ sdk_name }}.query import QueryBuilder, ReportBuilder + +qb = QueryBuilder("taxon").set_taxa(["Mammalia"], filter_type="tree") +rb1 = ReportBuilder("arc").set_feature("has_assembly") +rb2 = ReportBuilder("arc").set_feature("has_annotation").set_include_plot_spec() +results = qb.report_batch([rb1, rb2], api_base="{{ api_base }}", api_version="{{ api_version }}") +for res in results: + print(res.get("status")) + if "plot_spec" in res: + print("plot_spec included") +``` + +## R + +```r +rb1 <- ReportBuilder$new("arc")$set_feature("has_assembly") +rb2 <- ReportBuilder$new("arc")$set_feature("has_annotation")$set_include_plot_spec(TRUE) +results <- qb$report_batch(list(rb1, rb2)) +for (r in results) { + print(r$status) + if (!is.null(r$plot_spec)) cat("plot_spec included\n") +} +``` + +## JavaScript + +```javascript +const rb1 = new ReportBuilder('arc').setFeature('has_assembly'); +const rb2 = new ReportBuilder('arc').setFeature('has_annotation').setIncludePlotSpec(true); +const results = await qb.reportBatch([rb1, rb2]); +results.forEach(r => { + console.log(r.status); + if (r.plot_spec) console.log('plot_spec included'); +}); +``` + +## API + +```bash +curl -s -X POST {{ api_base }}/v3/report/batch \ + -H "Content-Type: application/json" \ + -d '{ + "reports": [ + { + "query_yaml": "index: taxon\n...", + "params_yaml": "size: 10\npage: 1\n", + "report_yaml": "report: arc\nfeature: has_assembly\n" + }, + { + "query_yaml": "index: taxon\n...", + "params_yaml": "size: 10\npage: 1\n", + "report_yaml": "report: arc\nfeature: has_annotation\n", + "include_plot_spec": true + } + ] + }' +``` + +::: + +--- + ### `chain_query(query_key, query_string) -> QueryBuilder` Chain a new query onto the results of the previous query. Specifies that this diff --git a/templates/js/query.browser.js.tera b/templates/js/query.browser.js.tera index 8f12cc5..326d549 100644 --- a/templates/js/query.browser.js.tera +++ b/templates/js/query.browser.js.tera @@ -954,6 +954,53 @@ class QueryBuilder { return (data.results ?? []).map((r) => Number(r?.status?.hits ?? 0)); } + /** + * Execute multiple reports in a single batch request. + * @param {(ReportBuilder|[QueryBuilder,ReportBuilder]|{query:QueryBuilder,report:ReportBuilder})[]} reports + * @param {string} [apiBase=API_BASE] + * @returns {Promise} + */ + async reportBatch(reports, apiBase = API_BASE) { + if (reports.length > 100) throw new Error("maximum 100 reports per batch request"); + const url = `${apiBase}/${API_VERSION}/report/batch`; + const payload = { + reports: reports.map((item) => { + let qb, rb; + if (Array.isArray(item) && item.length === 2) { + qb = item[0]; + rb = item[1]; + } else if (item && item.query && item.report) { + qb = item.query; + rb = item.report; + } else { + qb = this; + rb = item; + } + const entry = { + query_yaml: qb.toQueryYaml(), + params_yaml: qb.toParamsYaml(), + report_yaml: rb.toReportYaml(), + }; + if (rb._display != null) entry.display = rb._display; + if (rb._includePlotSpec) entry.include_plot_spec = true; + return entry; + }), + }; + const resp = await fetch(url, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify(payload), + }); + if (!resp.ok) throw new Error(`API request failed: ${resp.status} ${resp.statusText}`); + const data = JSON.parse(await resp.text()); + return (data.results ?? []).map((res) => { + if (res.plot_spec != null) return res; + const out = { report: res.report ?? {}, status: res.status ?? {} }; + if (res.error != null) out.error = res.error; + return out; + }); + } + /** * Fetch a single record by ID. * @param {string} recordId diff --git a/templates/js/query.js b/templates/js/query.js index b5f7ffc..47c64ce 100644 --- a/templates/js/query.js +++ b/templates/js/query.js @@ -1092,6 +1092,51 @@ class QueryBuilder { return counts; } + /** + * Execute multiple reports in a single batch request. + * @param {(ReportBuilder|[QueryBuilder,ReportBuilder]|{query:QueryBuilder,report:ReportBuilder})[]} reports + * @param {string} [apiBase=API_BASE] + * @returns {Promise} + */ + async reportBatch(reports, apiBase = API_BASE) { + if (reports.length > 100) + throw new Error("maximum 100 reports per batch request"); + + const batchData = await this._postJson(`${apiBase}/v3/report/batch`, { + reports: reports.map((item) => { + let qb, rb; + if (Array.isArray(item) && item.length === 2) { + qb = item[0]; + rb = item[1]; + } else if (item && item.query && item.report) { + qb = item.query; + rb = item.report; + } else { + qb = this; + rb = item; + } + const entry = { + query_yaml: qb.toQueryYaml(), + params_yaml: qb.toParamsYaml(), + report_yaml: rb.toReportYaml(), + }; + if (rb._display != null) entry.display = rb._display; + if (rb._includePlotSpec) entry.include_plot_spec = true; + return entry; + }), + }); + + return (batchData.results ?? []).map((res) => { + if (res.plot_spec != null) return res; + const out = { + report: res.report ?? {}, + status: res.status ?? {}, + }; + if (res.error != null) out.error = res.error; + return out; + }); + } + /** * Fetch a single record by ID or identifier. * @param {string} recordId - Record ID to fetch @@ -2204,7 +2249,7 @@ function parseSearchWithLineageSummary(raw, configJson) { function plotSpecToVegaLite(plotSpec) { const display = plotSpec.display ?? {}; const base = { - $schema: "https://vega.github.io/schema/vega-lite/v5.json", + $schema: "https://vega.github.io/schema/vega-lite/v6.json", title: display.title ?? undefined, width: display.width ?? 600, height: display.height ?? 400, diff --git a/templates/python/query.py.tera b/templates/python/query.py.tera index 85df85b..659e11b 100644 --- a/templates/python/query.py.tera +++ b/templates/python/query.py.tera @@ -5,7 +5,7 @@ Generated by cli-generator. Do not edit. from __future__ import annotations -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Sequence from . import {{ sdk_name }} as _ext @@ -949,6 +949,59 @@ class QueryBuilder: return data return data.get("report", data) + def report_batch( + self, + reports: Sequence[ReportBuilder | tuple[QueryBuilder, ReportBuilder]], + max_reports: int = 100, + ) -> list[dict[str, Any]]: + """Run a batch of reports and return a list of results. + + Each entry in ``reports`` may be either a ``ReportBuilder`` (which + uses the calling builder's embedded query) or a ``(QueryBuilder, + ReportBuilder)`` tuple for per-item queries. The method posts a + `POST /v3/report/batch` request and returns the parsed results. + """ + if len(reports) > max_reports: + raise ValueError(f"at most {max_reports} reports are allowed") + + payload_reports: list[dict[str, Any]] = [] + for rb in reports: + if isinstance(rb, (list, tuple)): + if len(rb) != 2: + raise ValueError("tuples in reports must be (QueryBuilder, ReportBuilder)") + qb, rb = rb # type: ignore + else: + qb = self + + # Narrow types for static checkers + from typing import cast + + rb = cast("ReportBuilder", rb) + qb = cast("QueryBuilder", qb) + + report_doc: dict[str, Any] = { + "query_yaml": qb.to_query_yaml(), + "params_yaml": qb.to_params_yaml(), + "report_yaml": rb.to_report_yaml(), + } + if getattr(rb, "_display", None) is not None: + report_doc["display"] = rb._display + if getattr(rb, "_include_plot_spec", False): + report_doc["include_plot_spec"] = True + + payload_reports.append(report_doc) + + url = f"{API_BASE}/v3/report/batch" + batch_resp = self._post_json(url, {"reports": payload_reports}) + + results: list[dict[str, Any]] = [] + for item in batch_resp.get("results", []): + if isinstance(item, dict) and "report" in item: + results.append(item) + else: + results.append(item) + return results + def search_batch( self, queries: list["QueryBuilder"], @@ -2051,7 +2104,7 @@ def plot_spec_to_vega_lite(spec: dict[str, Any]) -> dict[str, Any]: """ display: dict[str, Any] = spec.get("display") or {} base: dict[str, Any] = { - "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "$schema": "https://vega.github.io/schema/vega-lite/v6.json", "width": display.get("width", 600), "height": display.get("height", 400), "config": { diff --git a/templates/r/query.R b/templates/r/query.R index 3ff8895..d66f475 100644 --- a/templates/r/query.R +++ b/templates/r/query.R @@ -983,6 +983,52 @@ QueryBuilder <- R6::R6Class( if (length(counts) == 0) numeric(0) else counts }, + #' @description Execute multiple reports in a single batch request. + #' @param reports List of ReportBuilder objects. + #' @param api_base Base URL of the API (default: from package). + #' @return List of per-report result lists, one per input report. + report_batch = function(reports, api_base = NULL) { + if (length(reports) > 100) { + stop("maximum 100 reports per batch request") + } + + if (is.null(api_base)) { + api_base <- private$api_base_url + } + + url <- paste0(api_base, "/", private$api_version, "/report/batch") + payload <- list( + reports = lapply(reports, function(rb) { + item <- list( + query_yaml = self$to_query_yaml(), + params_yaml = self$to_params_yaml(), + report_yaml = rb$to_report_yaml() + ) + if (!is.null(rb$.__enclos_env__$private$._display)) item$display <- rb$.__enclos_env__$private$._display + if (isTRUE(rb$.__enclos_env__$private$._include_plot_spec)) item$include_plot_spec <- TRUE + item + }) + ) + + resp <- httr::POST(url, + body = jsonlite::toJSON(payload, auto_unbox = TRUE), + httr::add_headers("Content-Type" = "application/json"), + httr::accept("application/json") + ) + httr::stop_for_status(resp) + raw_text <- httr::content(resp, as = "text", encoding = "UTF-8") + batch_data <- jsonlite::fromJSON(raw_text, simplifyVector = FALSE) + + lapply(batch_data$results %||% list(), function(item) { + if (!is.null(item$plot_spec)) { + return(item) + } + out <- list(report = item$report %||% list(), status = item$status %||% list()) + if (!is.null(item$error)) out$error <- item$error + out + }) + }, + #' @description Fetch a single record by ID or identifier. #' @param record_id Record ID to fetch (required). #' @param result Result type (taxon|assembly|sample); defaults to index type. diff --git a/templates/rust/main.rs.tera b/templates/rust/main.rs.tera index 3578880..301cd25 100644 --- a/templates/rust/main.rs.tera +++ b/templates/rust/main.rs.tera @@ -296,6 +296,12 @@ enum {{ index.name | capitalize }}Commands { #[arg(long, default_value = "")] query: String, + /// YAML file containing a single report mapping or a sequence of report + /// mappings. When supplied the CLI will run each report in the file + /// (batch mode) or the single report mapping and print JSON results. + #[arg(long, value_name = "FILE")] + file: Option, + /// Maximum number of results to return per page. #[arg(long, default_value = "50")] size: usize, @@ -994,7 +1000,7 @@ fn run(cli: Cli) -> anyhow::Result<()> { } } {{ index.name | capitalize }}Commands::Report { - report_type, taxon, taxon_filter, filter, rank, query, + report_type, taxon, taxon_filter, filter, rank, query, file, x, x_opts, y, cat, cat_rank, count_rank, collapse_monotypic, include_plot_spec, display, taxonomy, } => { @@ -1038,21 +1044,141 @@ fn run(cli: Cli) -> anyhow::Result<()> { taxonomy, ..Default::default() }; - let report_opts = generated::client::ReportOptions { - x, x_opts, y, cat, cat_rank, count_rank, - collapse_monotypic, - include_plot_spec, - display, - ..Default::default() - }; - let raw = generated::client::report( - generated::indexes::Index::{{ index.name | capitalize }}, - &full_query, - &report_type, - &opts, - &report_opts, - )?; - println!("{raw}"); + + // CLI-level report defaults are applied per-item when running a + // batch file (we construct the per-item opts from these values). + + if let Some(ref path) = file { + let reports = load_report_file( + path, + &taxon_filter, + taxon.as_deref(), + rank.as_deref(), + &filter, + &full_query, + suppress_divergence_warnings, + )?; + + let mut results: Vec = Vec::new(); + for (q, cfg) in reports { + let item_report_type = cfg + .get("report") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(|| report_type.clone()); + + let mut per_report_opts = generated::client::ReportOptions { + x: x.clone(), + x_opts: x_opts.clone(), + y: y.clone(), + cat: cat.clone(), + cat_rank: cat_rank.clone(), + count_rank: count_rank.clone(), + collapse_monotypic, + include_plot_spec, + display: display.clone(), + ..Default::default() + }; + + if let Some(s) = cfg.get("x").and_then(|v| v.as_str()) { + per_report_opts.x = Some(s.to_string()); + } + if let Some(s) = cfg.get("x_opts").and_then(|v| v.as_str()).or(cfg.get("xOpts").and_then(|v| v.as_str())) { + per_report_opts.x_opts = Some(s.to_string()); + } + if let Some(s) = cfg.get("y").and_then(|v| v.as_str()) { + per_report_opts.y = Some(s.to_string()); + } + if let Some(s) = cfg.get("y_opts").and_then(|v| v.as_str()).or(cfg.get("yOpts").and_then(|v| v.as_str())) { + per_report_opts.y_opts = Some(s.to_string()); + } + if let Some(s) = cfg.get("cat").and_then(|v| v.as_str()) { + per_report_opts.cat = Some(s.to_string()); + } + if let Some(s) = cfg.get("cat_opts").and_then(|v| v.as_str()).or(cfg.get("catOpts").and_then(|v| v.as_str())) { + per_report_opts.cat_opts = Some(s.to_string()); + } + if let Some(s) = cfg.get("rank").and_then(|v| v.as_str()) { + per_report_opts.rank = Some(s.to_string()); + } + if let Some(arr) = cfg.get("fields").and_then(|v| v.as_array()) { + per_report_opts.fields = arr.iter().filter_map(|vv| vv.as_str().map(|s| s.to_string())).collect(); + } + if let Some(s) = cfg.get("status_filter").and_then(|v| v.as_str()) { + per_report_opts.status_filter = Some(s.to_string()); + } + if let Some(s) = cfg.get("cat_rank").and_then(|v| v.as_str()).or(cfg.get("catRank").and_then(|v| v.as_str())) { + per_report_opts.cat_rank = Some(s.to_string()); + } + if let Some(b) = cfg.get("collapse_monotypic").and_then(|v| v.as_bool()).or(cfg.get("collapseMonotypic").and_then(|v| v.as_bool())) { + per_report_opts.collapse_monotypic = b; + } + if let Some(s) = cfg.get("preserve_rank").and_then(|v| v.as_str()).or(cfg.get("preserveRank").and_then(|v| v.as_str())) { + per_report_opts.preserve_rank = Some(s.to_string()); + } + if let Some(s) = cfg.get("count_rank").and_then(|v| v.as_str()).or(cfg.get("countRank").and_then(|v| v.as_str())) { + per_report_opts.count_rank = Some(s.to_string()); + } + if let Some(s) = cfg.get("location_field").and_then(|v| v.as_str()).or(cfg.get("locationField").and_then(|v| v.as_str())) { + per_report_opts.location_field = Some(s.to_string()); + } + if let Some(n) = cfg.get("hex_resolution").and_then(|v| v.as_i64()) { + per_report_opts.hex_resolution = Some(n as u32); + } + if let Some(n) = cfg.get("map_threshold").and_then(|v| v.as_i64()) { + per_report_opts.map_threshold = Some(n as u32); + } + if let Some(n) = cfg.get("scatter_threshold").and_then(|v| v.as_i64()) { + per_report_opts.scatter_threshold = Some(n as u32); + } + if let Some(b) = cfg.get("include_plot_spec").and_then(|v| v.as_bool()) { + per_report_opts.include_plot_spec = b; + } + if let Some(display_val) = cfg.get("display") { + if let Ok(s) = serde_json::to_string(display_val) { + per_report_opts.display = Some(s); + per_report_opts.include_plot_spec = true; + } + } + + let raw = generated::client::report( + generated::indexes::Index::{{ index.name | capitalize }}, + &q, + &item_report_type, + &opts, + &per_report_opts, + )?; + if let Ok(val) = serde_json::from_str::(&raw) { + results.push(val); + } else { + results.push(serde_json::Value::String(raw)); + } + } + + let json = serde_json::to_string(&results).context("serialising batch reports")?; + println!("{json}"); + } else { + let report_opts = generated::client::ReportOptions { + x: x.clone(), + x_opts: x_opts.clone(), + y: y.clone(), + cat: cat.clone(), + cat_rank: cat_rank.clone(), + count_rank: count_rank.clone(), + collapse_monotypic, + include_plot_spec, + display: display.clone(), + ..Default::default() + }; + let raw = generated::client::report( + generated::indexes::Index::{{ index.name | capitalize }}, + &full_query, + &report_type, + &opts, + &report_opts, + )?; + println!("{raw}"); + } } {{ index.name | capitalize }}Commands::Lookup { search_term, size, format } => { let body = generated::client::lookup( @@ -1762,6 +1888,86 @@ fn build_queries_from_patches( .collect() } + /// Load a report file and produce a list of `(query_string, report_cfg)` pairs. + /// + /// Supported forms: + /// - Mapping with top-level `reports:` sequence (optional `shared:` mapping) + /// - Sequence of report mappings + /// - Single report mapping + fn load_report_file( + file_path: &std::path::Path, + taxon_filter: &TaxonFilter, + cli_taxon: Option<&str>, + cli_rank: Option<&str>, + cli_filters: &[String], + base_query: &str, + suppress_divergence_warnings: bool, + ) -> anyhow::Result> { + let content = std::fs::read_to_string(file_path) + .with_context(|| format!("reading report file {}", file_path.display()))?; + let trimmed = content.trim(); + + let parsed: serde_yaml::Value = serde_yaml::from_str(trimmed) + .unwrap_or(serde_yaml::Value::Null); + + // Extract sequence of items and optional shared mapping. + let (items, shared) = match &parsed { + serde_yaml::Value::Mapping(map) + if map.contains_key(&serde_yaml::Value::String("reports".into())) => + { + let shared = map + .get(&serde_yaml::Value::String("shared".into())) + .and_then(|v| v.as_mapping()) + .cloned(); + let seq = map + .get(&serde_yaml::Value::String("reports".into())) + .and_then(|v| v.as_sequence()) + .cloned() + .unwrap_or_default(); + (seq, shared) + } + serde_yaml::Value::Sequence(seq) => (seq.clone(), None), + serde_yaml::Value::Mapping(map) => (vec![serde_yaml::Value::Mapping(map.clone())], None), + _ => anyhow::bail!("report file must be a YAML mapping or sequence"), + }; + + // Build query strings for items using existing query patch logic. + let queries = build_queries_from_patches( + &items, + shared.as_ref(), + taxon_filter, + cli_taxon, + cli_rank, + cli_filters, + base_query, + suppress_divergence_warnings, + )?; + + // Convert each item mapping into a JSON object with report-specific keys + // (remove taxon/rank/filter which were applied to the query string). + let mut cfgs: Vec = Vec::new(); + for it in items.iter() { + let map = it.as_mapping().ok_or_else(|| anyhow::anyhow!("each report entry must be a YAML mapping"))?; + let mut obj = serde_json::Map::new(); + for (k, v) in map.iter() { + if let Some(ks) = k.as_str() { + if ks == "taxon" || ks == "rank" || ks == "filter" || ks == "shared" { + continue; + } + let val = serde_json::to_value(v).unwrap_or(serde_json::Value::Null); + obj.insert(ks.to_string(), val); + } + } + cfgs.push(serde_json::Value::Object(obj)); + } + + if queries.len() != cfgs.len() { + anyhow::bail!("internal error: queries/report configs length mismatch") + } + + Ok(queries.into_iter().zip(cfgs.into_iter()).collect()) + } + /// Convert raw `--filter FIELD OP VALUE` triples into expression strings. fn cli_filter_fragments(filter: &[String]) -> Vec { filter diff --git a/templates/snippets/js_snippet.tera b/templates/snippets/js_snippet.tera index 93c8bc9..dc1fb2a 100644 --- a/templates/snippets/js_snippet.tera +++ b/templates/snippets/js_snippet.tera @@ -61,6 +61,13 @@ qb.countBatch(queries).then((counts) => counts.forEach((n, i) => console.log(`Query ${i}: ${n} records`)) ); +{% elif call_type == "report_batch" -%} +// Run a batch of reports +const reports = [rb]; // extend with ReportBuilder instances or [QueryBuilder, ReportBuilder] pairs +qb.reportBatch(reports).then((results) => + results.forEach((res, i) => console.log(`Report ${i}:`, res)) +); + {% elif call_type == "positional" -%} // Run a positional report const assemblies = [{% for asm in positional_assemblies %}"{{ asm }}", {% endfor %}]; diff --git a/templates/snippets/python_snippet.tera b/templates/snippets/python_snippet.tera index e908aca..499a206 100644 --- a/templates/snippets/python_snippet.tera +++ b/templates/snippets/python_snippet.tera @@ -64,6 +64,13 @@ counts = qb.count_batch(queries) for i, n in enumerate(counts): print(f"Query {i}: {n} records") +{% elif call_type == "report_batch" -%} +# Run a batch of reports +reports = [rb] # extend this list with ReportBuilder instances or (QueryBuilder, ReportBuilder) pairs +results = qb.report_batch(reports) +for i, res in enumerate(results): + print(f"Report {i}: {res}") + {% elif call_type == "positional" -%} # Run a positional report assemblies = [{% for asm in positional_assemblies %}"{{ asm }}", {% endfor %}] diff --git a/templates/snippets/r_snippet.tera b/templates/snippets/r_snippet.tera index e7fce3a..2a01769 100644 --- a/templates/snippets/r_snippet.tera +++ b/templates/snippets/r_snippet.tera @@ -61,6 +61,15 @@ queries <- list(qb) # extend with additional QueryBuilder instances counts <- qb$count_batch(queries) for (i in seq_along(counts)) cat("Query", i, ":", counts[[i]], "records\n") +{% elif call_type == "report_batch" -%} +# Run a batch of reports +reports <- list(rb) # extend with ReportBuilder instances +results <- qb$report_batch(reports) +for (i in seq_along(results)) { + cat(sprintf("Report %d:\n", i)) + print(results[[i]]) +} + {% elif call_type == "positional" -%} # Run a positional report assemblies <- c({% for asm in positional_assemblies %}"{{ asm }}", {% endfor %}) diff --git a/tests/python/test_report_batch.py b/tests/python/test_report_batch.py new file mode 100644 index 0000000..4a47303 --- /dev/null +++ b/tests/python/test_report_batch.py @@ -0,0 +1,11 @@ +import pytest + +from cli_generator import QueryBuilder, ReportBuilder + + +def test_report_batch_exceeds_limit(): + qb = QueryBuilder("taxon") + rb = ReportBuilder("histogram") + reports = [rb] * 101 + with pytest.raises(ValueError): + qb.report_batch(reports) diff --git a/tests/python/test_sdk_parity.py b/tests/python/test_sdk_parity.py index 6b7a6d9..6cb95c0 100644 --- a/tests/python/test_sdk_parity.py +++ b/tests/python/test_sdk_parity.py @@ -362,6 +362,18 @@ "js_name": "report", "r_name": "report", }, + "report": { + "params": ["report"], + "python_name": "report", + "js_name": "report", + "r_name": "report", + }, + "report_batch": { + "params": ["reports", "max_reports"], + "python_name": "report_batch", + "js_name": "reportBatch", + "r_name": "report_batch", + }, "chain_query": { "params": ["query_key", "query_string"], "python_name": "chain_query",