diff --git a/fluree-db-api/src/import.rs b/fluree-db-api/src/import.rs index 3eadddfc20..c8e1bde099 100644 --- a/fluree-db-api/src/import.rs +++ b/fluree-db-api/src/import.rs @@ -3629,6 +3629,7 @@ where numbig_pool: Arc::new(SharedNumBigPool::new()), vector_pool: Arc::new(SharedVectorArenaPool::new()), ns_alloc: Arc::clone(&shared_alloc), + decimal_encoding: IMPORT_DECIMAL_ENCODING, }); // Pre-insert rdf:type so we know the predicate ID before Phase A begins. @@ -5112,6 +5113,13 @@ struct IndexUploadResult { has_annotations: bool, } +/// Decimal-encoding policy for a fresh bulk import. Like a full reindex, a new +/// import adopts the inline-decimal (v3) format. This is the single source for +/// BOTH the spool object resolution ([`SpoolConfig::decimal_encoding`]) and the +/// written root version — they must agree or decimal identity would split. +const IMPORT_DECIMAL_ENCODING: fluree_db_core::DecimalEncoding = + fluree_db_core::DecimalEncoding::InlineWhenFits; + #[allow(clippy::too_many_arguments)] async fn build_and_upload( storage: &S, @@ -5896,6 +5904,9 @@ where // and stays out of the bootstrap path. had_annotation_arena: false, ns_split_mode: input.ns_split_mode, + // Same source as the spool object resolution (SpoolConfig): the root + // version must match how the import encoded decimals. + decimal_encoding: IMPORT_DECIMAL_ENCODING, }; // Encode and upload FIR6 root. diff --git a/fluree-db-api/tests/it_decimal_exactness.rs b/fluree-db-api/tests/it_decimal_exactness.rs index c844032223..91972bf1b2 100644 --- a/fluree-db-api/tests/it_decimal_exactness.rs +++ b/fluree-db-api/tests/it_decimal_exactness.rs @@ -977,6 +977,672 @@ async fn sparql_delete_data_decimal_retracts_exactly() { ); } +// ============================================================================= +// Inline xsd:decimal encoding (v3 root format) +// ============================================================================= + +/// Run a full rebuild, publish the new index, and return the decoded index root +/// so tests can assert the on-disk decimal-encoding format. +async fn full_rebuild_publish_decode_root( + fluree: &fluree_db_api::Fluree, + ledger_id: &str, +) -> fluree_db_binary_index::format::index_root::IndexRoot { + use fluree_db_core::storage::ContentStore; + let record = fluree + .nameservice() + .lookup(ledger_id) + .await + .expect("nameservice lookup") + .expect("ledger record"); + let result = fluree_db_indexer::rebuild_index_from_commits( + fluree.content_store(ledger_id), + ledger_id, + &record, + fluree_db_indexer::IndexerConfig::default(), + ) + .await + .expect("full rebuild"); + let root_bytes = fluree + .content_store(ledger_id) + .get(&result.root_id) + .await + .expect("fetch root bytes"); + fluree + .publisher() + .expect("read-write nameservice") + .publish_index(ledger_id, result.index_t, &result.root_id) + .await + .expect("publish index"); + fluree_db_binary_index::format::index_root::IndexRoot::decode(&root_bytes).expect("decode root") +} + +#[tokio::test] +async fn full_reindex_writes_inline_decimal_v3_format_and_roundtrips() { + // A full rebuild adopts the inline-decimal format: the root is v3 + // (InlineWhenFits), small exact decimals encode inline, and a value too + // large to fit inline falls back to the arena — all round-trip exactly. + let fluree = memory_fluree(); + let ledger_id = "decimal/inline-format:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + let result = run_sparql_update( + &fluree, + ledger, + r#" + PREFIX ex: + INSERT DATA { + ex:a ex:amount 19.99 . + ex:b ex:amount 0.0000001 . + ex:c ex:amount "1234567890123456789.5"^^ . + } + "#, + ) + .await; + let _ = result; + + let root = full_rebuild_publish_decode_root(&fluree, ledger_id).await; + assert_eq!( + root.decimal_encoding(), + fluree_db_core::DecimalEncoding::InlineWhenFits, + "a full rebuild must write the inline-decimal (v3) format" + ); + + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + let query = r" + PREFIX ex: + SELECT ?s ?amount WHERE { ?s ex:amount ?amount . } + "; + let result = support::query_sparql(&fluree, &ledger, query) + .await + .expect("query"); + let sparql_json = result + .to_sparql_json(&ledger.snapshot) + .expect("to_sparql_json"); + + let mut amounts = binding_values(&sparql_json, "amount"); + amounts.sort(); + // Two inline-eligible decimals + one arena-overflow decimal, all exact and + // in plain (non-exponent) form. + assert_eq!( + amounts, + vec![ + "0.0000001".to_string(), + "1234567890123456789.5".to_string(), + "19.99".to_string(), + ], + "inline + arena decimals must round-trip exactly after reindex" + ); +} + +#[tokio::test] +async fn inline_decimal_equality_constant_matches_after_reindex() { + // A decimal equality constant must encode the same way as the stored inline + // row so the bound-object lookup hits it (issue #1328 narrowing). + let fluree = memory_fluree(); + let ledger_id = "decimal/inline-eq:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:price 19.99 . + ex:b ex:price 20.00 . + } + ", + ) + .await; + + let root = full_rebuild_publish_decode_root(&fluree, ledger_id).await; + assert_eq!( + root.decimal_encoding(), + fluree_db_core::DecimalEncoding::InlineWhenFits + ); + + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + let query = r" + PREFIX ex: + SELECT ?s WHERE { ?s ex:price 19.99 . } + "; + let result = support::query_sparql(&fluree, &ledger, query) + .await + .expect("query"); + let sparql_json = result + .to_sparql_json(&ledger.snapshot) + .expect("to_sparql_json"); + assert_eq!( + binding_values(&sparql_json, "s"), + vec!["ex:a".to_string()], + "decimal equality constant must match the stored inline decimal" + ); +} + +/// Canonicalize a SPARQL bindings array for differential comparison: any literal +/// whose value parses as a `BigDecimal` is rewritten to its normalized form. +/// Indexing canonicalizes decimal scale (`10.50` -> `10.5`) for both the arena +/// and inline encodings, so a novelty-vs-indexed comparison must compare by +/// numeric value, not lexical form. Datatype and structure are preserved and +/// still compared exactly. +fn canon_decimal_bindings(bindings: &JsonValue) -> JsonValue { + let mut bindings = bindings.clone(); + if let Some(rows) = bindings.as_array_mut() { + for row in rows { + if let Some(obj) = row.as_object_mut() { + for (_var, cell) in obj.iter_mut() { + if let Some(v) = cell.get("value").and_then(|v| v.as_str()) { + if let Ok(bd) = v.parse::() { + cell["value"] = JsonValue::String(bd.normalized().to_plain_string()); + } + } + } + } + } + } + bindings +} + +#[tokio::test] +async fn inline_decimal_results_match_novelty_differential() { + // Differential: the same query must return identical results whether the + // decimals are unindexed (novelty, canonical FlakeValue::Decimal) or indexed + // under the inline (v3) format. Proves inline encoding is observably + // identical to the canonical representation across SELECT / ORDER BY / FILTER + // / aggregation. + let fluree = memory_fluree(); + let ledger_id = "decimal/inline-differential:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + let result = run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:amount 19.99 . + ex:b ex:amount 0.01 . + ex:c ex:amount 10.50 . + ex:d ex:amount 100.00 . + } + ", + ) + .await; + let novelty_ledger = result.ledger; + + let queries = [ + // Plain projection + ORDER BY on the decimal value. + r"PREFIX ex: + SELECT ?amount WHERE { ?s ex:amount ?amount . } ORDER BY ?amount", + // FILTER comparison against a decimal threshold. + r"PREFIX ex: + SELECT ?amount WHERE { ?s ex:amount ?amount . FILTER(?amount > 10.0) } ORDER BY ?amount", + // Aggregation (SUM/AVG) + COUNT. + r"PREFIX ex: + SELECT (SUM(?amount) AS ?total) (COUNT(?amount) AS ?n) WHERE { ?s ex:amount ?amount . }", + ]; + + // Results from the unindexed (novelty) state. + let mut novelty_results = Vec::new(); + for q in &queries { + let r = support::query_sparql(&fluree, &novelty_ledger, q) + .await + .expect("novelty query"); + novelty_results.push( + r.to_sparql_json(&novelty_ledger.snapshot) + .expect("to_sparql_json")["results"]["bindings"] + .clone(), + ); + } + + // Reindex into the inline (v3) format. + let root = full_rebuild_publish_decode_root(&fluree, ledger_id).await; + assert_eq!( + root.decimal_encoding(), + fluree_db_core::DecimalEncoding::InlineWhenFits + ); + let indexed_ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + for (q, novelty_bindings) in queries.iter().zip(novelty_results) { + let r = support::query_sparql(&fluree, &indexed_ledger, q) + .await + .expect("indexed query"); + let indexed_bindings = r + .to_sparql_json(&indexed_ledger.snapshot) + .expect("to_sparql_json")["results"]["bindings"] + .clone(); + assert_eq!( + canon_decimal_bindings(&indexed_bindings), + canon_decimal_bindings(&novelty_bindings), + "inline-indexed results must match novelty results (by value + datatype) for query:\n{q}" + ); + } +} + +#[tokio::test] +async fn inline_decimal_order_by_and_range_are_numeric_after_reindex() { + // Order-preserving inline decimal keys: ORDER BY and range filters on a + // decimal predicate must use NUMERIC order, not scale-broken key order. + // 0.05 vs 0.5 (different scales) and negatives are the cases the old + // equality-keyed layout got wrong. + let fluree = memory_fluree(); + let ledger_id = "decimal/inline-order:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:v 0.5 . + ex:b ex:v 0.05 . + ex:c ex:v -1 . + ex:d ex:v 2 . + ex:e ex:v 19.99 . + ex:f ex:v -0.01 . + ex:g ex:v 1000.5 . + } + ", + ) + .await; + + let root = full_rebuild_publish_decode_root(&fluree, ledger_id).await; + assert_eq!( + root.decimal_encoding(), + fluree_db_core::DecimalEncoding::InlineWhenFits + ); + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + // 1. Plain ORDER BY ascending — full numeric order across signs and scales. + let asc = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v } ORDER BY ?v", + ) + .await + .expect("order by asc"); + let asc_json = asc.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&asc_json, "v"), + vec!["-1", "-0.01", "0.05", "0.5", "2", "19.99", "1000.5"], + "ORDER BY must be numeric (0.05 < 0.5, negatives first)" + ); + + // 2. ORDER BY DESC LIMIT — exercises the reverse-POST top-k fast path. + let desc = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v } ORDER BY DESC(?v) LIMIT 3", + ) + .await + .expect("order by desc limit"); + let desc_json = desc.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&desc_json, "v"), + vec!["1000.5", "19.99", "2"], + "ORDER BY DESC LIMIT must return the numerically largest values" + ); + + // 3. SELECT with a range FILTER — numeric subset. + let filtered = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v FILTER(?v > 0.1) } ORDER BY ?v", + ) + .await + .expect("range filter"); + let filtered_json = filtered.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&filtered_json, "v"), + vec!["0.5", "2", "19.99", "1000.5"], + "FILTER(?v > 0.1) must exclude 0.05 and the negatives" + ); + + // 4. COUNT with a range FILTER — exercises the numeric-compare COUNT fast path. + let counted = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT (COUNT(?s) AS ?n) WHERE { ?s ex:v ?v FILTER(?v > 0.1) }", + ) + .await + .expect("count filter"); + let counted_json = counted.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&counted_json, "n"), + vec!["4"], + "COUNT over a decimal range filter must match the four values > 0.1" + ); + + // 5. COUNT with an integer threshold against decimal rows (cross-form). + let counted_int = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT (COUNT(?s) AS ?n) WHERE { ?s ex:v ?v FILTER(?v >= 2) }", + ) + .await + .expect("count int threshold"); + let counted_int_json = counted_int.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&counted_int_json, "n"), + vec!["3"], + "FILTER(?v >= 2) over decimals must count 2, 19.99, 1000.5" + ); + + // 6. MIN / MAX — exercises the boundary-key numeric MIN/MAX fast path. + let minmax = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT (MIN(?v) AS ?lo) (MAX(?v) AS ?hi) WHERE { ?s ex:v ?v }", + ) + .await + .expect("min/max"); + let minmax_json = minmax.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&minmax_json, "lo"), + vec!["-1"], + "MIN over decimals must be the most negative value" + ); + assert_eq!( + binding_values(&minmax_json, "hi"), + vec!["1000.5"], + "MAX over decimals must be the largest value" + ); +} + +#[tokio::test] +async fn mixed_int_decimal_predicate_range_filter_is_correct() { + // Correctness guard for decimal range-scan narrowing: a predicate with BOTH + // integer and inline-decimal values spans two o_types, so the uniform-extent + // precondition fails and the scan must NOT narrow to the decimal key range — + // doing so would drop the integer rows. The general post-filter must return + // every numerically-matching value regardless of type. + let fluree = memory_fluree(); + let ledger_id = "decimal/mixed-range:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:v 5 . + ex:b ex:v 10.5 . + ex:c ex:v 2 . + ex:d ex:v 7.5 . + ex:e ex:v 3 . + } + ", + ) + .await; + + full_rebuild_publish_decode_root(&fluree, ledger_id).await; + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + // FILTER(?v > 4): must match the integer 5 AND the decimals 7.5, 10.5 — + // three values across two o_types. A decimal-only narrowed scan would miss 5. + let filtered = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v FILTER(?v > 4) } ORDER BY ?v", + ) + .await + .expect("mixed range filter"); + let json = filtered.to_sparql_json(&ledger.snapshot).expect("json"); + let mut got = binding_values(&json, "v"); + got.sort(); + assert_eq!( + got, + vec!["10.5", "5", "7.5"], + "range filter over a mixed int+decimal predicate must keep matches of both types" + ); + + // Full ORDER BY must still interleave both types numerically. + let ordered = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v } ORDER BY ?v", + ) + .await + .expect("mixed order by"); + let oj = ordered.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&oj, "v"), + vec!["2", "3", "5", "7.5", "10.5"], + "ORDER BY over mixed int+decimal must be numerically interleaved" + ); +} + +#[tokio::test] +async fn inline_integer_range_pushdown_is_correct() { + // Integer keys are order-preserving (encode_i64), so a uniform-integer + // predicate gets the same range/ORDER BY/COUNT pushdown as decimals — and it + // needs no format change (works on any index). This checks correctness of + // the generalized path on a uniform xsd:integer predicate. + let fluree = memory_fluree(); + let ledger_id = "decimal/inline-int-range:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:n 5 . + ex:b ex:n 100 . + ex:c ex:n -3 . + ex:d ex:n 42 . + ex:e ex:n 0 . + } + ", + ) + .await; + + full_rebuild_publish_decode_root(&fluree, ledger_id).await; + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + // ORDER BY: numeric order across negatives. + let asc = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?n WHERE { ?s ex:n ?n } ORDER BY ?n", + ) + .await + .expect("order by"); + let asc_json = asc.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&asc_json, "n"), + vec!["-3", "0", "5", "42", "100"], + "integer ORDER BY must be numeric" + ); + + // Range FILTER (the new narrowing path) + COUNT. + let filtered = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?n WHERE { ?s ex:n ?n FILTER(?n > 4) } ORDER BY ?n", + ) + .await + .expect("range filter"); + let fj = filtered.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&fj, "n"), + vec!["5", "42", "100"], + "integer range filter must narrow to values > 4" + ); + + let counted = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT (COUNT(?s) AS ?c) WHERE { ?s ex:n ?n FILTER(?n >= 0) }", + ) + .await + .expect("count"); + let cj = counted.to_sparql_json(&ledger.snapshot).expect("json"); + assert_eq!( + binding_values(&cj, "c"), + vec!["4"], + "COUNT(?n >= 0) over integers must be 4 (0, 5, 42, 100)" + ); +} + +#[tokio::test] +async fn range_narrowing_keeps_cross_type_novelty() { + // Regression guard for the overlay/novelty hazard in numeric range + // narrowing. The base predicate is uniformly inline-decimal (so narrowing + // WOULD fire on a clean index), but novelty then adds a matching value of a + // DIFFERENT type (an integer) for the same predicate. The integer's overlay + // op sorts outside the decimal o_type/o_key window, so narrowing must be + // disabled while overlay is present — otherwise the integer is dropped + // before the post-filter sees it. With the overlay gate, the full scan + + // merge + post-filter keeps it. + let fluree = memory_fluree(); + let ledger_id = "decimal/xtype-novelty:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:v 0.5 . + ex:b ex:v 100.5 . + ex:c ex:v 10.5 . + } + ", + ) + .await; + + // Reindex: the base predicate is now uniformly inline-decimal (v3). + let root = full_rebuild_publish_decode_root(&fluree, ledger_id).await; + assert_eq!( + root.decimal_encoding(), + fluree_db_core::DecimalEncoding::InlineWhenFits + ); + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + // Add a NOVELTY integer (different o_type) that matches the filter, without + // reindexing — it lives in the overlay as XSD_INTEGER. + let result = run_sparql_update( + &fluree, + ledger, + r"PREFIX ex: INSERT DATA { ex:d ex:v 100 . }", + ) + .await; + let ledger = result.ledger; + + // FILTER(?v > 50): the indexed decimal 100.5 AND the novelty integer 100. + let r = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v FILTER(?v > 50) } ORDER BY ?v", + ) + .await + .expect("cross-type range filter"); + let json = r.to_sparql_json(&ledger.snapshot).expect("json"); + let mut got = binding_values(&json, "v"); + got.sort(); + assert_eq!( + got, + vec!["100", "100.5"], + "range filter must keep a cross-type novelty match (integer 100) on a \ + uniform-decimal base predicate — narrowing must not drop it" + ); +} + +#[tokio::test] +async fn integer_range_narrowing_keeps_cross_type_novelty() { + // The overlay gate is type-agnostic: a uniform-INTEGER base predicate with a + // matching DECIMAL novelty value must not drop the decimal. (Mirror of + // range_narrowing_keeps_cross_type_novelty with the base/overlay types + // swapped, covering the generalized integer/double pushdown path.) + let fluree = memory_fluree(); + let ledger_id = "decimal/int-xtype-novelty:main"; + let ledger = genesis_ledger(&fluree, ledger_id); + + run_sparql_update( + &fluree, + ledger, + r" + PREFIX ex: + INSERT DATA { + ex:a ex:v 5 . + ex:b ex:v 100 . + ex:c ex:v 50 . + } + ", + ) + .await; + + full_rebuild_publish_decode_root(&fluree, ledger_id).await; + let ledger = fluree + .ledger(ledger_id) + .await + .expect("load reindexed ledger"); + + // Novelty decimal (different o_type) that matches the filter. + let result = run_sparql_update( + &fluree, + ledger, + r"PREFIX ex: INSERT DATA { ex:d ex:v 75.5 . }", + ) + .await; + let ledger = result.ledger; + + let r = support::query_sparql( + &fluree, + &ledger, + r"PREFIX ex: + SELECT ?v WHERE { ?s ex:v ?v FILTER(?v > 60) } ORDER BY ?v", + ) + .await + .expect("cross-type range filter"); + let json = r.to_sparql_json(&ledger.snapshot).expect("json"); + let mut got = binding_values(&json, "v"); + got.sort(); + assert_eq!( + got, + vec!["100", "75.5"], + "integer-base range filter must keep a cross-type novelty decimal (75.5)" + ); +} + #[tokio::test] async fn integer_valued_double_over_indexed_predicate_is_not_corrupted() { // Regression (fluree/db-r#142): an integer-valued double inserted into a @@ -1086,7 +1752,7 @@ async fn integer_valued_double_over_indexed_predicate_is_not_corrupted() { /// Issue #1329: JSON-LD decimal rendering must be consistent regardless of /// whether the value is served from the binary index (arena-decoded) or from /// novelty (raw flake merge). The reported bug rendered index-served decimals -/// as `{"@value": "19.99", "@type": ""}` (empty type) and novelty-served ones +/// as `{"@value": "19.90", "@type": ""}` (empty type) and novelty-served ones /// as a bare string with no `@type`. #[tokio::test] async fn jsonld_decimal_renders_consistently_across_index_and_novelty() { @@ -1108,13 +1774,15 @@ async fn jsonld_decimal_renders_consistently_across_index_and_novelty() { .run_until(async move { let ledger = genesis_ledger(&fluree, ledger_id); - // Indexed base: ex:a is arena-backed after the index build. + // Indexed base: ex:a is arena-backed after the index build. The + // trailing zero (19.90) exercises canonicalization on the indexed + // path — the inline decimal code strips it, so it renders as 19.9. let result = run_sparql_update( &fluree, ledger, r" PREFIX ex: - INSERT DATA { ex:a ex:price 19.99 . } + INSERT DATA { ex:a ex:price 19.90 . } ", ) .await; @@ -1151,8 +1819,8 @@ async fn jsonld_decimal_renders_consistently_across_index_and_novelty() { // Both the arena-served (indexed) and novelty-served decimals must // render in the SAME shape. Before the fix the indexed copy lost its - // datatype and rendered as `{"@value":"19.99","@type":""}` while the - // novelty copy rendered as the bare string `"24.50"` (issue #1329). + // datatype and rendered as `{"@value":"19.90","@type":""}` while the + // novelty copy rendered as a bare string (issue #1329). let mut by_id = std::collections::HashMap::new(); for node in rows { let id = node["@id"].as_str().expect("@id").to_string(); @@ -1173,7 +1841,10 @@ async fn jsonld_decimal_renders_consistently_across_index_and_novelty() { } // Consistency: identical JSON shape across the two paths (xsd:decimal - // is an inferable datatype, so both render as the exact bare string). + // is an inferable datatype, so both render as a bare string). The + // inline decimal code is canonical (order-preserving, trailing zeros + // stripped), so both paths drop the trailing zero — 19.90 → 19.9 and + // 24.50 → 24.5 — matching the XSD canonical form of xsd:decimal. assert_eq!( indexed.is_object(), novel.is_object(), @@ -1182,12 +1853,12 @@ async fn jsonld_decimal_renders_consistently_across_index_and_novelty() { ); assert_eq!( indexed, - &JsonValue::String("19.99".to_string()), + &JsonValue::String("19.9".to_string()), "indexed decimal" ); assert_eq!( novel, - &JsonValue::String("24.50".to_string()), + &JsonValue::String("24.5".to_string()), "novelty decimal" ); diff --git a/fluree-db-api/tests/it_import_v3.rs b/fluree-db-api/tests/it_import_v3.rs index 9c8d64fc6c..d2bda8a6be 100644 --- a/fluree-db-api/tests/it_import_v3.rs +++ b/fluree-db-api/tests/it_import_v3.rs @@ -1320,3 +1320,89 @@ ex:remove a ex:User ; "rebuilt V3 index should only contain 'Keep' — 'Remove' should be filtered as retract-winner" ); } + +// ── Bulk import writes inline decimals (v3 root) and round-trips exactly ── +#[tokio::test] +async fn import_v3_inline_decimals_roundtrip() { + let db_dir = tempfile::tempdir().expect("db tmpdir"); + let data_dir = tempfile::tempdir().expect("data tmpdir"); + + // Two inline-eligible decimals + one too large to fit inline (mantissa + // exceeds 2^57, so it falls back to the NumBig arena). + let ttl = r#" +@prefix ex: . +@prefix xsd: . + +ex:a ex:amount 19.99 . +ex:b ex:amount 0.0000001 . +ex:c ex:amount "1234567890123456789.5"^^xsd:decimal . +"#; + let ttl_path = write_ttl(data_dir.path(), "decimals.ttl", ttl); + + let fluree = FlureeBuilder::file(db_dir.path().to_string_lossy().to_string()) + .build() + .expect("build file-backed Fluree"); + + let result = fluree + .create("test/v3-decimals:main") + .import(&ttl_path) + .threads(1) + .memory_budget_mb(128) + .cleanup(false) + .execute() + .await + .expect("decimal import should succeed"); + assert!(result.root_id.is_some(), "index should have been built"); + + // The import must write a v3 (inline-decimal) root: byte 4 of the FIR6 + // header is the version, and ROOT_V6_VERSION_INLINE_DECIMAL == 3. + let fir6_files = find_files_with_magic(db_dir.path(), b"FIR6"); + assert!(!fir6_files.is_empty(), "expected a FIR6 root file"); + let root_bytes = std::fs::read(&fir6_files[0]).expect("read FIR6 root"); + assert_eq!( + root_bytes[4], 3, + "bulk import must write a v3 inline-decimal root" + ); + + let ledger = fluree + .ledger("test/v3-decimals:main") + .await + .expect("load decimal ledger"); + + let result = support::query_sparql( + &fluree, + &ledger, + r" + PREFIX ex: + SELECT ?amount WHERE { ?s ex:amount ?amount } ORDER BY ?amount + ", + ) + .await + .expect("decimal query"); + let json = result + .to_sparql_json(&ledger.snapshot) + .expect("format sparql json"); + let bindings = json["results"]["bindings"].as_array().expect("bindings"); + let mut amounts: Vec<&str> = bindings + .iter() + .map(|b| b["amount"]["value"].as_str().unwrap()) + .collect(); + amounts.sort(); + + // All three decimals — two inline, one arena — round-trip exactly in plain + // (non-exponent) form. + assert_eq!( + amounts, + vec!["0.0000001", "1234567890123456789.5", "19.99"], + "inline + arena decimals must round-trip exactly through bulk import" + ); + + // Datatype is xsd:decimal for all (inline lane resolves the datatype). + for b in bindings { + assert_eq!( + b["amount"]["datatype"].as_str().unwrap(), + "http://www.w3.org/2001/XMLSchema#decimal", + "imported decimals must carry xsd:decimal datatype" + ); + } +} diff --git a/fluree-db-binary-index/src/format/expanded_cas.rs b/fluree-db-binary-index/src/format/expanded_cas.rs index 5526cb641d..cdef28c0a7 100644 --- a/fluree-db-binary-index/src/format/expanded_cas.rs +++ b/fluree-db-binary-index/src/format/expanded_cas.rs @@ -291,6 +291,7 @@ mod tests { had_annotation_arena: false, o_type_table: IndexRoot::build_o_type_table(&[], &[]), ns_split_mode: fluree_db_core::ns_encoding::NsSplitMode::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, } } diff --git a/fluree-db-binary-index/src/format/index_root.rs b/fluree-db-binary-index/src/format/index_root.rs index 9c6bf16bce..a9aa633c9a 100644 --- a/fluree-db-binary-index/src/format/index_root.rs +++ b/fluree-db-binary-index/src/format/index_root.rs @@ -117,8 +117,40 @@ pub const ROOT_V6_MAGIC: &[u8; 4] = b"FIR6"; /// be keyed by `(g_id, p_id, lang_id)` for multi-language full-text indexing. /// Pre-v2 roots are refused outright — operators upgrading must run a full /// reindex before queries resume. +/// +/// Version 3 enables inline `xsd:decimal` encoding ([`DecimalEncoding`]): leaf +/// data may carry [`OType::XSD_DECIMAL_INLINE`] rows. The root layout is +/// byte-identical to v2; the version is the capability signal. Old binaries +/// refuse a v3 root outright (the strict version check below) rather than +/// misdecoding inline-decimal leaf rows, which is the required "upgrade code +/// first" safety property. pub const ROOT_V6_VERSION: u8 = 2; +/// Root format version that enables inline `xsd:decimal` encoding. Written only +/// by a full reindex/import under [`DecimalEncoding::InlineWhenFits`]; see +/// [`IndexRoot::decimal_encoding`]. +pub const ROOT_V6_VERSION_INLINE_DECIMAL: u8 = 3; + +/// Derive the decimal-encoding policy from a decoded root format version. +#[inline] +pub const fn decimal_encoding_for_version(version: u8) -> fluree_db_core::DecimalEncoding { + if version >= ROOT_V6_VERSION_INLINE_DECIMAL { + fluree_db_core::DecimalEncoding::InlineWhenFits + } else { + fluree_db_core::DecimalEncoding::ArenaOnly + } +} + +/// The root format version that must be written for a given decimal-encoding +/// policy. Inverse of [`decimal_encoding_for_version`]. +#[inline] +pub const fn version_for_decimal_encoding(enc: fluree_db_core::DecimalEncoding) -> u8 { + match enc { + fluree_db_core::DecimalEncoding::InlineWhenFits => ROOT_V6_VERSION_INLINE_DECIMAL, + fluree_db_core::DecimalEncoding::ArenaOnly => ROOT_V6_VERSION, + } +} + /// Binary index root (`FIR6`). /// /// Contains all sections needed to load an index: dict refs, arena refs, @@ -164,6 +196,12 @@ pub struct IndexRoot { /// this flag on the first post-import write. pub lex_sorted_string_ids: bool, + /// How this root encodes `xsd:decimal` values. Derived from the format + /// version on decode ([`decimal_encoding_for_version`]) and mapped back to + /// the version on encode ([`version_for_decimal_encoding`]). Sticky: + /// incremental writes preserve it; only a full reindex changes it. + pub decimal_encoding: fluree_db_core::DecimalEncoding, + // ── Cumulative commit stats ──────────────────────────────────── pub total_commit_size: u64, pub total_asserts: u64, @@ -399,6 +437,14 @@ impl IndexRoot { Some(geo::WKT_LITERAL), ), (OType::BLANK_NODE.as_u16(), DecodeKind::BlankNode, None), + // Inline exact xsd:decimal (v3 roots). Maps back to xsd:decimal so + // decoded values carry the correct datatype, distinct from the lossy + // f64 XSD_DECIMAL lane above. + ( + OType::XSD_DECIMAL_INLINE.as_u16(), + DecodeKind::Decimal, + Some(xsd::DECIMAL), + ), ]; for &(o_type, decode_kind, dt_iri) in embedded_types { @@ -568,6 +614,14 @@ impl IndexRoot { /// machine. const FLAG_EXT_HAD_ANNOTATION_ARENA: u8 = 1 << 0; + /// The decimal-encoding policy this root writes under. Equivalent to reading + /// [`decimal_encoding`](Self::decimal_encoding) directly; provided as the + /// stable accessor for encode-path callers. + #[inline] + pub fn decimal_encoding(&self) -> fluree_db_core::DecimalEncoding { + self.decimal_encoding + } + /// Encode to the binary FIR6 wire format. /// /// Determinism: namespaces sorted by ns_code, named graphs by g_id, @@ -593,7 +647,9 @@ impl IndexRoot { // ---- Header (24 bytes) ---- buf.extend_from_slice(ROOT_V6_MAGIC); - buf.push(ROOT_V6_VERSION); + // The version byte is the inline-decimal capability signal: a root that + // inlines decimals is written as v3 so old binaries refuse it. + buf.push(version_for_decimal_encoding(self.decimal_encoding)); let flags = (if self.stats.is_some() { Self::FLAG_HAS_STATS } else { @@ -831,9 +887,13 @@ impl IndexRoot { ))); } let version = data[4]; - if version != ROOT_V6_VERSION { + // Accept v2 (arena-only) and v3 (inline-decimal-capable); the layouts are + // byte-identical, the version only signals whether inline-decimal leaf + // rows may appear. Any other version is refused. + if version != ROOT_V6_VERSION && version != ROOT_V6_VERSION_INLINE_DECIMAL { return Err(io_err(&format!("root v6: unsupported version {version}"))); } + let decimal_encoding = decimal_encoding_for_version(version); let flags = data[5]; // Extended-flags byte at data[6]; data[7] reserved. @@ -1079,6 +1139,7 @@ impl IndexRoot { subject_watermarks, string_watermark, lex_sorted_string_ids, + decimal_encoding, total_commit_size, total_asserts, total_retracts, @@ -1408,6 +1469,7 @@ mod tests { annotation_index: None, had_annotation_arena: false, ns_split_mode: fluree_db_core::ns_encoding::NsSplitMode::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, } } @@ -1421,6 +1483,10 @@ mod tests { assert_eq!(bytes[5], 0); // no optional sections let decoded = IndexRoot::decode(&bytes).unwrap(); + assert_eq!( + decoded.decimal_encoding(), + fluree_db_core::DecimalEncoding::ArenaOnly + ); assert_eq!(decoded.ledger_id, "test:main"); assert_eq!(decoded.index_t, 42); assert_eq!(decoded.base_t, 0); @@ -1675,6 +1741,44 @@ mod tests { assert!(err.to_string().contains("trailing bytes"), "got: {err}"); } + #[test] + fn fir6_decimal_encoding_version_round_trip() { + use fluree_db_core::DecimalEncoding; + + // Arena-only root is written as v2 and decodes back to ArenaOnly. + let arena = minimal_root_v6(); + let arena_bytes = arena.encode(); + assert_eq!(arena_bytes[4], ROOT_V6_VERSION); + assert_eq!( + IndexRoot::decode(&arena_bytes).unwrap().decimal_encoding(), + DecimalEncoding::ArenaOnly + ); + + // Inline-decimal root is written as v3 (the capability signal) and + // decodes back to InlineWhenFits. + let mut inline = minimal_root_v6(); + inline.decimal_encoding = DecimalEncoding::InlineWhenFits; + let inline_bytes = inline.encode(); + assert_eq!(inline_bytes[4], ROOT_V6_VERSION_INLINE_DECIMAL); + assert_eq!( + IndexRoot::decode(&inline_bytes).unwrap().decimal_encoding(), + DecimalEncoding::InlineWhenFits + ); + + // The two roots are byte-identical except for the version byte: the v3 + // capability is purely a header signal, not a layout change. + assert_eq!(arena_bytes[0..4], inline_bytes[0..4]); // magic + assert_eq!(arena_bytes[5..], inline_bytes[5..]); // everything after version + } + + #[test] + fn fir6_unknown_version_refused() { + let mut bytes = minimal_root_v6().encode(); + bytes[4] = 99; // neither v2 nor v3 + let err = IndexRoot::decode(&bytes).unwrap_err(); + assert!(err.to_string().contains("unsupported version")); + } + #[test] fn fir6_round_trip_with_default_graph() { let mut root = minimal_root_v6(); @@ -1798,8 +1902,8 @@ mod tests { #[test] fn o_type_table_built_in() { let table = IndexRoot::build_o_type_table(&[], &[]); - // Should contain all 31 embedded + 13 Fluree = 44 entries. - assert_eq!(table.len(), 44); + // Should contain all 32 embedded + 13 Fluree = 45 entries. + assert_eq!(table.len(), 45); // Spot-check a few entries. let int_entry = table @@ -1828,8 +1932,8 @@ mod tests { #[test] fn o_type_table_with_langs() { let table = IndexRoot::build_o_type_table(&[], &["en".to_string(), "fr".to_string()]); - // 44 built-in + 2 langString = 46. - assert_eq!(table.len(), 46); + // 45 built-in + 2 langString = 47. + assert_eq!(table.len(), 47); // lang_id is 1-based: first tag "en" gets lang_id=1 let en_entry = table @@ -1843,8 +1947,8 @@ mod tests { #[test] fn o_type_table_with_custom_types() { let table = IndexRoot::build_o_type_table(&["http://example.org/myType".to_string()], &[]); - // 44 built-in + 1 customer = 45. - assert_eq!(table.len(), 45); + // 45 built-in + 1 customer = 46. + assert_eq!(table.len(), 46); let custom = table.last().unwrap(); assert!(OType::from_u16(custom.o_type).is_customer_datatype()); diff --git a/fluree-db-binary-index/src/read/binary_index_store.rs b/fluree-db-binary-index/src/read/binary_index_store.rs index 6b0b4ebe77..cd72d496da 100644 --- a/fluree-db-binary-index/src/read/binary_index_store.rs +++ b/fluree-db-binary-index/src/read/binary_index_store.rs @@ -235,6 +235,9 @@ pub struct BinaryIndexStore { base_t: i64, language_tags: Vec, lex_sorted_string_ids: bool, + /// Decimal-encoding policy of the loaded root. Governs how query constants + /// encode so they match stored rows (inline vs NumBig arena). + decimal_encoding: fluree_db_core::DecimalEncoding, /// Ledger-fixed split mode for canonical IRI encoding. /// Set from the snapshot's `ns_split_mode` via `set_ns_split_mode()`. ns_split_mode: NsSplitMode, @@ -362,6 +365,7 @@ impl BinaryIndexStore { base_t: root.base_t, language_tags: root.language_tags.clone(), lex_sorted_string_ids: root.lex_sorted_string_ids, + decimal_encoding: root.decimal_encoding(), ns_split_mode: root.ns_split_mode, ns_split_mode_set: true, }) @@ -398,6 +402,13 @@ impl BinaryIndexStore { self.lex_sorted_string_ids } + /// The loaded root's decimal-encoding policy. Query constants must encode + /// under this policy so they match stored `(o_type, o_key)` rows. + #[inline] + pub fn decimal_encoding(&self) -> fluree_db_core::DecimalEncoding { + self.decimal_encoding + } + /// Get the branch manifest for a graph + sort order. pub fn branch_for_order( &self, @@ -825,6 +836,7 @@ impl BinaryIndexStore { DecodeKind::Bool => Ok(FlakeValue::Boolean(o_key != 0)), DecodeKind::I64 => Ok(FlakeValue::Long(key.decode_i64())), DecodeKind::F64 => Ok(FlakeValue::Double(key.decode_f64())), + DecodeKind::Decimal => Ok(FlakeValue::Decimal(Box::new(key.decode_decimal()))), DecodeKind::Date => { let days = key.decode_date(); let date = chrono::NaiveDate::from_num_days_from_ce_opt(days + 719_163).unwrap_or( @@ -1382,6 +1394,7 @@ impl BinaryIndexStore { OType::XSD_DOUBLE => Some(Sid::new(namespaces::XSD, xsd_names::DOUBLE)), OType::XSD_FLOAT => Some(Sid::new(namespaces::XSD, xsd_names::FLOAT)), OType::XSD_DECIMAL => Some(Sid::new(namespaces::XSD, xsd_names::DECIMAL)), + OType::XSD_DECIMAL_INLINE => Some(Sid::new(namespaces::XSD, xsd_names::DECIMAL)), OType::XSD_DATE => Some(Sid::new(namespaces::XSD, xsd_names::DATE)), OType::XSD_TIME => Some(Sid::new(namespaces::XSD, xsd_names::TIME)), OType::XSD_DATE_TIME => Some(Sid::new(namespaces::XSD, xsd_names::DATE_TIME)), @@ -2865,6 +2878,7 @@ mod tests { base_t: 0, language_tags: Vec::new(), lex_sorted_string_ids: false, + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, ns_split_mode: NsSplitMode::default(), ns_split_mode_set: true, } diff --git a/fluree-db-core/src/db.rs b/fluree-db-core/src/db.rs index 78b8c60ce6..d84347092a 100644 --- a/fluree-db-core/src/db.rs +++ b/fluree-db-core/src/db.rs @@ -647,13 +647,13 @@ fn decode_fir6_metadata(bytes: &[u8]) -> std::io::Result )); } let version = bytes[4]; - // FIR6 version 2 adds `lang_id` to each `FulltextArenaRef` so fulltext - // arenas can be keyed by `(g_id, p_id, lang_id)`. This helper doesn't - // parse arena refs — it only consumes the header bits it needs — so - // both versions are accepted here. The authoritative parser - // (`IndexRoot::decode` in `fluree-db-binary-index`) enforces version - // matching for the full-root deserialization path. - if version != 1 && version != 2 { + // FIR6 version 2 adds `lang_id` to each `FulltextArenaRef`; version 3 enables + // inline xsd:decimal encoding. Neither changes the header layout this helper + // reads — it only consumes the fixed header + optional-section bits, not arena + // refs or leaf data — so all three versions are accepted here. The + // authoritative parser (`IndexRoot::decode` in `fluree-db-binary-index`) + // enforces version matching for the full-root deserialization path. + if version != 1 && version != 2 && version != 3 { return Err(std::io::Error::new( std::io::ErrorKind::InvalidData, format!("FIR6: unsupported version {version}"), diff --git a/fluree-db-core/src/lib.rs b/fluree-db-core/src/lib.rs index d3add48be1..3750ab2340 100644 --- a/fluree-db-core/src/lib.rs +++ b/fluree-db-core/src/lib.rs @@ -206,7 +206,7 @@ pub use value::{ parse_decimal, parse_decimal_string, parse_double, parse_integer, parse_integer_string, FlakeValue, GeoPointBits, }; -pub use value_id::{ObjKey, ObjKeyError, ObjKind, ObjPair, ValueTypeTag}; +pub use value_id::{DecimalEncoding, ObjKey, ObjKeyError, ObjKind, ObjPair, ValueTypeTag}; /// Prelude module for convenient imports of storage traits and common types. /// diff --git a/fluree-db-core/src/o_type.rs b/fluree-db-core/src/o_type.rs index bbbb40a14f..b5b9d8c226 100644 --- a/fluree-db-core/src/o_type.rs +++ b/fluree-db-core/src/o_type.rs @@ -125,7 +125,15 @@ impl OType { /// Blank node (`_:b{id}`) — `o_key` is the atomic bnode integer. pub const BLANK_NODE: Self = Self(0x001F); - // Tag `00` payload range 0x0020–0x3FFF reserved for future embedded types. + /// `xsd:decimal` stored **inline** as an exact, order-preserving base-10 + /// float key (see [`ObjKey::encode_decimal`]) — canonical *and* value-ordered. + /// Distinct from the lossy f64 [`XSD_DECIMAL`](Self::XSD_DECIMAL) lane: this + /// carries the exact value with no arena handle. Only written by new-format + /// index roots; large/high-precision decimals still fall back to the NumBig + /// arena ([`NUM_BIG_OVERFLOW`](Self::NUM_BIG_OVERFLOW)). + pub const XSD_DECIMAL_INLINE: Self = Self(0x0020); + + // Tag `00` payload range 0x0021–0x3FFF reserved for future embedded types. // ── Tag `10` — Fluree-reserved dictionary/arena-backed ───────────── @@ -341,6 +349,7 @@ impl OType { 0x001D => DecodeKind::Duration, 0x001E => DecodeKind::GeoPoint, 0x001F => DecodeKind::BlankNode, + 0x0020 => DecodeKind::Decimal, _ => DecodeKind::Sentinel, // future embedded types } } @@ -414,6 +423,9 @@ pub enum DecodeKind { NumBigArena, /// Spatial arena handle (per-predicate). SpatialArena, + /// Exact inline `xsd:decimal` — o_key is an order-preserving base-10 float + /// code (see [`super::value_id::ObjKey::decode_decimal`]). Not arena-backed. + Decimal, } impl DecodeKind { @@ -444,6 +456,7 @@ impl DecodeKind { 21 => Some(Self::VectorArena), 22 => Some(Self::NumBigArena), 23 => Some(Self::SpatialArena), + 24 => Some(Self::Decimal), _ => None, } } @@ -486,6 +499,7 @@ impl fmt::Debug for OType { 0x001D => write!(f, "OType::XSD_DURATION"), 0x001E => write!(f, "OType::GEO_POINT"), 0x001F => write!(f, "OType::BLANK_NODE"), + 0x0020 => write!(f, "OType::XSD_DECIMAL_INLINE"), 0x8000 => write!(f, "OType::XSD_STRING"), 0x8001 => write!(f, "OType::XSD_ANY_URI"), 0x8002 => write!(f, "OType::XSD_NORMALIZED_STRING"), diff --git a/fluree-db-core/src/o_type_registry.rs b/fluree-db-core/src/o_type_registry.rs index e918987f1d..e0616cfbde 100644 --- a/fluree-db-core/src/o_type_registry.rs +++ b/fluree-db-core/src/o_type_registry.rs @@ -84,6 +84,7 @@ impl OTypeRegistry { ObjKind::VECTOR_ID => OType::VECTOR, ObjKind::JSON_ID => OType::RDF_JSON, ObjKind::NUM_BIG => OType::NUM_BIG_OVERFLOW, + ObjKind::NUM_DEC => OType::XSD_DECIMAL_INLINE, ObjKind::G_YEAR => OType::XSD_G_YEAR, ObjKind::G_YEAR_MONTH => OType::XSD_G_YEAR_MONTH, ObjKind::G_MONTH => OType::XSD_G_MONTH, @@ -308,6 +309,10 @@ mod tests { reg.resolve(ObjKind::NUM_BIG, DatatypeDictId::DECIMAL, 0), OType::NUM_BIG_OVERFLOW ); + assert_eq!( + reg.resolve(ObjKind::NUM_DEC, DatatypeDictId::DECIMAL, 0), + OType::XSD_DECIMAL_INLINE + ); assert_eq!( reg.resolve(ObjKind::GEO_POINT, DatatypeDictId::STRING, 0), OType::GEO_POINT diff --git a/fluree-db-core/src/value_id.rs b/fluree-db-core/src/value_id.rs index 38b8e0d484..d292844d72 100644 --- a/fluree-db-core/src/value_id.rs +++ b/fluree-db-core/src/value_id.rs @@ -15,6 +15,11 @@ //! `NumInt(3)` vs `NumF64(3.0)`) is a query-layer concern resolved via //! multi-scan merge, not an index property. //! +//! [`ObjKind::NUM_DEC`] (inline `xsd:decimal`) upholds this contract too: its +//! key is canonical (equal values → identical bits) *and* order-preserving (raw +//! `u64` order == numeric order), via an order-preserving base-10 float layout. +//! See [`ObjKey::encode_decimal`]. +//! //! [`ValueTypeTag`] is a compact `u8` identifier for XSD/RDF datatypes, used as //! a tie-breaker in index sort keys so that values with the same `(ObjKind, //! ObjKey)` but different types (e.g., `xsd:integer 3` vs `xsd:long 3`) @@ -116,6 +121,14 @@ impl ObjKind { /// Precision: approximately 0.3mm at the equator. pub const GEO_POINT: Self = Self(0x14); + /// Exact inline `xsd:decimal` — `o_key` is an order-preserving base-10 float + /// code (see [`ObjKey::encode_decimal`]). Canonical (equal values → identical + /// bits) AND value-ordered (raw `u64` order == numeric order), so it supports + /// equality, dedup, joins, and range / ORDER BY pushdown. Distinct from + /// [`NUM_BIG`](Self::NUM_BIG) (arena handle) — inline decimals carry the exact + /// value with no arena. + pub const NUM_DEC: Self = Self(0x15); + /// Get the raw `u8` discriminant. #[inline] pub const fn as_u8(self) -> u8 { @@ -159,6 +172,7 @@ impl fmt::Debug for ObjKind { 0x12 => write!(f, "ObjKind::YearMonthDur"), 0x13 => write!(f, "ObjKind::DayTimeDur"), 0x14 => write!(f, "ObjKind::GeoPoint"), + 0x15 => write!(f, "ObjKind::NumDec"), 0xFF => write!(f, "ObjKind::Max"), n => write!(f, "ObjKind({n:#04x})"), } @@ -171,6 +185,48 @@ impl fmt::Display for ObjKind { } } +// ============================================================================ +// DecimalEncoding +// ============================================================================ + +/// How an index root encodes `xsd:decimal` object values. +/// +/// This is an **encode-time** policy derived from the active index root's +/// format version — it is never user-facing. Decode is always capable of both +/// schemes regardless of this policy, so new code reading any root is fully +/// backward-compatible. +/// +/// The policy is **sticky per root and preserved across incremental writes**: +/// extending an `ArenaOnly` root keeps writing decimals to the NumBig arena; +/// extending an `InlineWhenFits` root keeps inlining. Only a full reindex +/// changes the policy, because only a full reindex rewrites existing facts under +/// a new `(o_type, o_key)` identity. (Contrast `lex_sorted_string_ids`, which is +/// *cleared* on incremental writes; inline decimals are not broken by appends.) +/// +/// The hard invariant this protects: `(o_type, o_key)` is persisted fact +/// identity, so a single root must use **one** decimal encoding for all +/// inline-eligible values — never a mix — or a retract computed under one scheme +/// would miss an assert stored under the other. +#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug, Default)] +pub enum DecimalEncoding { + /// All `xsd:decimal` values route to the per-`(graph, predicate)` NumBig + /// arena. The behavior of every pre-inline index root, bit-for-bit unchanged. + #[default] + ArenaOnly, + /// Inline-eligible decimals (see [`ObjKey::encode_decimal`]) encode inline + /// under [`ObjKind::NUM_DEC`]; values that do not fit fall back to the arena, + /// exactly like overflow integers. + InlineWhenFits, +} + +impl DecimalEncoding { + /// True if this policy may emit inline decimal keys on write. + #[inline] + pub const fn inlines(self) -> bool { + matches!(self, Self::InlineWhenFits) + } +} + // ============================================================================ // ObjKey // ============================================================================ @@ -189,6 +245,54 @@ const SIGN_FLIP: u64 = 1u64 << 63; /// Sign bit mask for f64 bits. const F64_SIGN_BIT: u64 = 1u64 << 63; +// ---- Inline decimal (NumDec) — ORDER-PRESERVING base-10 float layout ---- +// +// An inline decimal key is a canonical, order-preserving code: equal values +// produce identical bits (equality), and raw `u64` order equals numeric order +// (range/ORDER BY pushdown). The magnitude is laid out as a base-10 float +// +// magnitude = significand × 10^(exp10 - (DEC_DIGITS - 1)) +// +// where `significand` is `|mantissa|` normalized to exactly `DEC_DIGITS` +// decimal digits (MSD in the leading place, in `[10^(DEC_DIGITS-1), 10^DEC_DIGITS)`) +// and `exp10` is the base-10 exponent of the most significant digit. The packed +// magnitude code places the (biased) exponent above the significand: +// +// mag = (biased_exp << DEC_SIG_BITS) | significand (0 = the value zero) +// +// `mag` is monotonic in magnitude — a larger exponent dominates, and within one +// exponent a larger significand wins. Sign is then folded so the full `u64` +// order is numeric order, with zero at the exact midpoint: +// +// value > 0 → key = 2^63 + mag +// value == 0 → key = 2^63 +// value < 0 → key = 2^63 - 1 - mag (more negative ⇒ smaller key) +// +// Negatives complement the magnitude (like the f64 lane), so they sort below +// zero and more-negative values sort lower. Canonicalization (normalize the +// mantissa, strip trailing zeros) makes `1.5`, `1.50`, and `1.500` one code. + +// Magnitude budget is 63 bits (the 64th splits sign around the pivot), packed as +// `[ exponent:6 | significand:57 ]` — all 63 bits used, no waste. + +/// Significant decimal digits an inline decimal carries. Values with more +/// significant digits spill to the NumBig arena. 17 digits matches the original +/// inline precision; `10^17 < 2^57` so the significand fits 57 bits exactly. +const DEC_DIGITS: u32 = 17; +/// Bits the `DEC_DIGITS`-digit significand occupies (low bits of the magnitude). +const DEC_SIG_BITS: u64 = 57; +/// Mask selecting the significand. +const DEC_SIG_MASK: u64 = (1u64 << DEC_SIG_BITS) - 1; +/// Exponent field width (bits), sitting just above the significand. +const DEC_EXP_BITS: u64 = 6; +/// Bias added to `exp10` so the stored exponent is non-negative. Representable +/// `exp10` range is `[-DEC_EXP_BIAS, DEC_EXP_BIAS - 1]` = `[-32, 31]`; values +/// outside (more than ~32 integer or fractional places) spill to the arena. +const DEC_EXP_BIAS: i64 = 1 << (DEC_EXP_BITS - 1); // 32 +/// Sign split point: non-negative keys are `>= DEC_SIGN_PIVOT`, negatives below. +/// Zero encodes exactly to this value (the midpoint). +const DEC_SIGN_PIVOT: u64 = 1u64 << 63; + /// Error returned when a value cannot be stored in the index. #[derive(Debug, Clone, PartialEq)] pub enum ObjKeyError { @@ -287,6 +391,102 @@ impl ObjKey { f64::from_bits(bits) } + // ---- Inline decimal encoding (NumDec) — order-preserving ---- + // + // See the layout notes above the `DEC_*` constants. The key is canonical + // (equal values → identical bits) AND order-preserving (raw `u64` order == + // numeric order), so inline decimals support equality, dedup, joins, AND + // range / ORDER BY pushdown. A value is inline-eligible iff, after + // canonicalization, it has at most `DEC_DIGITS` significant digits and a + // base-10 exponent in `[-DEC_EXP_BIAS, DEC_EXP_BIAS - 1]`. Anything else + // returns `None` and falls back to the NumBig arena, like overflow integers. + + /// Encode a canonical `xsd:decimal` inline, order-preserving, or `None` if it + /// does not fit. Numerically-equal decimals (`1.50`, `1.5`, `1.500`) encode + /// to identical bits, and `a < b` numerically iff `encode(a) < encode(b)` as + /// `u64`. + pub fn encode_decimal(value: &bigdecimal::BigDecimal) -> Option { + use num_bigint::Sign; + use num_traits::{ToPrimitive, Zero}; + + // Canonicalize: strip trailing zeros so the significant-digit count and + // significand are unique for a given value. + let normalized = value.normalized(); + let (mantissa, scale) = normalized.as_bigint_and_exponent(); + + if mantissa.is_zero() { + // Zero is the canonical midpoint between negatives and positives. + return Some(Self(DEC_SIGN_PIVOT)); + } + + let (sign, magnitude) = mantissa.into_parts(); + // Anything larger than u64 has > 19 decimal digits, well past DEC_DIGITS, + // so a `to_u64` miss is itself the spill signal — no BigUint digit walk. + let mag_u64 = magnitude.to_u64()?; + let digits = mag_u64.ilog10() + 1; // mag_u64 > 0 here + if digits > DEC_DIGITS { + // More significant digits than the inline significand holds. + return None; + } + + // Base-10 exponent of the most significant digit: with `magnitude` having + // `digits` digits and value = magnitude × 10^-scale, the MSD place is + // `(digits - 1) - scale`. + let exp10 = (digits as i64 - 1) - scale; + if !(-DEC_EXP_BIAS..DEC_EXP_BIAS).contains(&exp10) { + return None; + } + + // Left-align to exactly DEC_DIGITS digits so same-exponent significands + // compare as integers. `mag_u64 < 10^digits` and the pad is + // `DEC_DIGITS - digits`, so the product is `< 10^17 < 2^DEC_SIG_BITS` and + // well under `u64::MAX` — no overflow. + let significand = mag_u64 * 10u64.pow(DEC_DIGITS - digits); + + let biased_exp = (exp10 + DEC_EXP_BIAS) as u64; + let mag = (biased_exp << DEC_SIG_BITS) | significand; + + let key = if sign == Sign::Minus { + // More negative ⇒ larger mag ⇒ smaller key, all below the pivot. + DEC_SIGN_PIVOT - 1 - mag + } else { + DEC_SIGN_PIVOT + mag + }; + Some(Self(key)) + } + + /// Decode an inline `xsd:decimal` previously produced by [`encode_decimal`]. + /// + /// [`encode_decimal`]: Self::encode_decimal + pub fn decode_decimal(self) -> bigdecimal::BigDecimal { + use num_bigint::BigInt; + + if self.0 == DEC_SIGN_PIVOT { + return bigdecimal::BigDecimal::from(0); + } + + let (negative, mag) = if self.0 >= DEC_SIGN_PIVOT { + (false, self.0 - DEC_SIGN_PIVOT) + } else { + (true, DEC_SIGN_PIVOT - 1 - self.0) + }; + + let significand = mag & DEC_SIG_MASK; + let biased_exp = (mag >> DEC_SIG_BITS) as i64; + let exp10 = biased_exp - DEC_EXP_BIAS; + + // value = significand × 10^(exp10 - (DEC_DIGITS - 1)). + // As BigDecimal: mantissa = ±significand, scale = (DEC_DIGITS-1) - exp10. + let mut mantissa = BigInt::from(significand); + if negative { + mantissa = -mantissa; + } + let scale = (DEC_DIGITS as i64 - 1) - exp10; + // `normalized()` strips the left-alignment padding so output is minimal + // (1.5, not 1.500000000000000). + bigdecimal::BigDecimal::from_bigint(mantissa, scale).normalized() + } + // ---- Boolean encoding ---- /// Encode a boolean (false = 0, true = 1). @@ -1631,4 +1831,206 @@ mod tests { assert!(!dt.is_float_type(), "{dt} should not be float type"); } } + + // ---- Inline decimal (NumDec) encode/decode ---- + + fn bd(s: &str) -> bigdecimal::BigDecimal { + s.parse().unwrap() + } + + /// Round-trip an inline-eligible decimal: encode must succeed and decode back + /// to the numerically-equal value. + fn assert_decimal_roundtrip(s: &str) { + let v = bd(s); + let key = + ObjKey::encode_decimal(&v).unwrap_or_else(|| panic!("{s} should be inline-eligible")); + let back = key.decode_decimal(); + assert_eq!(back, v, "round-trip mismatch for {s}: got {back}"); + } + + #[test] + fn decimal_roundtrip_common_values() { + for s in [ + "0", + "1", + "-1", + "19.99", + "-19.99", + "0.01", + "-0.01", + "3.14159", + "100", + "1000000.5", + "-1000000.5", + "0.0000001", + "12345678901234.56", + ] { + assert_decimal_roundtrip(s); + } + } + + #[test] + fn decimal_zero_is_canonical() { + // All spellings of zero encode to the same key. + let keys: Vec<_> = ["0", "0.0", "-0", "-0.00", "0.000000"] + .iter() + .map(|s| ObjKey::encode_decimal(&bd(s)).unwrap()) + .collect(); + for k in &keys { + assert_eq!(*k, keys[0], "zero spellings must share one key"); + } + assert_eq!(keys[0].decode_decimal(), bd("0")); + } + + #[test] + fn decimal_scale_variants_share_key() { + // 1.50 and 1.5 are the same value -> identical key (equality identity). + assert_eq!( + ObjKey::encode_decimal(&bd("1.50")).unwrap(), + ObjKey::encode_decimal(&bd("1.5")).unwrap(), + ); + // 1.00 and 1 fold to scale 0 -> identical key. + assert_eq!( + ObjKey::encode_decimal(&bd("1.00")).unwrap(), + ObjKey::encode_decimal(&bd("1")).unwrap(), + ); + } + + #[test] + fn decimal_integer_valued_folds_to_scale_zero() { + // Trailing-zero integers (normalized form has negative exponent) fold back. + for s in ["100", "1000", "100000000", "-100"] { + assert_decimal_roundtrip(s); + } + } + + #[test] + fn decimal_significant_digit_boundary() { + // Up to DEC_DIGITS (17) significant digits fit; an 18th spills. + let d17 = bd("12345678901234567"); // 17 significant digits + assert!(ObjKey::encode_decimal(&d17).is_some()); + assert_decimal_roundtrip("12345678901234567"); + assert_decimal_roundtrip("1.2345678901234567"); // same 17 digits, fractional + + let d18 = bd("123456789012345678"); // 18 significant digits + assert!(ObjKey::encode_decimal(&d18).is_none()); + let d18_frac = bd("1.23456789012345678"); + assert!(ObjKey::encode_decimal(&d18_frac).is_none()); + + // A 20-digit value (exceeds u64) also spills via the to_u64 miss. + assert!(ObjKey::encode_decimal(&bd("12345678901234567890")).is_none()); + } + + #[test] + fn decimal_exponent_boundary() { + // exp10 in [-32, 31] fits; outside spills. A single-digit value's exp10 + // equals its power of ten. + assert!(ObjKey::encode_decimal(&bd("1e31")).is_some()); // exp10 = 31 + assert_decimal_roundtrip("1e31"); + assert!(ObjKey::encode_decimal(&bd("1e32")).is_none()); // exp10 = 32, out + + assert!(ObjKey::encode_decimal(&bd("1e-32")).is_some()); // exp10 = -32 + assert_decimal_roundtrip("1e-32"); + assert!(ObjKey::encode_decimal(&bd("1e-33")).is_none()); // exp10 = -33, out + } + + #[test] + fn decimal_sign_distinguished() { + let pos = ObjKey::encode_decimal(&bd("19.99")).unwrap(); + let neg = ObjKey::encode_decimal(&bd("-19.99")).unwrap(); + assert_ne!(pos, neg); + assert_eq!(pos.decode_decimal(), bd("19.99")); + assert_eq!(neg.decode_decimal(), bd("-19.99")); + } + + #[test] + fn decimal_zero_sits_between_signs() { + let neg = ObjKey::encode_decimal(&bd("-0.0001")).unwrap(); + let zero = ObjKey::encode_decimal(&bd("0")).unwrap(); + let pos = ObjKey::encode_decimal(&bd("0.0001")).unwrap(); + assert!(neg < zero, "negatives sort below zero"); + assert!(zero < pos, "zero sorts below positives"); + } + + /// Build a broad, deterministic spread of distinct decimal values for the + /// order-preservation property test: every (sign, coefficient, scale) combo + /// that stays inline-eligible. + fn property_test_decimals() -> Vec { + use bigdecimal::BigDecimal; + use num_bigint::BigInt; + let coeffs: [i64; 9] = [1, 2, 7, 9, 15, 100, 999, 12345, 9_999_999_999_999_999]; + let scales: [i64; 13] = [-15, -8, -3, -1, 0, 1, 2, 3, 5, 8, 12, 20, 28]; + let mut out = vec![BigDecimal::from(0)]; + for &c in &coeffs { + for &s in &scales { + let v = BigDecimal::from_bigint(BigInt::from(c), s); + out.push(v.clone()); + out.push(-v); + } + } + out + } + + #[test] + fn decimal_encoding_is_order_preserving() { + // The headline invariant: raw u64 key order == numeric order. Sort the + // value set numerically, then assert encoded keys are strictly ascending + // across distinct values (and equal across numerically-equal ones). + let mut values = property_test_decimals(); + // Numeric sort (BigDecimal Ord compares by value). + values.sort(); + + let mut prev: Option<(bigdecimal::BigDecimal, ObjKey)> = None; + for v in values { + let key = ObjKey::encode_decimal(&v) + .unwrap_or_else(|| panic!("{v} should be inline-eligible in the property set")); + // Decode must round-trip to the same numeric value. + assert_eq!(key.decode_decimal(), v, "round-trip failed for {v}"); + if let Some((pv, pk)) = prev { + use std::cmp::Ordering; + match v.cmp(&pv) { + Ordering::Greater => assert!( + key.as_u64() > pk.as_u64(), + "order broken: {pv} (key {}) !< {v} (key {})", + pk.as_u64(), + key.as_u64() + ), + Ordering::Equal => assert_eq!( + key.as_u64(), + pk.as_u64(), + "equal values must share a key: {pv} vs {v}" + ), + Ordering::Less => unreachable!("values are sorted ascending"), + } + } + prev = Some((v, key)); + } + } + + #[test] + fn decimal_encoding_pairwise_monotonic() { + // Exhaustive pairwise check: for every ordered pair, the sign of the + // numeric comparison matches the sign of the key comparison. + let values = property_test_decimals(); + let keyed: Vec<_> = values + .iter() + .map(|v| { + ( + v.clone(), + ObjKey::encode_decimal(v).expect("inline-eligible").as_u64(), + ) + }) + .collect(); + for (a, ka) in &keyed { + for (b, kb) in &keyed { + assert_eq!( + a.cmp(b), + ka.cmp(kb), + "numeric {a} vs {b} ({:?}) disagrees with key {ka} vs {kb} ({:?})", + a.cmp(b), + ka.cmp(kb), + ); + } + } + } } diff --git a/fluree-db-indexer/src/build/rebuild.rs b/fluree-db-indexer/src/build/rebuild.rs index 4e7f32c4e1..880b73c404 100644 --- a/fluree-db-indexer/src/build/rebuild.rs +++ b/fluree-db-indexer/src/build/rebuild.rs @@ -226,6 +226,10 @@ where _span_b.record("fetch_concurrency", fetch_concurrency); let mut shared = SharedResolverState::new_for_ledger(&ledger_id); + // A full rebuild writes a fresh root, so it adopts the inline-decimal + // format: small exact decimals encode inline, the rest fall back to + // the arena. Existing ledgers keep their format until reindexed. + shared.decimal_encoding = fluree_db_core::DecimalEncoding::InlineWhenFits; // Pre-insert rdf:type into predicate dictionary so class tracking // works from the very first commit. @@ -1119,6 +1123,9 @@ where db_stats: Some(db_stats), db_schema, sketch_ref, + // Same source as the resolver above: the root version must match + // how decimals were just encoded. + decimal_encoding: shared.decimal_encoding, attachment_events: config.attachment_events.clone(), }; diff --git a/fluree-db-indexer/src/build/root_assembly.rs b/fluree-db-indexer/src/build/root_assembly.rs index 909509efdd..e4eee6503c 100644 --- a/fluree-db-indexer/src/build/root_assembly.rs +++ b/fluree-db-indexer/src/build/root_assembly.rs @@ -230,6 +230,11 @@ pub(crate) struct Fir6Inputs { pub db_schema: Option, /// CAS reference for the serialized HLL sketch blob. pub sketch_ref: Option, + /// Decimal-encoding policy for this build. Must equal the resolver's policy + /// for this run (same source) so the written root version matches how the + /// resolver encoded decimals — a mismatch would split decimal identity + /// across the inline/arena boundary. + pub decimal_encoding: fluree_db_core::DecimalEncoding, /// Edge-annotation event coverage envelope (M2b slice 3g). /// /// Routed from `IndexerConfig.attachment_events` through @@ -331,6 +336,8 @@ pub(crate) async fn encode_and_write_root_v6( prev_index: None, garbage: None, sketch_ref: inputs.sketch_ref, + // Same source as the resolver's policy for this run (see Fir6Inputs). + decimal_encoding: inputs.decimal_encoding, has_annotations, annotation_index: None, // Sticky bit flipped to `true` below if the rebuild path diff --git a/fluree-db-indexer/src/drop.rs b/fluree-db-indexer/src/drop.rs index fbec1309f6..9662ddd152 100644 --- a/fluree-db-indexer/src/drop.rs +++ b/fluree-db-indexer/src/drop.rs @@ -246,6 +246,7 @@ mod tests { annotation_index, o_type_table: IndexRoot::build_o_type_table(&[], &[]), ns_split_mode: fluree_db_core::ns_encoding::NsSplitMode::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, } } diff --git a/fluree-db-indexer/src/gc/collector.rs b/fluree-db-indexer/src/gc/collector.rs index b924af0f47..359dbc9928 100644 --- a/fluree-db-indexer/src/gc/collector.rs +++ b/fluree-db-indexer/src/gc/collector.rs @@ -415,6 +415,7 @@ mod tests { had_annotation_arena: false, o_type_table: IndexRoot::build_o_type_table(&[], &[]), ns_split_mode: fluree_db_core::ns_encoding::NsSplitMode::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, }; root.encode() } diff --git a/fluree-db-indexer/src/run_index/build/incremental_root.rs b/fluree-db-indexer/src/run_index/build/incremental_root.rs index 074129412b..8746fd07c3 100644 --- a/fluree-db-indexer/src/run_index/build/incremental_root.rs +++ b/fluree-db-indexer/src/run_index/build/incremental_root.rs @@ -375,6 +375,7 @@ mod tests { had_annotation_arena: false, o_type_table: IndexRoot::build_o_type_table(&[], &[]), ns_split_mode: NsSplitMode::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::ArenaOnly, } } diff --git a/fluree-db-indexer/src/run_index/resolve/resolver.rs b/fluree-db-indexer/src/run_index/resolve/resolver.rs index 2b0d638ac6..08fa01d0ee 100644 --- a/fluree-db-indexer/src/run_index/resolve/resolver.rs +++ b/fluree-db-indexer/src/run_index/resolve/resolver.rs @@ -55,6 +55,15 @@ pub struct ResolvedCommit { } /// Resolves commit-local ops into globally-addressed RunRecords. +/// +/// **Not on any production indexing path.** Live indexing resolves through +/// [`SharedResolverState`] (full rebuild + incremental) or `ImportSink` (bulk +/// import); `CommitResolver` is currently constructed only in tests. If it is +/// ever wired into a production path, the caller MUST call +/// [`set_decimal_encoding`](Self::set_decimal_encoding) with the target root's +/// policy — it defaults to `ArenaOnly`, so without that call it would write +/// arena decimals into a root that may be inline-decimal (v3), splitting +/// `(o_type, o_key)` identity. pub struct CommitResolver { /// namespace_code -> prefix IRI. /// Seeded from `default_namespace_codes()`, updated by commit namespace_deltas. @@ -80,6 +89,10 @@ pub struct CommitResolver { /// be collected. Empty by default so the `@fulltext`-datatype path keeps /// working without any config setup. fulltext_hook_config: crate::fulltext_hook::FulltextHookConfig, + /// Decimal-encoding policy for this run, derived from the root being + /// extended. `ArenaOnly` by default so behavior is unchanged until a caller + /// opts a run into inline decimals. + decimal_encoding: fluree_db_core::DecimalEncoding, } impl CommitResolver { @@ -92,9 +105,15 @@ impl CommitResolver { spatial_hook: None, fulltext_hook: None, fulltext_hook_config: crate::fulltext_hook::FulltextHookConfig::default(), + decimal_encoding: fluree_db_core::DecimalEncoding::default(), } } + /// Set the decimal-encoding policy for this run (sticky per root). + pub fn set_decimal_encoding(&mut self, enc: fluree_db_core::DecimalEncoding) { + self.decimal_encoding = enc; + } + /// Set the ID-based stats hook for per-op stats collection. pub fn set_stats_hook(&mut self, hook: crate::stats::IdStatsHook) { self.stats_hook = Some(hook); @@ -951,17 +970,29 @@ impl CommitResolver { } } RawObject::DecimalStr(s) => { - // All typed xsd:decimal values route to NumBig by default match s.parse::() { Ok(bd) => { - let handle = dicts - .numbigs - .entry(g_id) - .or_default() - .entry(p_id) - .or_default() - .get_or_insert_bigdec(&bd); - Ok((ObjKind::NUM_BIG, ObjKey::encode_u32_id(handle))) + // Under InlineWhenFits, small/exact decimals encode inline; + // large/high-precision values fall back to the arena. Under + // ArenaOnly every decimal routes to the arena. + let inline = self + .decimal_encoding + .inlines() + .then(|| ObjKey::encode_decimal(&bd)) + .flatten(); + match inline { + Some(key) => Ok((ObjKind::NUM_DEC, key)), + None => { + let handle = dicts + .numbigs + .entry(g_id) + .or_default() + .entry(p_id) + .or_default() + .get_or_insert_bigdec(&bd); + Ok((ObjKind::NUM_BIG, ObjKey::encode_u32_id(handle))) + } + } } Err(_) => { // Cannot parse as BigDecimal -- store as string @@ -1115,6 +1146,10 @@ pub struct SharedResolverState { /// for every `rdfs:subClassOf` / `rdfs:subPropertyOf` user-data op so rebuild /// can populate `IndexSchema` in the FIR6 root. pub schema_hook: Option, + /// Decimal-encoding policy for this rebuild, derived from the root being + /// extended. `ArenaOnly` by default so behavior is unchanged until a caller + /// opts the rebuild into inline decimals. + pub decimal_encoding: fluree_db_core::DecimalEncoding, } impl SharedResolverState { @@ -1161,6 +1196,7 @@ impl SharedResolverState { fulltext_hook: None, fulltext_hook_config: crate::fulltext_hook::FulltextHookConfig::default(), schema_hook: None, + decimal_encoding: fluree_db_core::DecimalEncoding::default(), } } @@ -1288,6 +1324,9 @@ impl SharedResolverState { fulltext_hook: None, fulltext_hook_config: crate::fulltext_hook::FulltextHookConfig::default(), schema_hook: None, + // Sticky: an incremental rebuild inherits the base root's policy so + // it never mixes inline and arena encodings under one identity. + decimal_encoding: root.decimal_encoding(), }) } @@ -1736,14 +1775,24 @@ impl SharedResolverState { } RawObject::DecimalStr(s) => match s.parse::() { Ok(bd) => { - let handle = self - .numbigs - .entry(g_id) - .or_default() - .entry(p_id) - .or_default() - .get_or_insert_bigdec(&bd); - Ok((ObjKind::NUM_BIG, ObjKey::encode_u32_id(handle))) + let inline = self + .decimal_encoding + .inlines() + .then(|| ObjKey::encode_decimal(&bd)) + .flatten(); + match inline { + Some(key) => Ok((ObjKind::NUM_DEC, key)), + None => { + let handle = self + .numbigs + .entry(g_id) + .or_default() + .entry(p_id) + .or_default() + .get_or_insert_bigdec(&bd); + Ok((ObjKind::NUM_BIG, ObjKey::encode_u32_id(handle))) + } + } } Err(_) => { let id = chunk.strings.get_or_insert(s.as_bytes()); @@ -2406,6 +2455,58 @@ mod tests { assert_eq!(collector.records.len(), 3); } + #[test] + fn resolve_decimal_inline_vs_arena() { + use fluree_db_core::value_id::{ObjKey, ObjKind}; + use fluree_db_core::DecimalEncoding; + + // Resolve a single decimal-valued flake under the given policy and return + // the emitted record's (o_kind, o_key). + let resolve = |dec: &str, enc: DecimalEncoding| -> (u8, u64) { + let flake = Flake::new( + Sid::new(101, "Item"), + Sid::new(101, "price"), + FlakeValue::Decimal(Box::new(dec.parse().unwrap())), + Sid::new(2, "decimal"), + 1, + true, + None, + ); + let blob = build_test_blob(&[flake], 1); + let commit_ops = load_commit_ops(&blob).unwrap(); + let mut dicts = GlobalDicts::new_memory("test:main"); + let mut resolver = CommitResolver::new(); + resolver + .ns_prefixes + .insert(101, "http://example.org/".to_string()); + resolver.set_decimal_encoding(enc); + let mut collector = RecordCollector::new(); + resolver + .resolve_commit_ops(&commit_ops, &mut dicts, &mut collector) + .unwrap(); + assert_eq!(collector.records.len(), 1); + let r = &collector.records[0]; + (r.o_kind, r.o_key) + }; + + // ArenaOnly: every decimal routes to the NumBig arena (handle in o_key). + let (kind, _) = resolve("19.99", DecimalEncoding::ArenaOnly); + assert_eq!(kind, ObjKind::NUM_BIG.as_u8()); + + // InlineWhenFits: a small decimal encodes inline and decodes back exactly. + let (kind, key) = resolve("19.99", DecimalEncoding::InlineWhenFits); + assert_eq!(kind, ObjKind::NUM_DEC.as_u8()); + assert_eq!( + ObjKey::from_u64(key).decode_decimal(), + "19.99".parse::().unwrap() + ); + + // InlineWhenFits: a value too large to fit (mantissa >= 2^57) falls back + // to the arena, exactly like overflow integers. + let (kind, _) = resolve("144115188075855872", DecimalEncoding::InlineWhenFits); + assert_eq!(kind, ObjKind::NUM_BIG.as_u8()); + } + #[test] fn test_resolve_ref_and_dedup() { let flakes = vec![ diff --git a/fluree-db-indexer/src/stats/id_hook.rs b/fluree-db-indexer/src/stats/id_hook.rs index 13c81aa19a..1175299252 100644 --- a/fluree-db-indexer/src/stats/id_hook.rs +++ b/fluree-db-indexer/src/stats/id_hook.rs @@ -179,6 +179,10 @@ fn otype_to_value_type_tag(ot: fluree_db_core::o_type::OType) -> ValueTypeTag { OType::XSD_DOUBLE => ValueTypeTag::DOUBLE, OType::XSD_FLOAT => ValueTypeTag::FLOAT, OType::XSD_DECIMAL => ValueTypeTag::DECIMAL, + // Inline exact decimals (v3 roots) unambiguously carry only + // `FlakeValue::Decimal` — unlike NUM_BIG_OVERFLOW below — so they map + // straight to DECIMAL here. + OType::XSD_DECIMAL_INLINE => ValueTypeTag::DECIMAL, // NUM_BIG_OVERFLOW is intentionally NOT mapped here: it carries both // `FlakeValue::Decimal` (arbitrary-precision xsd:decimal) and // `FlakeValue::BigInt` (xsd:integer overflow > i64) — they share @@ -859,4 +863,20 @@ mod tests { assert_eq!(props[&key].count, 5); assert_eq!(props[&key].last_modified_t, 3); } + + #[test] + fn inline_decimal_otype_classified_as_decimal() { + use fluree_db_core::o_type::OType; + // Both the lossy f64 decimal lane and the exact inline lane count as + // DECIMAL for datatype stats, so a reindexed (inline) ledger reports the + // same property datatype as before. + assert_eq!( + otype_to_value_type_tag(OType::XSD_DECIMAL), + ValueTypeTag::DECIMAL + ); + assert_eq!( + otype_to_value_type_tag(OType::XSD_DECIMAL_INLINE), + ValueTypeTag::DECIMAL + ); + } } diff --git a/fluree-db-query/src/binary_scan.rs b/fluree-db-query/src/binary_scan.rs index d5198c9c38..06991dba81 100644 --- a/fluree-db-query/src/binary_scan.rs +++ b/fluree-db-query/src/binary_scan.rs @@ -1934,60 +1934,130 @@ impl Operator for BinaryScanOperator { Arc::clone(branch_ref); // If this scan has range bounds on the object variable and we're scanning in POST order, - // narrow the cursor's leaf range by object-key range. - // - // IMPORTANT: SPARQL numeric comparisons are cross-type (integer bounds match double - // values), and ObjKey encodings differ between types. For correctness, we only apply - // range narrowing for temporal types where cross-type comparison does not apply. + // narrow the cursor's leaf range by object-key range. Two cases are safe: + // - **Temporal** types: comparison is within-type, so a cross-type value + // can't satisfy the filter — dropping it via narrowing is harmless. + // - **A uniform, order-preserving numeric predicate with no overlay**: + // no other-typed base rows exist and no novelty can introduce a + // cross-type match (see the numeric block below). + // SPARQL numeric comparison is otherwise cross-type (an integer bound + // matches double/decimal values under different o_types), so numeric + // narrowing is gated on those preconditions. let mut range_min_okey: Option = None; let mut range_max_okey: Option = None; let mut range_o_type: Option = None; - if order == RunSortOrder::Post && filter.p_id.is_some() && self.bound_o.is_none() { - if let Some(bounds) = self.object_bounds.as_ref() { - let supports_range = |ot: OType| -> bool { + if order == RunSortOrder::Post && self.bound_o.is_none() { + if let (Some(bounds), Some(p_id)) = (self.object_bounds.as_ref(), filter.p_id) { + // Only numeric bounds can target a numeric predicate; gate the + // manifest extent probe (which may open ≤2 boundary leaves) on + // that so temporal/string range scans don't pay for it. + let has_numeric_bound = |b: &Option<(FlakeValue, bool)>| { matches!( - ot, - OType::XSD_DATE - | OType::XSD_DATE_TIME - | OType::XSD_TIME - | OType::XSD_G_YEAR - | OType::XSD_G_YEAR_MONTH - | OType::XSD_G_MONTH - | OType::XSD_G_DAY - | OType::XSD_G_MONTH_DAY + b, + Some(( + FlakeValue::Long(_) + | FlakeValue::BigInt(_) + | FlakeValue::Decimal(_) + | FlakeValue::Double(_), + _ + )) ) }; - - let encode = |v: &FlakeValue| -> Option<(u16, u64)> { - let (ot, key) = value_to_otype_okey_simple(v, store_ref).ok()?; - supports_range(ot).then_some((ot.as_u16(), key)) + // Numeric range narrowing is unsafe in general (cross-type: + // `?o > 10` matches integer 11 AND decimal 11.5, stored under + // different o_types). It's safe only when the predicate is + // *uniformly* one order-preserving numeric type AND there is no + // overlay: + // + // - **Uniform base** (manifest extent min_o_type == max_o_type, + // o_key-ordered — any inline integer subtype, double/float, or + // inline decimal) means no other-typed *base* rows to miss. + // - **Overlay-free** is required because novelty can add a + // matching value of a *different* type (e.g. integer 100 to a + // decimal predicate). Its translated overlay op sorts outside + // the narrowed o_type/o_key window and would be dropped before + // the post-filter could rescue it. (Temporal narrowing doesn't + // need this: cross-type values can't satisfy a temporal filter, + // so dropping them is harmless.) With overlay present we fall + // back to the full base scan + overlay merge + post-filter. + // + // The post-filter below stays as the correctness backstop. + let numeric_uniform_ot = if (has_numeric_bound(&bounds.lower) + || has_numeric_bound(&bounds.upper)) + && ctx.overlay_free_single_graph() + { + crate::fast_count::predicate_uniform_o_type(store_ref, self.g_id, p_id) + .map(OType::from_u16) + .filter(|ot| crate::fast_count::otype_okey_order_comparable(*ot)) + } else { + None }; - - let mut ot: Option = None; - if let Some((v, _inclusive)) = bounds.lower.as_ref() { - if let Some((o_type, key)) = encode(v) { - ot = Some(o_type); - range_min_okey = Some(key); + if let Some(pred_ot) = numeric_uniform_ot { + let enc = |v: &FlakeValue| { + crate::fast_count::encode_numeric_threshold_for_otype(pred_ot, v) + .ok() + .flatten() + }; + if let Some((v, _inclusive)) = bounds.lower.as_ref() { + range_min_okey = enc(v); + } + if let Some((v, _inclusive)) = bounds.upper.as_ref() { + range_max_okey = enc(v); + } + if range_min_okey.is_some() || range_max_okey.is_some() { + range_o_type = Some(pred_ot.as_u16()); + filter.o_type = Some(pred_ot.as_u16()); } } - if let Some((v, _inclusive)) = bounds.upper.as_ref() { - if let Some((o_type, key)) = encode(v) { - if ot.is_some() && ot != Some(o_type) { - // Mixed type bounds; don't attempt range narrowing. - ot = None; - range_min_okey = None; - range_max_okey = None; - } else { + + // Temporal range narrowing (within-type comparison, always safe). + // Skipped if the numeric-uniform branch above already narrowed. + if numeric_uniform_ot.is_none() { + let supports_range = |ot: OType| -> bool { + matches!( + ot, + OType::XSD_DATE + | OType::XSD_DATE_TIME + | OType::XSD_TIME + | OType::XSD_G_YEAR + | OType::XSD_G_YEAR_MONTH + | OType::XSD_G_MONTH + | OType::XSD_G_DAY + | OType::XSD_G_MONTH_DAY + ) + }; + + let encode = |v: &FlakeValue| -> Option<(u16, u64)> { + let (ot, key) = value_to_otype_okey_simple(v, store_ref).ok()?; + supports_range(ot).then_some((ot.as_u16(), key)) + }; + + let mut ot: Option = None; + if let Some((v, _inclusive)) = bounds.lower.as_ref() { + if let Some((o_type, key)) = encode(v) { ot = Some(o_type); - range_max_okey = Some(key); + range_min_okey = Some(key); + } + } + if let Some((v, _inclusive)) = bounds.upper.as_ref() { + if let Some((o_type, key)) = encode(v) { + if ot.is_some() && ot != Some(o_type) { + // Mixed type bounds; don't attempt range narrowing. + ot = None; + range_min_okey = None; + range_max_okey = None; + } else { + ot = Some(o_type); + range_max_okey = Some(key); + } } } - } - if let Some(o_type) = ot { - range_o_type = Some(o_type); - // Also set the filter o_type so directory-level pre-skip can eliminate non-matching leaflets. - filter.o_type = Some(o_type); + if let Some(o_type) = ot { + range_o_type = Some(o_type); + // Also set the filter o_type so directory-level pre-skip can eliminate non-matching leaflets. + filter.o_type = Some(o_type); + } } } } @@ -2070,6 +2140,7 @@ impl Operator for BinaryScanOperator { to_t: ctx.to_t, g_id: self.g_id, index: self.index, + decimal_encoding: store_arc.decimal_encoding(), }; let entry = if let Some(hit) = global_translation_cache().get(&global_key) { hit @@ -2341,6 +2412,14 @@ pub struct GlobalTranslationKey { pub to_t: i64, pub g_id: GraphId, pub index: IndexType, + /// The base store's decimal-encoding policy. A full reindex can replace an + /// arena-only (v2) root with an inline-decimal (v3) root at the *same* + /// `index_t` (a pure re-encode of the same committed data), so `store_max_t` + /// alone can't tell the two apart. The two roots translate the same novelty + /// decimal to different `(o_type, o_key)` (NUM_BIG_OVERFLOW handle vs inline + /// XSD_DECIMAL_INLINE); keying on the policy prevents serving a stale + /// arena-keyed translation against an inline root (or vice versa). + pub decimal_encoding: fluree_db_core::DecimalEncoding, } /// Cross-query LRU of translated overlay ops. @@ -2966,7 +3045,18 @@ fn value_to_otype_okey( } find_numbig_okey(val, store, numbig_ctx) } - FlakeValue::Decimal(_) => find_numbig_okey(val, store, numbig_ctx), + FlakeValue::Decimal(bd) => { + // Mirror the resolver exactly: under InlineWhenFits a decimal that + // fits is stored inline (XSD_DECIMAL_INLINE), so the constant must + // encode the same way to match the stored row; values that don't fit + // (and every decimal under ArenaOnly) live in the NumBig arena. + if store.decimal_encoding().inlines() { + if let Some(key) = ObjKey::encode_decimal(bd) { + return Ok((OType::XSD_DECIMAL_INLINE, key.as_u64())); + } + } + find_numbig_okey(val, store, numbig_ctx) + } // Not handled: Vector (arena + HNSW identity; raw-merge is the // intended lane) and generic Duration (its V3 decode is a stub — // the raw flake preserves the value, the binary row would not). @@ -3320,6 +3410,23 @@ pub(crate) fn value_to_otype_okey_simple( OType::XSD_G_MONTH_DAY, ObjKey::encode_g_month_day(g.month(), g.day()).as_u64(), )), + FlakeValue::Decimal(bd) => { + // An inline-eligible decimal under InlineWhenFits has a + // self-describing key, so the prefilter narrows with no arena + // round-trip (issue #1328). Arena decimals (too large, or any + // decimal under ArenaOnly) need a per-(graph, predicate) handle this + // helper has no context for, so leave the scan un-narrowed + // (Unsupported) — never NotFound, since the value may still exist. + if store.decimal_encoding().inlines() { + if let Some(key) = ObjKey::encode_decimal(bd) { + return Ok((OType::XSD_DECIMAL_INLINE, key.as_u64())); + } + } + Err(Error::new( + ErrorKind::Unsupported, + "arena decimal not encodable without (graph, predicate) context", + )) + } _ => Err(std::io::Error::new( std::io::ErrorKind::Unsupported, format!("unsupported FlakeValue variant for V6 fast-path: {val:?}"), diff --git a/fluree-db-query/src/execute/operator_tree.rs b/fluree-db-query/src/execute/operator_tree.rs index 88431e6a1a..53be22a376 100644 --- a/fluree-db-query/src/execute/operator_tree.rs +++ b/fluree-db-query/src/execute/operator_tree.rs @@ -414,9 +414,15 @@ fn extract_simple_numeric_compare_threshold( if args.len() != 2 { return None; } + // Long/Double, plus exact integer (BigInt) and decimal constants — the + // numeric-compare fast paths now encode all of these into the matching + // order-preserving key space, so `FILTER(?v > 0.1)` and `FILTER(?n > big)` + // can take the pushdown rather than always deferring to the general scan. let const_to_flake = |c: &FlakeValue| match c { FlakeValue::Long(n) => Some(fluree_db_core::FlakeValue::Long(*n)), FlakeValue::Double(d) => Some(fluree_db_core::FlakeValue::Double(*d)), + FlakeValue::Decimal(d) => Some(fluree_db_core::FlakeValue::Decimal(d.clone())), + FlakeValue::BigInt(b) => Some(fluree_db_core::FlakeValue::BigInt(b.clone())), _ => None, }; let direct_op = match *func { diff --git a/fluree-db-query/src/fast_count.rs b/fluree-db-query/src/fast_count.rs index c17f9383d9..0d4e99bf58 100644 --- a/fluree-db-query/src/fast_count.rs +++ b/fluree-db-query/src/fast_count.rs @@ -366,8 +366,8 @@ fn count_rows_for_predicate_numeric_compare_post( // Same o_type at both ends ⇒ uniform o_type (POST sorts o_type before o_key). if min_ot == max_ot { let otype = OType::from_u16(min_ot); - if !matches!(otype, OType::XSD_INTEGER | OType::XSD_DOUBLE) { - // Uniformly unsupported (e.g. all-decimal predicate): the leaf + if !otype_okey_order_comparable(otype) { + // Uniformly not o_key-comparable (e.g. arena NUM_BIG): the leaf // scan below would bail on its first leaflet anyway — defer // now without opening any leaves. return Ok(None); @@ -411,13 +411,52 @@ fn count_rows_for_predicate_numeric_compare_post( /// per-leaflet directory must be consulted for this predicate's first/last key). /// Returns `None` if there are no leaves (an empty predicate — the caller's total is /// 0) or, defensively, if a boundary leaf yields no matching leaflet. -/// True if this o_type is numeric but cannot be compared by encoded o_key in -/// the numeric-COUNT lanes (non-canonical integer widths, floats, decimals, -/// arena-keyed NUM_BIG): rows of these kinds force the count to defer. +/// o_types whose `o_key` order equals numeric order, so a `?o K` scan can +/// compare encoded keys directly: +/// - **all inline integer subtypes** (`is_integer`): every inline integer is +/// `encode_i64`-ordered; values that overflow `i64` carry the arena +/// `NUM_BIG_OVERFLOW` o_type instead, so an integer-subtype o_type guarantees +/// an inline, order-preserving key. +/// - **`xsd:double` / `xsd:float`**: `encode_f64` is total-order. +/// - **inline decimals** (`XSD_DECIMAL_INLINE`): order-preserving base-10 float. +/// +/// Arena `NUM_BIG_OVERFLOW` is numeric but equality-only, so it is excluded. +pub(crate) fn otype_okey_order_comparable(ot: OType) -> bool { + ot.is_integer() + || ot == OType::XSD_DOUBLE + || ot == OType::XSD_FLOAT + || ot == OType::XSD_DECIMAL_INLINE +} + +/// True if this o_type is numeric but NOT o_key-order-comparable, so rows of it +/// force the numeric-COUNT lanes to defer. With all inline integer subtypes, +/// `xsd:double`/`xsd:float`, and inline decimals now comparable +/// ([`otype_okey_order_comparable`]), this is the arena `NUM_BIG_OVERFLOW` lane +/// (equality-only) and the dormant lossy-f64 `XSD_DECIMAL` lane. fn otype_unsupported_numeric(raw: u16) -> bool { let ot = OType::from_u16(raw); - (ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW) - && !matches!(ot, OType::XSD_INTEGER | OType::XSD_DOUBLE) + (ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW || ot == OType::XSD_DECIMAL_INLINE) + && !otype_okey_order_comparable(ot) +} + +/// The single `o_type` shared by every row of `p_id` in POST order, or `None` +/// if the predicate is empty or has mixed o_types. Read from the leaf manifest +/// (plus ≤2 boundary leaves) — cheap, no full scan. A uniform result in an +/// order-preserving numeric type (any inline integer subtype, double/float, or +/// inline decimal — see [`otype_okey_order_comparable`]) means every value +/// shares that type with no arena spill and no other types, which is the base +/// precondition for narrowing a numeric range scan by `o_key`. (The caller must +/// additionally ensure no overlay, since novelty can add a cross-type value.) +pub(crate) fn predicate_uniform_o_type( + store: &BinaryIndexStore, + g_id: GraphId, + p_id: u32, +) -> Option { + let leaves = leaf_entries_for_predicate(store, g_id, RunSortOrder::Post, p_id); + match predicate_post_global_extent(store, p_id, leaves).ok()? { + Some((min_ot, _, max_ot, _)) if min_ot == max_ot => Some(min_ot), + _ => None, + } } fn predicate_post_global_extent( @@ -521,7 +560,7 @@ fn count_numeric_compare_in_leaf_slice( return Ok(None); }; let otype = OType::from_u16(raw_otype); - if !matches!(otype, OType::XSD_INTEGER | OType::XSD_DOUBLE) { + if !otype_okey_order_comparable(otype) { return Ok(None); } let threshold_key = match encode_numeric_threshold_for_otype(otype, threshold)? { @@ -558,15 +597,47 @@ fn count_numeric_compare_in_leaf_slice( Ok(Some(total)) } -fn encode_numeric_threshold_for_otype(otype: OType, threshold: &FlakeValue) -> Result> { +pub(crate) fn encode_numeric_threshold_for_otype( + otype: OType, + threshold: &FlakeValue, +) -> Result> { + use bigdecimal::BigDecimal; + // Encode the threshold into the row o_type's key space. Inline decimals use + // the order-preserving decimal codec, so a `>`/`<` comparison of `o_key`s is + // exact: an integer/decimal threshold and a numerically-equal stored decimal + // encode identically, so cross-form (`?price > 10` over decimal rows) is + // correct. A threshold that doesn't fit inline (or a double threshold against + // decimal rows) yields `None` → the caller declines the fast path. let key = match (otype, threshold) { - (OType::XSD_INTEGER, FlakeValue::Long(n)) => ObjKey::encode_i64(*n).as_u64(), - (OType::XSD_DOUBLE, FlakeValue::Long(n)) => ObjKey::encode_f64(*n as f64) - .map_err(|_| QueryError::execution("cannot encode f64 threshold".to_string()))? - .as_u64(), - (OType::XSD_DOUBLE, FlakeValue::Double(d)) => ObjKey::encode_f64(*d) + // Integer-family rows: every inline integer subtype is encode_i64-ordered. + // A non-integer bound (decimal/double) against integer rows can't encode + // exactly here → None → caller post-filters. + (ot, FlakeValue::Long(n)) if ot.is_integer() => ObjKey::encode_i64(*n).as_u64(), + // Float-family rows: encode_f64 (total-order). + (OType::XSD_DOUBLE | OType::XSD_FLOAT, FlakeValue::Long(n)) => { + ObjKey::encode_f64(*n as f64) + .map_err(|_| QueryError::execution("cannot encode f64 threshold".to_string()))? + .as_u64() + } + (OType::XSD_DOUBLE | OType::XSD_FLOAT, FlakeValue::Double(d)) => ObjKey::encode_f64(*d) .map_err(|_| QueryError::execution("cannot encode f64 threshold".to_string()))? .as_u64(), + (OType::XSD_DECIMAL_INLINE, FlakeValue::Decimal(d)) => match ObjKey::encode_decimal(d) { + Some(k) => k.as_u64(), + None => return Ok(None), + }, + (OType::XSD_DECIMAL_INLINE, FlakeValue::Long(n)) => { + match ObjKey::encode_decimal(&BigDecimal::from(*n)) { + Some(k) => k.as_u64(), + None => return Ok(None), + } + } + (OType::XSD_DECIMAL_INLINE, FlakeValue::BigInt(b)) => { + match ObjKey::encode_decimal(&BigDecimal::from(b.as_ref().clone())) { + Some(k) => k.as_u64(), + None => return Ok(None), + } + } _ => return Ok(None), }; Ok(Some(key)) @@ -672,37 +743,56 @@ fn count_numeric_compare_overlay_parallel( compare: NumericCompareOp, threshold: &FlakeValue, ) -> Result> { - let tk_int = encode_numeric_threshold_for_otype(OType::XSD_INTEGER, threshold)?; - let tk_dbl = encode_numeric_threshold_for_otype(OType::XSD_DOUBLE, threshold)?; + // One threshold key per order-preserving family. All integer subtypes share + // the encode_i64 key; double/float share encode_f64; inline decimals their + // own codec. `None` means the threshold doesn't encode in that family. + let tk_i64 = encode_numeric_threshold_for_otype(OType::XSD_INTEGER, threshold)?; + let tk_f64 = encode_numeric_threshold_for_otype(OType::XSD_DOUBLE, threshold)?; + let tk_dec = encode_numeric_threshold_for_otype(OType::XSD_DECIMAL_INLINE, threshold)?; + + // Map a row o_type to its threshold key: `Some(tk)` if the type is + // o_key-comparable (tk may itself be `None` if the threshold didn't encode + // for that family), `None` if the type isn't comparable at all. + let tk_for = |ot: OType| -> Option> { + if ot.is_integer() { + Some(tk_i64) + } else if ot == OType::XSD_DOUBLE || ot == OType::XSD_FLOAT { + Some(tk_f64) + } else if ot == OType::XSD_DECIMAL_INLINE { + Some(tk_dec) + } else { + None + } + }; - // Pre-check the base predicate's POST extent: if the base rows are - // uniformly an unsupported o_type (e.g. all-decimal), or the threshold - // can't encode for the uniform supported type, the full scan below is + // Pre-check the base predicate's POST extent: if the base rows are uniformly + // an o_type we can't compare by o_key (e.g. arena NUM_BIG), or the threshold + // can't encode for the uniform supported family, the full scan below is // doomed — defer immediately instead of scanning every partition first. // (Unsupported values arriving only via novelty are still caught by the // per-row flag; novelty is small, so that residual pass is bounded.) let post_leaves = leaf_entries_for_predicate(store, g_id, RunSortOrder::Post, p_id); if let Some((min_ot, _, max_ot, _)) = predicate_post_global_extent(store, p_id, post_leaves)? { if min_ot == max_ot { - match OType::from_u16(min_ot) { - OType::XSD_INTEGER if tk_int.is_none() => return Ok(None), - OType::XSD_DOUBLE if tk_dbl.is_none() => return Ok(None), - OType::XSD_INTEGER | OType::XSD_DOUBLE => {} - ot if ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW => return Ok(None), - _ => {} + let ot = OType::from_u16(min_ot); + match tk_for(ot) { + Some(Some(_)) => {} // comparable, threshold encodes → proceed + Some(None) => return Ok(None), // comparable family but threshold didn't encode + None if ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW => return Ok(None), + None => {} // non-numeric uniform → every row a non-match, fine } } else if otype_unsupported_numeric(min_ot) || otype_unsupported_numeric(max_ot) { // Mixed base with an unsupported-numeric boundary (e.g. integer - // rows + decimals): doomed regardless of novelty — defer before - // scanning any partition. + // rows + arena NUM_BIG): doomed regardless of novelty — defer. return Ok(None); } } - // Numeric o_types this lane can't compare by o_key (other integer-family - // widths, floats, decimals — arena-keyed NUM_BIG has no value order at - // all) must defer the whole count: treating them as non-matches would - // silently undercount. Mirrors the base lane's per-leaflet Ok(None) bail. + // Numeric o_types this lane can't compare by o_key (arena-keyed NUM_BIG, + // which has no value order) must defer the whole count: treating them as + // non-matches would silently undercount. All inline integer subtypes, + // doubles/floats, and inline decimals ARE comparable. Mirrors the base + // lane's per-leaflet Ok(None) bail. let saw_unsupported_numeric = std::sync::atomic::AtomicBool::new(false); let count = parallel_overlay_psot_filter_count( ctx, @@ -712,26 +802,20 @@ fn count_numeric_compare_overlay_parallel( p_id, |_s, o_type, o_key| { let ot = OType::from_u16(o_type); - let tk = match ot { - OType::XSD_INTEGER => tk_int, - OType::XSD_DOUBLE => tk_dbl, - _ if ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW => { + match tk_for(ot) { + Some(Some(tk)) => okey_matches(compare, o_key, tk), + Some(None) => { + // Comparable family but the threshold didn't encode for it + // (e.g. decimal threshold vs integer rows): defer. saw_unsupported_numeric.store(true, std::sync::atomic::Ordering::Relaxed); - return false; + false } - // Genuinely non-numeric object: comparison errors => not a match - _ => return false, - }; - match tk { - Some(tk) => okey_matches(compare, o_key, tk), - None => { - // Threshold not encodable for this row's o_type (e.g. a - // decimal threshold against integer rows): the comparison - // is still numerically valid, so defer rather than - // undercount — mirrors the base lane. + None if ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW => { saw_unsupported_numeric.store(true, std::sync::atomic::Ordering::Relaxed); false } + // Genuinely non-numeric object: comparison errors => not a match. + None => false, } }, )?; diff --git a/fluree-db-query/src/fast_min_max_string.rs b/fluree-db-query/src/fast_min_max_string.rs index ab4605e933..b469fb178e 100644 --- a/fluree-db-query/src/fast_min_max_string.rs +++ b/fluree-db-query/src/fast_min_max_string.rs @@ -285,7 +285,10 @@ fn minmax_numeric_post( MinMaxMode::Max => read_ordered_key_v2(RunSortOrder::Post, &entry.last_key), }; let ot = OType::from_u16(rr.o_type); - if !ot.is_numeric() { + // Numeric kinds plus order-preserving inline decimals: their boundary + // o_key is the min/max value. (The single-o_type checks below ensure a + // predicate mixing inline and arena decimals declines.) + if !ot.is_numeric() && ot != OType::XSD_DECIMAL_INLINE { return Ok(None); } @@ -320,6 +323,10 @@ fn numeric_binding_from_otype_okey(store: &BinaryIndexStore, o_type: u16, o_key: DecodeKind::F64 => { Binding::lit(FlakeValue::Double(ObjKey::from_u64(o_key).decode_f64()), dt) } + DecodeKind::Decimal => Binding::lit( + FlakeValue::Decimal(Box::new(ObjKey::from_u64(o_key).decode_decimal())), + dt, + ), _ => Binding::Unbound, } } diff --git a/fluree-db-query/src/fast_path_common.rs b/fluree-db-query/src/fast_path_common.rs index c71423d3a1..166df9e93d 100644 --- a/fluree-db-query/src/fast_path_common.rs +++ b/fluree-db-query/src/fast_path_common.rs @@ -286,16 +286,26 @@ pub fn cursor_projection_otype_okey() -> ColumnProjection { /// - `GEO_POINT` (packed lat/long — not a linear value order) and `BLANK_NODE`; /// - overflow big numerics / JSON / vector arena handles (equality-only). /// +/// Inline decimals (`XSD_DECIMAL_INLINE`) ARE admitted: their key is an +/// order-preserving base-10 float code (raw `u64` order == numeric order), so a +/// single-`o_type` scan yields them in value order like the other numerics. +/// /// Within one `o_type`, this equals the SPARQL `ORDER BY` order; mixing -/// `o_type`s under one predicate is rejected by the operator at runtime. +/// `o_type`s under one predicate is rejected by the operator at runtime — which +/// also means a predicate with both inline and arena (NUM_BIG) decimals can't +/// use this path, so the inline-only scan never silently drops arena rows. #[inline] pub const fn is_post_desc_orderable(o_type: u16) -> bool { let ot = OType::from_u16(o_type); // XSD_BOOLEAN (0x0002), the signed/unsigned/constrained integers and floats - // (is_numeric: 0x0003..=0x0012), and the temporal + duration range - // (is_temporal: XSD_DATE 0x0013..=XSD_DURATION 0x001D). Excludes GEO_POINT - // (0x001E), BLANK_NODE (0x001F), and every dict-backed/lang/arena type. - o_type == OType::XSD_BOOLEAN.as_u16() || ot.is_numeric() || ot.is_temporal() + // (is_numeric: 0x0003..=0x0012), the temporal + duration range (is_temporal: + // XSD_DATE 0x0013..=XSD_DURATION 0x001D), and inline decimals + // (XSD_DECIMAL_INLINE 0x0020, order-preserving). Excludes GEO_POINT (0x001E), + // BLANK_NODE (0x001F), and every dict-backed/lang/arena type. + o_type == OType::XSD_BOOLEAN.as_u16() + || ot.is_numeric() + || ot.is_temporal() + || o_type == OType::XSD_DECIMAL_INLINE.as_u16() } // --------------------------------------------------------------------------- diff --git a/fluree-db-query/src/fast_star_const_order_topk.rs b/fluree-db-query/src/fast_star_const_order_topk.rs index 23f97d557a..8901519dac 100644 --- a/fluree-db-query/src/fast_star_const_order_topk.rs +++ b/fluree-db-query/src/fast_star_const_order_topk.rs @@ -75,14 +75,20 @@ pub fn star_const_ordered_limit_operator( } // Apply numeric existence filter: keep subjects with any numeric value satisfying the threshold. - let filtered_subjects = filter_subjects_by_numeric_gt( + // `None` ⇒ a row carried a numeric o_type we can't compare by o_key + // (inline decimal / arena big numeric); decline so the fallback path + // evaluates the filter correctly. + let Some(filtered_subjects) = filter_subjects_by_numeric_gt( store, g_id, numeric_p_id, &candidates, ctx.to_t, &numeric_threshold, - )?; + )? + else { + return Ok(None); + }; if filtered_subjects.is_empty() { return Ok(Some(empty_batch(schema.clone())?)); } @@ -329,6 +335,14 @@ where Ok(()) } +/// Returns `Some(subjects)` whose value satisfies `> threshold`, or `None` to +/// decline the fast path (the caller falls back) when a row carries a numeric +/// o_type this lane can't compare by `o_key` against this threshold — +/// `XSD_INTEGER`, `XSD_DOUBLE`, and order-preserving inline decimals +/// (`XSD_DECIMAL_INLINE`, when the threshold encodes to a decimal key) are +/// comparable. Arena `NUM_BIG_OVERFLOW` (and decimals under a double threshold) +/// would be silently dropped by a naive `_ => false`, undercounting the filter; +/// declining keeps the result correct via the general path. fn filter_subjects_by_numeric_gt( store: &Arc, g_id: GraphId, @@ -336,19 +350,33 @@ fn filter_subjects_by_numeric_gt( subjects_sorted: &[u64], to_t: i64, threshold: &FlakeValue, -) -> Result> { - // Only support numeric thresholds used in benchmark filters. +) -> Result>> { + use fluree_db_core::value_id::ObjKey; + // This lane derives its row-comparison keys from a Long/Double threshold. + // Any other threshold type (e.g. an xsd:decimal constant, now that the + // detector can extract them) must DECLINE to the fallback — returning an + // empty set would silently undercount instead of evaluating the filter. let (thr_i, thr_d) = match threshold { FlakeValue::Long(n) => (*n, *n as f64), FlakeValue::Double(d) => (*d as i64, *d), - _ => return Ok(Vec::new()), + _ => return Ok(None), }; - let thr_i_key = fluree_db_core::value_id::ObjKey::encode_i64(thr_i).as_u64(); - let thr_d_key = fluree_db_core::value_id::ObjKey::encode_f64(thr_d) + let thr_i_key = ObjKey::encode_i64(thr_i).as_u64(); + let thr_d_key = ObjKey::encode_f64(thr_d) .map_err(|_| QueryError::execution("cannot encode f64 threshold".to_string()))? .as_u64(); + // Decimal threshold key, for comparing inline-decimal rows. Only an integer + // threshold maps cleanly onto the decimal key space (10 == 10.00); a double + // threshold against decimal rows isn't compared here (decimal rows decline). + let thr_dec_key: Option = match threshold { + FlakeValue::Long(n) => { + ObjKey::encode_decimal(&bigdecimal::BigDecimal::from(*n)).map(ObjKey::as_u64) + } + _ => None, + }; let mut keep: FxHashSet = FxHashSet::default(); + let mut saw_uncomparable_numeric = false; for_each_subject_row_psot( store, g_id, @@ -360,6 +388,23 @@ fn filter_subjects_by_numeric_gt( let over_threshold = match ot { OType::XSD_INTEGER => batch.o_key.get(i) > thr_i_key, OType::XSD_DOUBLE => batch.o_key.get(i) > thr_d_key, + // Inline decimals are order-preserving: compare keys when the + // threshold encodes to a decimal key, else decline. + OType::XSD_DECIMAL_INLINE => match thr_dec_key { + Some(k) => batch.o_key.get(i) > k, + None => { + saw_uncomparable_numeric = true; + false + } + }, + // Numeric but not o_key-comparable (arena big numerics, other + // integer widths/floats): can't decide here. + _ if ot.is_numeric() || ot == OType::NUM_BIG_OVERFLOW => { + saw_uncomparable_numeric = true; + false + } + // Genuinely non-numeric object: `?o > number` is a type + // mismatch, so it is correctly a non-match. _ => false, }; if over_threshold { @@ -369,9 +414,13 @@ fn filter_subjects_by_numeric_gt( }, )?; + if saw_uncomparable_numeric { + return Ok(None); + } + let mut out: Vec = keep.into_iter().collect(); out.sort_unstable(); - Ok(out) + Ok(Some(out)) } fn collect_label_pairs( diff --git a/fluree-db-query/src/object_binding.rs b/fluree-db-query/src/object_binding.rs index b600c3dd0e..8c8819ae58 100644 --- a/fluree-db-query/src/object_binding.rs +++ b/fluree-db-query/src/object_binding.rs @@ -169,6 +169,13 @@ pub(crate) fn late_materialized_object_binding( t, }) } + // Inline decimals decode cheaply from `o_key` alone (no arena, unlike + // NUM_BIG), so we decline the encoded fast path and let the caller + // materialize a `FlakeValue::Decimal`. That keeps them on the ordinary + // value path for equality/aggregate surfaces, where they compare by + // canonical BigDecimal against decoded sources (VALUES, BIND, novelty, + // and arena-backed decimals on other roots). + DecodeKind::Decimal => None, _ => None, } } @@ -439,6 +446,25 @@ mod tests { )); } + #[test] + fn late_materialized_object_binding_declines_inline_decimal() { + // Inline decimals decode cheaply, so the encoded fast path declines and + // the caller materializes a FlakeValue::Decimal — they never become an + // EncodedLit (which would need every equality surface to learn NUM_DEC). + use fluree_db_core::value_id::ObjKey; + let binding = late_materialized_object_binding( + OType::XSD_DECIMAL_INLINE.as_u16(), + ObjKey::encode_decimal(&"19.99".parse().unwrap()) + .unwrap() + .as_u64(), + 7, + 0, + u32::MAX, + None, + ); + assert!(binding.is_none()); + } + #[test] fn late_materialized_object_binding_keeps_datetime_encoded() { let binding = late_materialized_object_binding( diff --git a/fluree-db-transact/src/import_sink.rs b/fluree-db-transact/src/import_sink.rs index 96c216051e..7ef0a1b710 100644 --- a/fluree-db-transact/src/import_sink.rs +++ b/fluree-db-transact/src/import_sink.rs @@ -90,6 +90,10 @@ mod inner { pub vector_pool: Arc, /// Shared namespace allocator (for prefix lookup). pub ns_alloc: Arc, + /// Decimal-encoding policy for the index this import builds. Must match + /// the version of the root written for the import (same source) so inline + /// decimals and the root format agree. + pub decimal_encoding: fluree_db_core::DecimalEncoding, } /// Result of finishing a [`SpoolContext`] via [`SpoolContext::finish`] — @@ -157,6 +161,9 @@ mod inner { next_lang_id: u16, /// Graph ID for all records in this chunk (0 = default). g_id: GraphId, + /// Decimal-encoding policy (from `SpoolConfig`): under `InlineWhenFits`, + /// small exact decimals encode inline instead of via the numbig pool. + decimal_encoding: fluree_db_core::DecimalEncoding, } impl SpoolContext { @@ -184,6 +191,7 @@ mod inner { ns_alloc: Arc::clone(&config.ns_alloc), ns_prefix_cache: FxHashMap::default(), languages: FxHashMap::default(), + decimal_encoding: config.decimal_encoding, next_lang_id: 1, // 0 = no language tag g_id, }) @@ -384,14 +392,29 @@ mod inner { } } FlakeValue::Decimal(dec) => { - // Use shared numbig pool for global handle. - let handle = - self.numbig_pool - .get_or_insert_bigdec(self.g_id, p_id, dec.as_ref()); - ( - ObjKind::NUM_BIG.as_u8(), - ObjKey::encode_u32_id(handle).as_u64(), - ) + // Under InlineWhenFits, a small exact decimal encodes inline — + // no numbig-pool handle, avoiding the shared-pool insert on the + // import hot path. Large/high-precision decimals (and every + // decimal under ArenaOnly) fall back to the pool. + let inline = self + .decimal_encoding + .inlines() + .then(|| ObjKey::encode_decimal(dec.as_ref())) + .flatten(); + match inline { + Some(key) => (ObjKind::NUM_DEC.as_u8(), key.as_u64()), + None => { + let handle = self.numbig_pool.get_or_insert_bigdec( + self.g_id, + p_id, + dec.as_ref(), + ); + ( + ObjKind::NUM_BIG.as_u8(), + ObjKey::encode_u32_id(handle).as_u64(), + ) + } + } } FlakeValue::Vector(v) => { // Use shared vector pool for global handle. @@ -823,6 +846,7 @@ mod inner { numbig_pool: Arc::new(SharedNumBigPool::new()), vector_pool: Arc::new(SharedVectorArenaPool::new()), ns_alloc: Arc::new(SharedNamespaceAllocator::from_registry(ns)), + decimal_encoding: fluree_db_core::DecimalEncoding::InlineWhenFits, } }