diff --git a/.changeset/index-optimization-partial-and-or.md b/.changeset/index-optimization-partial-and-or.md new file mode 100644 index 000000000..a67048c9d --- /dev/null +++ b/.changeset/index-optimization-partial-and-or.md @@ -0,0 +1,14 @@ +--- +'@tanstack/db': patch +--- + +Fix incorrect results from index-optimized `where` clauses that combine indexed and non-indexed conditions. + +- `OR` expressions are now only served from indexes when every disjunct can use an index; otherwise the query falls back to a full scan. Previously, rows matched only by a non-indexed disjunct were missing from the result. +- `AND` expressions still use indexes for the conditions that have them, but the remaining conditions are now enforced by re-checking each candidate row against the full expression. Previously, non-indexed conditions were silently dropped, returning rows that did not match the query. +- Compound range conditions (e.g. `age > 5 AND age < 10`) combined with conditions on other fields no longer ignore those other conditions. +- Compound range conditions sharing the same boundary value (e.g. `age >= 5 AND age > 5`) now apply the strictest bound regardless of the order the conditions appear in, using the same value comparison semantics as the indexes (dates, locale strings, ...). +- Compound range conditions that only bound one side (e.g. `age > 5 AND age >= 8`) no longer return an empty result. +- Strict range comparisons (`gt`/`lt`) on BTree-indexed fields holding normalized values such as dates now correctly exclude the boundary value. +- Compound range conditions with a `null`/`undefined` bound (e.g. `gt(score, undefined)`) now re-filter against the full expression instead of returning index-ordered rows, matching the semantics of a full scan (a comparison against `null`/`undefined` is never true). +- Index-optimized `eq`, `IN`, and range queries on a field that has rows with `null`/`undefined` values no longer leak those rows into results. BTree indexes store and return such rows (they sort as the smallest key), but a comparison against `null`/`undefined` is never true, so these results are now re-filtered against the full expression to stay equivalent to a full scan. diff --git a/packages/db/src/collection/change-events.ts b/packages/db/src/collection/change-events.ts index 6afac412c..3f4977b7b 100644 --- a/packages/db/src/collection/change-events.ts +++ b/packages/db/src/collection/change-events.ts @@ -138,11 +138,16 @@ export function currentStateAsChanges< ) if (optimizationResult.canOptimize) { - // Use index optimization + // Use index optimization. When the index lookup is inexact, the keys + // are a superset of the true result (some conditions could not be + // served by an index), so re-check each row against the full expression. + const filterFn = optimizationResult.isExact + ? undefined + : createFilterFunctionFromExpression(expression) const result: Array, TKey>> = [] for (const key of optimizationResult.matchingKeys) { const value = collection.get(key) - if (value !== undefined) { + if (value !== undefined && (filterFn?.(value) ?? true)) { result.push({ type: `insert`, key, diff --git a/packages/db/src/indexes/btree-index.ts b/packages/db/src/indexes/btree-index.ts index 17608950a..b46c70710 100644 --- a/packages/db/src/indexes/btree-index.ts +++ b/packages/db/src/indexes/btree-index.ts @@ -247,7 +247,16 @@ export class BTreeIndex< toKey, toInclusive, (indexedValue, _) => { - if (!fromInclusive && this.compareFn(indexedValue, from) === 0) { + // Only exclude the boundary when an exclusive lower bound was + // actually provided. Without a `from` bound, `fromKey` defaults to + // the minimum key and must not be dropped. Compare against the + // normalized key since indexed values are stored normalized + // (e.g. dates as timestamps), so the raw `from` would never match. + if ( + hasFrom && + !fromInclusive && + this.compareFn(indexedValue, fromKey) === 0 + ) { // the B+ tree `forRange` method does not support exclusive lower bounds // so we need to exclude it manually return diff --git a/packages/db/src/utils/index-optimization.ts b/packages/db/src/utils/index-optimization.ts index 81b111af5..103cd6355 100644 --- a/packages/db/src/utils/index-optimization.ts +++ b/packages/db/src/utils/index-optimization.ts @@ -18,6 +18,7 @@ import { DEFAULT_COMPARE_OPTIONS } from '../utils.js' import { ReverseIndex } from '../indexes/reverse-index.js' import { hasVirtualPropPath } from '../virtual-props.js' +import { makeComparator } from './comparison.js' import type { CompareOptions } from '../query/builder/types.js' import type { IndexInterface, IndexOperation } from '../indexes/base-index.js' import type { BasicExpression } from '../query/ir.js' @@ -29,6 +30,13 @@ import type { CollectionLike } from '../types.js' export interface OptimizationResult { canOptimize: boolean matchingKeys: Set + /** + * Whether `matchingKeys` is exactly the set of keys matching the expression. + * When `false`, the keys are a superset of the true result (some conditions + * could not be served by an index) and each row must be re-checked against + * the full expression before being included in the result. + */ + isExact: boolean } /** @@ -134,7 +142,7 @@ function optimizeQueryRecursive( } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -167,6 +175,14 @@ export function canOptimizeExpression< return false } +/** + * Result of compound range optimization, including which AND arguments + * were covered by the range query so the caller can process the rest. + */ +interface CompoundRangeResult extends OptimizationResult { + coveredArgIndices: Set +} + /** * Optimizes compound range queries on the same field * Example: WHERE age > 5 AND age < 10 @@ -177,9 +193,14 @@ function optimizeCompoundRangeQuery< >( expression: BasicExpression, collection: CollectionLike, -): OptimizationResult { +): CompoundRangeResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { + canOptimize: false, + matchingKeys: new Set(), + isExact: false, + coveredArgIndices: new Set(), + } } // Group range operations by field @@ -188,11 +209,12 @@ function optimizeCompoundRangeQuery< Array<{ operation: `gt` | `gte` | `lt` | `lte` value: any + argIndex: number }> >() // Collect all range operations from AND arguments - for (const arg of expression.args) { + for (const [argIndex, arg] of expression.args.entries()) { if (arg.type === `func` && [`gt`, `gte`, `lt`, `lte`].includes(arg.name)) { const rangeOp = arg as any if (rangeOp.args.length === 2) { @@ -238,7 +260,7 @@ function optimizeCompoundRangeQuery< if (!fieldOperations.has(fieldKey)) { fieldOperations.set(fieldKey, []) } - fieldOperations.get(fieldKey)!.push({ operation, value }) + fieldOperations.get(fieldKey)!.push({ operation, value, argIndex }) } } } @@ -251,54 +273,100 @@ function optimizeCompoundRangeQuery< const index = findIndexForField(collection, fieldPath) if (index && index.supports(`gt`) && index.supports(`lt`)) { - // Build range query options + // Compare values with the same semantics the index uses (dates, + // locale strings, ...), in ascending order since bounds are about + // value order regardless of the index direction + const compare = makeComparator({ + ...DEFAULT_COMPARE_OPTIONS, + ...collection.compareOptions, + direction: `asc`, + }) + + // Build range query options, keeping the strictest bound on each + // side: a larger lower bound (or smaller upper bound) wins, and at + // equal values the exclusive operation wins over the inclusive one. + // `hasFromBound`/`hasToBound` track whether a bound was selected, + // separately from the bound value (which may legitimately be falsy). let from: any = undefined let to: any = undefined + let hasFromBound = false + let hasToBound = false let fromInclusive = true let toInclusive = true + // A comparison against null/undefined is never true, but in an index + // those values sort as the smallest key, so a range query cannot + // represent such a bound. Track it and force a re-filter instead of + // claiming the result is exact. + let hasNullBound = false for (const { operation, value } of operations) { + if (value == null) { + hasNullBound = true + continue + } switch (operation) { case `gt`: - if (from === undefined || value > from) { + case `gte`: { + const cmp = hasFromBound ? compare(value, from) : 1 + if (cmp > 0) { from = value + hasFromBound = true + fromInclusive = operation === `gte` + } else if (cmp === 0 && operation === `gt`) { fromInclusive = false } break - case `gte`: - if (from === undefined || value > from) { - from = value - fromInclusive = true - } - break + } case `lt`: - if (to === undefined || value < to) { + case `lte`: { + const cmp = hasToBound ? compare(value, to) : -1 + if (cmp < 0) { to = value + hasToBound = true + toInclusive = operation === `lte` + } else if (cmp === 0 && operation === `lt`) { toInclusive = false } break - case `lte`: - if (to === undefined || value < to) { - to = value - toInclusive = true - } - break + } } } - const matchingKeys = (index as any).rangeQuery({ - from, - to, - fromInclusive, - toInclusive, - }) - - return { canOptimize: true, matchingKeys } + // Only pass the bounds that were selected: rangeQuery distinguishes + // an absent bound (open-ended) from an explicitly provided one + const rangeOptions: Record = {} + if (hasFromBound) { + rangeOptions.from = from + rangeOptions.fromInclusive = fromInclusive + } + if (hasToBound) { + rangeOptions.to = to + rangeOptions.toInclusive = toInclusive + } + const matchingKeys = (index as any).rangeQuery(rangeOptions) + + return { + canOptimize: true, + matchingKeys, + // The range result is exact only when it cannot include rows with a + // nullish indexed value (which a comparison would reject but the + // index returns, as they sort as the smallest key). That requires a + // non-nullish lower bound to exclude them: without `hasFromBound` + // the range is open at the bottom and captures those rows, and a + // nullish bound value (`hasNullBound`) can never bound them out. + isExact: hasFromBound && !hasNullBound, + coveredArgIndices: new Set(operations.map((op) => op.argIndex)), + } } } } - return { canOptimize: false, matchingKeys: new Set() } + return { + canOptimize: false, + matchingKeys: new Set(), + isExact: false, + coveredArgIndices: new Set(), + } } /** @@ -312,7 +380,7 @@ function optimizeSimpleComparison< collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length !== 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const leftArg = expression.args[0]! @@ -362,15 +430,29 @@ function optimizeSimpleComparison< // Check if the index supports this operation if (!index.supports(indexOperation)) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const matchingKeys = index.lookup(indexOperation, queryValue) - return { canOptimize: true, matchingKeys } + + // A comparison against null/undefined is never true, but BTree indexes + // store and return rows with a nullish indexed value (they sort as the + // smallest key). Determine whether the index result is exact or a + // superset that the caller must re-filter: + // - eq/gt/gte: a nullish query value matches nothing, while the index + // would still return nullish-keyed rows -> inexact when nullish. A + // non-nullish lower bound (gt/gte) excludes the bottom-sorted nullish + // rows, so those stay exact. + // - lt/lte: the open lower bound always includes nullish-keyed rows, + // so the result is conservatively inexact. + const isExact = + operation === `lt` || operation === `lte` ? false : queryValue != null + + return { canOptimize: true, matchingKeys, isExact } } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -412,22 +494,41 @@ function optimizeAndExpression( collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } // First, try to optimize compound range queries on the same field + // (e.g. age > 5 AND age < 10 becomes a single range query) const compoundRangeResult = optimizeCompoundRangeQuery(expression, collection) - if (compoundRangeResult.canOptimize) { - return compoundRangeResult - } + const coveredArgIndices = compoundRangeResult.canOptimize + ? compoundRangeResult.coveredArgIndices + : new Set() const results: Array> = [] + if (compoundRangeResult.canOptimize) { + results.push(compoundRangeResult) + } - // Try to optimize each part, keep the optimizable ones - for (const arg of expression.args) { + // Try to optimize the remaining conjuncts, keep the optimizable ones. + // Conjuncts that cannot use an index make the result inexact: the + // intersection is then a superset of the true result and must be + // re-filtered against the full expression by the caller. The compound + // range result may itself be inexact (e.g. a null/undefined bound). + let allConjunctsExact = !compoundRangeResult.canOptimize + ? true + : compoundRangeResult.isExact + for (const [argIndex, arg] of expression.args.entries()) { + if (coveredArgIndices.has(argIndex)) { + continue + } const result = optimizeQueryRecursive(arg, collection) if (result.canOptimize) { results.push(result) + if (!result.isExact) { + allConjunctsExact = false + } + } else { + allConjunctsExact = false } } @@ -435,10 +536,14 @@ function optimizeAndExpression( // Use intersectSets utility for AND logic const allMatchingSets = results.map((r) => r.matchingKeys) const intersectedKeys = intersectSets(allMatchingSets) - return { canOptimize: true, matchingKeys: intersectedKeys } + return { + canOptimize: true, + matchingKeys: intersectedKeys, + isExact: allConjunctsExact, + } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** @@ -464,27 +569,31 @@ function optimizeOrExpression( collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length < 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const results: Array> = [] - // Try to optimize each part, keep the optimizable ones + // Every disjunct must be optimizable: rows matched only by a disjunct + // that cannot use an index would be missing from the union, and no + // post-filtering can recover them. In that case fall back to a full scan. for (const arg of expression.args) { const result = optimizeQueryRecursive(arg, collection) - if (result.canOptimize) { - results.push(result) + if (!result.canOptimize) { + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } + results.push(result) } - if (results.length > 0) { - // Use unionSets utility for OR logic - const allMatchingSets = results.map((r) => r.matchingKeys) - const unionedKeys = unionSets(allMatchingSets) - return { canOptimize: true, matchingKeys: unionedKeys } + // Use unionSets utility for OR logic + const allMatchingSets = results.map((r) => r.matchingKeys) + const unionedKeys = unionSets(allMatchingSets) + return { + canOptimize: true, + matchingKeys: unionedKeys, + // An inexact (superset) disjunct makes the union a superset as well + isExact: results.every((r) => r.isExact), } - - return { canOptimize: false, matchingKeys: new Set() } } /** @@ -498,8 +607,9 @@ function canOptimizeOrExpression< return false } - // If any argument can be optimized, we can gain some speedup - return expression.args.some((arg) => canOptimizeExpression(arg, collection)) + // Every disjunct must be optimizable, otherwise the union would miss + // rows matched only by the non-optimizable disjuncts + return expression.args.every((arg) => canOptimizeExpression(arg, collection)) } /** @@ -513,7 +623,7 @@ function optimizeInArrayExpression< collection: CollectionLike, ): OptimizationResult { if (expression.type !== `func` || expression.args.length !== 2) { - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } const fieldArg = expression.args[0]! @@ -528,11 +638,17 @@ function optimizeInArrayExpression< const values = (arrayArg as any).value const index = findIndexForField(collection, fieldPath) + // A nullish member can never be matched by `IN` (a comparison against + // null/undefined is never true), but the index would still return rows + // with a nullish indexed value. When the list contains a nullish member + // the result is a superset that the caller must re-filter. + const isExact = !values.some((value: any) => value == null) + if (index) { // Check if the index supports IN operation if (index.supports(`in`)) { const matchingKeys = index.lookup(`in`, values) - return { canOptimize: true, matchingKeys } + return { canOptimize: true, matchingKeys, isExact } } else if (index.supports(`eq`)) { // Fallback to multiple equality lookups const matchingKeys = new Set() @@ -542,12 +658,12 @@ function optimizeInArrayExpression< matchingKeys.add(key) } } - return { canOptimize: true, matchingKeys } + return { canOptimize: true, matchingKeys, isExact } } } } - return { canOptimize: false, matchingKeys: new Set() } + return { canOptimize: false, matchingKeys: new Set(), isExact: false } } /** diff --git a/packages/db/tests/btree-index-undefined-values.test.ts b/packages/db/tests/btree-index-undefined-values.test.ts index 11e29690c..1510c02e3 100644 --- a/packages/db/tests/btree-index-undefined-values.test.ts +++ b/packages/db/tests/btree-index-undefined-values.test.ts @@ -248,6 +248,24 @@ describe(`BTreeIndex - undefined value handling`, () => { expect(withoutFrom.size).toBe(3) }) + it(`should not drop the minimum key when an upper-only range is exclusive on the (absent) lower bound`, () => { + // When no `from` bound is provided, `fromInclusive` must not cause the + // smallest key to be excluded: there is no lower bound to exclude + // against. Only an explicitly provided exclusive lower bound should + // drop its boundary value. + const index = createIndex(`value`) + index.add(`a`, { value: 1 }) + index.add(`b`, { value: 5 }) + index.add(`c`, { value: 10 }) + + const result = index.rangeQuery({ to: 10, fromInclusive: false }) + + expect(result.size).toBe(3) + expect(result).toContain(`a`) + expect(result).toContain(`b`) + expect(result).toContain(`c`) + }) + it(`should handle range query from undefined to undefined`, () => { const index = createIndex(`value`) index.add(`a`, { value: undefined })