diff --git a/.claude/skills/rationalize-deps/SKILL.md b/.claude/skills/rationalize-deps/SKILL.md new file mode 100644 index 00000000000..829a70c67ec --- /dev/null +++ b/.claude/skills/rationalize-deps/SKILL.md @@ -0,0 +1,125 @@ +--- +name: rationalize-deps +description: Analyze Cargo.toml dependencies and attempt to remove unused features to reduce compile times and binary size +--- + +# Rationalize Dependencies + +This skill analyzes Cargo.toml dependencies to identify and remove unused features. + +## Overview + +Many crates enable features by default that may not be needed. This skill: +1. Identifies dependencies with default features enabled +2. Tests if `default-features = false` works +3. Identifies which specific features are actually needed +4. Verifies compilation after changes + +## Step 1: Identify the target + +Ask the user which crate(s) to analyze: +- A specific crate name (e.g., "tokio", "serde") +- A specific workspace member (e.g., "quickwit-search") +- "all" to scan the entire workspace + +## Step 2: Analyze current dependencies + +For the workspace Cargo.toml (`quickwit/Cargo.toml`), list dependencies that: +- Do NOT have `default-features = false` +- Have default features that might be unnecessary + +Run: `cargo tree -p -f "{p} {f}" --edges features` to see what features are actually used. + +## Step 3: For each candidate dependency + +### 3a: Check the crate's default features + +Look up the crate on crates.io or check its Cargo.toml to understand: +- What features are enabled by default +- What each feature provides + +Use: `cargo metadata --format-version=1 | jq '.packages[] | select(.name == "") | .features'` + +### 3b: Try disabling default features + +Modify the dependency in `quickwit/Cargo.toml`: + +From: +```toml +some-crate = { version = "1.0" } +``` + +To: +```toml +some-crate = { version = "1.0", default-features = false } +``` + +### 3c: Run cargo check + +Run: `cargo check --workspace` (or target specific packages for faster feedback) + +If compilation fails: +1. Read the error messages to identify which features are needed +2. Add only the required features explicitly: + ```toml + some-crate = { version = "1.0", default-features = false, features = ["needed-feature"] } + ``` +3. Re-run cargo check + +### 3d: Binary search for minimal features + +If there are many default features, use binary search: +1. Start with no features +2. If it fails, add half the default features +3. Continue until you find the minimal set + +## Step 4: Document findings + +For each dependency analyzed, report: +- Original configuration +- New configuration (if changed) +- Features that were removed +- Any features that are required + +## Step 5: Verify full build + +After all changes, run: +```bash +cargo check --workspace --all-targets +cargo test --workspace --no-run +``` + +## Common Patterns + +### Serde +Often only needs `derive`: +```toml +serde = { version = "1.0", default-features = false, features = ["derive", "std"] } +``` + +### Tokio +Identify which runtime features are actually used: +```toml +tokio = { version = "1.0", default-features = false, features = ["rt-multi-thread", "macros", "sync"] } +``` + +### Reqwest +Often doesn't need all TLS backends: +```toml +reqwest = { version = "0.11", default-features = false, features = ["rustls-tls", "json"] } +``` + +## Rollback + +If changes cause issues: +```bash +git checkout quickwit/Cargo.toml +cargo check --workspace +``` + +## Tips + +- Start with large crates that have many default features (tokio, reqwest, hyper) +- Use `cargo bloat --crates` to identify large dependencies +- Check `cargo tree -d` for duplicate dependencies that might indicate feature conflicts +- Some features are needed only for tests - consider using `[dev-dependencies]` features diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7af5fbda950..dead7aedeca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -188,28 +188,3 @@ jobs: if: always() && steps.modified.outputs.rust_src == 'true' run: cargo +nightly fmt --all -- --check working-directory: ./quickwit - - thirdparty-license: - name: Check Datadog third-party license file - runs-on: ubuntu-latest - permissions: - contents: read - actions: write - steps: - - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 - - name: Install Rust toolchain - uses: dtolnay/rust-toolchain@f7ccc83f9ed1e5b9c81d8a67d7ad1a747e22a561 # master - with: - toolchain: stable - - - name: Cache cargo tools - uses: actions/cache@9255dc7a253b0ccc959486e2bca901246202afeb # v5.0.1 - with: - path: ~/.cargo/bin - key: ${{ runner.os }}-cargo-tools-${{ hashFiles('**/Cargo.lock') }} - - - name: Install dd-rust-license-tool - run: dd-rust-license-tool --help || cargo install --git https://github.com/DataDog/rust-license-tool.git --force - - - name: Check Datadog third-party license file - run: dd-rust-license-tool --config quickwit/license-tool.toml --manifest-path quickwit/Cargo.toml check diff --git a/LICENSE-3rdparty.csv b/LICENSE-3rdparty.csv index 13904cb90c2..ed79fbdb132 100644 --- a/LICENSE-3rdparty.csv +++ b/LICENSE-3rdparty.csv @@ -52,7 +52,6 @@ base16ct,https://github.com/RustCrypto/formats/tree/master/base16ct,Apache-2.0 O base64,https://github.com/marshallpierce/rust-base64,MIT OR Apache-2.0,Marshall Pierce base64-simd,https://github.com/Nugine/simd,MIT,The base64-simd Authors base64ct,https://github.com/RustCrypto/formats,Apache-2.0 OR MIT,RustCrypto Developers -bincode,https://github.com/servo/bincode,MIT,"Ty Overby , Francesco Mazzoli , David Tolnay , Zoey Riordan " bit-set,https://github.com/contain-rs/bit-set,Apache-2.0 OR MIT,Alexis Beingessner bit-vec,https://github.com/contain-rs/bit-vec,Apache-2.0 OR MIT,Alexis Beingessner bitflags,https://github.com/bitflags/bitflags,MIT OR Apache-2.0,The Rust Project Developers @@ -104,8 +103,6 @@ crossbeam-utils,https://github.com/crossbeam-rs/crossbeam,MIT OR Apache-2.0,The crunchy,https://github.com/eira-fransham/crunchy,MIT,Eira Fransham crypto-bigint,https://github.com/RustCrypto/crypto-bigint,Apache-2.0 OR MIT,RustCrypto Developers crypto-common,https://github.com/RustCrypto/traits,MIT OR Apache-2.0,RustCrypto Developers -csv,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant -csv-core,https://github.com/BurntSushi/rust-csv,Unlicense OR MIT,Andrew Gallant darling,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_core,https://github.com/TedDriggs/darling,MIT,Ted Driggs darling_macro,https://github.com/TedDriggs/darling,MIT,Ted Driggs @@ -130,15 +127,7 @@ elliptic-curve,https://github.com/RustCrypto/traits/tree/master/elliptic-curve,A embedded-io,https://github.com/embassy-rs/embedded-io,MIT OR Apache-2.0,The embedded-io Authors embedded-io,https://github.com/rust-embedded/embedded-hal,MIT OR Apache-2.0,The embedded-io Authors encode_unicode,https://github.com/tormol/encode_unicode,Apache-2.0 OR MIT,Torbjørn Birch Moltu -encoding,https://github.com/lifthrasiir/rust-encoding,MIT,Kang Seonghoon -encoding-index-japanese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-korean,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-simpchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-singlebyte,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding-index-tradchinese,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon -encoding_index_tests,https://github.com/lifthrasiir/rust-encoding,CC0-1.0,Kang Seonghoon encoding_rs,https://github.com/hsivonen/encoding_rs,(Apache-2.0 OR MIT) AND BSD-3-Clause,Henri Sivonen -encoding_rs_io,https://github.com/BurntSushi/encoding_rs_io,MIT OR Apache-2.0,Andrew Gallant enum-iterator,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux enum-iterator-derive,https://github.com/stephaneyfx/enum-iterator,0BSD,Stephane Raux env_filter,https://github.com/rust-cli/env_logger,MIT OR Apache-2.0,The env_filter Authors @@ -150,7 +139,6 @@ fail,https://github.com/tikv/fail-rs,Apache-2.0,The TiKV Project Developers fastdivide,https://github.com/fulmicoton/fastdivide,zlib-acknowledgement OR MIT,Paul Masurel fastrand,https://github.com/smol-rs/fastrand,Apache-2.0 OR MIT,Stjepan Glavina ff,https://github.com/zkcrypto/ff,MIT OR Apache-2.0,"Sean Bowe , Jack Grigg " -filetime,https://github.com/alexcrichton/filetime,MIT OR Apache-2.0,Alex Crichton find-msvc-tools,https://github.com/rust-lang/cc-rs,MIT OR Apache-2.0,The find-msvc-tools Authors fixedbitset,https://github.com/petgraph/fixedbitset,MIT OR Apache-2.0,bluss flate2,https://github.com/rust-lang/flate2-rs,MIT OR Apache-2.0,"Alex Crichton , Josh Triplett " @@ -224,8 +212,6 @@ is-terminal,https://github.com/sunfishcode/is-terminal,MIT,"softprops -jiff,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant -jiff-static,https://github.com/BurntSushi/jiff,Unlicense OR MIT,Andrew Gallant jobserver,https://github.com/rust-lang/jobserver-rs,MIT OR Apache-2.0,Alex Crichton js-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/js-sys,MIT OR Apache-2.0,The wasm-bindgen Developers json_comments,https://github.com/tmccombs/json-comments-rs,Apache-2.0,Thayne McCombs @@ -233,19 +219,6 @@ lazy_static,https://github.com/rust-lang-nursery/lazy-static.rs,MIT OR Apache-2. levenshtein_automata,https://github.com/tantivy-search/levenshtein-automata,MIT,Paul Masurel libc,https://github.com/rust-lang/libc,MIT OR Apache-2.0,The Rust Project Developers libm,https://github.com/rust-lang/compiler-builtins,MIT,Jorge Aparicio -libredox,https://gitlab.redox-os.org/redox-os/libredox,MIT,4lDO2 <4lDO2@protonmail.com> -lindera-cc-cedict,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict Authors -lindera-cc-cedict-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-cc-cedict-builder Authors -lindera-core,https://github.com/lindera-morphology/lindera,MIT,The lindera-core Authors -lindera-decompress,https://github.com/lindera-morphology/lindera,MIT,The lindera-decompress Authors -lindera-dictionary,https://github.com/lindera-morphology/lindera,MIT,The lindera-dictionary Authors -lindera-ipadic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic Authors -lindera-ipadic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-builder Authors -lindera-ipadic-neologd-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ipadic-neologd-builder Authors -lindera-ko-dic,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic Authors -lindera-ko-dic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-ko-dic-builder Authors -lindera-tokenizer,https://github.com/lindera-morphology/lindera,MIT,The lindera-tokenizer Authors -lindera-unidic-builder,https://github.com/lindera-morphology/lindera,MIT,The lindera-unidic-builder Authors linked-hash-map,https://github.com/contain-rs/linked-hash-map,MIT OR Apache-2.0,"Stepan Koltsov , Andrew Paseltiner " linux-raw-sys,https://github.com/sunfishcode/linux-raw-sys,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Dan Gohman litemap,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers @@ -330,7 +303,6 @@ pnet_packet,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham pnet_sys,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,"Robert Clipsham , Linus Färnstrand " pnet_transport,https://github.com/libpnet/libpnet,MIT OR Apache-2.0,Robert Clipsham portable-atomic,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic Authors -portable-atomic-util,https://github.com/taiki-e/portable-atomic,Apache-2.0 OR MIT,The portable-atomic-util Authors postcard,https://github.com/jamesmunns/postcard,MIT OR Apache-2.0,James Munns potential_utf,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers powerfmt,https://github.com/jhpratt/powerfmt,MIT OR Apache-2.0,Jacob Pratt @@ -353,8 +325,6 @@ prost,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-derive,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " prost-types,https://github.com/tokio-rs/prost,Apache-2.0,"Dan Burkert , Lucio Franco , Casper Meijn , Tokio Contributors " -protobuf,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov -protobuf-support,https://github.com/stepancheg/rust-protobuf,MIT,Stepan Koltsov pulldown-cmark,https://github.com/raphlinus/pulldown-cmark,MIT,"Raph Levien , Marcus Klaas de Vries " pulldown-cmark-to-cmark,https://github.com/Byron/pulldown-cmark-to-cmark,Apache-2.0,"Sebastian Thiel , Dylan Owen , Alessandro Ogier , Zixian Cai <2891235+caizixian@users.noreply.github.com>, Andrew Lyjak " quanta,https://github.com/metrics-rs/quanta,MIT,Toby Lawrence @@ -388,7 +358,6 @@ roxmltree,https://github.com/RazrFalcon/roxmltree,MIT OR Apache-2.0,Evgeniy Reiz rust-embed,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-impl,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh rust-embed-utils,https://pyrossh.dev/repos/rust-embed,MIT,pyrossh -rust-stemmers,https://github.com/CurrySoftware/rust-stemmers,MIT OR BSD-3-Clause,"Jakob Demler , CurrySoftware " rustc-hash,https://github.com/rust-lang/rustc-hash,Apache-2.0 OR MIT,The Rust Project Developers rustix,https://github.com/bytecodealliance/rustix,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,"Dan Gohman , Jakub Konka " rustls,https://github.com/rustls/rustls,Apache-2.0 OR ISC OR MIT,The rustls Authors @@ -448,8 +417,6 @@ syn,https://github.com/dtolnay/syn,MIT OR Apache-2.0,David Tolnay synstructure,https://github.com/mystor/synstructure,MIT,Nika Layzell sysinfo,https://github.com/GuillaumeGomez/sysinfo,MIT,Guillaume Gomez -system-configuration,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN -system-configuration-sys,https://github.com/mullvad/system-configuration-rs,MIT OR Apache-2.0,Mullvad VPN tabled,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tabled_derive,https://github.com/zhiburt/tabled,MIT,Maxim Zhiburt tantivy,https://github.com/quickwit-oss/tantivy,MIT,Paul Masurel @@ -545,7 +512,6 @@ wasmtimer,https://github.com/whizsid/wasmtimer-rs,MIT,"WhizSid web-sys,https://github.com/wasm-bindgen/wasm-bindgen/tree/master/crates/web-sys,MIT OR Apache-2.0,The wasm-bindgen Developers web-time,https://github.com/daxpedda/web-time,MIT OR Apache-2.0,The web-time Authors webpki-roots,https://github.com/rustls/webpki-roots,CDLA-Permissive-2.0,The webpki-roots Authors -whichlang,https://github.com/quickwit-oss/whichlang,MIT,"Quickwit, Inc. " winapi,https://github.com/retep998/winapi-rs,MIT,Peter Atashian winapi,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian winapi-i686-pc-windows-gnu,https://github.com/retep998/winapi-rs,MIT OR Apache-2.0,Peter Atashian @@ -561,7 +527,6 @@ windows-interface,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-link,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-link Authors windows-numerics,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-numerics Authors -windows-registry,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-registry Authors windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft windows-result,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,The windows-result Authors windows-strings,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Microsoft @@ -590,9 +555,7 @@ windows_x86_64_msvc,https://github.com/microsoft/windows-rs,MIT OR Apache-2.0,Th winnow,https://github.com/winnow-rs/winnow,MIT,The winnow Authors wit-bindgen,https://github.com/bytecodealliance/wit-bindgen,Apache-2.0 WITH LLVM-exception OR Apache-2.0 OR MIT,Alex Crichton writeable,https://github.com/unicode-org/icu4x,Unicode-3.0,The ICU4X Project Developers -xattr,https://github.com/Stebalien/xattr,MIT OR Apache-2.0,Steven Allen xmlparser,https://github.com/RazrFalcon/xmlparser,MIT OR Apache-2.0,Yevhenii Reizner -yada,https://github.com/takuyaa/yada,MIT OR Apache-2.0,Takuya Asano yansi,https://github.com/SergioBenitez/yansi,MIT OR Apache-2.0,Sergio Benitez yoke,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar yoke-derive,https://github.com/unicode-org/icu4x,Unicode-3.0,Manish Goregaokar diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md index 24ce8677902..c8f26ded709 100644 --- a/docs/configuration/index-config.md +++ b/docs/configuration/index-config.md @@ -94,6 +94,7 @@ The doc mapping defines how a document and the fields it contains are stored and | `tag_fields` | Collection of fields* explicitly defined in `field_mappings` whose values will be stored as part of the `tags` metadata. Allowed types are: `text` (with raw tokenizer), `i64` and `u64`. [Learn more about tags](../overview/concepts/querying.md#tag-pruning). | `[]` | | `store_source` | Whether or not the original JSON document is stored or not in the index. | `false` | | `timestamp_field` | Timestamp field* used for sharding documents in splits. The field has to be of type `datetime`. [Learn more about time sharding](./../overview/architecture.md). | `None` | +| `indexation_time_field` | Field with that will hold the indexation time of the document. This field is populated during indexation. The field has to be of type `datetime`. | `None` | | `partition_key` | If set, quickwit will route documents into different splits depending on the field name declared as the `partition_key`. | `null` | | `max_num_partitions` | Limits the number of splits created through partitioning. (See [Partitioning](../overview/concepts/querying.md#partitioning)) | `200` | | `index_field_presence` | `exists` queries are enabled automatically for fast fields. To enable it for all other fields set this parameter to `true`. Enabling it can have a significant CPU-cost on indexing. | false | diff --git a/docs/reference/es_compatible_api.md b/docs/reference/es_compatible_api.md index 32cbdafd761..885ac39e67b 100644 --- a/docs/reference/es_compatible_api.md +++ b/docs/reference/es_compatible_api.md @@ -187,11 +187,12 @@ It is also possible to not supply an order and rely on the default order using t } ``` -If no format is provided for timestamps, timestamps are returned with milliseconds precision. - -If you need nanosecond precision, you can use the `epoch_nanos_int` format. Beware this means the resulting -JSON may contain high numbers for which there is loss of precision when using languages where all numbers are -floats, such as JavaScript. +Fields explicitly specified as `datetime` in the doc mapping also support an +output format. If no format is provided, timestamps are returned with +milliseconds precision. If you need nanosecond precision, you can use the +`epoch_nanos_int` format. Beware, this means the resulting JSON may contain high +numbers for which there is loss of precision when using languages where all +numbers are floats, such as JavaScript. ```json { @@ -237,6 +238,40 @@ You can pass the `sort` value of the last hit in a subsequent request where othe This allows you to paginate your results. + +#### Note regarding multi-type pagination + +Pagination can get tricky on fields that have multiple types. In dynamic fields, multiple column types can be present for a given field within a single split. When using doc mapping updates, any type combination can be present across split. + +First, let's take a look at the various type systems we are working with. + +The JSON representation used for the sort values provides the following primitive types: +- numerical +- bool +- string + +Tantivy uses the following types: +- i64 / u64 / f64 (only one of these can be present in a split) +- datetime +- string +- bool +- ip (not supported in sort yet) +- bytes (not supported in sort yet) + +Elasticsearch can represent date field sort values in various formats. In Quickwit, only integer formats are supported (millisecond or nanosecond). Either way, the fact that datetime can live along with another type inside a split yields unreliable pagination: +- Because there isn't a simple and efficient common representation in the fast field u64 space, it's hard to represent datetime within the numerical (i64/u64/f64) order. +- To paginate separately across numerical and datetime types a strongly typed representation of the json sort key would be necessary. + +The current implementation does the following: +- If the mapping is explicitly set to datetime and never changed, pagination works as expected. +- If the mapping evolved to datetime, pagination fails for splits that contain numerical values (i64, u64, f64 columns). +- If the mapping is a json/dynamic field, pagination fails for splits that contain a datetime column. This can happen because on JSON field Tantivy automatically stores RFC3339 date strings in a datetime column. +- If other types are mixed, the sort will iterate over all values type by type + - Asc: numeric -> string -> boolean -> datetime -> null + - Desc: datetime -> boolean -> string -> numeric -> null +- Quickwit used to support specifying numbers as string in the search after value. That isn't possible anymore. + + ### `_msearch`   Multi search API ``` diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index a0f47b86c7d..33e2cf38be8 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -1186,15 +1186,6 @@ version = "1.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d809780667f4410e7c41b07f52439b94d2bdf8528eeedc287fa38d3b7f95d82" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bindgen" version = "0.72.1" @@ -1376,9 +1367,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" dependencies = [ "serde", ] @@ -1529,9 +1520,8 @@ dependencies = [ [[package]] name = "chitchat" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "735f8a51f68b353b17e351b38317433d6afcaa9cc04f4d0f6c9e9125c49c1efe" +version = "0.9.0" +source = "git+https://github.com/quickwit-oss/chitchat.git?rev=bd54c81#bd54c810700814f83599a31a7e29f2a5eb8324b3" dependencies = [ "anyhow", "async-trait", @@ -2226,6 +2216,12 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be1e0bca6c3637f992fc1cc7cbc52a78c1ef6db076dbf1059c4323d6a2048376" +[[package]] +name = "datasketches" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c286de4e81ea2590afc24d754e0f83810c566f50a1388fa75ebd57928c0d9745" + [[package]] name = "dbl" version = "0.3.2" @@ -2312,8 +2308,6 @@ checksum = "25f104b501bf2364e78d0d3974cbc774f738f5865306ed128e1e0d7499c0ad96" dependencies = [ "console", "shell-words", - "tempfile", - "zeroize", ] [[package]] @@ -2598,70 +2592,6 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" -[[package]] -name = "encoding" -version = "0.2.33" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b0d943856b990d12d3b55b359144ff341533e516d94098b1d3fc1ac666d36ec" -dependencies = [ - "encoding-index-japanese", - "encoding-index-korean", - "encoding-index-simpchinese", - "encoding-index-singlebyte", - "encoding-index-tradchinese", -] - -[[package]] -name = "encoding-index-japanese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04e8b2ff42e9a05335dbf8b5c6f7567e5591d0d916ccef4e0b1710d32a0d0c91" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-korean" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dc33fb8e6bcba213fe2f14275f0963fd16f0a02c878e3095ecfdf5bee529d81" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-simpchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d87a7194909b9118fc707194baa434a4e3b0fb6a5a757c73c3adb07aa25031f7" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-singlebyte" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3351d5acffb224af9ca265f435b859c7c01537c0849754d3db3fdf2bfe2ae84a" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding-index-tradchinese" -version = "1.20141219.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0e20d5688ce3cab59eb3ef3a2083a5c77bf496cb798dc6fcdb75f323890c18" -dependencies = [ - "encoding_index_tests", -] - -[[package]] -name = "encoding_index_tests" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a246d82be1c9d791c5dfde9a2bd045fc3cbba3fa2b11ad558f27d01712f00569" - [[package]] name = "encoding_rs" version = "0.8.35" @@ -2671,15 +2601,6 @@ dependencies = [ "cfg-if", ] -[[package]] -name = "encoding_rs_io" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" -dependencies = [ - "encoding_rs", -] - [[package]] name = "enum-iterator" version = "2.3.0" @@ -2707,7 +2628,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1bf3c259d255ca70051b30e2e95b5446cdb8949ac4cd22c0d7fd634d89f568e2" dependencies = [ "log", - "regex", ] [[package]] @@ -2719,7 +2639,6 @@ dependencies = [ "anstream", "anstyle", "env_filter", - "jiff", "log", ] @@ -2892,18 +2811,6 @@ version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" -[[package]] -name = "filetime" -version = "0.2.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" -dependencies = [ - "cfg-if", - "libc", - "libredox", - "windows-sys 0.60.2", -] - [[package]] name = "find-msvc-tools" version = "0.1.6" @@ -3824,21 +3731,9 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", - "system-configuration", "tokio", - "tower-layer", "tower-service", "tracing", - "windows-registry", -] - -[[package]] -name = "hyperloglogplus" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" -dependencies = [ - "serde", ] [[package]] @@ -4292,9 +4187,9 @@ dependencies = [ [[package]] name = "keccak" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecc2af9a1119c51f12a14607e783cb977bde58bc069ff0c3da1095e635d70654" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" dependencies = [ "cpufeatures", ] @@ -4418,219 +4313,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "lindera-cc-cedict" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7595a377b9723e837711366721b02662dac64d734af3dac1c01941e779e95a6b" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-cc-cedict-builder", - "lindera-core", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-cc-cedict-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c6fbd76a65b5df73574898e871d7cff3e34bf89f544f6e1a1087cba82e25cce" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-core" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85af015d15c25cb3b7af82ba181908f4afbec6a2636f0fdfcca6d173c1b2c7fe" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "encoding_rs", - "log", - "once_cell", - "serde", - "thiserror 1.0.69", - "yada", -] - -[[package]] -name = "lindera-decompress" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3dfc054b2f3f3eb21a24ce062a3d5f969339ddf50652038ea33993b1b97d4ba" -dependencies = [ - "anyhow", - "flate2", - "serde", -] - -[[package]] -name = "lindera-dictionary" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6b1a5d8f4cba37dcca18dc0e827233ff46695a6d878d716f16f755d264d588a" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "lindera-cc-cedict", - "lindera-cc-cedict-builder", - "lindera-core", - "lindera-ipadic", - "lindera-ipadic-builder", - "lindera-ipadic-neologd-builder", - "lindera-ko-dic", - "lindera-ko-dic-builder", - "lindera-unidic-builder", - "serde", -] - -[[package]] -name = "lindera-ipadic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5f1d26aba22d8a9193dcd2d087205d89e0ffb19490bc305b341e25c037f353" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ipadic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ipadic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "184a9769b05ae857bd55f5e8a94b2ae2ba8816c5c6b78c73f161b4d7490c0461" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ipadic-neologd-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6b8cd28b5402425184d0f719d5bd81af87a7e36e2032b5bcceddf55011b1b22c" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding_rs", - "encoding_rs_io", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "serde", - "yada", -] - -[[package]] -name = "lindera-ko-dic" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a6d718720a28ac5d93b449661d8844f7858b2b71595e3198bc90e437f01e5ce" -dependencies = [ - "bincode", - "byteorder", - "encoding", - "flate2", - "lindera-core", - "lindera-ko-dic-builder", - "once_cell", - "tar", - "ureq", -] - -[[package]] -name = "lindera-ko-dic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f22de1fcdc33de258037145ae86686125214206b98d04c6dfe01f36c136c0022" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - -[[package]] -name = "lindera-tokenizer" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cca45cbc1af512ce2aa9dea9a1d694430480a53bb53e37165ba143e27e81f7dd" -dependencies = [ - "bincode", - "lindera-core", - "lindera-dictionary", - "once_cell", - "serde", - "serde_json", -] - -[[package]] -name = "lindera-unidic-builder" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "359425c8dff54164ff1b068122d26df358ce18533e4771eb5c5ce68888d988f2" -dependencies = [ - "anyhow", - "bincode", - "byteorder", - "csv", - "encoding", - "env_logger", - "glob", - "lindera-core", - "lindera-decompress", - "log", - "yada", -] - [[package]] name = "linked-hash-map" version = "0.5.6" @@ -4731,6 +4413,12 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lz4_flex" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" + [[package]] name = "matchers" version = "0.2.0" @@ -5090,9 +4778,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-format" @@ -5297,9 +4985,9 @@ checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" [[package]] name = "oneshot" -version = "0.1.11" +version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ce411919553d3f9fa53a0880544cda985a112117a0444d5ff1e870a893d6ea" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" [[package]] name = "onig" @@ -5598,7 +5286,8 @@ checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" [[package]] name = "ownedbytes" version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" dependencies = [ "stable_deref_trait", ] @@ -6364,7 +6053,6 @@ dependencies = [ "memchr", "parking_lot 0.12.5", "procfs", - "protobuf", "thiserror 2.0.17", ] @@ -6504,26 +6192,6 @@ dependencies = [ "prost 0.14.1", ] -[[package]] -name = "protobuf" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d65a1d4ddae7d8b5de68153b48f6aa3bba8cb002b243dbdbc55a5afbc98f99f4" -dependencies = [ - "once_cell", - "protobuf-support", - "thiserror 1.0.69", -] - -[[package]] -name = "protobuf-support" -version = "3.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e36c2f31e0a47f9280fb347ef5e461ffcd2c52dd520d8e216b52f93b0b0d7d6" -dependencies = [ - "thiserror 1.0.69", -] - [[package]] name = "psl" version = "2.1.176" @@ -6719,7 +6387,6 @@ dependencies = [ "quickwit-cluster", "quickwit-common", "quickwit-config", - "quickwit-doc-mapper", "quickwit-index-management", "quickwit-indexing", "quickwit-ingest", @@ -7328,9 +6995,6 @@ dependencies = [ "bitpacking", "criterion", "hex", - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", "once_cell", "proptest", "quickwit-common", @@ -7346,7 +7010,6 @@ dependencies = [ "thiserror 2.0.17", "time", "tracing", - "whichlang", ] [[package]] @@ -8195,16 +7858,6 @@ dependencies = [ "walkdir", ] -[[package]] -name = "rust-stemmers" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" -dependencies = [ - "serde", - "serde_derive", -] - [[package]] name = "rust_decimal" version = "1.39.0" @@ -8956,9 +8609,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "sketches-ddsketch" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1e9a774a6c28142ac54bb25d25562e6bcf957493a184f15ad4eebccb23e410a" +checksum = "05e40b6cf54d988dc1a2223531b969c9a9e30906ad90ef64890c27b4bfbb46ea" dependencies = [ "serde", ] @@ -9428,27 +9081,6 @@ dependencies = [ "nom 8.0.0", ] -[[package]] -name = "system-configuration" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" -dependencies = [ - "bitflags 2.10.0", - "core-foundation 0.9.4", - "system-configuration-sys", -] - -[[package]] -name = "system-configuration-sys" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" -dependencies = [ - "core-foundation-sys", - "libc", -] - [[package]] name = "tabled" version = "0.20.0" @@ -9484,7 +9116,8 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "tantivy" version = "0.26.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "778da245841522199d512d19511b041425d8cff3a8f262b4e1516fceb050289a" dependencies = [ "aho-corasick", "arc-swap", @@ -9495,6 +9128,7 @@ dependencies = [ "census", "crc32fast", "crossbeam-channel", + "datasketches", "downcast-rs", "fastdivide", "fnv", @@ -9502,19 +9136,17 @@ dependencies = [ "futures-channel", "futures-util", "htmlescape", - "hyperloglogplus", "itertools 0.14.0", "levenshtein_automata", "log", - "lru 0.12.5", - "lz4_flex", + "lru 0.16.3", + "lz4_flex 0.13.0", "measure_time", "memmap2", "once_cell", "oneshot", "rayon", "regex", - "rust-stemmers", "rustc-hash", "serde", "serde_json", @@ -9539,16 +9171,18 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.9.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fed3d674429bcd2de5d0a6d1aa5495fed8afd9c5ecce993019caf7615f53fa4" dependencies = [ "bitpacking", ] [[package]] name = "tantivy-columnar" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c57166f5bcfd478f370ab8445afb4678dce44801fa5ce5c451aaf8595583c5dc" dependencies = [ "downcast-rs", "fastdivide", @@ -9562,8 +9196,9 @@ dependencies = [ [[package]] name = "tantivy-common" -version = "0.10.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbf10915aa75da3c3b0d58b58853d2e889efbaf32d4982a4c3715dde6bba23e5" dependencies = [ "async-trait", "byteorder", @@ -9584,8 +9219,9 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.25.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfadb8526b6da90704feb293b0701a6aae62ea14983143344be2dc5ce30f1d82" dependencies = [ "fnv", "nom 7.1.3", @@ -9596,8 +9232,9 @@ dependencies = [ [[package]] name = "tantivy-sstable" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a2cfc3ac5164cbadc28965ffb145a8f47582a60ae5897859ad8d4316596c606" dependencies = [ "futures-util", "itertools 0.14.0", @@ -9609,8 +9246,9 @@ dependencies = [ [[package]] name = "tantivy-stacker" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6cbb051742da9d53ca9e8fff43a9b10e319338b24e2c0e15d0372df19ffeb951" dependencies = [ "murmurhash32", "tantivy-common", @@ -9618,21 +9256,11 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" -version = "0.6.0" -source = "git+https://github.com/SekoiaLab/tantivy/?rev=e9aede4#e9aede4b7eea5ce578887e8c7a1194b20953be0a" -dependencies = [ - "serde", -] - -[[package]] -name = "tar" -version = "0.4.44" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a" +checksum = "eac258c2c6390673f2685813afeeafcb8c4e0ee7de8dd3fc46838dcc37263f98" dependencies = [ - "filetime", - "libc", - "xattr", + "serde", ] [[package]] @@ -9770,9 +9398,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.44" +version = "0.3.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" dependencies = [ "deranged", "itoa", @@ -9781,16 +9409,16 @@ dependencies = [ "num-conv", "num_threads", "powerfmt", - "serde", + "serde_core", "time-core", "time-macros", ] [[package]] name = "time-core" -version = "0.1.6" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40868e7c1d2f0b8d73e4a8c7f0ff63af4f6d19be117e90bd73eb1d62cf831c6b" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-fmt" @@ -9804,9 +9432,9 @@ dependencies = [ [[package]] name = "time-macros" -version = "0.2.24" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30cfb0125f12d9c277f35663a0a33f8c30190f4e4574868a330595412d34ebf3" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" dependencies = [ "num-conv", "time-core", @@ -9950,10 +9578,7 @@ dependencies = [ "futures-core", "futures-io", "futures-sink", - "futures-util", - "hashbrown 0.15.5", "pin-project-lite", - "slab", "tokio", ] @@ -10496,21 +10121,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" -[[package]] -name = "ureq" -version = "2.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" -dependencies = [ - "base64 0.22.1", - "log", - "once_cell", - "rustls 0.23.36", - "rustls-pki-types", - "url", - "webpki-roots 0.26.11", -] - [[package]] name = "url" version = "2.5.8" @@ -10679,7 +10289,7 @@ dependencies = [ "jsonschema", "lalrpop", "lalrpop-util", - "lz4_flex", + "lz4_flex 0.11.5", "md-5", "nom 8.0.0", "nom-language", @@ -10984,12 +10594,6 @@ dependencies = [ "rustls-pki-types", ] -[[package]] -name = "whichlang" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b9aa3ad29c3d08283ac6b769e3ec15ad1ddb88af7d2e9bc402c574973b937e7" - [[package]] name = "whoami" version = "1.6.1" @@ -11146,17 +10750,6 @@ dependencies = [ "windows-link 0.1.3", ] -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link 0.2.1", - "windows-result 0.4.1", - "windows-strings 0.5.1", -] - [[package]] name = "windows-result" version = "0.3.4" @@ -11496,16 +11089,6 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" -[[package]] -name = "xattr" -version = "1.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" -dependencies = [ - "libc", - "rustix 1.1.3", -] - [[package]] name = "xmlparser" version = "0.13.6" @@ -11518,12 +11101,6 @@ version = "0.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" -[[package]] -name = "yada" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" - [[package]] name = "yansi" version = "1.0.1" diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index 453b5850761..b4bc5843d6e 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -91,7 +91,7 @@ bitpacking = "0.9.3" bytes = { version = "1", features = ["serde"] } bytesize = { version = "1.3", features = ["serde"] } bytestring = "1.5" -chitchat = "0.10.0" +chitchat = { git = "https://github.com/quickwit-oss/chitchat.git", rev = "bd54c81" } chrono = { version = "0.4", default-features = false, features = [ "clock", "std", @@ -102,11 +102,11 @@ colored = "3.0" console-subscriber = "0.5" criterion = { version = "0.8", features = ["async_tokio"] } cron = "0.15" -dialoguer = "0.12" +dialoguer = { version = "0.12", default-features = false } dotenvy = "0.15" dyn-clone = "1.0" enum-iterator = "2.3" -env_logger = "0.11" +env_logger = { version = "0.11", default-features = false, features = ["auto-color"] } fail = "0.5" flate2 = "1.1" flume = "0.12" @@ -131,23 +131,18 @@ http-serde = "2.1" humantime = "2.3" hyper = { version = "1.8", features = ["client", "http1", "http2", "server"] } hyper-rustls = "0.27" -hyper-util = { version = "0.1", features = ["full"] } +hyper-util = { version = "0.1", default-features = false, features = [ + "client-legacy", + "server-auto", + "server-graceful", + "service", + "tokio", +] } indexmap = { version = "2.12", features = ["serde"] } indicatif = "0.18" itertools = "0.14" json_comments = "0.2" libz-sys = "1.1" -# Lindera tokenizer 0.30+ versions (tested up to 0.32.3) are currently broken due to upstream build failures. -# The dictionary crates attempt to download artifacts from S3 URLs that return 404 Not Found. -# Version 0.29.0 is the latest version that builds correctly. It also explicitly depends on lindera-core 0.29 -# and lindera-dictionary 0.29. -lindera-core = "0.29" -lindera-dictionary = "0.29" -lindera-tokenizer = { version = "0.29", features = [ - "cc-cedict", - "ipadic", - "ko-dic", -] } lru = "0.16" matches = "0.1" md5 = "0.8" @@ -175,7 +170,7 @@ pprof = { version = "0.15", features = ["flamegraph"] } predicates = "3" prettyplease = "0.2" proc-macro2 = "1.0" -prometheus = { version = "0.14", features = ["process"] } +prometheus = { version = "0.14", default-features = false, features = ["process"] } proptest = "1" prost = { version = "0.14", default-features = false, features = [ "derive", @@ -245,7 +240,10 @@ tokio = { version = "1.48", features = ["full"] } tokio-metrics = { version = "0.4", features = ["rt"] } tokio-rustls = { version = "0.26", default-features = false } tokio-stream = { version = "0.1", features = ["sync"] } -tokio-util = { version = "0.7", features = ["full"] } +tokio-util = { version = "0.7", default-features = false, features = [ + "compat", + "io-util", +] } toml = "0.9" tonic = { version = "0.14", features = [ "_tls-any", @@ -295,9 +293,8 @@ vrl = { version = "0.29", default-features = false, features = [ "value", ] } warp = { version = "0.4", features = ["server", "test"] } -whichlang = "0.1" wiremock = "0.6" -zstd = "0.13" +zstd = { version = "0.13", default-features = false } aws-config = "1.8" aws-credential-types = { version = "1.2", features = ["hardcoded-credentials"] } @@ -356,7 +353,7 @@ quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/SekoiaLab/tantivy/", rev = "e9aede4", default-features = false, features = [ +tantivy = { version = "0.26.0", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-actors/src/actor.rs b/quickwit/quickwit-actors/src/actor.rs index 2fa32d7f2a5..bb5a48239a4 100644 --- a/quickwit/quickwit-actors/src/actor.rs +++ b/quickwit/quickwit-actors/src/actor.rs @@ -18,7 +18,6 @@ use std::sync::Arc; use async_trait::async_trait; use thiserror::Error; -use tracing::error; use crate::{ActorContext, QueueCapacity, SendError}; diff --git a/quickwit/quickwit-cli/Cargo.toml b/quickwit/quickwit-cli/Cargo.toml index c595cb7e90a..8819d92ec97 100644 --- a/quickwit/quickwit-cli/Cargo.toml +++ b/quickwit/quickwit-cli/Cargo.toml @@ -59,7 +59,6 @@ quickwit-actors = { workspace = true } quickwit-cluster = { workspace = true } quickwit-common = { workspace = true } quickwit-config = { workspace = true } -quickwit-doc-mapper = { workspace = true } quickwit-index-management = { workspace = true } quickwit-indexing = { workspace = true } quickwit-ingest = { workspace = true } @@ -105,7 +104,6 @@ release-feature-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-feature-vendored-set = [ "jemalloc", @@ -119,7 +117,6 @@ release-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-macos-feature-vendored-set = [ "jemalloc", @@ -132,13 +129,8 @@ release-macos-feature-vendored-set = [ "quickwit-storage/azure", "quickwit-storage/gcs", "quickwit-metastore/postgres", - "quickwit-doc-mapper/multilang", ] release-jemalloc-profiled = [ "release-feature-set", "jemalloc-profiled", ] - -[package.metadata.cargo-machete] -# used to enable the `multilang` feature -ignored = ["quickwit-doc-mapper"] diff --git a/quickwit/quickwit-cli/src/tool.rs b/quickwit/quickwit-cli/src/tool.rs index d32db8a9e45..4fa52e6b9ea 100644 --- a/quickwit/quickwit-cli/src/tool.rs +++ b/quickwit/quickwit-cli/src/tool.rs @@ -39,6 +39,7 @@ use quickwit_config::{ use quickwit_index_management::{IndexService, clear_cache_directory}; use quickwit_indexing::IndexingPipeline; use quickwit_indexing::actors::{IndexingService, MergePipeline, MergeSchedulerService}; +use quickwit_indexing::mature_merge::{MatureMergeConfig, merge_mature_all_indexes}; use quickwit_indexing::models::{ DetachIndexingPipeline, DetachMergePipeline, IndexingStatistics, SpawnPipeline, }; @@ -163,6 +164,56 @@ pub fn build_tool_command() -> Command { .required(true), ]) ) + .subcommand( + Command::new("merge-mature") + .display_order(10) + .about("Merges mature splits across all indexes and nodes.") + .long_about( + "Scans indexes for merge opportunities in mature Published splits. Considers \ + opportunities across all origin nodes and sources. Runs once and exits." + ) + .args(&[ + arg!(--"dry-run" + "Prints the planned merge operations without executing them.") + .required(false), + arg!(--"max-concurrent-merges" + "Maximum number of merges to run concurrently (default: 10).") + .display_order(1) + .required(false), + arg!(--"retention-safety-buffer-days" + "Splits within this many days of the retention cutoff are excluded (default: 5).") + .display_order(2) + .required(false), + arg!(--"min-merge-group-size" + "Minimum number of splits in a group to trigger a merge (default: 5).") + .display_order(3) + .required(false), + arg!(--"input-split-max-num-docs" + "Maximum number of docs in a split for it to be eligible (default: 10_000).") + .display_order(4) + .required(false), + arg!(--"max-merge-group-size" + "Maximum number of splits per merge operation (default: 100).") + .display_order(5) + .required(false), + arg!(--"split-target-num-docs" + "Maximum total docs per merge operation (default: 5_000_000).") + .display_order(6) + .required(false), + arg!(--"index-parallelism" + "Number of indexes processed concurrently (default: 50).") + .display_order(7) + .required(false), + arg!(--"index-id-patterns" + "Comma-separated list of index ID patterns to include (default: '*').") + .display_order(8) + .required(false), + arg!(--"metrics" + "Expose Prometheus metrics on the REST listen address during the run.") + .display_order(9) + .required(false), + ]) + ) .arg_required_else_help(true) } @@ -207,6 +258,13 @@ pub struct MergeArgs { pub source_id: SourceId, } +#[derive(Debug, Eq, PartialEq)] +pub struct MatureMergeArgs { + pub config_uri: Uri, + pub merge_config: MatureMergeConfig, + pub serve_metrics: bool, +} + #[derive(Debug, Eq, PartialEq)] pub struct ExtractSplitArgs { pub config_uri: Uri, @@ -221,6 +279,7 @@ pub enum ToolCliCommand { LocalIngest(LocalIngestDocsArgs), LocalSearch(LocalSearchArgs), Merge(MergeArgs), + MatureMerge(MatureMergeArgs), ExtractSplit(ExtractSplitArgs), } @@ -234,6 +293,7 @@ impl ToolCliCommand { "local-ingest" => Self::parse_local_ingest_args(submatches), "local-search" => Self::parse_local_search_args(submatches), "merge" => Self::parse_merge_args(submatches), + "merge-mature" => Self::parse_mature_merge_args(submatches), "extract-split" => Self::parse_extract_split_args(submatches), _ => bail!("unknown tool subcommand `{subcommand}`"), } @@ -385,12 +445,84 @@ impl ToolCliCommand { })) } + fn parse_mature_merge_args(mut matches: ArgMatches) -> anyhow::Result { + let config_uri = matches + .remove_one::("config") + .map(|uri_str| Uri::from_str(&uri_str)) + .expect("`config` should be a required arg.")?; + let defaults = MatureMergeConfig::default(); + let dry_run = matches.get_flag("dry-run"); + let max_concurrent_merges = matches + .remove_one::("max-concurrent-merges") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_concurrent_merges); + let retention_safety_buffer_days = matches + .remove_one::("retention-safety-buffer-days") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.retention_safety_buffer_days); + let min_merge_group_size = matches + .remove_one::("min-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.min_merge_group_size); + let input_split_max_num_docs = matches + .remove_one::("input-split-max-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.input_split_max_num_docs); + let max_merge_group_size = matches + .remove_one::("max-merge-group-size") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.max_merge_group_size); + let split_target_num_docs = matches + .remove_one::("split-target-num-docs") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.split_target_num_docs); + let index_parallelism = matches + .remove_one::("index-parallelism") + .map(|s| s.parse::()) + .transpose()? + .unwrap_or(defaults.index_parallelism); + let index_id_patterns = matches + .remove_one::("index-id-patterns") + .map(|s| s.split(',').map(|p| p.trim().to_string()).collect()) + .unwrap_or(defaults.index_id_patterns); + let serve_metrics = matches.get_flag("metrics"); + + if max_concurrent_merges == 0 { + bail!("`max-concurrent-merges` must be greater than or equal to 1."); + } + if index_parallelism == 0 { + bail!("`index-parallelism` must be greater than or equal to 1."); + } + Ok(Self::MatureMerge(MatureMergeArgs { + config_uri, + serve_metrics, + merge_config: MatureMergeConfig { + dry_run, + max_concurrent_merges, + retention_safety_buffer_days, + min_merge_group_size, + input_split_max_num_docs, + max_merge_group_size, + split_target_num_docs, + index_parallelism, + index_id_patterns, + }, + })) + } + pub async fn execute(self) -> anyhow::Result<()> { match self { Self::GarbageCollect(args) => garbage_collect_index_cli(args).await, Self::LocalIngest(args) => local_ingest_docs_cli(args).await, Self::LocalSearch(args) => local_search_cli(args).await, Self::Merge(args) => merge_cli(args).await, + Self::MatureMerge(args) => merge_mature_cli(args).await, Self::ExtractSplit(args) => extract_split_cli(args).await, } } @@ -651,6 +783,43 @@ pub async fn merge_cli(args: MergeArgs) -> anyhow::Result<()> { Ok(()) } +pub async fn merge_mature_cli(args: MatureMergeArgs) -> anyhow::Result<()> { + debug!(args=?args, "merge-mature"); + info!(merge_config=?args.merge_config, "merge-mature configuration"); + println!("❯ Scanning all indexes for mature merge opportunities..."); + let config = load_node_config(&args.config_uri).await?; + let (storage_resolver, metastore_resolver) = + get_resolvers(&config.storage_configs, &config.metastore_configs); + let metastore = metastore_resolver.resolve(&config.metastore_uri).await?; + + let runtimes_config = RuntimesConfig::default(); + start_actor_runtimes( + runtimes_config, + &HashSet::from_iter([QuickwitService::Indexer]), + )?; + + if args.serve_metrics { + let metrics_addr = config.rest_config.listen_addr; + tokio::spawn(serve_metrics(metrics_addr)); + } + + merge_mature_all_indexes( + metastore, + storage_resolver, + &config.data_dir_path, + args.merge_config.clone(), + config.node_id, + ) + .await?; + + if !args.merge_config.dry_run { + info!("mature splits successfully merged, waiting for explicit termination signal"); + tokio::time::sleep(Duration::MAX).await; + } + + Ok(()) +} + pub async fn garbage_collect_index_cli(args: GarbageCollectIndexArgs) -> anyhow::Result<()> { debug!(args=?args, "garbage-collect-index"); println!("❯ Garbage collecting index..."); @@ -955,3 +1124,48 @@ async fn create_empty_cluster(config: &NodeConfig) -> anyhow::Result { Ok(cluster) } + +/// A shortcut to expose the metrics without loading the whole quickwit_serve +/// machinery. +async fn serve_metrics(addr: std::net::SocketAddr) { + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + let listener = match tokio::net::TcpListener::bind(addr).await { + Ok(l) => l, + Err(err) => { + tracing::warn!("metrics server could not bind to {addr}: {err}"); + return; + } + }; + tracing::info!("metrics server listening on http://{addr}/metrics"); + loop { + let Ok((mut stream, _peer)) = listener.accept().await else { + continue; + }; + tokio::spawn(async move { + let mut buf = [0u8; 4096]; + let n = match stream.read(&mut buf).await { + Ok(n) => n, + Err(_) => return, + }; + let request = std::str::from_utf8(&buf[..n]).unwrap_or(""); + let is_metrics = request.starts_with("GET /metrics"); + let (status, body) = if is_metrics { + match quickwit_common::metrics::metrics_text_payload() { + Ok(payload) => ("200 OK", payload), + Err(e) => { + tracing::error!("failed to encode prometheus metrics: {e}"); + ("500 Internal Server Error", String::new()) + } + } + } else { + ("404 Not Found", String::new()) + }; + let response = format!( + "HTTP/1.1 {status}\r\nContent-Type: text/plain; version=0.0.4\r\nContent-Length: \ + {}\r\nConnection: close\r\n\r\n{body}", + body.len() + ); + let _ = stream.write_all(response.as_bytes()).await; + }); + } +} diff --git a/quickwit/quickwit-cluster/src/grpc_gossip.rs b/quickwit/quickwit-cluster/src/grpc_gossip.rs index 10be33970db..e974a975118 100644 --- a/quickwit/quickwit-cluster/src/grpc_gossip.rs +++ b/quickwit/quickwit-cluster/src/grpc_gossip.rs @@ -147,7 +147,7 @@ async fn perform_grpc_gossip_rounds( }, ) }); - chitchat_guard.reset_node_state_if_update( + chitchat_guard.reset_node_state( &chitchat_id, key_values, proto_node_state.max_version, diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs index 0f3af2bc5ba..2e719193383 100644 --- a/quickwit/quickwit-common/src/lib.rs +++ b/quickwit/quickwit-common/src/lib.rs @@ -27,6 +27,7 @@ pub mod jemalloc_profiled; mod kill_switch; pub mod metrics; pub mod net; +pub mod numeric_types; mod path_hasher; pub mod pretty; mod progress; diff --git a/quickwit/quickwit-common/src/numeric_types.rs b/quickwit/quickwit-common/src/numeric_types.rs new file mode 100644 index 00000000000..cf4028f2888 --- /dev/null +++ b/quickwit/quickwit-common/src/numeric_types.rs @@ -0,0 +1,470 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! This module is copied over from Tantivy + +/// This module helps compare numerical values of different types (i64, u64 +/// and f64). +pub mod num_cmp { + use std::cmp::Ordering; + + pub fn cmp_i64_f64(left_i: i64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // If right_f is < i64::MIN then left_i > right_f (i64::MIN=-2^63 can be + // exactly represented as f64) + if right_f < i64::MIN as f64 { + return Ok(Ordering::Greater); + } + // If right_f is >= i64::MAX then left_i < right_f (i64::MAX=2^63-1 cannot + // be exactly represented as f64) + if right_f >= i64::MAX as f64 { + return Ok(Ordering::Less); + } + + // Now right_f is in (i64::MIN, i64::MAX), so `right_f as i64` is + // well-defined (truncation toward 0) + let right_as_i = right_f as i64; + + let result = match left_i.cmp(&right_as_i) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_i as f64); + if rem == 0.0 { + Ordering::Equal + } else if right_f > 0.0 { + Ordering::Less + } else { + Ordering::Greater + } + } + }; + Ok(result) + } + + pub fn cmp_u64_f64(left_u: u64, right_f: f64) -> Result { + if right_f.is_nan() { + return Err("NaN comparison is not supported".to_string()); + } + + // Negative floats are always less than any u64 >= 0 + if right_f < 0.0 { + return Ok(Ordering::Greater); + } + + // If right_f is >= u64::MAX then left_u < right_f (u64::MAX=2^64-1 cannot be exactly) + let max_as_f = u64::MAX as f64; + if right_f > max_as_f { + return Ok(Ordering::Less); + } + + // Now right_f is in (0, u64::MAX), so `right_f as u64` is well-defined + // (truncation toward 0) + let right_as_u = right_f as u64; + + let result = match left_u.cmp(&right_as_u) { + Ordering::Less => Ordering::Less, + Ordering::Greater => Ordering::Greater, + Ordering::Equal => { + // they have the same integer part, compare the fraction + let rem = right_f - (right_as_u as f64); + if rem == 0.0 { + Ordering::Equal + } else { + Ordering::Less + } + } + }; + Ok(result) + } + + pub fn cmp_i64_u64(left_i: i64, right_u: u64) -> Ordering { + if left_i < 0 { + Ordering::Less + } else { + let left_as_u = left_i as u64; + left_as_u.cmp(&right_u) + } + } +} + +/// This modules helps projecting numerical values to other numerical types. +/// When the target value space cannot exactly represent the source value, the +/// next representable value is returned (or AfterLast if the source value is +/// larger than the largest representable value). +/// +/// All functions in this module assume that f64 values are not NaN. +pub mod num_proj { + #[derive(Debug, PartialEq)] + pub enum ProjectedNumber { + Exact(T), + Next(T), + AfterLast, + } + + pub fn i64_to_u64(value: i64) -> ProjectedNumber { + if value < 0 { + ProjectedNumber::Next(0) + } else { + ProjectedNumber::Exact(value as u64) + } + } + + pub fn u64_to_i64(value: u64) -> ProjectedNumber { + if value > i64::MAX as u64 { + ProjectedNumber::AfterLast + } else { + ProjectedNumber::Exact(value as i64) + } + } + + pub fn f64_to_u64(value: f64) -> ProjectedNumber { + if value < 0.0 { + ProjectedNumber::Next(0) + } else if value > u64::MAX as f64 { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as u64) + } else { + // casting f64 to u64 truncates toward zero + ProjectedNumber::Next(value as u64 + 1) + } + } + + pub fn f64_to_i64(value: f64) -> ProjectedNumber { + if value < (i64::MIN as f64) { + ProjectedNumber::Next(i64::MIN) + } else if value >= (i64::MAX as f64) { + ProjectedNumber::AfterLast + } else if value.fract() == 0.0 { + ProjectedNumber::Exact(value as i64) + } else if value > 0.0 { + // casting f64 to i64 truncates toward zero + ProjectedNumber::Next(value as i64 + 1) + } else { + ProjectedNumber::Next(value as i64) + } + } + + pub fn i64_to_f64(value: i64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as i64; + if k_roundtrip == value { + // between -2^53 and 2^53 all i64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else { + // for very large/small i64 values, it is approximated to the closest f64 + if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } + } + + pub fn u64_to_f64(value: u64) -> ProjectedNumber { + let value_f = value as f64; + let k_roundtrip = value_f as u64; + if k_roundtrip == value { + // between 0 and 2^53 all u64 are exactly represented as f64 + ProjectedNumber::Exact(value_f) + } else if k_roundtrip > value { + ProjectedNumber::Next(value_f) + } else { + ProjectedNumber::Next(value_f.next_up()) + } + } +} + +#[cfg(test)] +mod num_cmp_tests { + use std::cmp::Ordering; + + use super::num_cmp::*; + + #[test] + fn test_cmp_u64_f64() { + // Basic comparisons + assert_eq!(cmp_u64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_u64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(0, 0.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_u64_f64(0, 0.1).unwrap(), Ordering::Less); + + // Negative float values should always be less than any u64 + assert_eq!(cmp_u64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_u64_f64(u64::MAX, -1e20).unwrap(), Ordering::Greater); + + // Tests with extreme values + assert_eq!(cmp_u64_f64(u64::MAX, 1e20).unwrap(), Ordering::Less); + + // Precision edge cases: large u64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_u64 = 18_014_398_509_481_984; + // prove that large_u64 is exactly represented as f64 + assert_eq!(large_u64 as f64, large_f64); + assert_eq!(cmp_u64_f64(large_u64, large_f64).unwrap(), Ordering::Equal); + // => (2^54 + 1) cannot be exactly represented in f64 + let large_u64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_u64_plus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (2^54 - 1) cannot be exactly represented in f64 + let large_u64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_u64_minus_1 as f64, large_f64); + assert_eq!( + cmp_u64_f64(large_u64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // NaN comparison results in an error + assert!(cmp_u64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_f64() { + // Basic comparisons + assert_eq!(cmp_i64_f64(5, 5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(5, 6.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(6, 5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, -5.0).unwrap(), Ordering::Equal); + assert_eq!(cmp_i64_f64(-5, -4.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-4, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(-5, 5.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(5, -5.0).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, -0.1).unwrap(), Ordering::Greater); + assert_eq!(cmp_i64_f64(0, 0.1).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, -0.5).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(-1, 0.0).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(0, 0.0).unwrap(), Ordering::Equal); + + // Tests with extreme values + assert_eq!(cmp_i64_f64(i64::MAX, 1e20).unwrap(), Ordering::Less); + assert_eq!(cmp_i64_f64(i64::MIN, -1e20).unwrap(), Ordering::Greater); + + // Precision edge cases: large i64 that loses precision when converted to f64 + // => 2^54, exactly represented as f64 + let large_f64 = 18_014_398_509_481_984.0; + let large_i64 = 18_014_398_509_481_984; + // prove that large_i64 is exactly represented as f64 + assert_eq!(large_i64 as f64, large_f64); + assert_eq!(cmp_i64_f64(large_i64, large_f64).unwrap(), Ordering::Equal); + // => (1_i64 << 54) + 1 cannot be exactly represented in f64 + let large_i64_plus_1 = 18_014_398_509_481_985; + // prove that it is represented as f64 by large_f64 + assert_eq!(large_i64_plus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_plus_1, large_f64).unwrap(), + Ordering::Greater + ); + // => (1_i64 << 54) - 1 cannot be exactly represented in f64 + let large_i64_minus_1 = 18_014_398_509_481_983; + // prove that it is also represented as f64 by large_f64 + assert_eq!(large_i64_minus_1 as f64, large_f64); + assert_eq!( + cmp_i64_f64(large_i64_minus_1, large_f64).unwrap(), + Ordering::Less + ); + + // Same precision edge case but with negative values + // => -2^54, exactly represented as f64 + let large_neg_f64 = -18_014_398_509_481_984.0; + let large_neg_i64 = -18_014_398_509_481_984; + // prove that large_neg_i64 is exactly represented as f64 + assert_eq!(large_neg_i64 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64, large_neg_f64).unwrap(), + Ordering::Equal + ); + // => (-2^54 + 1) cannot be exactly represented in f64 + let large_neg_i64_plus_1 = -18_014_398_509_481_985; + // prove that it is represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_plus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_plus_1, large_neg_f64).unwrap(), + Ordering::Less + ); + // => (-2^54 - 1) cannot be exactly represented in f64 + let large_neg_i64_minus_1 = -18_014_398_509_481_983; + // prove that it is also represented as f64 by large_neg_f64 + assert_eq!(large_neg_i64_minus_1 as f64, large_neg_f64); + assert_eq!( + cmp_i64_f64(large_neg_i64_minus_1, large_neg_f64).unwrap(), + Ordering::Greater + ); + + // NaN comparison results in an error + assert!(cmp_i64_f64(0, f64::NAN).is_err()); + } + + #[test] + fn test_cmp_i64_u64() { + // Test with negative i64 values (should always be less than any u64) + assert_eq!(cmp_i64_u64(-1, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, 0), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MIN, u64::MAX), Ordering::Less); + + // Test with positive i64 values + assert_eq!(cmp_i64_u64(0, 0), Ordering::Equal); + assert_eq!(cmp_i64_u64(1, 0), Ordering::Greater); + assert_eq!(cmp_i64_u64(1, 1), Ordering::Equal); + assert_eq!(cmp_i64_u64(0, 1), Ordering::Less); + assert_eq!(cmp_i64_u64(5, 10), Ordering::Less); + assert_eq!(cmp_i64_u64(10, 5), Ordering::Greater); + + // Test with values near i64::MAX and u64 conversion + assert_eq!(cmp_i64_u64(i64::MAX, i64::MAX as u64), Ordering::Equal); + assert_eq!(cmp_i64_u64(i64::MAX, (i64::MAX as u64) + 1), Ordering::Less); + assert_eq!(cmp_i64_u64(i64::MAX, u64::MAX), Ordering::Less); + } +} + +#[cfg(test)] +mod num_proj_tests { + use super::num_proj::{self, ProjectedNumber}; + + #[test] + fn test_i64_to_u64() { + assert_eq!(num_proj::i64_to_u64(-1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(i64::MIN), ProjectedNumber::Next(0)); + assert_eq!(num_proj::i64_to_u64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::i64_to_u64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::i64_to_u64(i64::MAX), + ProjectedNumber::Exact(i64::MAX as u64) + ); + } + + #[test] + fn test_u64_to_i64() { + assert_eq!(num_proj::u64_to_i64(0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::u64_to_i64(42), ProjectedNumber::Exact(42)); + assert_eq!( + num_proj::u64_to_i64(i64::MAX as u64), + ProjectedNumber::Exact(i64::MAX) + ); + assert_eq!( + num_proj::u64_to_i64((i64::MAX as u64) + 1), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::u64_to_i64(u64::MAX), ProjectedNumber::AfterLast); + } + + #[test] + fn test_f64_to_u64() { + assert_eq!(num_proj::f64_to_u64(-1e25), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(-0.1), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_u64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_u64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_u64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_u64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_u64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_u64(42.1), ProjectedNumber::Next(43)); + } + + #[test] + fn test_f64_to_i64() { + assert_eq!(num_proj::f64_to_i64(-1e20), ProjectedNumber::Next(i64::MIN)); + assert_eq!( + num_proj::f64_to_i64(f64::NEG_INFINITY), + ProjectedNumber::Next(i64::MIN) + ); + assert_eq!(num_proj::f64_to_i64(1e20), ProjectedNumber::AfterLast); + assert_eq!( + num_proj::f64_to_i64(f64::INFINITY), + ProjectedNumber::AfterLast + ); + assert_eq!(num_proj::f64_to_i64(0.0), ProjectedNumber::Exact(0)); + assert_eq!(num_proj::f64_to_i64(42.0), ProjectedNumber::Exact(42)); + assert_eq!(num_proj::f64_to_i64(-42.0), ProjectedNumber::Exact(-42)); + assert_eq!(num_proj::f64_to_i64(0.5), ProjectedNumber::Next(1)); + assert_eq!(num_proj::f64_to_i64(42.1), ProjectedNumber::Next(43)); + assert_eq!(num_proj::f64_to_i64(-0.5), ProjectedNumber::Next(0)); + assert_eq!(num_proj::f64_to_i64(-42.1), ProjectedNumber::Next(-42)); + } + + #[test] + fn test_i64_to_f64() { + assert_eq!(num_proj::i64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::i64_to_f64(42), ProjectedNumber::Exact(42.0)); + assert_eq!(num_proj::i64_to_f64(-42), ProjectedNumber::Exact(-42.0)); + + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::i64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_i64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_i64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_i64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_i64"); + } + + // Test with very large negative value + let large_neg_i64 = -9_007_199_254_740_993; // -(2^53 + 1) + let closest_neg_f64 = -9_007_199_254_740_992.0; + assert_eq!(large_neg_i64 as f64, closest_neg_f64); + if let ProjectedNumber::Next(val) = num_proj::i64_to_f64(large_neg_i64) { + // Verify that the returned float is the closest representable f64 + assert_eq!(val, closest_neg_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_neg_i64"); + } + } + + #[test] + fn test_u64_to_f64() { + assert_eq!(num_proj::u64_to_f64(0), ProjectedNumber::Exact(0.0)); + assert_eq!(num_proj::u64_to_f64(42), ProjectedNumber::Exact(42.0)); + + // Test the largest u64 value that can be exactly represented as f64 (2^53) + let max_exact = 9_007_199_254_740_992; // 2^53 + assert_eq!( + num_proj::u64_to_f64(max_exact), + ProjectedNumber::Exact(max_exact as f64) + ); + + // Test values that cannot be exactly represented as f64 (integers above 2^53) + let large_u64 = 9_007_199_254_740_993; // 2^53 + 1 + let closest_f64 = 9_007_199_254_740_992.0; + assert_eq!(large_u64 as f64, closest_f64); + if let ProjectedNumber::Next(val) = num_proj::u64_to_f64(large_u64) { + // Verify that the returned float is different from the direct cast + assert!(val > closest_f64); + assert!(val - closest_f64 < 2. * f64::EPSILON * closest_f64); + } else { + panic!("Expected ProjectedNumber::Next for large_u64"); + } + } +} diff --git a/quickwit/quickwit-common/src/rate_limited_tracing.rs b/quickwit/quickwit-common/src/rate_limited_tracing.rs index c9a323f9ec2..198c2bf8bdd 100644 --- a/quickwit/quickwit-common/src/rate_limited_tracing.rs +++ b/quickwit/quickwit-common/src/rate_limited_tracing.rs @@ -179,12 +179,13 @@ fn _check_macro_works() { #[doc(hidden)] pub use coarsetime::Instant as CoarsetimeInstant; +pub use rate_limited_debug; +pub use rate_limited_error; +pub use rate_limited_info; +pub use rate_limited_trace; #[doc(hidden)] pub use rate_limited_tracing; -pub use { - rate_limited_debug, rate_limited_error, rate_limited_info, rate_limited_trace, - rate_limited_warn, -}; +pub use rate_limited_warn; #[cfg(test)] mod tests { diff --git a/quickwit/quickwit-config/src/index_config/mod.rs b/quickwit/quickwit-config/src/index_config/mod.rs index e6e7adb3766..1f8af60aa57 100644 --- a/quickwit/quickwit-config/src/index_config/mod.rs +++ b/quickwit/quickwit-config/src/index_config/mod.rs @@ -487,6 +487,7 @@ impl crate::TestableForRegression for IndexConfig { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["tenant_id".to_string(), "log_level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-config/src/node_config/mod.rs b/quickwit/quickwit-config/src/node_config/mod.rs index 31e19bce09c..0d70d0a2b77 100644 --- a/quickwit/quickwit-config/src/node_config/mod.rs +++ b/quickwit/quickwit-config/src/node_config/mod.rs @@ -59,6 +59,9 @@ pub struct RestConfig { pub struct GrpcConfig { #[serde(default = "GrpcConfig::default_max_message_size")] pub max_message_size: ByteSize, + /// Search server responses can be larger when returning many hits. + #[serde(default = "GrpcConfig::default_max_search_message_size")] + pub max_search_message_size: ByteSize, #[serde(default)] pub tls: Option, // If set, keeps idle connection alive by periodically perform a @@ -104,6 +107,10 @@ impl GrpcConfig { ByteSize::mib(20) } + fn default_max_search_message_size() -> ByteSize { + ByteSize::mib(60) + } + pub fn validate(&self) -> anyhow::Result<()> { ensure!( self.max_message_size >= ByteSize::mb(1), @@ -118,6 +125,7 @@ impl Default for GrpcConfig { fn default() -> Self { Self { max_message_size: Self::default_max_message_size(), + max_search_message_size: Self::default_max_search_message_size(), tls: None, keep_alive: None, } @@ -846,6 +854,7 @@ mod tests { fn test_grpc_config_validate() { let grpc_config = GrpcConfig { max_message_size: ByteSize::mb(1), + max_search_message_size: ByteSize::mb(1), tls: None, keep_alive: None, }; @@ -853,6 +862,7 @@ mod tests { let grpc_config = GrpcConfig { max_message_size: ByteSize::kb(1), + max_search_message_size: ByteSize::kb(1), tls: None, keep_alive: None, }; diff --git a/quickwit/quickwit-config/src/storage_config.rs b/quickwit/quickwit-config/src/storage_config.rs index 52daffdb537..7a9af4b1cdf 100644 --- a/quickwit/quickwit-config/src/storage_config.rs +++ b/quickwit/quickwit-config/src/storage_config.rs @@ -425,6 +425,7 @@ impl fmt::Debug for S3StorageConfig { "disable_multi_object_delete", &self.disable_multi_object_delete, ) + .field("encryption", &self.encryption) .finish() } } diff --git a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs index 7feab6564e7..16d8bbcd737 100644 --- a/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs +++ b/quickwit/quickwit-control-plane/src/indexing_scheduler/mod.rs @@ -25,7 +25,9 @@ use fnv::{FnvHashMap, FnvHashSet}; use itertools::Itertools; use once_cell::sync::OnceCell; use quickwit_common::pretty::PrettySample; -use quickwit_config::{FileSourceParams, SourceParams, indexing_pipeline_params_fingerprint}; +use quickwit_config::{ + FileSourceParams, SourceParams, disable_ingest_v1, indexing_pipeline_params_fingerprint, +}; use quickwit_proto::indexing::{ ApplyIndexingPlanRequest, CpuCapacity, IndexingService, IndexingTask, PIPELINE_FULL_CAPACITY, PIPELINE_THROUGHPUT, @@ -218,7 +220,11 @@ fn get_sources_to_schedule(model: &ControlPlaneModel) -> Vec { } SourceParams::IngestApi => { - // TODO ingest v1 is scheduled differently + if disable_ingest_v1() { + // Existing indexes might still have the _ingest-api-source + continue; + } + // Note: ingest v1 is scheduled differently sources.push(SourceToSchedule { source_uid, source_type: SourceToScheduleType::IngestV1, @@ -543,7 +549,10 @@ fn format_indexing_task_map( const MAX_INDEXES: usize = 10; let mut index_displayed = 0; write!(formatter, "{{")?; - let mut indexer_iter = indexing_tasks.iter().enumerate(); + let mut indexer_iter = indexing_tasks + .iter() + .filter(|(_, tasks)| !tasks.is_empty()) + .enumerate(); for (i, (index_name, tasks)) in &mut indexer_iter { if i != 0 { write!(formatter, ", ")?; diff --git a/quickwit/quickwit-datetime/src/java_date_time_format.rs b/quickwit/quickwit-datetime/src/java_date_time_format.rs index 2ef63f32881..a0d6c1cb0f5 100644 --- a/quickwit/quickwit-datetime/src/java_date_time_format.rs +++ b/quickwit/quickwit-datetime/src/java_date_time_format.rs @@ -261,14 +261,17 @@ fn resolve_java_datetime_format_alias(java_datetime_format: &str) -> &str { OnceLock::new(); let java_datetime_format_map = JAVA_DATE_FORMAT_ALIASES.get_or_init(|| { let mut m = HashMap::new(); - m.insert("date_optional_time", "yyyy-MM-dd['T'HH:mm:ss.SSSZ]"); + m.insert( + "date_optional_time", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", + ); m.insert( "strict_date_optional_time", - "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS[Z]]]]]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSS][Z]]]]]]", ); m.insert( "strict_date_optional_time_nanos", - "yyyy[-MM[-dd['T'HH:mm:ss.SSSSSSZ]]]", + "yyyy[-MM[-dd['T'HH[:mm[:ss[.SSSSSS][Z]]]]]]", ); m.insert("basic_date", "yyyyMMdd"); @@ -660,6 +663,7 @@ mod tests { "2019-03-23T21:35:46.123+00:00", "2019-03-23T21:36:46.123+03:00", "2019-03-23T21:37:46.123+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -671,6 +675,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123 UTC), datetime!(2019-03-23 21:36:46.123 +03:00:00), datetime!(2019-03-23 21:37:46.123 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser @@ -692,6 +697,7 @@ mod tests { "2019-03-23T21:35:46.123456789+00:00", "2019-03-23T21:36:46.123456789+03:00", "2019-03-23T21:37:46.123456789+0300", + "2019-03-23T21:38:46+00:00", ]; let expected = [ datetime!(2019-01-01 00:00:00 UTC), @@ -701,6 +707,7 @@ mod tests { datetime!(2019-03-23 21:35:46.123456789 UTC), datetime!(2019-03-23 21:36:46.123456789 +03:00:00), datetime!(2019-03-23 21:37:46.123456789 +03:00:00), + datetime!(2019-03-23 21:38:46 UTC), ]; for (date_str, &expected_dt) in dates.iter().zip(expected.iter()) { let parsed_dt = parser diff --git a/quickwit/quickwit-doc-mapper/Cargo.toml b/quickwit/quickwit-doc-mapper/Cargo.toml index ae0239e53c5..92c977fe4da 100644 --- a/quickwit/quickwit-doc-mapper/Cargo.toml +++ b/quickwit/quickwit-doc-mapper/Cargo.toml @@ -42,10 +42,9 @@ serde_yaml = { workspace = true } time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -quickwit-query = { workspace = true, features = ["multilang"] } +quickwit-query = { workspace = true } [features] -multilang = ["quickwit-query/multilang"] testsuite = [] [[bench]] diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs index 1eb2cea02d9..1b5dc19b12e 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/doc_mapper_impl.rs @@ -78,6 +78,8 @@ pub struct DocMapper { timestamp_field_path: Option>, /// Secondary timestamp field name. secondary_timestamp_field_name: Option, + /// Indexation time field name. + indexation_time_field_name: Option, /// Root node of the field mapping tree. /// See [`MappingNode`]. field_mappings: MappingNode, @@ -128,6 +130,31 @@ fn validate_timestamp_field( Ok(()) } +fn validate_indexation_time_field( + indexation_field_path: &str, + mapping_root_node: &MappingNode, +) -> anyhow::Result<()> { + if indexation_field_path.starts_with('.') || indexation_field_path.starts_with("\\.") { + bail!("indexation_time field `{indexation_field_path}` should not start with a `.`"); + } + if indexation_field_path.ends_with('.') { + bail!("indexation_time field `{indexation_field_path}` should not end with a `.`"); + } + let Some(indexation_time_field_type) = + mapping_root_node.find_field_mapping_type(indexation_field_path) + else { + bail!("could not find indexation_time field `{indexation_field_path}` in field mappings"); + }; + if let FieldMappingType::DateTime(_, cardinality) = &indexation_time_field_type { + if cardinality != &Cardinality::SingleValued { + bail!("indexation_time field `{indexation_field_path}` should be single-valued"); + } + } else { + bail!("indexation_time field `{indexation_field_path}` should be a datetime field"); + } + Ok(()) +} + impl From for DocMapperBuilder { fn from(default_doc_mapper: DocMapper) -> Self { let partition_key_str = default_doc_mapper.partition_key.to_string(); @@ -142,6 +169,7 @@ impl From for DocMapperBuilder { field_mappings: default_doc_mapper.field_mappings.into(), timestamp_field: default_doc_mapper.timestamp_field_name, secondary_timestamp_field: default_doc_mapper.secondary_timestamp_field_name, + indexation_time_field: default_doc_mapper.indexation_time_field_name, tag_fields: default_doc_mapper.tag_field_names, partition_key: partition_key_opt, max_num_partitions: default_doc_mapper.max_num_partitions, @@ -203,6 +231,9 @@ impl TryFrom for DocMapper { } else { None }; + if let Some(indexation_time_field_name) = &doc_mapping.indexation_time_field { + validate_indexation_time_field(indexation_time_field_name, &field_mappings)?; + } let schema = schema_builder.build(); let tokenizer_manager = create_default_quickwit_tokenizer_manager(); @@ -293,6 +324,7 @@ impl TryFrom for DocMapper { timestamp_field_name: doc_mapping.timestamp_field, timestamp_field_path, secondary_timestamp_field_name: doc_mapping.secondary_timestamp_field, + indexation_time_field_name: doc_mapping.indexation_time_field, field_mappings, concatenate_dynamic_fields, tag_field_names, @@ -681,6 +713,11 @@ impl DocMapper { self.secondary_timestamp_field_name.as_deref() } + /// Returns the indexation time field name. + pub fn indexation_time_field_name(&self) -> Option<&str> { + self.indexation_time_field_name.as_deref() + } + /// Returns the tag `NameField`s on the current schema. /// Returns an error if a tag field is not found in this schema. pub fn tag_named_fields(&self) -> anyhow::Result> { diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs index ae3388aee32..e69d337a616 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/field_mapping_entry.rs @@ -1152,7 +1152,7 @@ mod tests { "type": "text", "stored": true, "record": "basic", - "tokenizer": "en_stem" + "tokenizer": "lowercase" } "#, )?; @@ -1161,7 +1161,7 @@ mod tests { FieldMappingType::Text(options, _) => { assert_eq!(options.stored, true); let indexing_options = options.indexing_options.unwrap(); - assert_eq!(indexing_options.tokenizer.name(), "en_stem"); + assert_eq!(indexing_options.tokenizer.name(), "lowercase"); assert_eq!(indexing_options.record, IndexRecordOption::Basic); } _ => panic!("wrong property type"), diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs index bed4b18b90f..749dde228a7 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/mod.rs @@ -28,8 +28,6 @@ use std::ops::Bound; pub use doc_mapper_builder::DocMapperBuilder; pub use doc_mapper_impl::DocMapper; -#[cfg(all(test, feature = "multilang"))] -pub(crate) use field_mapping_entry::TextIndexingOptions; pub use field_mapping_entry::{ BinaryFormat, FastFieldOptions, FieldMappingEntry, QuickwitBytesOptions, QuickwitJsonOptions, QuickwitTextNormalizer, @@ -812,55 +810,4 @@ mod tests { warmup_info.simplify(); assert_eq!(warmup_info, expected); } - - #[test] - #[cfg(feature = "multilang")] - fn test_doc_mapper_query_with_multilang_field() { - use quickwit_query::query_ast::TermQuery; - use tantivy::schema::IndexRecordOption; - - use crate::doc_mapper::{ - QuickwitTextOptions, QuickwitTextTokenizer, TextIndexingOptions, TokenizerType, - }; - use crate::{TokenizerConfig, TokenizerEntry}; - let mut doc_mapper_builder = DocMapperBuilder::default(); - doc_mapper_builder - .doc_mapping - .field_mappings - .push(FieldMappingEntry { - name: "multilang".to_string(), - mapping_type: FieldMappingType::Text( - QuickwitTextOptions { - indexing_options: Some(TextIndexingOptions { - tokenizer: QuickwitTextTokenizer::from_static("multilang"), - record: IndexRecordOption::Basic, - fieldnorms: false, - }), - ..Default::default() - }, - Cardinality::SingleValued, - ), - }); - doc_mapper_builder - .doc_mapping - .tokenizers - .push(TokenizerEntry { - name: "multilang".to_string(), - config: TokenizerConfig { - tokenizer_type: TokenizerType::Multilang, - filters: Vec::new(), - }, - }); - let doc_mapper = doc_mapper_builder.try_build().unwrap(); - let schema = doc_mapper.schema(); - let query_ast = quickwit_query::query_ast::QueryAst::Term(TermQuery { - field: "multilang".to_string(), - value: "JPN:す".to_string(), - }); - let (query, _) = doc_mapper.query(schema, query_ast, false, None).unwrap(); - assert_eq!( - format!("{query:?}"), - r#"TermQuery(Term(field=2, type=Str, "JPN:す"))"# - ); - } } diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs index b9793dc9548..0488d118c9f 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapper/tokenizer_entry.rs @@ -44,10 +44,6 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), - #[cfg(any(test, feature = "multilang"))] - TokenizerType::Multilang => { - TextAnalyzer::builder(quickwit_query::MultiLangTokenizer::default()).dynamic() - } TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = @@ -120,8 +116,6 @@ impl TokenFilterType { #[derive(Clone, Debug, PartialEq, Eq, Hash, Serialize, Deserialize, utoipa::ToSchema)] #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { - #[cfg(any(test, feature = "multilang"))] - Multilang, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), Simple, diff --git a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs index d8afa4b16e9..8fc1ce8096a 100644 --- a/quickwit/quickwit-doc-mapper/src/doc_mapping.rs +++ b/quickwit/quickwit-doc-mapper/src/doc_mapping.rs @@ -133,6 +133,13 @@ pub struct DocMapping { #[serde(skip_serializing_if = "Option::is_none")] pub secondary_timestamp_field: Option, + /// Declares the field which will contain the indexation time for the document. + /// This field is automatically populated by the indexer + /// with the time at which the document is indexed. + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub indexation_time_field: Option, + /// Declares the low cardinality fields for which the values ​​are recorded directly in the /// splits metadata. #[schema(value_type = Vec)] @@ -207,6 +214,7 @@ mod tests { ], timestamp_field: Some("timestamp".to_string()), secondary_timestamp_field: None, + indexation_time_field: None, tag_fields: BTreeSet::from_iter(["level".to_string()]), partition_key: Some("tenant_id".to_string()), max_num_partitions: NonZeroU32::new(100).unwrap(), diff --git a/quickwit/quickwit-indexing/src/actors/indexer.rs b/quickwit/quickwit-indexing/src/actors/indexer.rs index 64a08d3f5da..f4674ea84ba 100644 --- a/quickwit/quickwit-indexing/src/actors/indexer.rs +++ b/quickwit/quickwit-indexing/src/actors/indexer.rs @@ -44,6 +44,7 @@ use tantivy::schema::{Field, Schema, Value}; use tantivy::store::{Compressor, ZstdCompressor}; use tantivy::tokenizer::TokenizerManager; use tantivy::{DateTime, IndexBuilder, IndexSettings}; +use time::OffsetDateTime; use tokio::runtime::Handle; use tokio::sync::Semaphore; use tracing::{Span, info, info_span, warn}; @@ -99,6 +100,7 @@ struct IndexerState { max_num_partitions: NonZeroU32, index_settings: IndexSettings, cooperative_indexing_opt: Option, + indexation_time_field_opt: Option, } impl IndexerState { @@ -300,7 +302,15 @@ impl IndexerState { .context("batch delta does not follow indexer checkpoint")?; let mut memory_usage_delta: i64 = 0; counters.num_doc_batches_in_workbench += 1; - for doc in batch.docs { + let indexation_time_opt = self + .indexation_time_field_opt + .map(|_| DateTime::from_utc(OffsetDateTime::now_utc())); + for mut doc in batch.docs { + if let (Some(indexation_time), Some(indexation_time_field)) = + (indexation_time_opt, self.indexation_time_field_opt) + { + doc.doc.add_date(indexation_time_field, indexation_time); + } let ProcessedDoc { doc, timestamp_opt, @@ -589,6 +599,17 @@ impl Indexer { cooperative_indexing_permits, ) }); + let indexation_time_field_opt = + doc_mapper + .indexation_time_field_name() + .and_then(|name| match schema.get_field(name) { + Ok(field) => Some(field), + Err(_) => { + warn!("failed to find indexation time field '{}' in schema", name); + None + } + }); + Self { indexer_state: IndexerState { pipeline_id, @@ -604,6 +625,7 @@ impl Indexer { index_settings, max_num_partitions: doc_mapper.max_num_partitions(), cooperative_indexing_opt, + indexation_time_field_opt, }, index_serializer_mailbox, indexing_workbench_opt: None, @@ -743,7 +765,7 @@ mod tests { EmptyResponse, LastDeleteOpstampResponse, MockMetastoreService, }; use quickwit_proto::types::{IndexUid, NodeId, PipelineUid}; - use tantivy::{DateTime, doc}; + use tantivy::{DateTime, DocAddress, ReloadPolicy, TantivyDocument, doc}; use super::*; use crate::actors::indexer::{IndexerCounters, record_timestamp}; @@ -1851,4 +1873,161 @@ mod tests { universe.assert_quit().await; Ok(()) } + + fn doc_mapper_with_indexation_time() -> DocMapper { + const JSON_CONFIG_VALUE: &str = r#" + { + "store_source": true, + "index_field_presence": true, + "default_search_fields": ["body"], + "timestamp_field": "timestamp", + "indexation_time_field": "indexed_at", + "field_mappings": [ + { + "name": "timestamp", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true + }, + { + "name": "body", + "type": "text", + "stored": true + }, + { + "name": "indexed_at", + "type": "datetime", + "output_format": "unix_timestamp_secs", + "fast": true, + "stored": true + } + ] + }"#; + serde_json::from_str::(JSON_CONFIG_VALUE).unwrap() + } + + #[tokio::test] + async fn test_indexer_sets_indexation_time() -> anyhow::Result<()> { + let index_uid = IndexUid::new_with_random_ulid("test-index"); + let pipeline_id = IndexingPipelineId { + index_uid: index_uid.clone(), + source_id: "test-source".to_string(), + node_id: NodeId::from("test-node"), + pipeline_uid: PipelineUid::default(), + }; + let doc_mapper = Arc::new(doc_mapper_with_indexation_time()); + let last_delete_opstamp = 10; + let schema = doc_mapper.schema(); + let body_field = schema.get_field("body").unwrap(); + let timestamp_field = schema.get_field("timestamp").unwrap(); + let indexed_at_field = schema.get_field("indexed_at").unwrap(); + let indexing_directory = TempDirectory::for_test(); + let mut indexing_settings = IndexingSettings::for_test(); + indexing_settings.split_num_docs_target = 3; + let universe = Universe::with_accelerated_time(); + let (index_serializer_mailbox, index_serializer_inbox) = universe.create_test_mailbox(); + let mut mock_metastore = MockMetastoreService::new(); + mock_metastore + .expect_last_delete_opstamp() + .times(1) + .returning(move |delete_opstamp_request| { + assert_eq!(delete_opstamp_request.index_uid(), &index_uid); + Ok(LastDeleteOpstampResponse::new(last_delete_opstamp)) + }); + mock_metastore.expect_publish_splits().never(); + let indexer = Indexer::new( + pipeline_id, + doc_mapper, + MetastoreServiceClient::from_mock(mock_metastore), + indexing_directory, + indexing_settings, + None, + index_serializer_mailbox, + ); + let (indexer_mailbox, indexer_handle) = universe.spawn_builder().spawn(indexer); + + // Send 3 docs in a single batch so they all share the same indexation timestamp + // (the timestamp is sampled once per batch in `index_batch`). + indexer_mailbox + .send_message(ProcessedDocBatch::new( + vec![ + ProcessedDoc { + doc: doc!( + body_field => "document 1", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_001), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_001)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 2", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_002), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_002)), + partition: 1, + num_bytes: 30, + }, + ProcessedDoc { + doc: doc!( + body_field => "document 3", + timestamp_field => DateTime::from_timestamp_secs(1_662_000_003), + ), + timestamp_opt: Some(DateTime::from_timestamp_secs(1_662_000_003)), + partition: 1, + num_bytes: 30, + }, + ], + SourceCheckpointDelta::from_range(0..3), + false, + )) + .await?; + + indexer_handle.process_pending_and_observe().await; + let messages: Vec = index_serializer_inbox.drain_for_test_typed(); + assert_eq!(messages.len(), 1); + let batch = messages.into_iter().next().unwrap(); + assert_eq!(batch.commit_trigger, CommitTrigger::NumDocsLimit); + assert_eq!(batch.splits.len(), 1); + assert_eq!(batch.splits[0].split_attrs.num_docs, 3); + + // Finalize the split and open the tantivy index to verify the `indexed_at` field. + let indexed_split = batch.splits.into_iter().next().unwrap().finalize()?; + let reader = indexed_split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + + // Collect every `indexed_at` value present in the split. + let mut indexed_at_values: Vec = Vec::new(); + for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() { + for doc_id in 0..segment_reader.max_doc() { + let doc_address = DocAddress::new(segment_ord as u32, doc_id); + let doc: TantivyDocument = searcher.doc(doc_address)?; + let indexed_at = doc + .get_first(indexed_at_field) + .and_then(|val| val.as_datetime()) + .expect("indexed_at field must be set on every indexed document"); + indexed_at_values.push(indexed_at); + } + } + + // All 3 documents must have been stamped with the indexation time. + assert_eq!(indexed_at_values.len(), 3); + // Because the timestamp is captured once for the whole batch, every document + // in the batch must carry exactly the same `indexed_at` value. + let first = indexed_at_values[0]; + for val in &indexed_at_values { + assert_eq!( + *val, first, + "all documents in the same batch must share the same indexed_at timestamp" + ); + } + + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs index 99065651db1..65bd824b1b9 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_pipeline.rs @@ -159,7 +159,9 @@ impl IndexingPipeline { let indexing_pipelines_gauge = crate::metrics::INDEXER_METRICS .indexing_pipelines .with_label_values([¶ms.pipeline_id.index_uid.index_id]); - let indexing_pipelines_gauge_guard = OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + let mut indexing_pipelines_gauge_guard = + OwnedGaugeGuard::from_gauge(indexing_pipelines_gauge); + indexing_pipelines_gauge_guard.add(1); let params_fingerprint = params.params_fingerprint; IndexingPipeline { params, diff --git a/quickwit/quickwit-indexing/src/actors/indexing_service.rs b/quickwit/quickwit-indexing/src/actors/indexing_service.rs index afd2637c02c..363c9891f0c 100644 --- a/quickwit/quickwit-indexing/src/actors/indexing_service.rs +++ b/quickwit/quickwit-indexing/src/actors/indexing_service.rs @@ -16,6 +16,7 @@ use std::collections::{HashMap, HashSet}; use std::fmt::{Debug, Formatter}; use std::path::PathBuf; use std::sync::Arc; +use std::time::{Duration, Instant}; use anyhow::Context; use async_trait::async_trait; @@ -895,6 +896,7 @@ impl Handler for IndexingService { msg: ObservePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("observe_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let observation = self.observe_pipeline(&pipeline_uid).await; Ok(observation) @@ -910,6 +912,7 @@ impl Handler for IndexingService { msg: DetachIndexingPipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_indexing_pipeline"); let pipeline_uid = msg.pipeline_id.pipeline_uid; let detach_pipeline_result = self.detach_indexing_pipeline(&pipeline_uid).await; Ok(detach_pipeline_result) @@ -925,6 +928,7 @@ impl Handler for IndexingService { msg: DetachMergePipeline, _ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("detach_merge_pipeline"); Ok(self.detach_merge_pipeline(&msg.pipeline_id).await) } } @@ -941,6 +945,7 @@ impl Handler for IndexingService { _message: SuperviseLoop, ctx: &ActorContext, ) -> Result<(), ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("supervise_loop"); self.handle_supervise().await?; ctx.schedule_self_msg(*quickwit_actors::HEARTBEAT, SuperviseLoop); Ok(()) @@ -969,6 +974,7 @@ impl Handler for IndexingService { message: SpawnPipeline, ctx: &ActorContext, ) -> Result, ActorExitStatus> { + let _slow_handler_guard = SlowHandlerGuard::new("spawn_pipeline"); Ok(self .spawn_pipeline( ctx, @@ -989,6 +995,7 @@ impl Handler for IndexingService { plan_request: ApplyIndexingPlanRequest, ctx: &ActorContext, ) -> Result { + let _slow_handler_guard = SlowHandlerGuard::new("apply_indexing_plan"); Ok(self .apply_indexing_plan(&plan_request.indexing_tasks, ctx) .await @@ -1016,6 +1023,32 @@ struct IndexingPipelineDiff { pipelines_to_spawn: Vec, } +/// Logs a warning every 5 seconds until dropped. Useful to identify slow +/// handlers that might compromise liveness checks. +pub struct SlowHandlerGuard { + _cancel_tx: oneshot::Sender<()>, +} + +impl SlowHandlerGuard { + pub fn new(handler_name: &'static str) -> Self { + let (cancel_tx, mut cancel_rx) = oneshot::channel::<()>(); + let start = Instant::now(); + tokio::spawn(async move { + loop { + tokio::select! { + _ = tokio::time::sleep(Duration::from_secs(5)) => { + warn!(handler=handler_name, elapsed_secs=start.elapsed().as_secs(), "slow indexing service handler"); + } + _ = &mut cancel_rx => { break; } + } + } + }); + Self { + _cancel_tx: cancel_tx, + } + } +} + #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/quickwit/quickwit-indexing/src/actors/merge_executor.rs b/quickwit/quickwit-indexing/src/actors/merge_executor.rs index 6b753c7e13b..660a8b62d05 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_executor.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_executor.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeSet; +use std::collections::{BTreeSet, HashMap}; use std::ops::RangeInclusive; use std::path::Path; use std::sync::Arc; @@ -40,20 +40,40 @@ use quickwit_query::query_ast::QueryAst; use tantivy::directory::{Advice, DirectoryClone, MmapDirectory, RamDirectory}; use tantivy::index::SegmentId; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, Directory, Index, IndexMeta, IndexWriter, SegmentReader}; +use tantivy::{DateTime, Directory, DocId, Index, IndexMeta, IndexWriter, SegmentReader}; use tokio::runtime::Handle; use tracing::{debug, error, info, instrument, warn}; use crate::actors::Packager; use crate::controlled_directory::ControlledDirectory; use crate::merge_policy::MergeOperationType; -use crate::models::{IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, SplitAttrs}; +use crate::models::{ + IndexedSplit, IndexedSplitBatch, MergeScratch, PublishLock, ReplacedSplit, SplitAttrs, +}; +use crate::soft_delete_query::SoftDeletedDocIdsQuery; + +/// The mapping resolution assiated to the merge. To perform deletes a full doc +/// mapper is required. For regular merges, we only need the tokenizer manager. +#[derive(Clone)] +enum MapperContext { + TokenizersOnly(quickwit_query::tokenizers::TokenizerManager), + DocMapper(Arc), +} + +impl MapperContext { + fn tokenizer_manager(&self) -> quickwit_query::tokenizers::TokenizerManager { + match self { + MapperContext::TokenizersOnly(tokenizer_manager) => tokenizer_manager.clone(), + MapperContext::DocMapper(doc_mapper) => doc_mapper.tokenizer_manager().clone(), + } + } +} #[derive(Clone)] pub struct MergeExecutor { pipeline_id: MergePipelineId, metastore: MetastoreServiceClient, - doc_mapper: Arc, + mapper_context: MapperContext, io_controls: IoControls, merge_packager_mailbox: Mailbox, } @@ -106,14 +126,16 @@ impl Handler for MergeExecutor { // A failure in a merge is a bit special. // // Instead of failing the pipeline, we just log it. - // The idea is to limit the risk associated with a potential split of death. + // The idea is to limit the risk associated with a potential split of + // death. // - // Such a split is now not tracked by the merge planner and won't undergo a - // merge until the merge pipeline is restarted. + // Such a split is now not tracked by the merge planner and won't + // undergo a merge until the merge pipeline + // is restarted. // - // With a merge policy that marks splits as mature after a day or so, this - // limits the noise associated to those failed - // merges. + // With a merge policy that marks splits as mature after a day or so, + // this limits the noise associated to those + // failed merges. error!(task=?merge_task, err=?err, "failed to merge splits"); return Ok(()); } @@ -171,21 +193,23 @@ fn combine_index_meta(mut index_metas: Vec) -> anyhow::Result>, Vec)>; + fn open_split_directories( // Directories containing the splits to merge tantivy_dirs: &[Box], tokenizer_manager: &TokenizerManager, -) -> anyhow::Result<(IndexMeta, Vec>)> { +) -> OpenSplitDirsResult { let mut directories: Vec> = Vec::new(); - let mut index_metas = Vec::new(); + let mut index_metas: Vec = Vec::new(); for tantivy_dir in tantivy_dirs { directories.push(tantivy_dir.clone()); - let index_meta = open_index(tantivy_dir.clone(), tokenizer_manager)?.load_metas()?; index_metas.push(index_meta); } + let per_split_metas = index_metas.clone(); let union_index_meta = combine_index_meta(index_metas)?; - Ok((union_index_meta, directories)) + Ok((union_index_meta, directories, per_split_metas)) } /// Creates a directory with a single `meta.json` file describe in `index_meta` @@ -278,11 +302,23 @@ pub fn merge_split_attrs( let partition_id = combine_partition_ids_aux(splits.iter().map(|split| split.partition_id)); let time_range: Option> = merge_time_range(splits); let secondary_time_range = merge_secondary_time_range_if_exists(splits); - let uncompressed_docs_size_in_bytes = sum_doc_sizes_in_bytes(splits); - let num_docs = sum_num_docs(splits); - let replaced_split_ids: Vec = splits + let total_soft_deleted: u64 = splits + .iter() + .map(|split| split.soft_deleted_doc_ids.len() as u64) + .sum(); + let raw_num_docs = sum_num_docs(splits); + let num_docs = raw_num_docs.saturating_sub(total_soft_deleted); + let uncompressed_docs_size_in_bytes = if raw_num_docs > 0 { + (sum_doc_sizes_in_bytes(splits) as f64 * num_docs as f64 / raw_num_docs as f64) as u64 + } else { + 0 + }; + let replaced_splits = splits .iter() - .map(|split| split.split_id().to_string()) + .map(|split| ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }) .collect(); let delete_opstamp = splits .iter() @@ -306,13 +342,13 @@ pub fn merge_split_attrs( doc_mapping_uid, split_id: merge_split_id, partition_id, - replaced_split_ids, time_range, secondary_time_range, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp, num_merge_ops: max_merge_ops(splits) + 1, + replaced_splits, }) } @@ -324,6 +360,16 @@ fn max_merge_ops(splits: &[SplitMetadata]) -> usize { .unwrap_or(0) } +struct MergeDirectoriesInput { + union_index_meta: IndexMeta, + split_directories: Vec>, + delete_tasks: Vec, + /// Required when `delete_tasks` is non-empty; unused otherwise. + doc_mapper_opt: Option>, + /// Maps each segment ID to the sorted list of soft-deleted doc IDs to remove. + soft_deleted_docs: HashMap>, +} + impl MergeExecutor { pub fn new( pipeline_id: MergePipelineId, @@ -335,7 +381,24 @@ impl MergeExecutor { MergeExecutor { pipeline_id, metastore, - doc_mapper, + mapper_context: MapperContext::DocMapper(doc_mapper), + io_controls, + merge_packager_mailbox, + } + } + + /// Creates a simpler MergeExecutor that doesn't support deletes. + pub fn new_with_tokenizers_only( + pipeline_id: MergePipelineId, + metastore: MetastoreServiceClient, + tokenizer_manager: quickwit_query::tokenizers::TokenizerManager, + io_controls: IoControls, + merge_packager_mailbox: Mailbox, + ) -> Self { + MergeExecutor { + pipeline_id, + metastore, + mapper_context: MapperContext::TokenizersOnly(tokenizer_manager), io_controls, merge_packager_mailbox, } @@ -349,18 +412,33 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result { - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of its parent split. + let soft_deleted_docs: HashMap> = per_split_metas + .iter() + .zip(splits.iter()) + .filter(|(_, split)| !split.soft_deleted_doc_ids.is_empty()) + .flat_map(|(meta, split)| { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + meta.segments + .iter() + .map(move |seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect(); // TODO it would be nice if tantivy could let us run the merge in the current thread. fail_point!("before-merge-split"); let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - Vec::new(), - None, + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks: Vec::new(), + doc_mapper_opt: None, + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -371,17 +449,18 @@ impl MergeExecutor { // splits. let merged_index = open_index( controlled_directory.clone(), - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); let split_attrs = merge_split_attrs(self.pipeline_id.clone(), merge_split_id, &splits)?; - Ok(IndexedSplit { + let indexed_split = IndexedSplit { split_attrs, index: merged_index, split_scratch_directory: merge_scratch_directory, controlled_directory_opt: Some(controlled_directory), - }) + }; + Ok(indexed_split) } async fn process_delete_and_merge( @@ -392,6 +471,9 @@ impl MergeExecutor { merge_scratch_directory: TempDirectory, ctx: &ActorContext, ) -> anyhow::Result> { + let MapperContext::DocMapper(doc_mapper) = &self.mapper_context else { + anyhow::bail!("DocMapper is required to process delete and merge operations"); + }; let list_delete_tasks_request = ListDeleteTasksRequest::new(split.index_uid.clone(), split.delete_opstamp); let delete_tasks = ctx @@ -417,16 +499,34 @@ impl MergeExecutor { num_delete_tasks = delete_tasks.len() ); - let (union_index_meta, split_directories) = open_split_directories( + let (union_index_meta, split_directories, per_split_metas) = open_split_directories( &tantivy_dirs, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + doc_mapper.tokenizer_manager().tantivy_manager(), )?; + // Build a mapping from each segment ID to the soft-deleted doc IDs of the input split. + let soft_deleted_docs: HashMap> = + if split.soft_deleted_doc_ids.is_empty() { + HashMap::new() + } else { + let doc_ids: Vec = split.soft_deleted_doc_ids.iter().copied().collect(); + per_split_metas + .iter() + .flat_map(|meta| { + meta.segments + .iter() + .map(|seg_meta| (seg_meta.id(), doc_ids.clone())) + }) + .collect() + }; let controlled_directory = self .merge_split_directories( - union_index_meta, - split_directories, - delete_tasks, - Some(self.doc_mapper.clone()), + MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt: Some(doc_mapper.clone()), + soft_deleted_docs, + }, merge_scratch_directory.path(), ctx, ) @@ -435,12 +535,7 @@ impl MergeExecutor { // This will have the side effect of deleting the directory containing the downloaded split. let mut merged_index = Index::open(controlled_directory.clone())?; ctx.record_progress(); - merged_index.set_tokenizers( - self.doc_mapper - .tokenizer_manager() - .tantivy_manager() - .clone(), - ); + merged_index.set_tokenizers(doc_mapper.tokenizer_manager().tantivy_manager().clone()); merged_index.set_fast_field_tokenizers( get_quickwit_fastfield_normalizer_manager() .tantivy_manager() @@ -473,8 +568,7 @@ impl MergeExecutor { let uncompressed_docs_size_in_bytes = (num_docs as f32 * split.uncompressed_docs_size_in_bytes as f32 / split.num_docs as f32) as u64; - let time_range = if let Some(timestamp_field_name) = self.doc_mapper.timestamp_field_name() - { + let time_range = if let Some(timestamp_field_name) = doc_mapper.timestamp_field_name() { let reader = merged_segment_reader .fast_fields() .date(timestamp_field_name)?; @@ -490,13 +584,16 @@ impl MergeExecutor { doc_mapping_uid: split.doc_mapping_uid, split_id: merge_split_id, partition_id: split.partition_id, - replaced_split_ids: vec![split.split_id.clone()], time_range, secondary_time_range: None, num_docs, uncompressed_docs_size_in_bytes, delete_opstamp: last_delete_opstamp, num_merge_ops: split.num_merge_ops, + replaced_splits: vec![ReplacedSplit { + split_id: split.split_id.clone(), + soft_deleted_doc_ids: split.soft_deleted_doc_ids.clone(), + }], }, index: merged_index, split_scratch_directory: merge_scratch_directory, @@ -507,13 +604,17 @@ impl MergeExecutor { async fn merge_split_directories( &self, - union_index_meta: IndexMeta, - split_directories: Vec>, - delete_tasks: Vec, - doc_mapper_opt: Option>, + input: MergeDirectoriesInput, output_path: &Path, ctx: &ActorContext, ) -> anyhow::Result { + let MergeDirectoriesInput { + union_index_meta, + split_directories, + delete_tasks, + doc_mapper_opt, + soft_deleted_docs, + } = input; let shadowing_meta_json_directory = create_shadowing_meta_json_directory(union_index_meta)?; // This directory is here to receive the merged split, as well as the final meta.json file. @@ -535,7 +636,7 @@ impl MergeExecutor { let union_directory = UnionDirectory::union_of(directory_stack); let union_index = open_index( union_directory, - self.doc_mapper.tokenizer_manager().tantivy_manager(), + self.mapper_context.tokenizer_manager().tantivy_manager(), )?; ctx.record_progress(); @@ -543,6 +644,12 @@ impl MergeExecutor { let mut index_writer: IndexWriter = union_index.writer_with_num_threads(1, 15_000_000)?; let num_delete_tasks = delete_tasks.len(); + let has_soft_deletes = !soft_deleted_docs.is_empty(); + // Hard-delete soft-deleted doc IDs before applying delete-task queries so that both + // sources of deletion are committed together in a single pass. + if has_soft_deletes { + index_writer.delete_query(Box::new(SoftDeletedDocIdsQuery::new(soft_deleted_docs)))?; + } if num_delete_tasks > 0 { let doc_mapper = doc_mapper_opt .ok_or_else(|| anyhow!("doc mapper must be present if there are delete tasks"))?; @@ -564,6 +671,8 @@ impl MergeExecutor { doc_mapper.query(union_index.schema(), parsed_query_ast, false, None)?; index_writer.delete_query(query)?; } + } + if has_soft_deletes || num_delete_tasks > 0 { debug!("commit-delete-operations"); index_writer.commit()?; } @@ -574,13 +683,13 @@ impl MergeExecutor { .map(|segment_meta| segment_meta.id()) .collect(); - // A merge is useless if there is no delete and only one segment. - if num_delete_tasks == 0 && segment_ids.len() <= 1 { + // A merge is useless if there are no deletions and only one segment. + if !has_soft_deletes && num_delete_tasks == 0 && segment_ids.len() <= 1 { return Ok(output_directory); } - // If after deletion there is no longer any document, don't try to merge. - if num_delete_tasks != 0 && segment_ids.is_empty() { + // If after deletion there are no remaining documents, don't try to merge. + if (has_soft_deletes || num_delete_tasks != 0) && segment_ids.is_empty() { return Ok(output_directory); } @@ -713,6 +822,287 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_merge_executor_with_soft_deleted_docs() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-soft-delete", doc_mapping_yaml, "", &["body"]).await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body ": format!("split{split_id}"), "ts": 1631072713u64 + split_id }), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Load the initial split metadata to obtain split IDs. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split. + // Each split contains exactly one document, so doc_id=0 is the only document. + let soft_deleted_split_id = split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: soft_deleted_split_id, + doc_ids: vec![0], + }], + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!( + split_metas + .iter() + .map(|s| s.soft_deleted_doc_ids.len()) + .sum::(), + 1, + "exactly one doc should be soft-deleted across all splits" + ); + + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()) + } + let merge_operation = MergeOperation::new_merge_operation(split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split_attrs_after_merge = &packager_msgs[0].splits[0].split_attrs; + // One document was soft-deleted, so only 3 docs should remain. + assert_eq!(split_attrs_after_merge.num_docs, 3); + assert_eq!(split_attrs_after_merge.uncompressed_docs_size_in_bytes, 102); + assert_eq!(split_attrs_after_merge.num_merge_ops, 1); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + // The merged segment must contain exactly 3 live documents. + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(num_live_docs, 3); + + test_sandbox.assert_quit().await; + Ok(()) + } + + /// Verifies that when a soft-delete lands on an input split while the + /// merge is running, the merge still succeeds. + #[tokio::test] + async fn test_merge_executor_soft_delete_race_condition() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-index-soft-delete-race", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + for split_id in 0..4 { + let single_doc = std::iter::once( + serde_json::json!({"body": format!("split{split_id}"), "ts": 1631072713u64 + split_id}), + ); + test_sandbox.add_documents(single_doc).await?; + } + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + // Read split metadata *before* the soft-delete — this is the stale snapshot that the + // merge task will carry, simulating a race where the delete arrives after the merge + // executor already read the metadata. + let stale_split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits_metadata() + .await + .unwrap(); + assert_eq!(stale_split_metas.len(), 4); + + // Soft-delete doc_id=0 from the first split *after* the stale metadata was read. + // This simulates a concurrent user action that arrives while the merge is running. + let racing_split_id = stale_split_metas[0].split_id.clone(); + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: racing_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Build the merge scratch using the stale metadata (no soft-deletes recorded). + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let mut tantivy_dirs: Vec> = Vec::new(); + for split_meta in &stale_split_metas { + let split_filename = split_file(split_meta.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + tantivy_dirs.push(get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap()); + } + let merge_operation = MergeOperation::new_merge_operation(stale_split_metas); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs, + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: index_uid.clone(), + source_id: test_sandbox.source_id(), + }; + let (merge_packager_mailbox, merge_packager_inbox) = + test_sandbox.universe().create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + test_sandbox.metastore(), + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = test_sandbox + .universe() + .spawn_builder() + .spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + // The merge must succeed despite the race condition. + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!( + packager_msgs.len(), + 1, + "merge must produce exactly one split batch" + ); + + let split_attrs = &packager_msgs[0].splits[0].split_attrs; + // The stale metadata had no soft-deletes, so all 4 docs are present in the merged + // segment. The racing soft-delete was missed. + assert_eq!(split_attrs.num_docs, 4); + assert_eq!(split_attrs.num_merge_ops, 1); + + // The snapshot carried in the batch reflects the stale state (no soft-deletes). + let replaced_splits = &packager_msgs[0].splits[0].split_attrs.replaced_splits; + assert_eq!( + replaced_splits.len(), + 4, + "all 4 input splits must appear in the snapshot" + ); + let racing_split_snapshot = replaced_splits + .iter() + .find(|replaced_split| replaced_split.split_id == racing_split_id) + .expect("racing split must be present in the snapshot"); + assert!( + racing_split_snapshot.soft_deleted_doc_ids.is_empty(), + "racing split had no soft-deletes at merge start (stale read)" + ); + + let reader = packager_msgs[0].splits[0] + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + // All 4 docs are physically present; the racing soft-delete was not applied. + assert_eq!(num_live_docs, 4); + + test_sandbox.assert_quit().await; + Ok(()) + } + #[test] fn test_combine_partition_ids_singleton_unchanged() { assert_eq!(combine_partition_ids_aux([17]), 17); @@ -950,4 +1340,204 @@ mod tests { ) .await } + + #[tokio::test] + async fn test_delete_and_merge_with_soft_deleted_docs() -> anyhow::Result<()> { + quickwit_common::setup_logging_for_tests(); + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: + - unix_timestamp + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create( + "test-delete-and-merge-with-soft-delete", + doc_mapping_yaml, + "", + &["body"], + ) + .await?; + + // Three docs are ingested into a single split. + // doc_id=0 body="soft_delete" → removed by soft-delete + // doc_id=1 body="query_delete" → removed by the delete query + // doc_id=2 body="keep" → must survive both conditions + test_sandbox + .add_documents(vec![ + serde_json::json!({"body": "soft_delete", "ts": 1624928200}), + serde_json::json!({"body": "query_delete", "ts": 1624928201}), + serde_json::json!({"body": "keep", "ts": 1624928202}), + ]) + .await?; + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + let original_split_id = splits[0].split_metadata.split_id.clone(); + + // Soft-delete doc_id=0 (the "soft_delete" document). + metastore + .soft_delete_documents(quickwit_proto::metastore::SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![quickwit_proto::metastore::SplitDocIds { + split_id: original_split_id.clone(), + doc_ids: vec![0], + }], + }) + .await?; + + // Register a delete task targeting the "query_delete" document. + metastore + .create_delete_task(DeleteQuery { + index_uid: Some(index_uid.clone()), + start_timestamp: None, + end_timestamp: None, + query_ast: quickwit_query::query_ast::qast_json_helper( + "body:query_delete", + &["body"], + ), + }) + .await?; + + // Reload split metadata so that soft_deleted_doc_ids is populated. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + assert_eq!(splits.len(), 1); + assert_eq!( + splits[0].split_metadata.soft_deleted_doc_ids.len(), + 1, + "doc_id=0 must be recorded as soft-deleted before staging" + ); + + // Stage a replacement split with num_merge_ops=1. By cloning the freshly-read + // metadata the soft_deleted_doc_ids field is carried over into the merge task, + // which is exactly what process_delete_and_merge relies on. + let mut new_split_metadata = splits[0].split_metadata.clone(); + new_split_metadata.split_id = new_split_id(); + new_split_metadata.num_merge_ops = 1; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &new_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![new_split_metadata.split_id.to_string()], + replaced_split_ids: vec![original_split_id.clone()], + index_checkpoint_delta_json_opt: None, + publish_token_opt: None, + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // Copy the original split bundle to the new split filename so the executor can open it. + let merge_scratch_directory = TempDirectory::for_test(); + let downloaded_splits_directory = + merge_scratch_directory.named_temp_child("downloaded-splits-")?; + let split_filename = split_file(&original_split_id); + let new_split_filename = split_file(new_split_metadata.split_id()); + let dest_filepath = downloaded_splits_directory.path().join(&new_split_filename); + test_sandbox + .storage() + .copy_to_file(Path::new(&split_filename), &dest_filepath) + .await?; + let tantivy_dir = get_tantivy_directory_from_split_bundle(&dest_filepath).unwrap(); + let merge_operation = MergeOperation::new_delete_and_merge_operation(new_split_metadata); + let merge_task = MergeTask::from_merge_operation_for_test(merge_operation); + let merge_scratch = MergeScratch { + merge_task, + tantivy_dirs: vec![tantivy_dir], + merge_scratch_directory, + downloaded_splits_directory, + }; + let pipeline_id = MergePipelineId { + node_id: test_sandbox.node_id(), + index_uid: test_sandbox.index_uid(), + source_id: test_sandbox.source_id(), + }; + let universe = Universe::with_accelerated_time(); + let (merge_packager_mailbox, merge_packager_inbox) = universe.create_test_mailbox(); + let merge_executor = MergeExecutor::new( + pipeline_id, + metastore, + test_sandbox.doc_mapper(), + IoControls::default(), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = + universe.spawn_builder().spawn(merge_executor); + merge_executor_mailbox.send_message(merge_scratch).await?; + merge_executor_handle.process_pending_and_observe().await; + + let packager_msgs: Vec = merge_packager_inbox.drain_for_test_typed(); + assert_eq!(packager_msgs.len(), 1); + let split = &packager_msgs[0].splits[0]; + + // 3 docs − 1 soft-deleted − 1 query-deleted = 1 surviving document. + assert_eq!(split.split_attrs.num_docs, 1); + assert_eq!(split.split_attrs.delete_opstamp, 1); + // Delete operations must not increment num_merge_ops. + assert_eq!(split.split_attrs.num_merge_ops, 1); + + let reader = split + .index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + + let num_live_docs: u32 = searcher + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!( + num_live_docs, 1, + "exactly one document must remain after all deletions" + ); + + // The surviving document must be the "keep" one. + let documents_left: Vec = searcher + .search( + &tantivy::query::AllQuery, + &tantivy::collector::TopDocs::with_limit(10).order_by_score(), + )? + .into_iter() + .map(|(_, doc_address)| { + let doc: TantivyDocument = searcher.doc(doc_address).unwrap(); + let doc_json = doc.to_json(searcher.schema()); + serde_json::from_str(&doc_json).unwrap() + }) + .collect(); + let expected_doc = serde_json::json!({"body": ["keep"], "ts": ["2021-06-29T00:56:42Z"]}); + assert_eq!( + documents_left, + vec![expected_doc], + "only the 'keep' document must survive both soft-delete and query-delete" + ); + + test_sandbox.assert_quit().await; + universe.assert_quit().await; + Ok(()) + } } diff --git a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs index bbe5267d514..3818edd8c73 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_scheduler_service.rs @@ -33,6 +33,15 @@ pub struct MergePermit { } impl MergePermit { + /// Creates a `MergePermit` from an owned semaphore permit, without notifying any + /// `MergeSchedulerService` on drop. Use this when managing concurrency externally. + pub fn new(permit: OwnedSemaphorePermit) -> MergePermit { + MergePermit { + _semaphore_permit: Some(permit), + merge_scheduler_mailbox: None, + } + } + #[cfg(any(test, feature = "testsuite"))] pub fn for_test() -> MergePermit { MergePermit { diff --git a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs index 5d68bb59285..7d124288288 100644 --- a/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs +++ b/quickwit/quickwit-indexing/src/actors/merge_split_downloader.rs @@ -17,7 +17,7 @@ use std::path::Path; use async_trait::async_trait; use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, QueueCapacity}; use quickwit_common::io::IoControls; -use quickwit_common::temp_dir::{self, TempDirectory}; +use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::SplitMetadata; use tantivy::Directory; use tracing::{debug, info, instrument}; @@ -62,14 +62,13 @@ impl Handler for MergeSplitDownloader { merge_task: MergeTask, ctx: &ActorContext, ) -> Result<(), quickwit_actors::ActorExitStatus> { - let merge_scratch_directory = temp_dir::Builder::default() - .join("merge") - .tempdir_in(self.scratch_directory.path()) + let merge_scratch_directory = self + .scratch_directory + .named_temp_child("merge") .map_err(|error| anyhow::anyhow!(error))?; info!(dir=%merge_scratch_directory.path().display(), "download-merge-splits"); - let downloaded_splits_directory = temp_dir::Builder::default() - .join("downloaded-splits") - .tempdir_in(merge_scratch_directory.path()) + let downloaded_splits_directory = merge_scratch_directory + .named_temp_child("downloaded-splits") .map_err(|error| anyhow::anyhow!(error))?; let tantivy_dirs = self .download_splits( diff --git a/quickwit/quickwit-indexing/src/actors/packager.rs b/quickwit/quickwit-indexing/src/actors/packager.rs index 18e0bb40d73..ee43e050a5c 100644 --- a/quickwit/quickwit-indexing/src/actors/packager.rs +++ b/quickwit/quickwit-indexing/src/actors/packager.rs @@ -527,9 +527,9 @@ mod tests { uncompressed_docs_size_in_bytes: num_docs * 15, time_range: timerange_opt, secondary_time_range: None, - replaced_split_ids: Vec::new(), delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/actors/publisher.rs b/quickwit/quickwit-indexing/src/actors/publisher.rs index b05081be706..2d85ca1a1af 100644 --- a/quickwit/quickwit-indexing/src/actors/publisher.rs +++ b/quickwit/quickwit-indexing/src/actors/publisher.rs @@ -12,22 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::{BTreeSet, HashMap}; + use anyhow::Context; use async_trait::async_trait; use fail::fail_point; use quickwit_actors::{Actor, ActorContext, Handler, Mailbox, QueueCapacity}; -use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, PublishSplitsRequest}; +use quickwit_common::Progress; +use quickwit_metastore::{ListSplitsQuery, ListSplitsRequestExt, MetastoreServiceStreamSplitsExt}; +use quickwit_proto::metastore::{ + ListSplitsRequest, MetastoreService, MetastoreServiceClient, PublishSplitsRequest, +}; +use quickwit_proto::types::{IndexUid, SplitId}; use serde::Serialize; -use tracing::{info, instrument, warn}; +use tracing::{error, info, instrument, warn}; use crate::actors::MergePlanner; -use crate::models::{NewSplits, SplitsUpdate}; +use crate::models::{NewSplits, ReplacedSplit, SplitsUpdate}; use crate::source::{SourceActor, SuggestTruncate}; #[derive(Clone, Debug, Default, Serialize)] pub struct PublisherCounters { pub num_published_splits: u64, pub num_replace_operations: u64, + pub num_replaced_splits: u64, pub num_empty_splits: u64, } @@ -127,10 +135,10 @@ impl Handler for Publisher { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, checkpoint_delta_opt, publish_lock, publish_token_opt, + replaced_splits, .. } = split_update; @@ -143,11 +151,24 @@ impl Handler for Publisher { .iter() .map(|split| split.split_id.clone()) .collect(); + let replaced_split_ids = replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(); if let Some(_guard) = publish_lock.acquire().await { + if !replaced_splits.is_empty() { + warn_if_soft_deletes_changed_during_merge( + &index_uid, + &replaced_splits, + &self.metastore, + ctx.progress(), + ) + .await; + } let publish_splits_request = PublishSplitsRequest { index_uid: Some(index_uid), staged_split_ids: split_ids.clone(), - replaced_split_ids: replaced_split_ids.clone(), + replaced_split_ids, index_checkpoint_delta_json_opt, publish_token_opt: publish_token_opt.clone(), }; @@ -194,10 +215,11 @@ impl Handler for Publisher { .await; } - if replaced_split_ids.is_empty() { + if replaced_splits.is_empty() { self.counters.num_published_splits += 1; } else { self.counters.num_replace_operations += 1; + self.counters.num_replaced_splits += replaced_splits.len() as u64; } } else { self.counters.num_empty_splits += 1; @@ -207,6 +229,73 @@ impl Handler for Publisher { } } +/// Re-reads the soft-deleted doc IDs for all input splits from the metastore and logs an +/// error for each split whose soft-delete set grew while the merge was running. +async fn warn_if_soft_deletes_changed_during_merge( + index_uid: &IndexUid, + replaced_splits: &[ReplacedSplit], + metastore: &MetastoreServiceClient, + progress: &Progress, +) { + let query = ListSplitsQuery::for_index(index_uid.clone()).with_split_ids( + replaced_splits + .iter() + .map(|replaced| replaced.split_id.clone()) + .collect(), + ); + + let list_splits_request = match ListSplitsRequest::try_from_list_splits_query(&query) { + Ok(request) => request, + Err(err) => { + warn!(error = ?err, "failed to build list_splits request for soft-delete race detection"); + return; + } + }; + let splits_stream = match progress + .protect_future(metastore.list_splits(list_splits_request)) + .await + { + Ok(stream) => stream, + Err(err) => { + warn!(error = ?err, "failed to list splits for soft-delete race detection"); + return; + } + }; + let fresh_splits = match progress + .protect_future(splits_stream.collect_splits_metadata()) + .await + { + Ok(splits) => splits, + Err(err) => { + warn!(error = ?err, "failed to collect split metadata for soft-delete race detection"); + return; + } + }; + let snapshot: HashMap<&SplitId, &BTreeSet> = replaced_splits + .iter() + .map(|n| (&n.split_id, &n.soft_deleted_doc_ids)) + .collect(); + for fresh_split in &fresh_splits { + let Some(snapshot_ids) = snapshot.get(&fresh_split.split_id) else { + continue; + }; + let missed: BTreeSet = fresh_split + .soft_deleted_doc_ids + .difference(snapshot_ids) + .copied() + .collect(); + if !missed.is_empty() { + // TODO: this means that the merge didn't include some committed + // soft deletes. Those are lost. + error!( + split_id = %fresh_split.split_id, + num_missed_soft_deletes = missed.len(), + "soft-delete race condition detected", + ); + } + } +} + #[cfg(test)] mod tests { use quickwit_actors::Universe; @@ -262,7 +351,6 @@ mod tests { split_id: "split".to_string(), ..Default::default() }], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -271,6 +359,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -278,6 +367,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 1); + assert_eq!(publisher_observation.num_replaced_splits, 0); let suggest_truncate_checkpoints: Vec = source_inbox .drain_for_test_typed::() @@ -337,7 +427,6 @@ mod tests { .send_message(SplitsUpdate { index_uid: ref_index_uid.clone(), new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(IndexCheckpointDelta { source_id: "source".to_string(), source_delta: SourceCheckpointDelta::from_range(1..3), @@ -346,6 +435,7 @@ mod tests { publish_token_opt: None, merge_task: None, parent_span: tracing::Span::none(), + replaced_splits: Vec::new(), }) .await .is_ok() @@ -354,6 +444,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); assert_eq!(publisher_observation.num_empty_splits, 1); let suggest_truncate_checkpoints: Vec = source_inbox @@ -381,12 +472,21 @@ mod tests { let mut mock_metastore = MockMetastoreService::new(); let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); let ref_index_uid_clone = ref_index_uid.clone(); + mock_metastore.expect_list_splits().times(1).returning(|_| { + use quickwit_common::ServiceStream; + use quickwit_metastore::ListSplitsResponseExt; + use quickwit_proto::metastore::ListSplitsResponse; + let response = ListSplitsResponse::try_from_splits(vec![]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); mock_metastore .expect_publish_splits() .withf(move |publish_splits_requests| { + let mut replaced_split_ids = publish_splits_requests.replaced_split_ids.clone(); + replaced_split_ids.sort(); publish_splits_requests.index_uid() == &ref_index_uid_clone && publish_splits_requests.staged_split_ids[..] == ["split3"] - && publish_splits_requests.replaced_split_ids[..] == ["split1", "split2"] + && replaced_split_ids[..] == ["split1", "split2"] && publish_splits_requests .index_checkpoint_delta_json_opt() .is_empty() @@ -407,12 +507,21 @@ mod tests { split_id: "split3".to_string(), ..Default::default() }], - replaced_split_ids: vec!["split1".to_string(), "split2".to_string()], checkpoint_delta_opt: None, publish_lock: PublishLock::default(), publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: vec![ + ReplacedSplit { + split_id: "split1".to_string(), + ..Default::default() + }, + ReplacedSplit { + split_id: "split2".to_string(), + ..Default::default() + }, + ], }; assert!( publisher_mailbox @@ -423,6 +532,7 @@ mod tests { let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); assert_eq!(publisher_observation.num_replace_operations, 1); + assert_eq!(publisher_observation.num_replaced_splits, 2); let merge_planner_msgs = merge_planner_inbox.drain_for_test_typed::(); assert_eq!(merge_planner_msgs.len(), 1); assert_eq!(merge_planner_msgs[0].new_splits.len(), 1); @@ -451,21 +561,99 @@ mod tests { .send_message(SplitsUpdate { index_uid: IndexUid::new_with_random_ulid("index"), new_splits: vec![SplitMetadata::for_test("test-split".to_string())], - replaced_split_ids: Vec::new(), checkpoint_delta_opt: None, publish_lock, publish_token_opt: None, merge_task: None, parent_span: Span::none(), + replaced_splits: Vec::new(), }) .await .unwrap(); let publisher_observation = publisher_handle.process_pending_and_observe().await.state; assert_eq!(publisher_observation.num_published_splits, 0); + assert_eq!(publisher_observation.num_replaced_splits, 0); let merger_messages = merge_planner_inbox.drain_for_test(); assert!(merger_messages.is_empty()); universe.assert_quit().await; } + + #[tokio::test] + async fn test_publisher_warns_on_soft_delete_race_condition() { + use std::collections::BTreeSet; + + use quickwit_common::ServiceStream; + use quickwit_metastore::{ListSplitsResponseExt, Split, SplitState}; + use quickwit_proto::metastore::ListSplitsResponse; + + let universe = Universe::with_accelerated_time(); + let ref_index_uid: IndexUid = IndexUid::for_test("index", 1); + let racing_split_id = "racing-split".to_string(); + + let mut mock_metastore = MockMetastoreService::new(); + + // list_splits returns the racing split with a new soft-delete absent from the snapshot. + let racing_split_id_clone = racing_split_id.clone(); + mock_metastore + .expect_list_splits() + .times(1) + .returning(move |_| { + let split = Split { + split_metadata: SplitMetadata { + split_id: racing_split_id_clone.clone(), + soft_deleted_doc_ids: BTreeSet::from([0u32]), + ..Default::default() + }, + split_state: SplitState::Published, + update_timestamp: 0, + publish_timestamp: None, + }; + let response = ListSplitsResponse::try_from_splits(vec![split]).unwrap(); + Ok(ServiceStream::from(vec![Ok(response)])) + }); + + mock_metastore + .expect_publish_splits() + .times(1) + .returning(|_| Ok(EmptyResponse {})); + + let publisher = Publisher::new( + PublisherType::MergePublisher, + MetastoreServiceClient::from_mock(mock_metastore), + None, + None, + ); + let (publisher_mailbox, publisher_handle) = universe.spawn_builder().spawn(publisher); + + // Snapshot shows the racing split had no soft-deletes at merge start (stale read). + let replaced_splits = vec![ReplacedSplit { + split_id: racing_split_id.clone(), + ..Default::default() + }]; + + publisher_mailbox + .send_message(SplitsUpdate { + index_uid: ref_index_uid.clone(), + new_splits: vec![SplitMetadata { + split_id: "merged-split".to_string(), + ..Default::default() + }], + checkpoint_delta_opt: None, + publish_lock: PublishLock::default(), + publish_token_opt: None, + merge_task: None, + parent_span: Span::none(), + replaced_splits, + }) + .await + .unwrap(); + + // Publish must still succeed despite the race condition (warning is non-fatal). + let observation = publisher_handle.process_pending_and_observe().await.state; + assert_eq!(observation.num_replace_operations, 1); + assert_eq!(observation.num_replaced_splits, 1); + universe.assert_quit().await; + } } diff --git a/quickwit/quickwit-indexing/src/actors/uploader.rs b/quickwit/quickwit-indexing/src/actors/uploader.rs index 2a012858587..1d9e71d87ba 100644 --- a/quickwit/quickwit-indexing/src/actors/uploader.rs +++ b/quickwit/quickwit-indexing/src/actors/uploader.rs @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; -use std::iter::FromIterator; use std::mem; use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering}; @@ -27,11 +25,9 @@ use quickwit_actors::{Actor, ActorContext, ActorExitStatus, Handler, Mailbox, Qu use quickwit_common::pubsub::EventBroker; use quickwit_common::spawn_named_task; use quickwit_config::RetentionPolicy; -use quickwit_metastore::checkpoint::IndexCheckpointDelta; use quickwit_metastore::{SplitMetadata, StageSplitsRequestExt}; use quickwit_proto::metastore::{MetastoreService, MetastoreServiceClient, StageSplitsRequest}; use quickwit_proto::search::{ReportSplit, ReportSplitsRequest}; -use quickwit_proto::types::{IndexUid, PublishToken}; use quickwit_storage::SplitPayloadBuilder; use serde::Serialize; use tokio::sync::oneshot::Sender; @@ -40,10 +36,10 @@ use tracing::{Instrument, Span, debug, info, instrument, warn}; use crate::actors::Publisher; use crate::actors::sequencer::{Sequencer, SequencerCommand}; -use crate::merge_policy::{MergePolicy, MergeTask}; +use crate::merge_policy::MergePolicy; use crate::metrics::INDEXER_METRICS; use crate::models::{ - EmptySplit, PackagedSplit, PackagedSplitBatch, PublishLock, SplitsUpdate, create_split_metadata, + EmptySplit, PackagedSplit, PackagedSplitBatch, SplitsUpdate, create_split_metadata, }; use crate::split_store::IndexingSplitStore; @@ -370,6 +366,7 @@ impl Handler for Uploader { event_broker.publish(ReportSplitsRequest { report_splits }); + let mut replaced_splits = Vec::new(); for (packaged_split, metadata) in batch.splits.into_iter().zip(split_metadata_list) { let upload_result = upload_split( &packaged_split, @@ -385,18 +382,24 @@ impl Handler for Uploader { return; } + replaced_splits.extend(packaged_split.split_attrs.replaced_splits.iter().cloned()); packaged_splits_and_metadata.push((packaged_split, metadata)); } - let splits_update = make_publish_operation( + assert!(!packaged_splits_and_metadata.is_empty()); + let splits_update = SplitsUpdate { index_uid, - packaged_splits_and_metadata, - batch.checkpoint_delta_opt, - batch.publish_lock, - batch.publish_token_opt, - batch.merge_task_opt, - batch.batch_parent_span, - ); + new_splits: packaged_splits_and_metadata + .into_iter() + .map(|split_and_meta| split_and_meta.1) + .collect_vec(), + checkpoint_delta_opt: batch.checkpoint_delta_opt, + publish_lock: batch.publish_lock, + publish_token_opt: batch.publish_token_opt, + merge_task: batch.merge_task_opt, + parent_span: batch.batch_parent_span, + replaced_splits, + }; let target = match &split_update_sender { SplitsUpdateSender::Sequencer(_) => "sequencer", @@ -439,12 +442,12 @@ impl Handler for Uploader { let splits_update = SplitsUpdate { index_uid: empty_split.index_uid, new_splits: Vec::new(), - replaced_split_ids: Vec::new(), checkpoint_delta_opt: Some(empty_split.checkpoint_delta), publish_lock: empty_split.publish_lock, publish_token_opt: empty_split.publish_token_opt, merge_task: None, parent_span: empty_split.batch_parent_span, + replaced_splits: Vec::new(), }; split_update_sender.send(splits_update, ctx).await?; @@ -452,35 +455,6 @@ impl Handler for Uploader { } } -fn make_publish_operation( - index_uid: IndexUid, - packaged_splits_and_metadatas: Vec<(PackagedSplit, SplitMetadata)>, - checkpoint_delta_opt: Option, - publish_lock: PublishLock, - publish_token_opt: Option, - merge_task: Option, - parent_span: Span, -) -> SplitsUpdate { - assert!(!packaged_splits_and_metadatas.is_empty()); - let replaced_split_ids = packaged_splits_and_metadatas - .iter() - .flat_map(|(split, _)| split.split_attrs.replaced_split_ids.clone()) - .collect::>(); - SplitsUpdate { - index_uid, - new_splits: packaged_splits_and_metadatas - .into_iter() - .map(|split_and_meta| split_and_meta.1) - .collect_vec(), - replaced_split_ids: Vec::from_iter(replaced_split_ids), - checkpoint_delta_opt, - publish_lock, - publish_token_opt, - merge_task, - parent_span, - } -} - #[instrument( level = "info" name = "upload", @@ -512,6 +486,7 @@ async fn upload_split( #[cfg(test)] mod tests { + use std::collections::BTreeSet; use std::path::PathBuf; use std::time::Duration; @@ -520,14 +495,14 @@ mod tests { use quickwit_common::temp_dir::TempDirectory; use quickwit_metastore::checkpoint::{IndexCheckpointDelta, SourceCheckpointDelta}; use quickwit_proto::metastore::{EmptyResponse, MockMetastoreService}; - use quickwit_proto::types::{DocMappingUid, NodeId}; + use quickwit_proto::types::{DocMappingUid, IndexUid, NodeId}; use quickwit_storage::RamStorage; use tantivy::DateTime; use tokio::sync::oneshot; use super::*; use crate::merge_policy::{NopMergePolicy, default_merge_policy}; - use crate::models::{SplitAttrs, SplitsUpdate}; + use crate::models::{PublishLock, ReplacedSplit, SplitAttrs, SplitsUpdate}; #[tokio::test] async fn test_uploader_with_sequencer() -> anyhow::Result<()> { @@ -590,10 +565,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: "test-split".to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -627,7 +602,6 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, .. } = publisher_message; @@ -640,7 +614,6 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); let mut files = ram_storage.list_files().await; files.sort(); assert_eq!(&files, &[PathBuf::from("test-split.split")]); @@ -703,12 +676,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_1, @@ -731,12 +704,12 @@ mod tests { ..=DateTime::from_timestamp_secs(1_628_203_640), ), secondary_time_range: None, - replaced_split_ids: vec![ - "replaced-split-1".to_string(), - "replaced-split-2".to_string(), - ], delete_opstamp: 0, num_merge_ops: 0, + replaced_splits: Vec::from([ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }]), }, serialized_split_fields: Vec::new(), split_scratch_directory: split_scratch_directory_2, @@ -772,21 +745,26 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - mut replaced_split_ids, checkpoint_delta_opt, + replaced_splits, .. } = publisher_message; assert_eq!(index_uid.index_id, "test-index"); // Sort first to avoid test failing. - replaced_split_ids.sort(); assert_eq!(new_splits.len(), 2); assert_eq!(new_splits[0].split_id(), "test-split-1"); assert_eq!(new_splits[1].split_id(), "test-split-2"); assert_eq!( - &replaced_split_ids, - &[ - "replaced-split-1".to_string(), - "replaced-split-2".to_string() + &replaced_splits, + &vec![ + ReplacedSplit { + split_id: "replaced-split-1".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, + ReplacedSplit { + split_id: "replaced-split-2".to_string(), + soft_deleted_doc_ids: BTreeSet::new(), + }, ] ); assert!(checkpoint_delta_opt.is_none()); @@ -855,9 +833,9 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, @@ -879,13 +857,13 @@ mod tests { let SplitsUpdate { index_uid, new_splits, - replaced_split_ids, + replaced_splits, .. } = publisher_inbox.recv_typed_message().await.unwrap(); assert_eq!(index_uid.index_id, "test-index"); assert_eq!(new_splits.len(), 1); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); universe.assert_quit().await; Ok(()) } @@ -943,7 +921,7 @@ mod tests { index_uid, new_splits, checkpoint_delta_opt, - replaced_split_ids, + replaced_splits, .. } = publisher_message; @@ -955,7 +933,7 @@ mod tests { checkpoint_delta.source_delta, SourceCheckpointDelta::from_range(3..15) ); - assert!(replaced_split_ids.is_empty()); + assert!(replaced_splits.is_empty()); let files = ram_storage.list_files().await; assert!(files.is_empty()); universe.assert_quit().await; @@ -1037,10 +1015,10 @@ mod tests { secondary_time_range: None, uncompressed_docs_size_in_bytes: 1_000, num_docs: 10, - replaced_split_ids: Vec::new(), split_id: SPLIT_ULID_STR.to_string(), delete_opstamp: 10, num_merge_ops: 0, + replaced_splits: Vec::new(), }, serialized_split_fields: Vec::new(), split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/controlled_directory.rs b/quickwit/quickwit-indexing/src/controlled_directory.rs index b209b4888d6..86e4a5fce0e 100644 --- a/quickwit/quickwit-indexing/src/controlled_directory.rs +++ b/quickwit/quickwit-indexing/src/controlled_directory.rs @@ -93,7 +93,7 @@ impl Directory for ControlledDirectory { self.check_if_alive() .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?; - let underlying_wrt: Box = self + let underlying_wrt: Box = self .underlying .open_write(path)? .into_inner() @@ -154,7 +154,9 @@ impl IoControlsAccess for HotswappableIoControls { } // Wrapper to work around the orphan rule. (hence the word "Adopted"). -struct AdoptedControlledWrite(ControlledWrite>); +struct AdoptedControlledWrite( + ControlledWrite>, +); impl io::Write for AdoptedControlledWrite { fn write(&mut self, buf: &[u8]) -> io::Result { diff --git a/quickwit/quickwit-indexing/src/lib.rs b/quickwit/quickwit-indexing/src/lib.rs index 2c2b28a09d7..9183fda3890 100644 --- a/quickwit/quickwit-indexing/src/lib.rs +++ b/quickwit/quickwit-indexing/src/lib.rs @@ -35,9 +35,12 @@ pub use crate::split_store::{IndexingSplitStore, get_tantivy_directory_from_spli pub mod actors; mod controlled_directory; +pub mod mature_merge; +pub mod mature_merge_plan; pub mod merge_policy; mod metrics; pub mod models; +mod soft_delete_query; pub mod source; mod split_store; #[cfg(any(test, feature = "testsuite"))] diff --git a/quickwit/quickwit-indexing/src/mature_merge.rs b/quickwit/quickwit-indexing/src/mature_merge.rs new file mode 100644 index 00000000000..de04b9c4ae2 --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge.rs @@ -0,0 +1,844 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use anyhow::{Context, bail}; +use bytesize::ByteSize; +use futures::StreamExt; +use quickwit_actors::{ActorExitStatus, Universe}; +use quickwit_common::io::IoControls; +use quickwit_common::{KillSwitch, temp_dir}; +use quickwit_metastore::{ + IndexMetadata, ListIndexesMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, + MetastoreServiceStreamSplitsExt, SplitState, +}; +use quickwit_proto::indexing::MergePipelineId; +use quickwit_proto::metastore::{ + ListIndexesMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, +}; +use quickwit_proto::types::NodeId; +use quickwit_storage::StorageResolver; +use tantivy::Inventory; +use time::OffsetDateTime; +use tokio::sync::Semaphore; +use tracing::{info, warn}; + +use crate::actors::{ + MergeExecutor, MergePermit, MergeSplitDownloader, Packager, Publisher, PublisherType, Uploader, + UploaderType, +}; +use crate::mature_merge_plan::{MATURITY_BUFFER, plan_merge_operations_for_index}; +use crate::merge_policy::{MergeOperation, MergeTask, NopMergePolicy}; +use crate::split_store::{IndexingSplitCache, IndexingSplitStore}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct MatureMergeConfig { + /// Splits within this many days of the retention cutoff are left untouched. + pub retention_safety_buffer_days: u64, + /// Minimum number of splits in a group before a merge operation is emitted. + pub min_merge_group_size: usize, + /// Maximum number of docs in a split for it to be eligible for mature merging. + pub input_split_max_num_docs: usize, + /// Maximum number of splits per merge operation. + pub max_merge_group_size: usize, + /// Maximum total number of documents per merge operation. + pub split_target_num_docs: usize, + /// Number of indexes processed concurrently. Lower to avoid fetching splits + /// metadata too eagerly. + pub index_parallelism: usize, + /// Maximum number of merges running concurrently across all indexes. + pub max_concurrent_merges: usize, + /// Print planned operations without executing them. + pub dry_run: bool, + /// List of index patterns to include in the mature merge process. + pub index_id_patterns: Vec, +} + +impl Default for MatureMergeConfig { + fn default() -> Self { + Self { + retention_safety_buffer_days: 5, + min_merge_group_size: 5, + input_split_max_num_docs: 10_000, + max_merge_group_size: 100, + split_target_num_docs: 5_000_000, + index_parallelism: 50, + max_concurrent_merges: 10, + dry_run: false, + index_id_patterns: vec!["*".to_string()], + } + } +} + +/// Statistics for the merges performed on a single index. +#[derive(Debug, Default)] +struct IndexMergeOutcome { + num_published_merges: u64, + num_replaced_splits: u64, +} + +struct IndexMergeSummary { + num_merges_planned: usize, + num_input_splits: usize, + total_input_bytes: u64, + outcome: IndexMergeOutcome, +} + +/// Fetches all published splits for the given index from the metastore (no +/// node-id filter) and calls [`plan_merge_operations_for_index`]. +async fn fetch_splits_and_plan( + index_metadata: &IndexMetadata, + metastore: &MetastoreServiceClient, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> anyhow::Result> { + let index_uid = index_metadata.index_uid.clone(); + let list_splits_query = ListSplitsQuery::for_index(index_uid) + .with_split_state(SplitState::Published) + .retain_mature(now - MATURITY_BUFFER); + let list_splits_request = ListSplitsRequest::try_from_list_splits_query(&list_splits_query)?; + let splits_stream = metastore.list_splits(list_splits_request).await?; + let splits = splits_stream.collect_splits_metadata().await?; + + if splits.iter().any(|s| !s.tags.is_empty()) { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges") + } + + let total_splits = splits.len(); + let operations = + plan_merge_operations_for_index(&index_metadata.index_config, splits, now, config); + + info!( + index_id = %index_metadata.index_config.index_id, + total_splits, + num_planned_merges = operations.len(), + "fetched splits for mature merge planning" + ); + Ok(operations) +} + +/// Executes the given merge operations for a single index using the standard +/// actor pipeline: `MergeSplitDownloader -> MergeExecutor -> Packager -> +/// Uploader -> Publisher`. +/// +/// Tags are not supported and we use the default tokenizer manager. In practice +/// we could use the tags and custom tokenizers from the current doc mapping, +/// but schema evolutions could lead to un-anticipated edge cases. +#[allow(clippy::too_many_arguments)] +async fn run_mature_merges_for_index( + index_metadata: &IndexMetadata, + operations: Vec, + metastore: MetastoreServiceClient, + split_store: IndexingSplitStore, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result { + if operations.is_empty() { + return Ok(IndexMergeOutcome { + num_published_merges: 0, + num_replaced_splits: 0, + }); + } + + let index_config = &index_metadata.index_config; + let index_uid = index_metadata.index_uid.clone(); + + let indexing_directory = temp_dir::Builder::default() + .join("mature-merge") + .tempdir_in(data_dir_path) + .context("failed to create temp directory for mature merge")?; + + let pipeline_id = MergePipelineId { + node_id, + index_uid, + source_id: "_mature_merge".to_string(), + }; + + let universe = Universe::new(); + let kill_switch = KillSwitch::default(); + + // Build chain from publisher inward (each actor gets the next actor's mailbox). + + let merge_publisher = Publisher::new( + PublisherType::MergePublisher, + metastore.clone(), + // No feedback loop to a merge planner. + None, + None, + ); + let (merge_publisher_mailbox, merge_publisher_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_publisher); + + let merge_uploader = Uploader::new( + UploaderType::MergeUploader, + metastore.clone(), + Arc::new(NopMergePolicy), + index_config.retention_policy_opt.clone(), + split_store.clone(), + merge_publisher_mailbox.into(), + config.max_concurrent_merges, + Default::default(), + ); + let (merge_uploader_mailbox, merge_uploader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_uploader); + + // Tag fields not supported for now + let tag_fields = Vec::new(); + let merge_packager = Packager::new("MaturePackager", tag_fields, merge_uploader_mailbox); + let (merge_packager_mailbox, merge_packager_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_packager); + + let merge_executor = MergeExecutor::new_with_tokenizers_only( + pipeline_id, + metastore, + // we only support the default tokenizer manager + quickwit_query::create_default_quickwit_tokenizer_manager(), + IoControls::default().set_component("mature_merger"), + merge_packager_mailbox, + ); + let (merge_executor_mailbox, merge_executor_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_executor); + + let merge_split_downloader = MergeSplitDownloader { + scratch_directory: indexing_directory, + split_store, + executor_mailbox: merge_executor_mailbox, + io_controls: IoControls::default().set_component("mature_split_downloader"), + }; + let (merge_split_downloader_mailbox, merge_split_downloader_handle) = universe + .spawn_builder() + .set_kill_switch(kill_switch.clone()) + .spawn(merge_split_downloader); + + // Send all merge tasks to the downloader, gated by the concurrency semaphore. + let inventory: Inventory = Inventory::default(); + for operation in operations { + let permit = Arc::clone(&semaphore) + .acquire_owned() + .await + .expect("semaphore should not be closed"); + let merge_task = MergeTask { + merge_operation: inventory.track(operation), + _merge_permit: MergePermit::new(permit), + }; + if merge_split_downloader_mailbox + .send_message(merge_task) + .await + .is_err() + { + anyhow::bail!("merge split downloader actor died unexpectedly"); + } + } + + // Dropping the downloader mailbox signals no more tasks are coming. + // The pipeline will cascade-exit once all pending tasks are processed. + drop(merge_split_downloader_mailbox); + + let (downloader_status, _) = merge_split_downloader_handle.join().await; + let (executor_status, _) = merge_executor_handle.join().await; + let (packager_status, _) = merge_packager_handle.join().await; + let (uploader_status, _) = merge_uploader_handle.join().await; + let (publisher_status, publisher_counters) = merge_publisher_handle.join().await; + + universe.quit().await; + + for (name, status) in [ + ("downloader", downloader_status), + ("executor", executor_status), + ("packager", packager_status), + ("uploader", uploader_status), + ("publisher", publisher_status), + ] { + if !matches!(status, ActorExitStatus::Success | ActorExitStatus::Quit) { + anyhow::bail!( + "mature merge actor `{}` exited with unexpected status: {:?}", + name, + status + ); + } + } + + Ok(IndexMergeOutcome { + num_published_merges: publisher_counters.num_replace_operations, + num_replaced_splits: publisher_counters.num_replaced_splits, + }) +} + +/// Plans and optionally executes mature merges for a single index +#[allow(clippy::too_many_arguments)] +async fn merge_mature_single_index( + index_metadata: IndexMetadata, + metastore: &MetastoreServiceClient, + storage_resolver: &StorageResolver, + semaphore: Arc, + data_dir_path: &std::path::Path, + config: &MatureMergeConfig, + node_id: NodeId, + now: OffsetDateTime, +) -> anyhow::Result { + let index_id = index_metadata.index_config.index_id.clone(); + let operations = fetch_splits_and_plan(&index_metadata, metastore, now, config).await?; + let num_merges_planned = operations.len(); + let num_input_splits: usize = operations.iter().map(|op| op.splits.len()).sum(); + let total_input_bytes: u64 = operations + .iter() + .flat_map(|op| op.splits.iter()) + .map(|s| s.uncompressed_docs_size_in_bytes) + .sum(); + + if config.dry_run { + for op in &operations { + log_op_for_dry_run(op, &index_metadata.index_config.index_id); + } + return Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome: IndexMergeOutcome::default(), + }); + } + + if operations.is_empty() { + return Ok(IndexMergeSummary { + num_merges_planned: 0, + total_input_bytes: 0, + num_input_splits: 0, + outcome: IndexMergeOutcome::default(), + }); + } + + let index_uri = index_metadata.index_uri(); + let remote_storage = storage_resolver + .resolve(index_uri) + .await + .context("failed to resolve index storage")?; + let split_store = + IndexingSplitStore::new(remote_storage, Arc::new(IndexingSplitCache::no_caching())); + + let outcome = run_mature_merges_for_index( + &index_metadata, + operations, + metastore.clone(), + split_store, + semaphore, + data_dir_path, + config, + node_id, + ) + .await?; + + if num_merges_planned > 0 { + info!( + index_id = %index_id, + planned = num_merges_planned, + published_merges = outcome.num_published_merges, + replaced_splits = outcome.num_replaced_splits, + input_splits = num_input_splits, + input_bytes = total_input_bytes, + "mature split merges complete for index" + ); + } + + Ok(IndexMergeSummary { + num_merges_planned, + num_input_splits, + total_input_bytes, + outcome, + }) +} + +/// Aggregates per-index results, logs per-index and global summary lines, and warns on errors. +fn log_merge_results(results: Vec>) { + let mut total_planned_merges = 0usize; + let mut total_input_splits = 0usize; + let mut total_input_bytes = 0u64; + let mut total_successfully_published_merges = 0u64; + let mut total_successfully_replaced_splits = 0u64; + + let mut num_indexes_successfully_merged = 0usize; + let mut num_indexes_partially_merged = 0usize; + let mut num_indexes_without_opportunity = 0usize; + + for result in results { + match result { + Ok(summary) => { + total_planned_merges += summary.num_merges_planned; + total_input_splits += summary.num_input_splits; + total_input_bytes += summary.total_input_bytes; + total_successfully_published_merges += summary.outcome.num_published_merges; + total_successfully_replaced_splits += summary.outcome.num_replaced_splits; + + if summary.num_merges_planned == 0 { + num_indexes_without_opportunity += 1; + } else if summary.outcome.num_published_merges + == (summary.num_merges_planned as u64) + { + num_indexes_successfully_merged += 1; + } else { + num_indexes_partially_merged += 1; + } + } + Err(err) => { + warn!(err = ?err, "error processing index during mature merge"); + } + } + } + info!( + num_indexes_successfully_merged, + num_indexes_partially_merged, + num_indexes_without_opportunity, + total_planned_merges, + total_successfully_published_merges, + total_successfully_replaced_splits, + total_input_splits, + total_input_bytes, + "mature merge complete" + ); +} + +fn log_op_for_dry_run(op: &MergeOperation, index_id: &str) { + let start_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.start())) + .min() + .unwrap_or(&0); + let end_time = op + .splits + .iter() + .filter_map(|s| s.time_range.as_ref().map(|r| r.end())) + .max() + .unwrap_or(&0); + let fmt_ts = |ts: i64| { + OffsetDateTime::from_unix_timestamp(ts) + .map(|dt| { + format!( + "{}-{:02}-{:02}T{:02}", + dt.year(), + dt.month() as u8, + dt.day(), + dt.hour() + ) + }) + .unwrap_or_else(|_| ts.to_string()) + }; + // print is better than log because dry-run will be used interactively from the CLI + println!( + "[dry-run] {index_id}: {} splits | {} docs | {} | {} → {}", + op.splits.len(), + op.splits.iter().map(|s| s.num_docs).sum::(), + ByteSize(op.splits.iter().map(|s| s.footer_offsets.end).sum::()), + fmt_ts(*start_time), + fmt_ts(*end_time), + ); +} + +/// Processes all indexes from the metastore, discovering and running mature +/// merge opportunities. +/// +/// If `dry_run` is `true`, the planned operations are printed but not executed. +pub async fn merge_mature_all_indexes( + metastore: MetastoreServiceClient, + storage_resolver: StorageResolver, + data_dir_path: &std::path::Path, + config: MatureMergeConfig, + node_id: NodeId, +) -> anyhow::Result<()> { + let indexes_metadata = metastore + .list_indexes_metadata(ListIndexesMetadataRequest { + index_id_patterns: config.index_id_patterns.clone(), + }) + .await + .context("failed to list indexes")? + .deserialize_indexes_metadata() + .await + .context("failed to deserialize indexes metadata")?; + + info!( + num_indexes = indexes_metadata.len(), + "starting mature merge" + ); + + let semaphore = Arc::new(Semaphore::new(config.max_concurrent_merges)); + let metastore_ref = &metastore; + let storage_resolver_ref = &storage_resolver; + let config_ref = &config; + + if indexes_metadata + .iter() + .any(|m| !m.index_config.doc_mapping.tag_fields.is_empty()) + { + // with tags and doc mapping evolutions, we might have weird edge cases + // -> just refuse them for now + bail!("tags not supported in mature merges"); + } + + let results: Vec> = futures::stream::iter(indexes_metadata) + .map(|index_metadata| { + let node_id = node_id.clone(); + let semaphore = Arc::clone(&semaphore); + async move { + let now = OffsetDateTime::now_utc(); + merge_mature_single_index( + index_metadata, + metastore_ref, + storage_resolver_ref, + semaphore, + data_dir_path, + config_ref, + node_id, + now, + ) + .await + } + }) + .buffer_unordered(config.index_parallelism) + .collect() + .await; + + log_merge_results(results); + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use quickwit_common::temp_dir::TempDirectory; + use quickwit_config::ConfigFormat; + use quickwit_metastore::{ + IndexMetadata, IndexMetadataResponseExt, SplitMaturity, SplitMetadata, + UpdateIndexRequestExt, + }; + use quickwit_proto::metastore::{ + IndexMetadataRequest, ListSplitsRequest, MetastoreService, MetastoreServiceClient, + MockMetastoreService, UpdateIndexRequest, + }; + use quickwit_proto::types::NodeId; + use quickwit_storage::RamStorage; + + use super::*; + use crate::TestSandbox; + + /// Tests the short-circuit path: when no merge operations are planned, + /// `run_mature_merges_for_index` returns 0 immediately without spawning any actors. + #[tokio::test] + async fn test_run_mature_merges_for_index_no_operations() -> anyhow::Result<()> { + let mock_metastore = MockMetastoreService::new(); + let storage = Arc::new(RamStorage::default()); + let split_store = IndexingSplitStore::create_without_local_store_for_test(storage); + let index_metadata = IndexMetadata::for_test("test-index", "ram:///test-index"); + let data_dir = TempDirectory::for_test(); + let node_id = NodeId::from("test-node"); + + let semaphore = Arc::new(Semaphore::new(2)); + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![], + MetastoreServiceClient::from_mock(mock_metastore), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + node_id, + ) + .await?; + + assert_eq!(outcome.num_published_merges, 0); + assert_eq!(outcome.num_replaced_splits, 0); + Ok(()) + } + + /// Tests the full per index pipeline end-to-end with a single merge operation + #[tokio::test] + async fn test_run_mature_merges_for_index_merges_real_splits() -> anyhow::Result<()> { + let doc_mapping_yaml = r#" + field_mappings: + - name: body + type: text + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-mature2", doc_mapping_yaml, "", &["body"]).await?; + + // each add_documents() call produces 1 split + for i in 0..4u64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"body": format!("doc{i}"), "ts": 1_631_072_713u64 + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let split_metas: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(split_metas.len(), 4); + + let index_metadata = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + + let merge_op = MergeOperation::new_merge_operation(split_metas); + let split_store = + IndexingSplitStore::create_without_local_store_for_test(test_sandbox.storage()); + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + + let outcome = run_mature_merges_for_index( + &index_metadata, + vec![merge_op], + metastore.clone(), + split_store, + semaphore, + data_dir.path(), + &MatureMergeConfig::default(), + test_sandbox.node_id(), + ) + .await?; + + assert_eq!(outcome.num_published_merges, 1); + assert_eq!(outcome.num_replaced_splits, 4); + + // The 4 input splits are now MarkedForDeletion; 1 merged Published split should remain. + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 1); + assert_eq!(published_after[0].num_docs, 4); + assert_eq!(published_after[0].maturity, SplitMaturity::Mature); + assert_eq!( + published_after[0].time_range, + Some(1_631_072_713..=1_631_072_716) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } + + #[tokio::test] + async fn test_merge_mature_single_index_schema_evolution() -> anyhow::Result<()> { + let doc_mapping_v1_yaml = r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: lowercase + timestamp_field: ts + "#; + let test_sandbox = + TestSandbox::create("test-index-schema-evo", doc_mapping_v1_yaml, "", &["label"]) + .await?; + + let base_time = 1_631_072_713i64; // Wednesday, September 8, 2021 at 3:45:13 AM UTC + + // create 3 splits with v1 mapping + for i in 0..3i64 { + test_sandbox + .add_documents(std::iter::once( + serde_json::json!({"label": format!("Doc{i}"), "ts": base_time + i}), + )) + .await?; + } + + let metastore = test_sandbox.metastore(); + let index_uid = test_sandbox.index_uid(); + + let v1_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(v1_splits.len(), 3); + let v1_doc_mapping_uid = v1_splits[0].doc_mapping_uid; + + // Update the index config: change tokenizer to `default` and add a secondary timestamp. + let index_metadata_v1 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let doc_mapping_v2 = ConfigFormat::Yaml.parse( + r#" + field_mappings: + - name: ts + type: datetime + input_formats: [unix_timestamp] + fast: true + - name: label + type: text + fast: true + tokenizer: default + - name: ts2 + type: datetime + input_formats: [unix_timestamp] + fast: true + timestamp_field: ts + secondary_timestamp_field: ts2 + "# + .as_bytes(), + )?; + let update_request = UpdateIndexRequest::try_from_updates( + index_uid.clone(), + &doc_mapping_v2, + &index_metadata_v1.index_config.indexing_settings, + &index_metadata_v1.index_config.ingest_settings, + &index_metadata_v1.index_config.search_settings, + &index_metadata_v1.index_config.retention_policy_opt, + )?; + metastore.update_index(update_request).await?; + + // create 3 more splits with v2 mapping + for i in 3..6i64 { + test_sandbox + .add_documents(std::iter::once(serde_json::json!({ + "label": format!("Doc{i}"), + "ts": base_time + i, + "ts2": base_time + i + 1000, + }))) + .await?; + } + + let all_splits: Vec = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(all_splits.len(), 6); + let v2_doc_mapping_uid = all_splits + .iter() + .find(|s| s.doc_mapping_uid != v1_doc_mapping_uid) + .unwrap() + .doc_mapping_uid; + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .count(), + 3 + ); + assert_eq!( + all_splits + .iter() + .filter(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .count(), + 3 + ); + + let index_metadata_v2 = metastore + .index_metadata(IndexMetadataRequest::for_index_id( + index_uid.index_id.to_string(), + )) + .await? + .deserialize_index_metadata()?; + let data_dir = TempDirectory::for_test(); + let semaphore = Arc::new(Semaphore::new(2)); + // Splits have the default 48h maturation period. Pass a `now` far enough in the future + // so all splits (both v1 and v2) are mature at `now - MATURITY_BUFFER (6h)`. + let now = OffsetDateTime::now_utc() + time::Duration::days(3); + // Override min_merge_group_size to 2 so that 3-split groups qualify. + let config = MatureMergeConfig { + min_merge_group_size: 2, + ..MatureMergeConfig::default() + }; + + let summary = merge_mature_single_index( + index_metadata_v2, + &metastore, + &test_sandbox.storage_resolver(), + semaphore, + data_dir.path(), + &config, + test_sandbox.node_id(), + now, + ) + .await?; + + // Both the v1 and v2 groups (3 splits each, different doc_mapping_uid) get merged. + assert_eq!(summary.num_merges_planned, 2); + assert_eq!(summary.outcome.num_published_merges, 2); + assert_eq!(summary.outcome.num_replaced_splits, 6); + + let published_after: Vec = metastore + .list_splits(ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid).with_split_state(SplitState::Published), + )?) + .await? + .collect_splits_metadata() + .await?; + assert_eq!(published_after.len(), 2); + + // The merged v1 split preserves the original doc_mapping_uid, time range, and has no + // secondary_time_range because the v1 schema had no secondary timestamp field. + let merged_v1 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v1_doc_mapping_uid) + .expect("merged v1 split must exist"); + assert_eq!(merged_v1.num_docs, 3); + assert_eq!(merged_v1.maturity, SplitMaturity::Mature); + assert_eq!(merged_v1.time_range, Some(base_time..=base_time + 2)); + assert_eq!(merged_v1.secondary_time_range, None); + + // The merged v2 split has the updated doc_mapping_uid and a secondary_time_range + // derived from the ts2 field. + let merged_v2 = published_after + .iter() + .find(|s| s.doc_mapping_uid == v2_doc_mapping_uid) + .expect("merged v2 split must exist"); + assert_eq!(merged_v2.num_docs, 3); + assert_eq!(merged_v2.maturity, SplitMaturity::Mature); + assert_eq!(merged_v2.time_range, Some(base_time + 3..=base_time + 5)); + assert_eq!( + merged_v2.secondary_time_range, + Some(base_time + 1003..=base_time + 1005) + ); + + test_sandbox.assert_quit().await; + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/mature_merge_plan.rs b/quickwit/quickwit-indexing/src/mature_merge_plan.rs new file mode 100644 index 00000000000..92a15a2fed9 --- /dev/null +++ b/quickwit/quickwit-indexing/src/mature_merge_plan.rs @@ -0,0 +1,459 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::time::Duration; + +use quickwit_config::IndexConfig; +use quickwit_metastore::SplitMetadata; +use time::OffsetDateTime; + +use crate::mature_merge::MatureMergeConfig; +use crate::merge_policy::MergeOperation; + +pub const SECS_PER_DAY: i64 = 60 * 60 * 24; + +/// Wait a couple of hours after the split got mature to be extra sure no merge +/// process is still running on it. +pub const MATURITY_BUFFER: Duration = Duration::from_hours(6); + +/// Computes the earliest UTC-day midnight (seconds since epoch) that is safe to merge, +/// given the index's retention policy and the current time. +fn retention_safety_cutoff_secs( + index_config: &IndexConfig, + now_secs: i64, + config: &MatureMergeConfig, +) -> Option { + let retention_policy = index_config.retention_policy_opt.as_ref()?; + let period = retention_policy.retention_period().ok()?; + let retention_safety_buffer = Duration::from_hours(config.retention_safety_buffer_days * 24); + if period <= retention_safety_buffer { + // No safe window: exclude every split by returning a cutoff in the far future. + return Some(i64::MAX); + } + let cutoff_raw = now_secs - period.as_secs() as i64 + retention_safety_buffer.as_secs() as i64; + // Round up to the next day boundary so we never partially exclude a day bucket. + Some((cutoff_raw / SECS_PER_DAY + 1) * SECS_PER_DAY) +} + +/// Converts a single day-bucket group of eligible splits into one or more balanced +/// [`MergeOperation`]s respecting constraints. +fn plan_operations_for_group( + mut group_splits: Vec, + config: &MatureMergeConfig, +) -> Vec { + if group_splits.len() < config.min_merge_group_size { + return Vec::new(); + } + // Sort ascending by end time so each sub-operation covers the most compact range. + group_splits.sort_by_key(|s| s.time_range.as_ref().map(|r| *r.end()).unwrap_or(0)); + + let n = group_splits.len(); + let total_docs: usize = group_splits.iter().map(|s| s.num_docs).sum(); + + // Minimum number of balanced operations needed to respect both per-operation limits. + let k = n + .div_ceil(config.max_merge_group_size) + .max(total_docs.div_ceil(config.split_target_num_docs)) + .max(1); + + // Divide into k balanced chunks (first chunks are ≥ last chunks by at most 1 split). + let chunk_size = n.div_ceil(k); + group_splits + .chunks(chunk_size) + .filter(|chunk| chunk.len() >= config.min_merge_group_size) + .map(|chunk| MergeOperation::new_merge_operation(chunk.to_vec())) + .collect() +} + +/// Group by UTC day (floored to midnight in seconds) of the split's time range, +/// and returns one or more [`MergeOperation`]s per group that meets the size +/// threshold. +/// +/// Rules: +/// - Splits without a `time_range` are skipped (cannot assign a day). +/// - A split is only assigned to a bucket when *both* `time_range.start()` and `time_range.end()` +/// fall on the same UTC day (i.e., the split does not span midnight). +/// - Immature splits are excluded. +/// - Splits whose `time_range.end()` falls within the retention safety buffer are excluded. +/// +/// Important: This plan merges splits accross sources. It can be problematic if +/// the IndexingSettings are different (e.g different maturation period), which +/// was made possible on Kafka sources by specifying an override in the +/// client_params. +pub fn plan_merge_operations_for_index( + index_config: &IndexConfig, + splits: Vec, + now: OffsetDateTime, + config: &MatureMergeConfig, +) -> Vec { + let now_secs = now.unix_timestamp(); + + let earliest_cutoff_timestamp = retention_safety_cutoff_secs(index_config, now_secs, config); + + // Key: (partition_id, doc_mapping_uid_string, day_bucket_seconds, secondary_day_opt) + let mut groups: HashMap<(u64, String, i64, Option), Vec> = HashMap::new(); + + for split in splits { + // Only splits that have been mature for a while + if !split.is_mature(now - MATURITY_BUFFER) { + continue; + } + + // Enforce the max size for splits to be considered for merging. + if split.num_docs > config.input_split_max_num_docs { + continue; + } + + // The timestamp field is required + let Some(ref time_range) = split.time_range else { + continue; + }; + + let start_day = time_range.start() / SECS_PER_DAY; + let end_day = time_range.end() / SECS_PER_DAY; + + // also group on secondary time range to make sure retention can still be applied + let secondary_day_opt = split + .secondary_time_range + .as_ref() + // In the nominal case, the secondary time (ingest time) is only + // slightly greater than the primary time (event time). Using + // `start()` here decreases the chances of further fragmenting the + // group at the day limits. + .map(|r| r.start() / SECS_PER_DAY); + + // Both endpoints must fall on the same UTC day. + if start_day != end_day { + continue; + } + + // Check that we are not too close to the retention cutoff. + if let Some(cutoff) = earliest_cutoff_timestamp + && *time_range.end() < cutoff + { + continue; + } + + let key = ( + split.partition_id, + split.doc_mapping_uid.to_string(), + start_day, + secondary_day_opt, + ); + groups.entry(key).or_default().push(split); + } + + let mut operations = Vec::new(); + for (_key, group_splits) in groups { + operations.extend(plan_operations_for_group(group_splits, config)); + } + operations +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use quickwit_config::{IndexConfig, RetentionPolicy}; + use quickwit_metastore::{SplitMaturity, SplitMetadata}; + use quickwit_proto::types::{DocMappingUid, IndexUid}; + use time::OffsetDateTime; + + use super::*; + + /// Builds a mature [`SplitMetadata`] for use in tests. + /// + /// - `day_bucket`: UTC day expressed as seconds-since-epoch (midnight). For example `day_bucket + /// = 0` means 1970-01-01, `day_bucket = SECS_PER_DAY` means 1970-01-02. + fn mature_split_for_test( + split_id: &str, + index_uid: &IndexUid, + partition_id: u64, + doc_mapping_uid: DocMappingUid, + num_docs: usize, + day_bucket: i64, + ) -> SplitMetadata { + SplitMetadata { + split_id: split_id.to_string(), + index_uid: index_uid.clone(), + partition_id, + num_docs, + doc_mapping_uid, + // Both endpoints on the same UTC day — the split spans one hour. + time_range: Some(day_bucket..=(day_bucket + 3600)), + maturity: SplitMaturity::Mature, + ..Default::default() + } + } + + fn index_config_no_retention() -> IndexConfig { + IndexConfig::for_test("test-index", "s3://test-bucket/test-index") + } + + fn index_config_with_retention(period: &str) -> IndexConfig { + let mut config = index_config_no_retention(); + config.retention_policy_opt = Some(RetentionPolicy { + retention_period: period.to_string(), + evaluation_schedule: "daily".to_string(), + timestamp_type: Default::default(), + }); + config + } + + // UTC day 0 = 1970-01-01. Use a recent-ish day to avoid the retention buffer. + // We use day 20000 (approx 2024-10) so splits are "recent" relative to a "now" we control. + const RECENT_DAY: i64 = 20_000 * SECS_PER_DAY; + + fn now_well_after_recent_day() -> OffsetDateTime { + // 1 day after the splits' day — they are mature but not in a retention buffer. + OffsetDateTime::from_unix_timestamp(RECENT_DAY + SECS_PER_DAY + 1).unwrap() + } + + #[test] + fn test_plan_basic() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert_eq!(operations.len(), 1); + assert_eq!(operations[0].splits.len(), 10); + } + + #[test] + fn test_plan_below_threshold() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + // Only 4 splits — below the min_merge_group_size (5). + let splits: Vec = (0..4) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig { + min_merge_group_size: 5, + ..Default::default() + }, + ); + + assert!(operations.is_empty(), "expected no operations for 4 splits"); + } + + #[test] + fn test_plan_immature_splits_excluded() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + let now = now_well_after_recent_day(); + let now_ts = now.unix_timestamp(); + + // All splits are immature (maturation period far in the future). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + split.maturity = SplitMaturity::Immature { + maturation_period: Duration::from_secs(999_999), + }; + // Make sure create_timestamp is recent so the split is truly immature. + split.create_timestamp = now_ts; + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now, + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "immature splits should be excluded"); + } + + #[test] + fn test_plan_multiday_split_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 10 splits, but each one spans midnight (start on day N, end on day N+1). + let splits: Vec = (0..10) + .map(|i| { + let mut split = mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ); + // Extend time_range to cross midnight. + split.time_range = Some(RECENT_DAY - 3600..=RECENT_DAY + 3600); + split + }) + .collect(); + + let operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!(operations.is_empty(), "multi-day splits should be skipped"); + } + + #[test] + fn test_plan_retention_safety_buffer() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // Retention period = 90 days. Safety buffer = 30 days. + // Splits must have time_range.end >= now - 90d + 30d = now - 60d. + // We put splits at RECENT_DAY but set "now" to be RECENT_DAY + 91 days. + // Then: cutoff_raw = (RECENT_DAY + 91d) - 90d + 30d = RECENT_DAY + 31d + // cutoff = RECENT_DAY + 32d (rounded up to next day boundary) + // Because RECENT_DAY + 3600 < cutoff, splits should be excluded. + let now_ts = RECENT_DAY + 91 * SECS_PER_DAY; + let now = OffsetDateTime::from_unix_timestamp(now_ts).unwrap(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let config = index_config_with_retention("90 days"); + + let merge_config = MatureMergeConfig { + retention_safety_buffer_days: 30, + ..MatureMergeConfig::default() + }; + let operations = plan_merge_operations_for_index(&config, splits, now, &merge_config); + + assert!( + operations.is_empty(), + "splits within retention safety buffer should be excluded" + ); + } + + #[test] + fn test_plan_retention_period_too_short_skipped() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + let splits: Vec = (0..10) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + 1, + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + // Retention period of 3 days is <= retention_safety_buffer_days (default 5 days) + // so the index should be skipped entirely. + let config = index_config_with_retention("3 days"); + + let operations = plan_merge_operations_for_index( + &config, + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + + assert!( + operations.is_empty(), + "index with short retention should produce no operations" + ); + } + + #[test] + fn test_plan_different_partitions_grouped_separately() { + let index_uid = IndexUid::for_test("test-index", 0); + let doc_mapping_uid = DocMappingUid::random(); + + // 6 splits per partition, two partitions => 2 separate merge operations. + let splits: Vec = (0..12) + .map(|i| { + mature_split_for_test( + &format!("split-{i}"), + &index_uid, + i as u64 / 6, // partition 0 for i in 0..6, partition 1 for i in 6..12 + doc_mapping_uid, + 100, + RECENT_DAY, + ) + }) + .collect(); + + let mut operations = plan_merge_operations_for_index( + &index_config_no_retention(), + splits, + now_well_after_recent_day(), + &MatureMergeConfig::default(), + ); + operations.sort_by_key(|op| op.splits[0].partition_id); + + assert_eq!(operations.len(), 2); + assert!(operations[0].splits.iter().all(|s| s.partition_id == 0)); + assert!(operations[1].splits.iter().all(|s| s.partition_id == 1)); + } +} diff --git a/quickwit/quickwit-indexing/src/models/indexed_split.rs b/quickwit/quickwit-indexing/src/models/indexed_split.rs index e129feede9b..a622b241da9 100644 --- a/quickwit/quickwit-indexing/src/models/indexed_split.rs +++ b/quickwit/quickwit-indexing/src/models/indexed_split.rs @@ -105,12 +105,12 @@ impl IndexedSplitBuilder { partition_id, split_id, num_docs: 0, - replaced_split_ids: Vec::new(), uncompressed_docs_size_in_bytes: 0, time_range: None, secondary_time_range: None, delete_opstamp: last_delete_opstamp, num_merge_ops: 0, + replaced_splits: Vec::new(), }, index_writer, split_scratch_directory, diff --git a/quickwit/quickwit-indexing/src/models/mod.rs b/quickwit/quickwit-indexing/src/models/mod.rs index 9dfdfde1594..d1642791933 100644 --- a/quickwit/quickwit-indexing/src/models/mod.rs +++ b/quickwit/quickwit-indexing/src/models/mod.rs @@ -47,7 +47,7 @@ use quickwit_proto::types::PublishToken; pub use raw_doc_batch::RawDocBatch; pub(crate) use shard_positions::LocalShardPositionsUpdate; pub use shard_positions::ShardPositionsService; -pub use split_attrs::{SplitAttrs, create_split_metadata}; +pub use split_attrs::{ReplacedSplit, SplitAttrs, create_split_metadata}; #[derive(Debug)] pub struct NewPublishToken(pub PublishToken); diff --git a/quickwit/quickwit-indexing/src/models/publisher_message.rs b/quickwit/quickwit-indexing/src/models/publisher_message.rs index 13182a8f76a..e1ba9eb8ae2 100644 --- a/quickwit/quickwit-indexing/src/models/publisher_message.rs +++ b/quickwit/quickwit-indexing/src/models/publisher_message.rs @@ -22,11 +22,11 @@ use tracing::Span; use crate::merge_policy::MergeTask; use crate::models::PublishLock; +use crate::models::split_attrs::ReplacedSplit; pub struct SplitsUpdate { pub index_uid: IndexUid, pub new_splits: Vec, - pub replaced_split_ids: Vec, pub checkpoint_delta_opt: Option, pub publish_lock: PublishLock, pub publish_token_opt: Option, @@ -36,6 +36,7 @@ pub struct SplitsUpdate { /// If `None`, the split batch was built in the `IndexingPipeline`. pub merge_task: Option, pub parent_span: Span, + pub replaced_splits: Vec, } impl fmt::Debug for SplitsUpdate { diff --git a/quickwit/quickwit-indexing/src/models/split_attrs.rs b/quickwit/quickwit-indexing/src/models/split_attrs.rs index dde48fab25a..4a8076c4ed6 100644 --- a/quickwit/quickwit-indexing/src/models/split_attrs.rs +++ b/quickwit/quickwit-indexing/src/models/split_attrs.rs @@ -25,6 +25,14 @@ use time::OffsetDateTime; use crate::merge_policy::MergePolicy; +#[derive(PartialEq, Eq, Debug, Default, Clone)] +pub struct ReplacedSplit { + pub split_id: SplitId, + /// Snapshot of the split's soft-deletes. These will be consolidated into + /// the split during the merge. + pub soft_deleted_doc_ids: BTreeSet, +} + pub struct SplitAttrs { /// ID of the node that produced the split. pub node_id: NodeId, @@ -61,13 +69,13 @@ pub struct SplitAttrs { pub time_range: Option>, pub secondary_time_range: Option>, - pub replaced_split_ids: Vec, - /// Delete opstamp. pub delete_opstamp: u64, // Number of merge operation the split has been through so far. pub num_merge_ops: usize, + + pub replaced_splits: Vec, } impl fmt::Debug for SplitAttrs { @@ -75,7 +83,14 @@ impl fmt::Debug for SplitAttrs { f.debug_struct("SplitAttrs") .field("split_id", &self.split_id) .field("partition_id", &self.partition_id) - .field("replaced_split_ids", &self.replaced_split_ids) + .field( + "replaced_split_ids", + &self + .replaced_splits + .iter() + .map(|s| &s.split_id) + .collect::>(), + ) .field("time_range", &self.time_range) .field( "uncompressed_docs_size_in_bytes", @@ -137,6 +152,7 @@ pub fn create_split_metadata( footer_offsets, delete_opstamp: split_attrs.delete_opstamp, num_merge_ops: split_attrs.num_merge_ops, + soft_deleted_doc_ids: BTreeSet::new(), } } diff --git a/quickwit/quickwit-indexing/src/soft_delete_query.rs b/quickwit/quickwit-indexing/src/soft_delete_query.rs new file mode 100644 index 00000000000..fad5ed564b9 --- /dev/null +++ b/quickwit/quickwit-indexing/src/soft_delete_query.rs @@ -0,0 +1,377 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use tantivy::index::SegmentId; +use tantivy::query::{EmptyScorer, EnableScoring, Explanation, Query, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED, TantivyError, Term}; + +/// A tantivy [`Query`] that matches specific doc IDs within their respective segments. +/// +/// Built from the `soft_deleted_doc_ids` fields of the input [`SplitMetadata`] structs, this +/// query is passed to [`IndexWriter::delete_query`] so that the matched documents are marked for +/// deletion and then physically removed during the subsequent tantivy merge. The query itself only +/// identifies which documents to remove; the actual deletion is performed by the caller. +#[derive(Clone, Debug)] +pub(crate) struct SoftDeletedDocIdsQuery { + /// Maps each segment ID to the **sorted** list of doc IDs to delete within that segment. + docs_per_segment: HashMap>, +} + +impl SoftDeletedDocIdsQuery { + pub(crate) fn new(docs_per_segment: HashMap>) -> Self { + Self { docs_per_segment } + } +} + +impl Query for SoftDeletedDocIdsQuery { + fn weight(&self, _: EnableScoring<'_>) -> tantivy::Result> { + Ok(Box::new(SoftDeletedDocIdsWeight { + docs_per_segment: self.docs_per_segment.clone(), + })) + } + + fn query_terms<'a>(&'a self, _visitor: &mut dyn FnMut(&'a Term, bool)) { + // Doc-ID–based query — no index terms to visit. + } +} + +/// Minimal `DocSet + Scorer` over a pre-sorted, deduplicated list of doc IDs. +/// +/// Starts positioned at the first document (no initial `advance()` call required). +struct SortedDocIdScorer { + doc_ids: Vec, + pos: usize, +} + +impl DocSet for SortedDocIdScorer { + fn advance(&mut self) -> DocId { + self.pos += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // Binary-search to the first id >= target. + self.pos = self.doc_ids.partition_point(|&id| id < target); + self.doc() + } + + fn doc(&self) -> DocId { + self.doc_ids.get(self.pos).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.pos) as u32 + } +} + +impl Scorer for SortedDocIdScorer { + fn score(&mut self) -> Score { + 1.0 + } +} + +struct SoftDeletedDocIdsWeight { + docs_per_segment: HashMap>, +} + +impl Weight for SoftDeletedDocIdsWeight { + fn scorer(&self, reader: &SegmentReader, _boost: Score) -> tantivy::Result> { + let Some(doc_ids) = self.docs_per_segment.get(&reader.segment_id()) else { + return Ok(Box::new(EmptyScorer)); + }; + // Filter defensively: doc IDs must be < max_doc. The BTreeSet source guarantees + // strict ascending order, which SortedDocIdScorer requires. + let doc_ids: Vec = doc_ids + .iter() + .copied() + .filter(|&id| id < reader.max_doc()) + .collect(); + if doc_ids.is_empty() { + return Ok(Box::new(EmptyScorer)); + } + Ok(Box::new(SortedDocIdScorer { doc_ids, pos: 0 })) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + let is_deleted = self + .docs_per_segment + .get(&reader.segment_id()) + .map(|ids| ids.binary_search(&doc).is_ok()) + .unwrap_or(false); + if is_deleted { + Ok(Explanation::new("SoftDeletedDocIdsQuery", 1.0)) + } else { + Err(TantivyError::InvalidArgument(format!( + "Document #{doc} is not soft-deleted in this segment" + ))) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use tantivy::collector::TopDocs; + use tantivy::index::SegmentId; + use tantivy::query::AllQuery; + use tantivy::schema::{STORED, Schema, TEXT, Value}; + use tantivy::{Index, IndexWriter, ReloadPolicy, TantivyDocument, doc}; + + use super::*; + + /// Build an in-RAM single-segment index where each entry in `texts` becomes + /// one stored document. All documents are committed in a single pass so + /// tantivy assigns them contiguous doc IDs starting at 0. + fn make_index(texts: &[&str]) -> tantivy::Result<(Index, tantivy::schema::Field)> { + let mut schema_builder = Schema::builder(); + let body = schema_builder.add_text_field("body", TEXT | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for text in texts { + writer.add_document(doc!(body => *text))?; + } + writer.commit()?; + Ok((index, body)) + } + + /// Apply `query` via `IndexWriter::delete_query`, commit, and return a + /// freshly-opened reader that reflects the resulting deletion state. + fn apply_delete_query( + index: &Index, + query: SoftDeletedDocIdsQuery, + ) -> tantivy::Result { + let mut writer: IndexWriter = index.writer(15_000_000)?; + writer.delete_query(Box::new(query))?; + writer.commit()?; + index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into() + } + + /// Collect and sort the stored body values of all live documents so that + /// tests can assert on the exact surviving content, independent of score + /// ordering. + fn live_bodies( + reader: &tantivy::IndexReader, + body: tantivy::schema::Field, + ) -> tantivy::Result> { + let searcher = reader.searcher(); + let top_docs = searcher.search(&AllQuery, &TopDocs::with_limit(1_000).order_by_score())?; + let mut texts: Vec = top_docs + .iter() + .map(|(_, addr)| { + let doc: TantivyDocument = searcher.doc(*addr).unwrap(); + doc.get_first(body) + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string() + }) + .collect(); + texts.sort(); + Ok(texts) + } + + #[test] + fn test_delete_query_removes_targeted_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + let seg_readers = searcher.segment_readers(); + assert_eq!( + seg_readers.len(), + 1, + "expected a single segment after one commit" + ); + let segment_id = seg_readers[0].segment_id(); + drop(searcher); + + // Target doc IDs 1 ("b") and 3 ("d"). + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + let seg = &searcher_after.segment_readers()[0]; + + assert_eq!(seg.num_docs(), 3, "exactly 3 docs must survive"); + Ok(()) + } + + #[test] + fn test_delete_query_leaves_correct_docs_alive() -> tantivy::Result<()> { + let (index, body) = make_index(&["a", "b", "c", "d", "e"])?; + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let segment_id = { + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + // Delete docs 1 ("b") and 3 ("d"); "a", "c", "e" must survive. + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["a", "c", "e"]); + Ok(()) + } + + #[test] + fn test_delete_query_removes_all_docs() -> tantivy::Result<()> { + let (index, _) = make_index(&["x", "y", "z"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + let total_live_docs: u32 = searcher_after + .segment_readers() + .iter() + .map(|r| r.num_docs()) + .sum(); + assert_eq!(total_live_docs, 0, "all docs must be deleted"); + Ok(()) + } + + #[test] + fn test_delete_query_boundary_doc_ids() -> tantivy::Result<()> { + // Deleting the very first (0) and very last (3) doc IDs exercises the boundary + // positions of SortedDocIdScorer. + let (index, body) = make_index(&["a", "b", "c", "d"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![0u32, 3u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["b", "c"]); + Ok(()) + } + + #[test] + fn test_delete_query_single_doc() -> tantivy::Result<()> { + let (index, body) = make_index(&["keep", "remove", "keep-too"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![1u32])])); + let reader_after = apply_delete_query(&index, query)?; + + let surviving = live_bodies(&reader_after, body)?; + assert_eq!(surviving, vec!["keep", "keep-too"]); + Ok(()) + } + + #[test] + fn test_delete_query_unknown_segment_id_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + + // Obtain a segment ID that definitely does not belong to `index` by + // creating an independent second index. + let (other_index, _) = make_index(&["z"])?; + let foreign_id: SegmentId = { + let other_reader: tantivy::IndexReader = other_index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let other_searcher = other_reader.searcher(); + other_searcher.segment_readers()[0].segment_id() + }; + + // Targeting all three doc IDs under the foreign segment must not delete anything. + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(foreign_id, vec![0u32, 1u32, 2u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "unknown segment ID must leave all docs intact" + ); + Ok(()) + } + + #[test] + fn test_delete_query_out_of_range_doc_ids_are_ignored() -> tantivy::Result<()> { + // The index has 2 docs (max_doc = 2, valid IDs are 0 and 1). + // Providing only out-of-range IDs must not delete anything. + let (index, _) = make_index(&["a", "b"])?; + let segment_id = { + let reader = index + .reader_builder() + .reload_policy(ReloadPolicy::Manual) + .try_into()?; + let searcher = reader.searcher(); + searcher.segment_readers()[0].segment_id() + }; + + let query = + SoftDeletedDocIdsQuery::new(HashMap::from([(segment_id, vec![10u32, 20u32, 100u32])])); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 2, + "out-of-range doc IDs must be silently ignored" + ); + Ok(()) + } + + #[test] + fn test_delete_query_empty_map_has_no_effect() -> tantivy::Result<()> { + let (index, _) = make_index(&["a", "b", "c"])?; + let query = SoftDeletedDocIdsQuery::new(HashMap::new()); + let reader_after = apply_delete_query(&index, query)?; + let searcher_after = reader_after.searcher(); + + assert_eq!( + searcher_after.segment_readers()[0].num_docs(), + 3, + "empty docs-per-segment map must delete nothing" + ); + Ok(()) + } +} diff --git a/quickwit/quickwit-indexing/src/source/ingest/mod.rs b/quickwit/quickwit-indexing/src/source/ingest/mod.rs index d9e21affb87..63c746aabe0 100644 --- a/quickwit/quickwit-indexing/src/source/ingest/mod.rs +++ b/quickwit/quickwit-indexing/src/source/ingest/mod.rs @@ -410,9 +410,8 @@ impl IngestSource { .assigned_shards .keys() .filter(|&shard_id| !new_assigned_shard_ids.contains(shard_id)) - .cloned() .any(|removed_shard_id| { - let Some(assigned_shard) = self.assigned_shards.get(&removed_shard_id) else { + let Some(assigned_shard) = self.assigned_shards.get(removed_shard_id) else { return false; }; assigned_shard.status != IndexingStatus::Complete diff --git a/quickwit/quickwit-indexing/src/source/kafka_source.rs b/quickwit/quickwit-indexing/src/source/kafka_source.rs index 5f93d0a9344..f1aca45bb98 100644 --- a/quickwit/quickwit-indexing/src/source/kafka_source.rs +++ b/quickwit/quickwit-indexing/src/source/kafka_source.rs @@ -25,7 +25,7 @@ use quickwit_actors::{ActorExitStatus, Mailbox}; use quickwit_config::KafkaSourceParams; use quickwit_metastore::checkpoint::{PartitionId, SourceCheckpoint}; use quickwit_proto::metastore::SourceType; -use quickwit_proto::types::{IndexUid, Position}; +use quickwit_proto::types::{IndexUid, NodeIdRef, Position}; use rdkafka::config::{ClientConfig, RDKafkaLogLevel}; use rdkafka::consumer::{ BaseConsumer, CommitMode, Consumer, ConsumerContext, DefaultConsumerContext, Rebalance, @@ -240,6 +240,7 @@ impl KafkaSource { let (events_tx, events_rx) = mpsc::channel(100); let (truncate_tx, truncate_rx) = watch::channel(SourceCheckpoint::default()); let (client_config, consumer, group_id) = create_consumer( + source_runtime.node_id(), source_runtime.index_uid(), source_runtime.source_id(), source_params, @@ -654,6 +655,7 @@ pub(super) async fn check_connectivity(params: KafkaSourceParams) -> anyhow::Res /// Creates a new `KafkaSourceConsumer`. fn create_consumer( + node_id: &NodeIdRef, index_uid: &IndexUid, source_id: &str, params: KafkaSourceParams, @@ -676,6 +678,7 @@ fn create_consumer( params.enable_backfill_mode.to_string(), ) .set("group.id", &group_id) + .set("client.id", node_id.as_str()) .set_log_level(log_level) .create_with_context(RdKafkaContext { topic: params.topic, diff --git a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs index bcb07d79020..cddbcaa002e 100644 --- a/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/control_plane_metastore.rs @@ -32,8 +32,9 @@ use quickwit_proto::metastore::{ ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreResult, MetastoreService, MetastoreServiceClient, MetastoreServiceStream, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, + SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, }; /// A [`MetastoreService`] implementation that proxies some requests to the control plane so it can @@ -188,6 +189,13 @@ impl MetastoreService for ControlPlaneMetastore { self.metastore.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + self.metastore.soft_delete_documents(request).await + } + async fn reset_source_checkpoint( &self, request: ResetSourceCheckpointRequest, diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs index 4b53cbf648b..bd1677e89fd 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/file_backed_index/mod.rs @@ -32,7 +32,7 @@ use quickwit_proto::metastore::{ AcquireShardsRequest, AcquireShardsResponse, DeleteQuery, DeleteShardsRequest, DeleteShardsResponse, DeleteTask, EntityKind, IndexStats, ListShardsSubrequest, ListShardsSubresponse, MetastoreError, MetastoreResult, OpenShardSubrequest, - OpenShardSubresponse, PruneShardsRequest, SplitStats, + OpenShardSubresponse, PruneShardsRequest, SplitDocIds, SplitStats, }; use quickwit_proto::types::{IndexUid, PublishToken, SourceId, SplitId}; use serde::{Deserialize, Serialize}; @@ -43,7 +43,7 @@ use tracing::{info, warn}; use super::MutationOccurred; use crate::checkpoint::IndexCheckpointDelta; -use crate::metastore::{SortBy, use_shard_api}; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, SortBy, use_shard_api}; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState, split_tag_filter}; /// A `FileBackedIndex` object carries an index metadata and its split metadata. @@ -498,6 +498,63 @@ impl FileBackedIndex { Ok(()) } + /// Soft-deletes individual documents within published splits. + pub(crate) fn soft_delete_documents( + &mut self, + split_doc_ids: &[SplitDocIds], + ) -> MetastoreResult { + // First pass: validate all splits before making any changes to guarantee atomicity. + for entry in split_doc_ids { + let split = self.splits.get(&entry.split_id).ok_or_else(|| { + MetastoreError::NotFound(EntityKind::Split { + split_id: entry.split_id.clone(), + }) + })?; + if split.split_state != SplitState::Published { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!("split `{}` is not in Published state", entry.split_id), + }); + } + let current_count = split.split_metadata.soft_deleted_doc_ids.len(); + let new_unique_count = entry + .doc_ids + .iter() + .filter(|&&id| !split.split_metadata.soft_deleted_doc_ids.contains(&id)) + .count(); + if current_count + new_unique_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: entry.split_id.clone(), + }, + message: format!( + "split `{}` would exceed the maximum number of soft-deleted documents \ + ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): current={current_count}, would be={}", + entry.split_id, + current_count + new_unique_count, + ), + }); + } + } + + // Second pass: all splits are valid — apply changes. + let mut num_soft_deleted = 0u64; + for entry in split_doc_ids { + let split = self + .splits + .get_mut(&entry.split_id) + .expect("split existence validated in first pass"); + for &doc_id in &entry.doc_ids { + if split.split_metadata.soft_deleted_doc_ids.insert(doc_id) { + num_soft_deleted += 1; + } + } + } + Ok(num_soft_deleted) + } + /// Gets IndexStats for this index pub(crate) fn get_stats(&self) -> MetastoreResult { let mut staged_stats = SplitStats::default(); @@ -724,6 +781,11 @@ impl Debug for Stamper { } fn split_query_predicate(split: &&Split, query: &ListSplitsQuery) -> bool { + if let Some(split_ids) = &query.split_ids + && !split_ids.contains(&split.split_metadata.split_id) + { + return false; + } if !split_tag_filter(&split.split_metadata, query.tags.as_ref()) { return false; } @@ -814,11 +876,14 @@ mod tests { use quickwit_doc_mapper::tag_pruning::TagFilterAst; use quickwit_proto::ingest::Shard; - use quickwit_proto::metastore::{ListShardsSubrequest, SplitStats}; + use quickwit_proto::metastore::{ + EntityKind, ListShardsSubrequest, MetastoreError, SplitDocIds, SplitStats, + }; use quickwit_proto::types::{IndexUid, SourceId}; use super::FileBackedIndex; use crate::file_backed::file_backed_index::split_query_predicate; + use crate::metastore::MAX_SOFT_DELETED_DOCS_PER_SPLIT; use crate::{IndexMetadata, ListSplitsQuery, Split, SplitMetadata, SplitState}; impl FileBackedIndex { @@ -949,6 +1014,15 @@ mod tests { assert!(split_query_predicate(&&split_1, &query)); assert!(split_query_predicate(&&split_2, &query)); assert!(!split_query_predicate(&&split_3, &query)); + + let query = ListSplitsQuery::for_index(IndexUid::new_with_random_ulid("test-index")) + .with_split_ids(vec![ + split_1.split_metadata.split_id.clone(), + split_2.split_metadata.split_id.clone(), + ]); + assert!(split_query_predicate(&&split_1, &query)); + assert!(split_query_predicate(&&split_2, &query)); + assert!(!split_query_predicate(&&split_3, &query)); } #[test] @@ -1019,4 +1093,151 @@ mod tests { assert_eq!(stats.published, expected_published); assert_eq!(stats.marked_for_deletion, expected_marked_for_deletion); } + + /// Helper: creates a `FileBackedIndex` with a single published split. + fn make_index_with_published_split(split_id: &str) -> FileBackedIndex { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: split_id.to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + index + .publish_splits([split_id], Vec::<&str>::new(), None, None) + .unwrap(); + index + } + + #[test] + fn test_soft_delete_documents_basic() { + let mut index = make_index_with_published_split("split-a"); + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 5, 42], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 5, 42]) + ); + } + + #[test] + fn test_soft_delete_documents_idempotent() { + let mut index = make_index_with_published_split("split-a"); + + // First call: delete doc IDs 1, 2, 3. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + assert_eq!(num_deleted, 3); + + // Second call: same IDs plus one new one. + let split_doc_ids = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![1, 2, 3, 4], + }]; + let num_deleted = index.soft_delete_documents(&split_doc_ids).unwrap(); + // Only doc_id 4 is new. + assert_eq!(num_deleted, 1); + + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids, + BTreeSet::from([1, 2, 3, 4]) + ); + } + + #[test] + fn test_soft_delete_documents_non_published_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + let split_metadata = SplitMetadata { + split_id: "staged-split".to_string(), + ..Default::default() + }; + index.stage_split(split_metadata).unwrap(); + // The split is still in Staged state — not Published. + + let split_doc_ids = vec![SplitDocIds { + split_id: "staged-split".to_string(), + doc_ids: vec![10], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_unknown_split_fails() { + let index_metadata = + IndexMetadata::for_test("test-index", "file:///qwdata/indexes/test-index"); + let mut index = FileBackedIndex::new(index_metadata, Vec::new(), HashMap::new(), vec![]); + + let split_doc_ids = vec![SplitDocIds { + split_id: "nonexistent-split".to_string(), + doc_ids: vec![1], + }]; + let error = index.soft_delete_documents(&split_doc_ids).unwrap_err(); + assert!( + matches!(error, MetastoreError::NotFound(EntityKind::Split { .. })), + "expected NotFound error, got: {error:?}" + ); + } + + #[test] + fn test_soft_delete_documents_limit_exceeded() { + let mut index = make_index_with_published_split("split-a"); + + // Pre-populate with MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 soft-deleted doc IDs. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let initial_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: initial_ids, + }]; + index.soft_delete_documents(&initial_entries).unwrap(); + + // Adding 2 more unique IDs would push the total to MAX + 1 — must fail. + let overflow_entries = vec![SplitDocIds { + split_id: "split-a".to_string(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }]; + let error = index.soft_delete_documents(&overflow_entries).unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition error when limit exceeded, got: {error:?}" + ); + + // The split must be unchanged — still at MAX - 1 entries. + let split = index.splits.get("split-a").unwrap(); + assert_eq!( + split.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1 + ); + } } diff --git a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs index 2542f1db36f..af3df2a363d 100644 --- a/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/file_backed/mod.rs @@ -55,8 +55,9 @@ use quickwit_proto::metastore::{ ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, PublishSplitsRequest, ResetSourceCheckpointRequest, - StageSplitsRequest, ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, - UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, + SoftDeleteDocumentsRequest, SoftDeleteDocumentsResponse, StageSplitsRequest, + ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid}; use quickwit_storage::Storage; @@ -729,6 +730,23 @@ impl MetastoreService for FileBackedMetastore { Ok(EmptyResponse {}) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid = request.index_uid().clone(); + let num_soft_deleted_doc_ids = self + .mutate(&index_uid, |index| { + let num_soft_deleted_doc_ids = + index.soft_delete_documents(&request.split_doc_ids)?; + Ok(MutationOccurred::Yes(num_soft_deleted_doc_ids)) + }) + .await?; + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + }) + } + async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; let index_uid = request.index_uid(); diff --git a/quickwit/quickwit-metastore/src/metastore/mod.rs b/quickwit/quickwit-metastore/src/metastore/mod.rs index 98f2f1d5039..187ad1676d9 100644 --- a/quickwit/quickwit-metastore/src/metastore/mod.rs +++ b/quickwit/quickwit-metastore/src/metastore/mod.rs @@ -49,6 +49,10 @@ use crate::{Split, SplitMetadata, SplitState}; /// Splits batch size returned by the stream splits API pub(crate) const STREAM_SPLITS_CHUNK_SIZE: usize = 100; +/// Maximum number of soft-deleted document IDs allowed per split. +/// Attempts to soft-delete documents that would push the total above this limit will fail. +pub(crate) const MAX_SOFT_DELETED_DOCS_PER_SPLIT: usize = 10_000; + /// An extended trait for [`MetastoreService`]. #[async_trait] pub trait MetastoreServiceExt: MetastoreService { @@ -640,6 +644,10 @@ pub struct ListSplitsQuery { /// A specific node ID to filter by. pub node_id: Option, + /// A non-empty list of split IDs to fetch, or + /// None to ignore this filter. + pub split_ids: Option>, + /// The maximum number of splits to retrieve. pub limit: Option, @@ -739,6 +747,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -765,6 +774,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, }) } @@ -787,6 +797,7 @@ impl ListSplitsQuery { mature: Bound::Unbounded, sort_by: SortBy::None, after_split: None, + split_ids: None, } } @@ -796,6 +807,12 @@ impl ListSplitsQuery { self } + /// Selects only splits with the specified IDs. + pub fn with_split_ids(mut self, split_ids: Vec) -> Self { + self.split_ids = Some(split_ids); + self + } + /// Sets the maximum number of splits to retrieve. pub fn with_limit(mut self, n: usize) -> Self { self.limit = Some(n); diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs index d4296fb7ee6..08c6e378254 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/metastore.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::fmt::{self, Write}; use std::str::FromStr; use std::time::Duration; @@ -43,8 +43,9 @@ use quickwit_proto::metastore::{ ListSplitsRequest, ListSplitsResponse, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, MetastoreResult, MetastoreService, MetastoreServiceStream, OpenShardSubrequest, OpenShardSubresponse, OpenShardsRequest, OpenShardsResponse, PruneShardsRequest, - PublishSplitsRequest, ResetSourceCheckpointRequest, SplitStats, StageSplitsRequest, - ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, + PublishSplitsRequest, ResetSourceCheckpointRequest, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitStats, StageSplitsRequest, ToggleSourceRequest, + UpdateIndexRequest, UpdateSourceRequest, UpdateSplitsDeleteOpstampRequest, UpdateSplitsDeleteOpstampResponse, serde_utils, }; use quickwit_proto::types::{IndexId, IndexUid, Position, PublishToken, ShardId, SourceId}; @@ -72,13 +73,14 @@ use crate::file_backed::MutationOccurred; use crate::metastore::postgres::model::Shards; use crate::metastore::postgres::utils::split_maturity_timestamp; use crate::metastore::{ - IndexesMetadataResponseExt, PublishSplitsRequestExt, STREAM_SPLITS_CHUNK_SIZE, - UpdateSourceRequestExt, use_shard_api, + IndexesMetadataResponseExt, MAX_SOFT_DELETED_DOCS_PER_SPLIT, PublishSplitsRequestExt, + STREAM_SPLITS_CHUNK_SIZE, UpdateSourceRequestExt, use_shard_api, }; use crate::{ AddSourceRequestExt, CreateIndexRequestExt, IndexMetadata, IndexMetadataResponseExt, ListIndexesMetadataResponseExt, ListSplitsRequestExt, ListSplitsResponseExt, - MetastoreServiceExt, Split, SplitState, StageSplitsRequestExt, UpdateIndexRequestExt, + MetastoreServiceExt, Split, SplitMetadata, SplitState, StageSplitsRequestExt, + UpdateIndexRequestExt, }; /// PostgreSQL metastore implementation. @@ -1165,6 +1167,124 @@ impl MetastoreService for PostgresqlMetastore { Ok(EmptyResponse {}) } + #[instrument(skip(self))] + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> MetastoreResult { + let index_uid: IndexUid = request.index_uid().clone(); + let split_doc_ids = request.split_doc_ids; + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // Fetches current metadata for all requested splits in a single round-trip, locking + // the rows for the duration of the transaction. + const FETCH_SPLITS_METADATA_QUERY: &str = r#" + SELECT split_id, split_metadata_json + FROM splits + WHERE + index_uid = $1 + AND split_id = ANY($2) + AND split_state = 'Published' + FOR UPDATE + "#; + + // Updates all modified splits in a single round-trip via UNNEST. + const UPDATE_SPLITS_METADATA_QUERY: &str = r#" + UPDATE splits + SET + split_metadata_json = updates.split_metadata_json, + update_timestamp = (CURRENT_TIMESTAMP AT TIME ZONE 'UTC') + FROM UNNEST($1::TEXT[], $2::TEXT[]) AS updates(split_id, split_metadata_json) + WHERE + splits.index_uid = $3 + AND splits.split_id = updates.split_id + AND splits.split_state = 'Published' + "#; + + // Build a lookup map: split_id → new doc IDs to add. + let mut new_ids_by_split: HashMap<&str, BTreeSet> = HashMap::new(); + for split in &split_doc_ids { + let entry = new_ids_by_split.entry(split.split_id.as_str()).or_default(); + entry.extend(split.doc_ids.iter().copied()); + } + + let requested_split_ids: Vec<&str> = + split_doc_ids.iter().map(|s| s.split_id.as_str()).collect(); + + run_with_tx!(self.connection_pool, tx, "soft delete documents", { + // Phase 1: fetch and lock all relevant splits, merge new doc IDs, validate limits. + // Any error here causes the transaction to roll back, so no split is modified. + let rows: Vec<(String, String)> = sqlx::query_as(FETCH_SPLITS_METADATA_QUERY) + .bind(&index_uid) + .bind(&requested_split_ids) + .fetch_all(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + + let mut updated_split_ids: Vec = Vec::with_capacity(rows.len()); + let mut updated_metadata_jsons: Vec = Vec::with_capacity(rows.len()); + let mut total_soft_deleted: u64 = 0; + + for (split_id, split_metadata_json) in rows { + let new_ids = new_ids_by_split + .get(split_id.as_str()) + .cloned() + .unwrap_or_default(); + + let mut split_metadata = serde_json::from_str::( + &split_metadata_json, + ) + .map_err(|error| MetastoreError::JsonDeserializeError { + struct_name: "SplitMetadata".to_string(), + message: error.to_string(), + })?; + + let old_count = split_metadata.soft_deleted_doc_ids.len(); + split_metadata.soft_deleted_doc_ids.extend(new_ids); + let new_count = split_metadata.soft_deleted_doc_ids.len(); + if old_count == new_count { + continue; + } + + if new_count > MAX_SOFT_DELETED_DOCS_PER_SPLIT { + return Err(MetastoreError::FailedPrecondition { + entity: EntityKind::Split { + split_id: split_id.clone(), + }, + message: format!( + "split `{split_id}` would exceed the maximum number of soft-deleted \ + documents ({MAX_SOFT_DELETED_DOCS_PER_SPLIT}): would be {new_count}", + ), + }); + } + + updated_metadata_jsons.push(serde_utils::to_json_str(&split_metadata)?); + updated_split_ids.push(split_id); + total_soft_deleted += (new_count - old_count) as u64; + } + + // Phase 2: all validations passed — apply all updates in a single query. + if !updated_split_ids.is_empty() { + sqlx::query(UPDATE_SPLITS_METADATA_QUERY) + .bind(&updated_split_ids) + .bind(&updated_metadata_jsons) + .bind(&index_uid) + .execute(tx.as_mut()) + .await + .map_err(|sqlx_error| convert_sqlx_err(&index_uid.index_id, sqlx_error))?; + } + + Ok(SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids: total_soft_deleted, + }) + }) + } + #[instrument(skip(self))] async fn add_source(&self, request: AddSourceRequest) -> MetastoreResult { let source_config = request.deserialize_source_config()?; @@ -2241,6 +2361,18 @@ mod tests { sql.to_string(PostgresQueryBuilder), r#"SELECT * FROM "splits" WHERE "time_range_end" <= 42"# ); + + let mut select_statement = Query::select(); + let sql = select_statement.column(Asterisk).from(Splits::Table); + + let query = ListSplitsQuery::for_all_indexes() + .with_split_ids(vec!["split-1".to_string(), "split-2".to_string()]); + append_query_filters_and_order_by(sql, &query); + + assert_eq!( + sql.to_string(PostgresQueryBuilder), + r#"SELECT * FROM "splits" WHERE "split_id" IN ('split-1', 'split-2')"# + ); } #[test] diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs index 86853c531b4..8c605859f55 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/model.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/model.rs @@ -90,6 +90,7 @@ pub enum Splits { IndexUid, NodeId, DeleteOpstamp, + SoftDeletedDocIds, } pub(super) struct ToTimestampFunc; diff --git a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs index b5769201948..f0d87246c0c 100644 --- a/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs +++ b/quickwit/quickwit-metastore/src/metastore/postgres/utils.rs @@ -107,6 +107,10 @@ pub(super) fn append_query_filters_and_order_by( sql.cond_where(Expr::col(Splits::IndexUid).is_in(index_uids)); } + if let Some(split_ids) = &query.split_ids { + sql.cond_where(Expr::col(Splits::SplitId).is_in(split_ids)); + } + if let Some(node_id) = &query.node_id { sql.cond_where(Expr::col(Splits::NodeId).eq(node_id)); }; diff --git a/quickwit/quickwit-metastore/src/split_metadata.rs b/quickwit/quickwit-metastore/src/split_metadata.rs index 829029e5d43..3de6f9122f4 100644 --- a/quickwit/quickwit-metastore/src/split_metadata.rs +++ b/quickwit/quickwit-metastore/src/split_metadata.rs @@ -135,6 +135,9 @@ pub struct SplitMetadata { /// Doc mapping UID used when creating this split. This split may only be merged with other /// splits using the same doc mapping UID. pub doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + pub soft_deleted_doc_ids: BTreeSet, } impl fmt::Debug for SplitMetadata { @@ -180,6 +183,9 @@ impl fmt::Debug for SplitMetadata { debug_struct.field("footer_offsets", &self.footer_offsets); debug_struct.field("delete_opstamp", &self.delete_opstamp); debug_struct.field("num_merge_ops", &self.num_merge_ops); + if !self.soft_deleted_doc_ids.is_empty() { + debug_struct.field("soft_deleted_doc_ids", &self.soft_deleted_doc_ids); + } debug_struct.finish() } } @@ -286,6 +292,7 @@ impl quickwit_config::TestableForRegression for SplitMetadata { footer_offsets: 1000..2000, num_merge_ops: 3, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), } } @@ -427,6 +434,7 @@ mod tests { delete_opstamp: 0, num_merge_ops: 0, doc_mapping_uid: DocMappingUid::default(), + soft_deleted_doc_ids: BTreeSet::new(), }; let expected_output = diff --git a/quickwit/quickwit-metastore/src/split_metadata_version.rs b/quickwit/quickwit-metastore/src/split_metadata_version.rs index 5f6204c85b7..43b38542133 100644 --- a/quickwit/quickwit-metastore/src/split_metadata_version.rs +++ b/quickwit/quickwit-metastore/src/split_metadata_version.rs @@ -97,6 +97,10 @@ pub(crate) struct SplitMetadataV0_8 { // splits before when updates first appeared are compatible with each other. #[serde(default)] doc_mapping_uid: DocMappingUid, + + /// Set of tantivy doc_ids that have been soft-deleted from this split. + #[serde(default)] + pub soft_deleted_doc_ids: BTreeSet, } impl From for SplitMetadata { @@ -134,6 +138,7 @@ impl From for SplitMetadata { footer_offsets: v8.footer_offsets, num_merge_ops: v8.num_merge_ops, doc_mapping_uid: v8.doc_mapping_uid, + soft_deleted_doc_ids: v8.soft_deleted_doc_ids, } } } @@ -157,6 +162,7 @@ impl From for SplitMetadataV0_8 { footer_offsets: split.footer_offsets, num_merge_ops: split.num_merge_ops, doc_mapping_uid: split.doc_mapping_uid, + soft_deleted_doc_ids: split.soft_deleted_doc_ids, } } } diff --git a/quickwit/quickwit-metastore/src/tests/mod.rs b/quickwit/quickwit-metastore/src/tests/mod.rs index d6e549baf25..0c598966a6a 100644 --- a/quickwit/quickwit-metastore/src/tests/mod.rs +++ b/quickwit/quickwit-metastore/src/tests/mod.rs @@ -575,6 +575,36 @@ macro_rules! metastore_test_suite { let _ = tracing_subscriber::fmt::try_init(); $crate::tests::get_identity::test_metastore_get_identity::<$metastore_type>().await; } + + /// Soft-delete documents API tests + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_idempotent() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_idempotent::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_non_published_split() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_non_published_split::<$metastore_type>().await; + } + + #[tokio::test] + #[serial_test::file_serial] + async fn test_metastore_soft_delete_documents_limit_exceeded() { + let _ = tracing_subscriber::fmt::try_init(); + $crate::tests::split::test_metastore_soft_delete_documents_limit_exceeded::<$metastore_type>().await; + } } }; } diff --git a/quickwit/quickwit-metastore/src/tests/split.rs b/quickwit/quickwit-metastore/src/tests/split.rs index 9e6d45265e3..16a96905936 100644 --- a/quickwit/quickwit-metastore/src/tests/split.rs +++ b/quickwit/quickwit-metastore/src/tests/split.rs @@ -20,7 +20,7 @@ use quickwit_config::{IndexConfig, SourceConfig, SourceParams}; use quickwit_proto::metastore::{ CreateIndexRequest, DeleteSplitsRequest, EntityKind, IndexMetadataRequest, ListSplitsRequest, ListStaleSplitsRequest, MarkSplitsForDeletionRequest, MetastoreError, PublishSplitsRequest, - StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, + SoftDeleteDocumentsRequest, SplitDocIds, StageSplitsRequest, UpdateSplitsDeleteOpstampRequest, }; use quickwit_proto::types::{IndexUid, Position}; use time::OffsetDateTime; @@ -29,7 +29,7 @@ use tracing::{error, info}; use super::DefaultForTest; use crate::checkpoint::{IndexCheckpointDelta, PartitionId, SourceCheckpointDelta}; -use crate::metastore::MetastoreServiceStreamSplitsExt; +use crate::metastore::{MAX_SOFT_DELETED_DOCS_PER_SPLIT, MetastoreServiceStreamSplitsExt}; use crate::tests::cleanup_index; use crate::{ CreateIndexRequestExt, IndexMetadataResponseExt, ListSplitsQuery, ListSplitsRequestExt, @@ -1806,3 +1806,431 @@ pub async fn test_metastore_update_splits_delete_opstamp< cleanup_index(&mut metastore, index_uid).await; } } + +pub async fn test_metastore_soft_delete_documents< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-docs"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 5, 42], + }], + }; + let response = metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + assert!(response.num_soft_deleted_doc_ids > 0); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&5)); + assert!(soft_deleted.contains(&42)); + assert_eq!(soft_deleted.len(), 3); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_idempotent< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-idempotent"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + let split_id = format!("{index_id}--split-1"); + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata).unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + // First call: soft-delete doc IDs [1, 2, 3]. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // Second call: same doc IDs — must not return an error. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + // The set of soft-deleted IDs must still be exactly {1, 2, 3}. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 3); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + + // Third call: same IDs plus one new one — must extend the set by exactly one. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![1, 2, 3, 4], + }], + }; + metastore + .soft_delete_documents(soft_delete_request) + .await + .unwrap(); + + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(splits.len(), 1); + let soft_deleted = &splits[0].split_metadata.soft_deleted_doc_ids; + assert_eq!(soft_deleted.len(), 4); + assert!(soft_deleted.contains(&1)); + assert!(soft_deleted.contains(&2)); + assert!(soft_deleted.contains(&3)); + assert!(soft_deleted.contains(&4)); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_non_published_split< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-unpublished"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Stage a split but do NOT publish it. + let staged_split_id = format!("{index_id}--split1"); + let staged_split_metadata = SplitMetadata { + split_id: staged_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &staged_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + // Stage, publish, then mark another split for deletion. + let marked_split_id = format!("{index_id}--split2"); + let marked_split_metadata = SplitMetadata { + split_id: marked_split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_splits_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &marked_split_metadata) + .unwrap(); + metastore.stage_splits(stage_splits_request).await.unwrap(); + + let publish_splits_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![marked_split_id.clone()], + ..Default::default() + }; + metastore + .publish_splits(publish_splits_request) + .await + .unwrap(); + + let mark_for_deletion_request = + MarkSplitsForDeletionRequest::new(index_uid.clone(), vec![marked_split_id.clone()]); + metastore + .mark_splits_for_deletion(mark_for_deletion_request) + .await + .unwrap(); + + // Attempt to soft-delete documents on the staged split. + // Implementations may return an error (file-backed) or silently skip (postgres) — both are + // valid. What matters is that the split's soft_deleted_doc_ids remains unmodified. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: staged_split_id.clone(), + doc_ids: vec![10, 20], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_staged_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()).with_split_state(SplitState::Staged), + ) + .unwrap(); + let staged_splits = metastore + .list_splits(list_staged_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(staged_splits.len(), 1); + assert!( + staged_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "staged split must not have any soft-deleted doc IDs" + ); + + // Attempt to soft-delete documents on the marked-for-deletion split. + let soft_delete_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: marked_split_id.clone(), + doc_ids: vec![30, 40], + }], + }; + let _ = metastore.soft_delete_documents(soft_delete_request).await; + + let list_marked_request = ListSplitsRequest::try_from_list_splits_query( + &ListSplitsQuery::for_index(index_uid.clone()) + .with_split_state(SplitState::MarkedForDeletion), + ) + .unwrap(); + let marked_splits = metastore + .list_splits(list_marked_request) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + assert_eq!(marked_splits.len(), 1); + assert!( + marked_splits[0] + .split_metadata + .soft_deleted_doc_ids + .is_empty(), + "marked-for-deletion split must not have any soft-deleted doc IDs" + ); + + cleanup_index(&mut metastore, index_uid).await; +} + +pub async fn test_metastore_soft_delete_documents_limit_exceeded< + MetastoreToTest: MetastoreServiceExt + DefaultForTest, +>() { + let mut metastore = MetastoreToTest::default_for_test().await; + + let index_id = append_random_suffix("test-soft-delete-limit"); + let index_uri = format!("ram:///indexes/{index_id}"); + let index_config = IndexConfig::for_test(&index_id, &index_uri); + + let create_index_request = CreateIndexRequest::try_from_index_config(&index_config).unwrap(); + let index_uid: IndexUid = metastore + .create_index(create_index_request) + .await + .unwrap() + .index_uid() + .clone(); + + // Create and publish two splits. + let split_a_id = format!("{index_id}--split-a"); + let split_b_id = format!("{index_id}--split-b"); + + for split_id in [&split_a_id, &split_b_id] { + let split_metadata = SplitMetadata { + split_id: split_id.clone(), + index_uid: index_uid.clone(), + ..Default::default() + }; + let stage_request = + StageSplitsRequest::try_from_split_metadata(index_uid.clone(), &split_metadata) + .unwrap(); + metastore.stage_splits(stage_request).await.unwrap(); + + let publish_request = PublishSplitsRequest { + index_uid: Some(index_uid.clone()), + staged_split_ids: vec![split_id.clone()], + ..Default::default() + }; + metastore.publish_splits(publish_request).await.unwrap(); + } + + // Pre-populate split-b with MAX - 1 soft-deleted doc IDs so one more would be fine but two + // would exceed the limit. + let initial_ids: Vec = (0..MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1).collect(); + let pre_populate_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: initial_ids, + }], + }; + metastore + .soft_delete_documents(pre_populate_request) + .await + .unwrap(); + + // Request that would: + // - soft-delete 1 doc on split-a (valid on its own) + // - soft-delete 2 *new* docs on split-b (would push total from MAX-1 to MAX+1) + // The whole request must fail and neither split must be modified. + let overflow_request = SoftDeleteDocumentsRequest { + index_uid: Some(index_uid.clone()), + split_doc_ids: vec![ + SplitDocIds { + split_id: split_a_id.clone(), + doc_ids: vec![100], + }, + SplitDocIds { + split_id: split_b_id.clone(), + doc_ids: vec![ + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32 - 1, + MAX_SOFT_DELETED_DOCS_PER_SPLIT as u32, + ], + }, + ], + }; + let error = metastore + .soft_delete_documents(overflow_request) + .await + .unwrap_err(); + assert!( + matches!( + error, + MetastoreError::FailedPrecondition { + entity: EntityKind::Split { .. }, + .. + } + ), + "expected FailedPrecondition when soft-deleted doc limit is exceeded, got: {error:?}" + ); + + // Verify atomicity: both splits must be unmodified after the failed request. + let splits = metastore + .list_splits(ListSplitsRequest::try_from_index_uid(index_uid.clone()).unwrap()) + .await + .unwrap() + .collect_splits() + .await + .unwrap(); + + let split_a = splits + .iter() + .find(|s| s.split_metadata.split_id == split_a_id) + .expect("split-a must exist"); + assert!( + split_a.split_metadata.soft_deleted_doc_ids.is_empty(), + "split-a must not have been modified (atomicity guarantee)" + ); + + let split_b = splits + .iter() + .find(|s| s.split_metadata.split_id == split_b_id) + .expect("split-b must exist"); + assert_eq!( + split_b.split_metadata.soft_deleted_doc_ids.len(), + MAX_SOFT_DELETED_DOCS_PER_SPLIT - 1, + "split-b must not have been modified (atomicity guarantee)" + ); + + cleanup_index(&mut metastore, index_uid).await; +} diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.7.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json index 9f7f0e27f23..0f708ac16b3 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.8.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000000", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000000", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000000", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 1 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000000", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000000", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000000", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1704067200 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000000", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000000", - "update_timestamp": 1704067200 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000000", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}}}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.expected.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json index f9ecb6a7bcb..2d60feec007 100644 --- a/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/file-backed-index/v0.9.json @@ -1,200 +1,201 @@ { - "version": "0.9", + "delete_tasks": [ + { + "create_timestamp": 0, + "delete_query": { + "index_uid": "my-index:00000000000000000000000001", + "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" + }, + "opstamp": 10 + } + ], "index": { - "version": "0.9", - "index_uid": "my-index:00000000000000000000000001", + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, "index_config": { - "version": "0.9", - "index_id": "my-index", - "index_uri": "s3://quickwit-indexes/my-index", "doc_mapping": { "doc_mapping_uid": "00000000000000000000000001", - "mode": "dynamic", "dynamic_mapping": { - "indexed": true, - "tokenizer": "raw", - "record": "basic", - "stored": true, "expand_dots": true, "fast": { "normalizer": "raw" - } + }, + "indexed": true, + "record": "basic", + "stored": true, + "tokenizer": "raw" }, "field_mappings": [ { + "coerce": true, + "fast": true, + "indexed": true, "name": "tenant_id", - "type": "u64", + "output_format": "number", "stored": true, - "indexed": true, - "fast": true, - "coerce": true, - "output_format": "number" + "type": "u64" }, { - "name": "timestamp", - "type": "datetime", + "fast": true, + "fast_precision": "seconds", + "indexed": true, "input_formats": [ "rfc3339", "unix_timestamp" ], + "name": "timestamp", "output_format": "rfc3339", - "fast_precision": "seconds", - "indexed": true, "stored": true, - "fast": true + "type": "datetime" }, { - "name": "log_level", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "raw", + "name": "log_level", "record": "basic", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "raw", + "type": "text" }, { - "name": "message", - "type": "text", + "fast": false, + "fieldnorms": false, "indexed": true, - "tokenizer": "default", + "name": "message", "record": "position", - "fieldnorms": false, "stored": true, - "fast": false + "tokenizer": "default", + "type": "text" } ], - "timestamp_field": "timestamp", + "index_field_presence": true, + "max_num_partitions": 100, + "mode": "dynamic", + "partition_key": "tenant_id", + "store_document_size": false, + "store_source": true, "tag_fields": [ "log_level", "tenant_id" ], - "partition_key": "tenant_id", - "max_num_partitions": 100, - "index_field_presence": true, - "store_document_size": false, - "store_source": true, + "timestamp_field": "timestamp", "tokenizers": [ { + "filters": [], "name": "custom_tokenizer", - "type": "regex", "pattern": "[^\\p{L}\\p{N}]+", - "filters": [] + "type": "regex" } ] }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", "indexing_settings": { "commit_timeout_secs": 301, - "docstore_compression_level": 8, "docstore_blocksize": 1000000, - "split_num_docs_target": 10000001, + "docstore_compression_level": 8, "merge_policy": { - "type": "stable_log", - "min_level_num_docs": 100000, - "merge_factor": 9, + "maturation_period": "2days", "max_merge_factor": 11, - "maturation_period": "2days" + "merge_factor": 9, + "min_level_num_docs": 100000, + "type": "stable_log" }, "resources": { "heap_size": "50.0 MB" - } + }, + "split_num_docs_target": 10000001 }, "ingest_settings": { "min_shards": 12 }, + "retention": { + "period": "90 days", + "schedule": "daily" + }, "search_settings": { "default_search_fields": [ "message" ] }, - "retention": { - "period": "90 days", - "schedule": "daily" - } - }, - "checkpoint": { - "kafka-source": { - "00000000000000000000": "00000000000000000042" - } + "version": "0.9" }, - "create_timestamp": 1789, + "index_uid": "my-index:00000000000000000000000001", "sources": [ { - "version": "0.9", - "source_id": "kafka-source", - "num_pipelines": 2, "enabled": true, - "source_type": "kafka", + "input_format": "json", + "num_pipelines": 2, "params": { - "topic": "kafka-topic", - "client_params": {} + "client_params": {}, + "topic": "kafka-topic" }, + "source_id": "kafka-source", + "source_type": "kafka", "transform": { "script": ".message = downcase(string!(.message))", "timezone": "UTC" }, - "input_format": "json" + "version": "0.9" + } + ], + "version": "0.9" + }, + "shards": { + "_ingest-source": [ + { + "doc_mapping_uid": "00000000000000000000000001", + "follower_id": "follower-ingester", + "index_uid": "my-index:00000000000000000000000001", + "leader_id": "leader-ingester", + "publish_position_inclusive": "", + "shard_id": "00000000000000000001", + "shard_state": 1, + "source_id": "_ingest-source", + "update_timestamp": 1724240908 } ] }, "splits": [ { - "split_state": "Published", - "update_timestamp": 1789, - "publish_timestamp": 1789, - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "publish_timestamp": 1789, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", + "split_state": "Published", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "update_timestamp": 1789, + "version": "0.9" } ], - "shards": { - "_ingest-source": [ - { - "index_uid": "my-index:00000000000000000000000001", - "source_id": "_ingest-source", - "shard_id": "00000000000000000001", - "leader_id": "leader-ingester", - "follower_id": "follower-ingester", - "shard_state": 1, - "publish_position_inclusive": "", - "doc_mapping_uid": "00000000000000000000000001", - "update_timestamp": 1724240908 - } - ] - }, - "delete_tasks": [ - { - "create_timestamp": 0, - "opstamp": 10, - "delete_query": { - "index_uid": "my-index:00000000000000000000000001", - "query_ast": "{\"type\":\"bool\",\"must\":[{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Harry\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false},{\"type\":\"full_text\",\"field\":\"body\",\"text\":\"Potter\",\"params\":{\"mode\":{\"type\":\"phrase_fallback_to_intersection\"}},\"lenient\":false}]}" - } - } - ] + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.7.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json index 248baebc68e..fc54c8b931c 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.8.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000000", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000000", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.expected.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json index 85bdfca81e0..3e2d37292d0 100644 --- a/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json +++ b/quickwit/quickwit-metastore/test-data/split-metadata/v0.9.json @@ -1,30 +1,31 @@ { - "version": "0.9", - "split_id": "split", - "index_uid": "my-index:00000000000000000000000001", - "partition_id": 7, - "source_id": "source", - "node_id": "node", - "num_docs": 12303, - "uncompressed_docs_size_in_bytes": 234234, - "time_range": { - "start": 121000, - "end": 130198 - }, "create_timestamp": 3, + "delete_opstamp": 10, + "doc_mapping_uid": "00000000000000000000000000", + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "index_uid": "my-index:00000000000000000000000001", "maturity": { - "type": "immature", - "maturation_period_millis": 4000 + "maturation_period_millis": 4000, + "type": "immature" }, + "node_id": "node", + "num_docs": 12303, + "num_merge_ops": 3, + "partition_id": 7, + "soft_deleted_doc_ids": [], + "source_id": "source", + "split_id": "split", "tags": [ "234", "aaa" ], - "footer_offsets": { - "start": 1000, - "end": 2000 + "time_range": { + "end": 130198, + "start": 121000 }, - "delete_opstamp": 10, - "num_merge_ops": 3, - "doc_mapping_uid": "00000000000000000000000000" + "uncompressed_docs_size_in_bytes": 234234, + "version": "0.9" } diff --git a/quickwit/quickwit-proto/protos/quickwit/metastore.proto b/quickwit/quickwit-proto/protos/quickwit/metastore.proto index 00680da02d0..712538f193a 100644 --- a/quickwit/quickwit-proto/protos/quickwit/metastore.proto +++ b/quickwit/quickwit-proto/protos/quickwit/metastore.proto @@ -125,6 +125,9 @@ service MetastoreService { // Deletes splits. rpc DeleteSplits(DeleteSplitsRequest) returns (EmptyResponse); + // Soft-deletes individual documents within published splits. + rpc SoftDeleteDocuments(SoftDeleteDocumentsRequest) returns (SoftDeleteDocumentsResponse); + // Adds a source. rpc AddSource(AddSourceRequest) returns (EmptyResponse); @@ -348,6 +351,20 @@ message DeleteSplitsRequest { repeated string split_ids = 3; } +message SplitDocIds { + string split_id = 1; + repeated uint32 doc_ids = 2; +} + +message SoftDeleteDocumentsRequest { + quickwit.common.IndexUid index_uid = 1; + repeated SplitDocIds split_doc_ids = 2; +} + +message SoftDeleteDocumentsResponse { + uint64 num_soft_deleted_doc_ids = 1; +} + message AddSourceRequest { quickwit.common.IndexUid index_uid = 1; string source_config_json = 2; diff --git a/quickwit/quickwit-proto/protos/quickwit/search.proto b/quickwit/quickwit-proto/protos/quickwit/search.proto index 7b543e9ed25..f50f79c0d73 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search.proto @@ -264,7 +264,7 @@ message SortField { SortOrder sort_order = 2; // Optional sort value format for datetime field only. // If none, the default output format for datetime field is - // unix_timestamp_nanos. + // unix_timestamp_millis. optional SortDatetimeFormat sort_datetime_format = 3; } @@ -386,6 +386,8 @@ message SplitIdAndFooterOffsets { optional int64 timestamp_end = 5; // The number of docs in the split uint64 num_docs = 6; + // Tantivy doc IDs that have been soft-deleted from this split + repeated uint32 soft_deleted_doc_ids = 7; } // Hits returned by a FetchDocRequest. @@ -461,9 +463,11 @@ message SortByValue { int64 i64 = 2; double f64 = 3; bool boolean = 4; + string str = 5; + int64 datetime = 6; } // Room for eventual future sorted key types. - reserved 5 to 20; + reserved 7 to 20; } message LeafSearchResponse { diff --git a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs index 6736d97c7e2..afa08ca3c9d 100644 --- a/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs +++ b/quickwit/quickwit-proto/src/codegen/jaeger/opentelemetry.proto.trace.v1.rs @@ -120,10 +120,12 @@ pub struct Span { /// attributes is a collection of key/value pairs. Note, global attributes /// like server name can be set using the resource API. Examples of attributes: /// - /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" - /// "/http/server_latency": 300 - /// "abc.com/myattribute": true - /// "abc.com/score": 10.239 + /// ```text + /// "/http/user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" + /// "/http/server_latency": 300 + /// "abc.com/myattribute": true + /// "abc.com/score": 10.239 + /// ``` /// /// The OpenTelemetry API specification further restricts the allowed value types: /// @@ -276,7 +278,7 @@ pub mod span { } /// The Status type defines a logical error model that is suitable for different /// programming environments, including REST APIs and RPC APIs. -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct Status { /// A developer-facing human readable error message. #[prost(string, tag = "2")] diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs index ab6d1ddc236..d503a940f44 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.metastore.rs @@ -210,6 +210,28 @@ pub struct DeleteSplitsRequest { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SplitDocIds { + #[prost(string, tag = "1")] + pub split_id: ::prost::alloc::string::String, + #[prost(uint32, repeated, tag = "2")] + pub doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SoftDeleteDocumentsRequest { + #[prost(message, optional, tag = "1")] + pub index_uid: ::core::option::Option, + #[prost(message, repeated, tag = "2")] + pub split_doc_ids: ::prost::alloc::vec::Vec, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, Copy, PartialEq, Eq, Hash, ::prost::Message)] +pub struct SoftDeleteDocumentsResponse { + #[prost(uint64, tag = "1")] + pub num_soft_deleted_doc_ids: u64, +} +#[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] +#[derive(Clone, PartialEq, Eq, Hash, ::prost::Message)] pub struct AddSourceRequest { #[prost(message, optional, tag = "1")] pub index_uid: ::core::option::Option, @@ -693,6 +715,11 @@ impl RpcName for DeleteSplitsRequest { "delete_splits" } } +impl RpcName for SoftDeleteDocumentsRequest { + fn rpc_name() -> &'static str { + "soft_delete_documents" + } +} impl RpcName for AddSourceRequest { fn rpc_name() -> &'static str { "add_source" @@ -867,6 +894,11 @@ pub trait MetastoreService: std::fmt::Debug + Send + Sync + 'static { &self, request: DeleteSplitsRequest, ) -> crate::metastore::MetastoreResult; + ///Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult; ///Adds a source. async fn add_source( &self, @@ -1167,6 +1199,12 @@ impl MetastoreService for MetastoreServiceClient { ) -> crate::metastore::MetastoreResult { self.inner.0.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.0.soft_delete_documents(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -1383,6 +1421,12 @@ pub mod mock_metastore_service { ) -> crate::metastore::MetastoreResult { self.inner.lock().await.delete_splits(request).await } + async fn soft_delete_documents( + &self, + request: super::SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner.lock().await.soft_delete_documents(request).await + } async fn add_source( &self, request: super::AddSourceRequest, @@ -1714,6 +1758,22 @@ impl tower::Service for InnerMetastoreServiceClient { Box::pin(fut) } } +impl tower::Service for InnerMetastoreServiceClient { + type Response = SoftDeleteDocumentsResponse; + type Error = crate::metastore::MetastoreError; + type Future = BoxFuture; + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + std::task::Poll::Ready(Ok(())) + } + fn call(&mut self, request: SoftDeleteDocumentsRequest) -> Self::Future { + let svc = self.clone(); + let fut = async move { svc.0.soft_delete_documents(request).await }; + Box::pin(fut) + } +} impl tower::Service for InnerMetastoreServiceClient { type Response = EmptyResponse; type Error = crate::metastore::MetastoreError; @@ -2115,6 +2175,11 @@ struct MetastoreServiceTowerServiceStack { EmptyResponse, crate::metastore::MetastoreError, >, + soft_delete_documents_svc: quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, add_source_svc: quickwit_common::tower::BoxService< AddSourceRequest, EmptyResponse, @@ -2295,6 +2360,12 @@ impl MetastoreService for MetastoreServiceTowerServiceStack { ) -> crate::metastore::MetastoreResult { self.delete_splits_svc.clone().ready().await?.call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.soft_delete_documents_svc.clone().ready().await?.call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -2548,6 +2619,16 @@ type DeleteSplitsLayer = quickwit_common::tower::BoxLayer< EmptyResponse, crate::metastore::MetastoreError, >; +type SoftDeleteDocumentsLayer = quickwit_common::tower::BoxLayer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, +>; type AddSourceLayer = quickwit_common::tower::BoxLayer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -2772,6 +2853,7 @@ pub struct MetastoreServiceTowerLayerStack { publish_splits_layers: Vec, mark_splits_for_deletion_layers: Vec, delete_splits_layers: Vec, + soft_delete_documents_layers: Vec, add_source_layers: Vec, update_source_layers: Vec, toggle_source_layers: Vec, @@ -3101,6 +3183,33 @@ impl MetastoreServiceTowerLayerStack { crate::metastore::MetastoreError, >, >>::Service as tower::Service>::Future: Send + 'static, + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Clone + Send + Sync + 'static, + , + >>::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + <, + >>::Service as tower::Service< + SoftDeleteDocumentsRequest, + >>::Future: Send + 'static, L: tower::Layer< quickwit_common::tower::BoxService< AddSourceRequest, @@ -3665,6 +3774,8 @@ impl MetastoreServiceTowerLayerStack { .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.delete_splits_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.add_source_layers .push(quickwit_common::tower::BoxLayer::new(layer.clone())); self.update_source_layers @@ -3943,6 +4054,28 @@ impl MetastoreServiceTowerLayerStack { self.delete_splits_layers.push(quickwit_common::tower::BoxLayer::new(layer)); self } + pub fn stack_soft_delete_documents_layer(mut self, layer: L) -> Self + where + L: tower::Layer< + quickwit_common::tower::BoxService< + SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + Send + Sync + 'static, + L::Service: tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + > + Clone + Send + Sync + 'static, + >::Future: Send + 'static, + { + self.soft_delete_documents_layers + .push(quickwit_common::tower::BoxLayer::new(layer)); + self + } pub fn stack_add_source_layer(mut self, layer: L) -> Self where L: tower::Layer< @@ -4522,6 +4655,14 @@ impl MetastoreServiceTowerLayerStack { quickwit_common::tower::BoxService::new(inner_client.clone()), |svc, layer| layer.layer(svc), ); + let soft_delete_documents_svc = self + .soft_delete_documents_layers + .into_iter() + .rev() + .fold( + quickwit_common::tower::BoxService::new(inner_client.clone()), + |svc, layer| layer.layer(svc), + ); let add_source_svc = self .add_source_layers .into_iter() @@ -4704,6 +4845,7 @@ impl MetastoreServiceTowerLayerStack { publish_splits_svc, mark_splits_for_deletion_svc, delete_splits_svc, + soft_delete_documents_svc, add_source_svc, update_source_svc, toggle_source_svc, @@ -4879,6 +5021,15 @@ where Error = crate::metastore::MetastoreError, Future = BoxFuture, > + + tower::Service< + SoftDeleteDocumentsRequest, + Response = SoftDeleteDocumentsResponse, + Error = crate::metastore::MetastoreError, + Future = BoxFuture< + SoftDeleteDocumentsResponse, + crate::metastore::MetastoreError, + >, + > + tower::Service< AddSourceRequest, Response = EmptyResponse, @@ -5096,6 +5247,12 @@ where ) -> crate::metastore::MetastoreResult { self.clone().call(request).await } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.clone().call(request).await + } async fn add_source( &self, request: AddSourceRequest, @@ -5445,6 +5602,20 @@ where DeleteSplitsRequest::rpc_name(), )) } + async fn soft_delete_documents( + &self, + request: SoftDeleteDocumentsRequest, + ) -> crate::metastore::MetastoreResult { + self.inner + .clone() + .soft_delete_documents(request) + .await + .map(|response| response.into_inner()) + .map_err(|status| crate::error::grpc_status_to_service_error( + status, + SoftDeleteDocumentsRequest::rpc_name(), + )) + } async fn add_source( &self, request: AddSourceRequest, @@ -5909,6 +6080,17 @@ for MetastoreServiceGrpcServerAdapter { .map(tonic::Response::new) .map_err(crate::error::grpc_error_to_grpc_status) } + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> Result, tonic::Status> { + self.inner + .0 + .soft_delete_documents(request.into_inner()) + .await + .map(tonic::Response::new) + .map_err(crate::error::grpc_error_to_grpc_status) + } async fn add_source( &self, request: tonic::Request, @@ -6619,6 +6801,36 @@ pub mod metastore_service_grpc_client { ); self.inner.unary(req, path, codec).await } + /// Soft-deletes individual documents within published splits. + pub async fn soft_delete_documents( + &mut self, + request: impl tonic::IntoRequest, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + > { + self.inner + .ready() + .await + .map_err(|e| { + tonic::Status::unknown( + format!("Service was not ready: {}", e.into()), + ) + })?; + let codec = tonic_prost::ProstCodec::default(); + let path = http::uri::PathAndQuery::from_static( + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments", + ); + let mut req = request.into_request(); + req.extensions_mut() + .insert( + GrpcMethod::new( + "quickwit.metastore.MetastoreService", + "SoftDeleteDocuments", + ), + ); + self.inner.unary(req, path, codec).await + } /// Adds a source. pub async fn add_source( &mut self, @@ -7325,6 +7537,14 @@ pub mod metastore_service_grpc_server { &self, request: tonic::Request, ) -> std::result::Result, tonic::Status>; + /// Soft-deletes individual documents within published splits. + async fn soft_delete_documents( + &self, + request: tonic::Request, + ) -> std::result::Result< + tonic::Response, + tonic::Status, + >; /// Adds a source. async fn add_source( &self, @@ -8176,6 +8396,55 @@ pub mod metastore_service_grpc_server { }; Box::pin(fut) } + "/quickwit.metastore.MetastoreService/SoftDeleteDocuments" => { + #[allow(non_camel_case_types)] + struct SoftDeleteDocumentsSvc(pub Arc); + impl< + T: MetastoreServiceGrpc, + > tonic::server::UnaryService + for SoftDeleteDocumentsSvc { + type Response = super::SoftDeleteDocumentsResponse; + type Future = BoxFuture< + tonic::Response, + tonic::Status, + >; + fn call( + &mut self, + request: tonic::Request, + ) -> Self::Future { + let inner = Arc::clone(&self.0); + let fut = async move { + ::soft_delete_documents( + &inner, + request, + ) + .await + }; + Box::pin(fut) + } + } + let accept_compression_encodings = self.accept_compression_encodings; + let send_compression_encodings = self.send_compression_encodings; + let max_decoding_message_size = self.max_decoding_message_size; + let max_encoding_message_size = self.max_encoding_message_size; + let inner = self.inner.clone(); + let fut = async move { + let method = SoftDeleteDocumentsSvc(inner); + let codec = tonic_prost::ProstCodec::default(); + let mut grpc = tonic::server::Grpc::new(codec) + .apply_compression_config( + accept_compression_encodings, + send_compression_encodings, + ) + .apply_max_message_size_config( + max_decoding_message_size, + max_encoding_message_size, + ); + let res = grpc.unary(method, req).await; + Ok(res) + }; + Box::pin(fut) + } "/quickwit.metastore.MetastoreService/AddSource" => { #[allow(non_camel_case_types)] struct AddSourceSvc(pub Arc); diff --git a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs index 9c6f7f5b70d..baad891ea58 100644 --- a/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs +++ b/quickwit/quickwit-proto/src/codegen/quickwit/quickwit.search.rs @@ -197,7 +197,7 @@ pub struct SortField { pub sort_order: i32, /// Optional sort value format for datetime field only. /// If none, the default output format for datetime field is - /// unix_timestamp_nanos. + /// unix_timestamp_millis. #[prost(enumeration = "SortDatetimeFormat", optional, tag = "3")] pub sort_datetime_format: ::core::option::Option, } @@ -327,6 +327,9 @@ pub struct SplitIdAndFooterOffsets { /// The number of docs in the split #[prost(uint64, tag = "6")] pub num_docs: u64, + /// Tantivy doc IDs that have been soft-deleted from this split + #[prost(uint32, repeated, tag = "7")] + pub soft_deleted_doc_ids: ::prost::alloc::vec::Vec, } /// Hits returned by a FetchDocRequest. /// @@ -407,16 +410,16 @@ pub struct PartialHit { } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[derive(Ord, PartialOrd)] -#[derive(Clone, Copy, PartialEq, ::prost::Message)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct SortByValue { - #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4")] + #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4, 5, 6")] pub sort_value: ::core::option::Option, } /// Nested message and enum types in `SortByValue`. pub mod sort_by_value { #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] #[serde(rename_all = "snake_case")] - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] + #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum SortValue { #[prost(uint64, tag = "1")] U64(u64), @@ -426,6 +429,10 @@ pub mod sort_by_value { F64(f64), #[prost(bool, tag = "4")] Boolean(bool), + #[prost(string, tag = "5")] + Str(::prost::alloc::string::String), + #[prost(int64, tag = "6")] + Datetime(i64), } } #[derive(serde::Serialize, serde::Deserialize, utoipa::ToSchema)] diff --git a/quickwit/quickwit-proto/src/getters.rs b/quickwit/quickwit-proto/src/getters.rs index a327c1717a7..73847554b7c 100644 --- a/quickwit/quickwit-proto/src/getters.rs +++ b/quickwit/quickwit-proto/src/getters.rs @@ -136,6 +136,7 @@ generate_getters! { ToggleSourceRequest, UpdateIndexRequest, UpdateSourceRequest, + SoftDeleteDocumentsRequest, UpdateSplitsDeleteOpstampRequest } diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index f4ddb734d2a..f89fdb97687 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -28,7 +28,8 @@ use tracing_opentelemetry::OpenTelemetrySpanExt; pub mod cluster; pub mod control_plane; -pub use {bytes, tonic}; +pub use bytes; +pub use tonic; pub mod developer; pub mod error; mod getters; diff --git a/quickwit/quickwit-proto/src/search/mod.rs b/quickwit/quickwit-proto/src/search/mod.rs index 307de262a70..caba73828cd 100644 --- a/quickwit/quickwit-proto/src/search/mod.rs +++ b/quickwit/quickwit-proto/src/search/mod.rs @@ -17,6 +17,8 @@ use std::fmt; use std::io::{self, Read}; use prost::Message; +use quickwit_common::numeric_types::num_proj::ProjectedNumber; +use quickwit_common::numeric_types::{num_cmp, num_proj}; pub use sort_by_value::SortValue; include!("../codegen/quickwit/quickwit.search.rs"); @@ -83,6 +85,8 @@ impl SortByValue { } } Some(SortValue::Boolean(b)) => Bool(b), + Some(SortValue::Str(s)) => String(s), + Some(SortValue::Datetime(dt)) => Number(dt.into()), None => Null, } } @@ -104,18 +108,7 @@ impl SortByValue { return None; } } - // Strings that can be converted to a number are accepted. - // Some clients (like JS clients) can't easily handle large integers - // without losing precision, so we accept them as strings. - String(value) => { - if let Ok(number) = value.parse::() { - Some(SortValue::I64(number)) - } else if let Ok(number) = value.parse::() { - Some(SortValue::U64(number)) - } else { - return None; - } - } + String(value) => Some(SortValue::Str(value)), Array(_) | Object(_) => return None, }; Some(SortByValue { sort_value }) @@ -132,25 +125,33 @@ impl Eq for SortValue {} impl Ord for SortValue { #[inline] fn cmp(&self, other: &Self) -> Ordering { - // We make sure to end up with a total order. - match (*self, *other) { + match (self, other) { // Same types. - (SortValue::U64(left), SortValue::U64(right)) => left.cmp(&right), - (SortValue::I64(left), SortValue::I64(right)) => left.cmp(&right), - (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(&right), - // We half the logic by making sure we keep - // the "stronger" type on the left. + (SortValue::U64(left), SortValue::U64(right)) => left.cmp(right), + (SortValue::I64(left), SortValue::I64(right)) => left.cmp(right), + (SortValue::Boolean(left), SortValue::Boolean(right)) => left.cmp(right), + (SortValue::Str(left), SortValue::Str(right)) => left.cmp(right), + (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(right), + (SortValue::Datetime(left), SortValue::Datetime(right)) => left.cmp(right), + // Different numeric types but can still be compared. + (SortValue::U64(left), SortValue::F64(right)) => { + num_cmp::cmp_u64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::U64(right)) => num_cmp::cmp_u64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::F64(right)) => { + num_cmp::cmp_i64_f64(*left, *right).expect("unexpected float comparison") + } + (SortValue::F64(left), SortValue::I64(right)) => num_cmp::cmp_i64_f64(*right, *left) + .expect("unexpected float comparison") + .reverse(), + (SortValue::I64(left), SortValue::U64(right)) => num_cmp::cmp_i64_u64(*left, *right), (SortValue::U64(left), SortValue::I64(right)) => { - if left > i64::MAX as u64 { - return Ordering::Greater; - } - (left as i64).cmp(&right) + num_cmp::cmp_i64_u64(*right, *left).reverse() } - (SortValue::F64(left), SortValue::F64(right)) => left.total_cmp(&right), - (SortValue::F64(left), SortValue::U64(right)) => left.total_cmp(&(right as f64)), - (SortValue::F64(left), SortValue::I64(right)) => left.total_cmp(&(right as f64)), - (SortValue::Boolean(left), right) => SortValue::U64(left as u64).cmp(&right), - (left, right) => right.cmp(&left).reverse(), + // Incompatible types, they are sorted one after another. + (left, right) => left.type_sort_key().cmp(&right.type_sort_key()), } } } @@ -165,7 +166,7 @@ impl std::hash::Hash for SortValue { fn hash(&self, state: &mut H) { let this = self.normalize(); std::mem::discriminant(&this).hash(state); - match this { + match &this { SortValue::U64(number) => { number.hash(state); } @@ -178,6 +179,12 @@ impl std::hash::Hash for SortValue { SortValue::Boolean(b) => { b.hash(state); } + SortValue::Str(s) => { + s.hash(state); + } + SortValue::Datetime(dt) => { + dt.hash(state); + } } } } @@ -188,27 +195,36 @@ impl SortValue { /// For number, we prefer to represent them, in order, as i64, then as u64 and finally as f64. pub fn normalize(&self) -> Self { match self { - SortValue::I64(_) => *self, - SortValue::Boolean(_) => *self, - SortValue::U64(number) => { - if let Ok(number) = (*number).try_into() { - SortValue::I64(number) - } else { - *self - } - } - SortValue::F64(number) => { - let number = *number; - if number.ceil() == number { - // number is not NaN, and is a natural number - if number >= i64::MIN as f64 && number <= i64::MAX as f64 { - return SortValue::I64(number as i64); - } else if number.is_sign_positive() && number <= u64::MAX as f64 { - return SortValue::U64(number as u64); + SortValue::I64(_) => self.clone(), + SortValue::Boolean(_) => self.clone(), + SortValue::Str(_) => self.clone(), + SortValue::U64(number) => match num_proj::u64_to_i64(*number) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + _ => self.clone(), + }, + SortValue::F64(float) => match num_proj::f64_to_i64(*float) { + ProjectedNumber::Exact(number) => SortValue::I64(number), + ProjectedNumber::AfterLast => { + if let ProjectedNumber::Exact(number) = num_proj::f64_to_u64(*float) { + SortValue::U64(number) + } else { + self.clone() } } - *self - } + _ => self.clone(), + }, + SortValue::Datetime(_) => self.clone(), + } + } + + pub fn type_sort_key(&self) -> TypeSortKey { + match self { + SortValue::U64(_) => TypeSortKey::Numeric, + SortValue::I64(_) => TypeSortKey::Numeric, + SortValue::F64(_) => TypeSortKey::Numeric, + SortValue::Boolean(_) => TypeSortKey::Boolean, + SortValue::Str(_) => TypeSortKey::Str, + SortValue::Datetime(_) => TypeSortKey::DateTime, } } } @@ -216,14 +232,26 @@ impl SortValue { impl PartialHit { /// Helper to get access to the 1st sort value pub fn sort_value(&self) -> Option { - if let Some(sort_value) = self.sort_value { - sort_value.sort_value + if let Some(sort_value) = &self.sort_value { + sort_value.sort_value.clone() } else { None } } } +/// Defines the order between types when sorting on a field with multiple types. +/// Expected order: +/// - Asc: numeric -> string -> boolean -> datetime +/// - Desc: datetime -> boolean -> string -> numeric +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TypeSortKey { + Numeric, + Str, + Boolean, + DateTime, +} + /// Serializes the Split fields. /// /// `fields_metadata` has to be sorted. diff --git a/quickwit/quickwit-query/Cargo.toml b/quickwit/quickwit-query/Cargo.toml index 066c00c0ff7..f24d8662715 100644 --- a/quickwit/quickwit-query/Cargo.toml +++ b/quickwit/quickwit-query/Cargo.toml @@ -15,9 +15,6 @@ anyhow = { workspace = true } base64 = { workspace = true } bitpacking = { workspace = true } hex = { workspace = true } -lindera-core = { workspace = true, optional = true } -lindera-dictionary = { workspace = true, optional = true } -lindera-tokenizer = { workspace = true, optional = true } once_cell = { workspace = true } regex = { workspace = true } serde = { workspace = true } @@ -29,7 +26,6 @@ tracing = { workspace = true } time = { workspace = true } thiserror = { workspace = true } rustc-hash = { workspace = true } -whichlang = { workspace = true, optional = true } quickwit-common = { workspace = true } quickwit-datetime = { workspace = true } @@ -42,19 +38,6 @@ time = { workspace = true } quickwit-common = { workspace = true, features = ["testsuite"] } -[features] -multilang = [ - "lindera-core", - "lindera-dictionary", - "lindera-tokenizer", - "whichlang", - "tantivy/stemmer", -] - [[bench]] name = "tokenizers_bench" harness = false - -[[bench]] -name = "multilang_tokenizers_bench" -harness = false diff --git a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs b/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs deleted file mode 100644 index 61755dea556..00000000000 --- a/quickwit/quickwit-query/benches/multilang_tokenizers_bench.rs +++ /dev/null @@ -1,167 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use criterion::{Criterion, Throughput, black_box, criterion_group, criterion_main}; -use quickwit_query::create_default_quickwit_tokenizer_manager; -use tantivy::tokenizer::{TextAnalyzer, Token, TokenStream}; - -// A random ascii string of length 100 chars. -const ASCII_SHORT: &str = "It is a long established fact"; -static ASCII_LONG: &str = r#"It is a long established fact that a reader will be distracted by the readable content of a - page when looking at its layout. The point of using Lorem Ipsum is that it has a - more-or-less normal distribution of letters, as opposed to using 'Content here, content - here', making it look like readable English. Many desktop publishing packages and web page - editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will - uncover many web sites still in their infancy. Various versions have evolved over the years, - sometimes by accident, sometimes on purpose (injected humour and the like)."#; -const JPN_SHORT: &str = "日本ごです。 とても素敵な言葉ですね"; -const JPN_LONG: &str = r#"日本ごです。 和名の由来は、 - 太陽の動きにつれてその方向を追うように花が回るといわれたことから。 - ただしこの動きは生長に伴うものであるため、 - 実際に太陽を追って動くのは生長が盛んな若い時期だけである。 - 若いヒマワリの茎の上部の葉は太陽に正対になるように動き、 - 朝には東を向いていたのが夕方には西を向く。日没後はまもなく起きあがり、 - 夜明け前にはふたたび東に向く。この運動はつぼみを付ける頃まで続くが、 - つぼみが大きくなり花が開く素敵な言葉ですね."#; -const CMN_SHORT: &str = "滚滚长江东逝水,浪花淘尽英雄。"; -const CMN_LONG: &str = r#"滚滚长江东逝水,浪花淘尽英雄。是非成败转头空,青山依旧在,几度夕阳红。 - 白发渔樵江渚上,惯看秋月春风。一壶浊酒喜相逢,古今多少事,都付笑谈中。 - 是非成败转头空,青山依旧在,惯看秋月春风。一壶浊酒喜相逢,古今多少事, - 滚滚长江东逝水,浪花淘尽英雄。 几度夕阳红。白发渔樵江渚上,都付笑谈中。"#; -const KOR_SHORT: &str = "안녕하세요. 반갑습니다."; -const KOR_LONG: &str = r#" -포근히 내려오는 눈밭속에서는 -낯이 붉은 處女아이들도 깃들이어 오는 소리… -울고 -웃고 -수구리고 -새파라니 얼어서 -運命들이 모두다 안끼어 드는 소리… -큰놈에겐 큰 눈물자국, 작은놈에겐 작은 웃음 흔적 -큰이얘기 작은이얘기들이 오부록이 도란 그리며 안끼어 오는 소리 -끊임없이 내리는 눈발 속에서는 -山도 山도 靑山도 안끼어 드는 소리 -"#; - -fn process_tokens(analyzer: &mut TextAnalyzer, text: &str) -> Vec { - let mut token_stream = analyzer.token_stream(text); - let mut tokens: Vec = Vec::new(); - token_stream.process(&mut |token: &Token| tokens.push(token.clone())); - tokens -} - -pub fn tokenizers_throughput_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("multilang"); - let tokenizer_manager = create_default_quickwit_tokenizer_manager(); - let mut default_tokenizer = tokenizer_manager.get_tokenizer("default").unwrap(); - let mut multilang_tokenizer = tokenizer_manager.get_tokenizer("multilang").unwrap(); - let mut chinese_tokenizer = tokenizer_manager - .get_tokenizer("chinese_compatible") - .unwrap(); - - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("default-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("default-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut default_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input("multilang-eng-tokenize-short", ASCII_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input("multilang-eng-tokenize-long", ASCII_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - let short_with_prefix = "ENG:".to_string() + ASCII_SHORT; - group - .throughput(Throughput::Bytes(ASCII_SHORT.len() as u64)) - .bench_with_input( - "multilang-tokenize-short-with-prefix", - &short_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - let long_with_prefix = "ENG:".to_string() + ASCII_LONG; - group - .throughput(Throughput::Bytes(ASCII_LONG.len() as u64)) - .bench_with_input( - "multilang-tokenize-long-with-prefix", - &long_with_prefix, - |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(JPN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-short", JPN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(JPN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-jpn-long", JPN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-short", CMN_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-cmn-long", CMN_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_SHORT.len() as u64)) - .bench_with_input("multilang-tokenize-kor-short", KOR_SHORT, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(KOR_LONG.len() as u64)) - .bench_with_input("multilang-tokenize-kor-long", KOR_LONG, |b, text| { - b.iter(|| process_tokens(&mut multilang_tokenizer, black_box(text))); - }); - group - .throughput(Throughput::Bytes(CMN_SHORT.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-short", - CMN_SHORT, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); - group - .throughput(Throughput::Bytes(CMN_LONG.len() as u64)) - .bench_with_input( - "chinese-compatible-tokenize-cmn-long", - CMN_LONG, - |b, text| { - b.iter(|| process_tokens(&mut chinese_tokenizer, black_box(text))); - }, - ); -} - -criterion_group!( - tokenizers_throughput_benches, - tokenizers_throughput_benchmark -); -criterion_main!(tokenizers_throughput_benches); diff --git a/quickwit/quickwit-query/src/lib.rs b/quickwit/quickwit-query/src/lib.rs index b2040f73daa..8f70e155933 100644 --- a/quickwit/quickwit-query/src/lib.rs +++ b/quickwit/quickwit-query/src/lib.rs @@ -38,8 +38,6 @@ pub(crate) use not_nan_f32::NotNaNf32; pub use query_ast::utils::find_field_or_hit_dynamic; use serde::{Deserialize, Serialize}; pub use tantivy::query::Query as TantivyQuery; -#[cfg(feature = "multilang")] -pub use tokenizers::MultiLangTokenizer; pub use tokenizers::{ CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH, create_default_quickwit_tokenizer_manager, get_quickwit_fastfield_normalizer_manager, diff --git a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs index 84176f4a4aa..7b24a66163d 100644 --- a/quickwit/quickwit-query/src/query_ast/wildcard_query.rs +++ b/quickwit/quickwit-query/src/query_ast/wildcard_query.rs @@ -247,7 +247,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -290,7 +289,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -335,7 +333,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", @@ -398,7 +395,6 @@ mod tests { "raw_lowercase", "lowercase", "default", - "en_stem", "chinese_compatible", "source_code_default", "source_code_with_hex", diff --git a/quickwit/quickwit-query/src/tokenizers/mod.rs b/quickwit/quickwit-query/src/tokenizers/mod.rs index d086c36a977..5a90715075e 100644 --- a/quickwit/quickwit-query/src/tokenizers/mod.rs +++ b/quickwit/quickwit-query/src/tokenizers/mod.rs @@ -14,8 +14,6 @@ mod chinese_compatible; mod code_tokenizer; -#[cfg(feature = "multilang")] -mod multilang; mod tokenizer_manager; use once_cell::sync::Lazy; @@ -26,8 +24,6 @@ use tantivy::tokenizer::{ use self::chinese_compatible::ChineseTokenizer; pub use self::code_tokenizer::CodeTokenizer; -#[cfg(feature = "multilang")] -pub use self::multilang::MultiLangTokenizer; pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager}; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -58,17 +54,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .filter(LowerCaser) .build(); tokenizer_manager.register("default", default_tokenizer, true); - #[cfg(feature = "multilang")] - { - let en_stem_tokenizer = TextAnalyzer::builder(SimpleTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .filter(tantivy::tokenizer::Stemmer::new( - tantivy::tokenizer::Language::English, - )) - .build(); - tokenizer_manager.register("en_stem", en_stem_tokenizer, true); - } tokenizer_manager.register("whitespace", WhitespaceTokenizer::default(), false); let chinese_tokenizer = TextAnalyzer::builder(ChineseTokenizer) @@ -94,15 +79,6 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), true, ); - #[cfg(feature = "multilang")] - tokenizer_manager.register( - "multilang_default", - TextAnalyzer::builder(MultiLangTokenizer::default()) - .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) - .filter(LowerCaser) - .build(), - true, - ); tokenizer_manager } diff --git a/quickwit/quickwit-query/src/tokenizers/multilang.rs b/quickwit/quickwit-query/src/tokenizers/multilang.rs deleted file mode 100644 index a62d2ff151c..00000000000 --- a/quickwit/quickwit-query/src/tokenizers/multilang.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright 2021-Present Datadog, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use lindera_core::mode::Mode; -use lindera_dictionary::{DictionaryConfig, DictionaryKind, load_dictionary_from_config}; -use lindera_tokenizer::token::Token as LinderaToken; -use lindera_tokenizer::tokenizer::Tokenizer as LinderaTokenizer; -use once_cell::sync::Lazy; -use tantivy::tokenizer::{SimpleTokenStream, SimpleTokenizer, Token, TokenStream, Tokenizer}; -use whichlang::{Lang, detect_language}; - -// Note(fmassot): we use `lindera_tokenizer::tokenizer::Tokenizer` and not -// `use lindera_tantivy::tokenizer::LinderaTokenizer` to avoid -// costly copy of lindera dictionaries each time we clone the `MultiLangTokenizer`. - -/// Mandarin chinese tokenizer. -static CMN_TOKENIZER: Lazy = Lazy::new(|| { - let cmn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::CcCedict), - path: None, - }; - let cmn_dictionary = load_dictionary_from_config(cmn_dictionary_config) - .expect("Lindera `CcCedict` dictionary must be present"); - LinderaTokenizer::new(cmn_dictionary, None, Mode::Normal) -}); - -/// Japanese tokenizer. -static JPN_TOKENIZER: Lazy = Lazy::new(|| { - let jpn_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::IPADIC), - path: None, - }; - let jpn_dictionary = load_dictionary_from_config(jpn_dictionary_config) - .expect("Lindera `IPADIC` dictionary must be present"); - LinderaTokenizer::new(jpn_dictionary, None, Mode::Normal) -}); - -/// Korean tokenizer. -static KOR_TOKENIZER: Lazy = Lazy::new(|| { - let kor_dictionary_config = DictionaryConfig { - kind: Some(DictionaryKind::KoDic), - path: None, - }; - let kor_dictionary = load_dictionary_from_config(kor_dictionary_config) - .expect("Lindera `KoDic` dictionary must be present"); - LinderaTokenizer::new(kor_dictionary, None, Mode::Normal) -}); - -/// Multilanguage tokenizer that uses the `whichlang` to detect the language of the text -/// and uses the appropriate tokenizer for the detected language: -/// - lindera for Chinese, Japanese, and Korean. -/// - Quickwit's default tokenizer for other languages. -/// -/// It is possible to bypass the language detection by prefixing the text with the language code -/// followed by a colon. For example, `KOR:일본입니다` will be tokenized by the korean tokenizer. -/// Current supported prefix are: -/// - `KOR:` for Korean tokenizer -/// - `JPN:` for Japanese tokenizer -/// - `CMN:` for Chinese tokenizer -/// - `ENG:` for Quickwit's default tokenizer -#[derive(Clone, Default)] -pub struct MultiLangTokenizer { - default_tokenizer: SimpleTokenizer, - token: Token, -} - -impl Tokenizer for MultiLangTokenizer { - type TokenStream<'a> = MultiLanguageTokenStream<'a>; - fn token_stream<'a>(&'a mut self, text: &'a str) -> MultiLanguageTokenStream<'a> { - self.token.reset(); - let (language_prefix, text_to_tokenize) = get_language_from_prefix(text); - // If the text is empty, we return an empty token stream. - // `whichlang::detect_language` panicks if the text is empty. - if text.trim().is_empty() { - return MultiLanguageTokenStream::Empty; - } - let language = language_prefix.unwrap_or_else(|| detect_language(text_to_tokenize)); - match language { - Lang::Cmn => { - let lindera_token_stream = LinderaTokenStream { - tokens: CMN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Jpn => { - let lindera_token_stream = LinderaTokenStream { - tokens: JPN_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - Lang::Kor => { - let lindera_token_stream = LinderaTokenStream { - tokens: KOR_TOKENIZER - .tokenize(text_to_tokenize) - .expect("tokenize method should never fail"), - token: &mut self.token, - }; - MultiLanguageTokenStream::Lindera(lindera_token_stream) - } - _ => MultiLanguageTokenStream::Simple( - self.default_tokenizer.token_stream(text_to_tokenize), - ), - } - } -} - -/// Gets the language defined by a prefix `{ID}:text` where ID being the 3-letter language used by -/// whichlang) and returns the language and the text without the prefix. If the prefix is not -/// recognized, the language is `None` and the text is the original. -fn get_language_from_prefix(text: &str) -> (Option, &str) { - let prefix_bytes = &text.as_bytes()[0..std::cmp::min(4, text.len())]; - // TODO: refactor. - let prefix_language = match prefix_bytes { - b"CMN:" => Some(Lang::Cmn), - b"ENG:" => Some(Lang::Eng), - b"JPN:" => Some(Lang::Jpn), - b"KOR:" => Some(Lang::Kor), - _ => None, - }; - let text_without_prefix = if prefix_language.is_some() { - // This is safe as we know that the prefix is made of 4 ascii characters. - &text[4..] - } else { - text - }; - (prefix_language, text_without_prefix) -} -pub enum MultiLanguageTokenStream<'a> { - Empty, - Lindera(LinderaTokenStream<'a>), - Simple(SimpleTokenStream<'a>), -} - -impl TokenStream for MultiLanguageTokenStream<'_> { - fn advance(&mut self) -> bool { - match self { - MultiLanguageTokenStream::Empty => false, - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.advance(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.advance(), - } - } - - fn token(&self) -> &Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token(), - } - } - - fn token_mut(&mut self) -> &mut Token { - match self { - MultiLanguageTokenStream::Empty => { - panic!("Cannot call token_mut() on an empty token stream.") - } - MultiLanguageTokenStream::Lindera(tokenizer) => tokenizer.token_mut(), - MultiLanguageTokenStream::Simple(tokenizer) => tokenizer.token_mut(), - } - } -} - -pub struct LinderaTokenStream<'a> { - pub tokens: Vec>, - pub token: &'a mut Token, -} - -impl TokenStream for LinderaTokenStream<'_> { - fn advance(&mut self) -> bool { - if self.tokens.is_empty() { - return false; - } - let token = self.tokens.remove(0); - self.token.text = token.text.to_string(); - self.token.offset_from = token.byte_start; - self.token.offset_to = token.byte_end; - self.token.position = token.position; - self.token.position_length = token.position_length; - - true - } - - fn token(&self) -> &Token { - self.token - } - - fn token_mut(&mut self) -> &mut Token { - self.token - } -} - -#[cfg(test)] -mod tests { - use tantivy::tokenizer::{Token, TokenStream, Tokenizer}; - - use super::{MultiLangTokenizer, MultiLanguageTokenStream, get_language_from_prefix}; - - fn test_helper(mut tokenizer: MultiLanguageTokenStream) -> Vec { - let mut tokens: Vec = Vec::new(); - tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); - tokens - } - - #[test] - fn test_multilanguage_tokenizer_cmn() { - let mut tokenizer = MultiLangTokenizer::default(); - let tokens = test_helper( - tokenizer.token_stream("地址1,包含無效的字元 (包括符號與不標準的asci阿爾發字元"), - ); - assert_eq!(tokens.len(), 19); - { - let token = &tokens[0]; - assert_eq!(token.text, "地址"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_jpn() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("すもももももももものうち")); - assert_eq!(tokens.len(), 7); - { - let token = &tokens[0]; - assert_eq!(token.text, "すもも"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 9); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - // Force usage of JPN tokenizer. - let tokens = test_helper(tokenizer.token_stream("JPN:すもももももももものうち")); - assert_eq!(tokens.len(), 7); - } - { - // Force usage of ENG tokenizer. - // This tokenizer will return only one token. - let tokens = test_helper(tokenizer.token_stream("ENG:すもももももももものうち")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_kor() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - { - let token = &tokens[0]; - assert_eq!(token.text, "일본"); - assert_eq!(token.offset_from, 0); - assert_eq!(token.offset_to, 6); - assert_eq!(token.position, 0); - assert_eq!(token.position_length, 1); - } - } - { - let tokens = - test_helper(tokenizer.token_stream("KOR:일본입니다. 매우 멋진 단어입니다.")); - assert_eq!(tokens.len(), 11); - } - { - let tokens = test_helper(tokenizer.token_stream("ENG:일본입니다")); - assert_eq!(tokens.len(), 1); - } - } - - #[test] - fn test_multilanguage_tokenizer_with_empty_string() { - let mut tokenizer = MultiLangTokenizer::default(); - { - let tokens = test_helper(tokenizer.token_stream("")); - assert_eq!(tokens.len(), 0); - } - { - let tokens = test_helper(tokenizer.token_stream(" ")); - assert_eq!(tokens.len(), 0); - } - } - - #[test] - fn test_multilanguage_process_language_prefix() { - { - let (lang, text) = get_language_from_prefix("JPN:すもももももももものうち"); - assert_eq!(lang, Some(whichlang::Lang::Jpn)); - assert_eq!(text, "すもももももももものうち"); - } - { - let (lang, text) = get_language_from_prefix("CMN:地址1,包含無效的字元"); - assert_eq!(lang, Some(whichlang::Lang::Cmn)); - assert_eq!(text, "地址1,包含無效的字元"); - } - { - let (lang, text) = get_language_from_prefix("ENG:my address"); - assert_eq!(lang, Some(whichlang::Lang::Eng)); - assert_eq!(text, "my address"); - } - { - let (lang, text) = get_language_from_prefix("UNK:my address"); - assert!(lang.is_none()); - assert_eq!(text, "UNK:my address"); - } - { - let (lang, text) = get_language_from_prefix(""); - assert!(lang.is_none()); - assert_eq!(text, ""); - } - } -} diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs index 79f6ba81702..31c98157889 100644 --- a/quickwit/quickwit-search/src/cluster_client.rs +++ b/quickwit/quickwit-search/src/cluster_client.rs @@ -328,6 +328,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }], ..Default::default() } @@ -355,6 +356,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -363,6 +365,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index ed21fd968ba..d901ed26071 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -16,12 +16,15 @@ use std::borrow::Cow; use std::cmp::Ordering; use std::collections::HashSet; -use itertools::Itertools; +use itertools::{Either, Itertools}; use quickwit_common::binary_heap::{SortKeyMapper, TopK}; +use quickwit_common::numeric_types::num_proj::{ + ProjectedNumber, f64_to_i64, f64_to_u64, i64_to_f64, i64_to_u64, u64_to_f64, u64_to_i64, +}; use quickwit_doc_mapper::{FastFieldWarmupInfo, WarmupInfo}; use quickwit_proto::search::{ LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortByValue, SortOrder, - SortValue, SplitSearchError, + SortValue, SplitSearchError, TypeSortKey, }; use quickwit_proto::types::SplitId; use serde::Deserialize; @@ -29,13 +32,18 @@ use tantivy::aggregation::agg_req::{Aggregations, get_fast_field_names}; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::aggregation::{AggContextParams, AggregationLimitsGuard, AggregationSegmentCollector}; use tantivy::collector::{Collector, SegmentCollector}; -use tantivy::columnar::{ColumnType, MonotonicallyMappableToU64}; +use tantivy::columnar::{ + ColumnIndex, ColumnType, MonotonicallyMappableToU64, StrColumn, TermOrdHit, +}; use tantivy::fastfield::Column; use tantivy::tokenizer::TokenizerManager; -use tantivy::{DateTime, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError}; +use tantivy::{ + COLLECT_BLOCK_BUFFER_LEN, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError, +}; use crate::find_trace_ids_collector::{FindTraceIdsCollector, FindTraceIdsSegmentCollector, Span}; -use crate::top_k_collector::{QuickwitSegmentTopKCollector, specialized_top_k_segment_collector}; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr, InternalValueRepr}; +use crate::top_k_collector::QuickwitSegmentTopKCollector; use crate::{GlobalDocAddress, merge_resource_stats, merge_resource_stats_it}; #[derive(Clone, Debug)] @@ -51,30 +59,7 @@ pub(crate) enum SortByComponent { order: SortOrder, }, } -impl From for SortByPair { - fn from(value: SortByComponent) -> Self { - Self { - first: value, - second: None, - } - } -} -#[derive(Clone)] -pub(crate) struct SortByPair { - first: SortByComponent, - second: Option, -} -impl SortByPair { - pub fn sort_orders(&self) -> (SortOrder, SortOrder) { - ( - self.first.sort_order(), - self.second - .as_ref() - .map(|sort_by| sort_by.sort_order()) - .unwrap_or(SortOrder::Desc), - ) - } -} + impl SortByComponent { fn to_sorting_field_extractor_component( &self, @@ -83,19 +68,48 @@ impl SortByComponent { match self { SortByComponent::DocId { .. } => Ok(SortingFieldExtractorComponent::DocId), SortByComponent::FastField { field_name, .. } => { - let sort_column_opt: Option<(Column, ColumnType)> = - segment_reader.fast_fields().u64_lenient(field_name)?; - let (sort_column, column_type) = sort_column_opt.unwrap_or_else(|| { - ( - Column::build_empty_column(segment_reader.max_doc()), - ColumnType::U64, - ) - }); - let sort_field_type = SortFieldType::try_from(column_type)?; - Ok(SortingFieldExtractorComponent::FastField { - sort_column, - sort_field_type, - }) + let allowed_column_types = [ + ColumnType::I64, + ColumnType::U64, + ColumnType::F64, + ColumnType::Str, + ColumnType::DateTime, + ColumnType::Bool, + // ColumnType::IpAddr Unsupported + // ColumnType::Bytes Unsupported + ]; + let fast_fields = segment_reader.fast_fields(); + let mut sort_columns = fast_fields + .u64_lenient_for_type_all(Some(&allowed_column_types), field_name)? + .into_iter() + .map(|(col, col_typ)| match col_typ { + ColumnType::U64 => Ok((col, SortFieldType::U64)), + ColumnType::I64 => Ok((col, SortFieldType::I64)), + ColumnType::F64 => Ok((col, SortFieldType::F64)), + ColumnType::DateTime => Ok((col, SortFieldType::DateTime)), + ColumnType::Bool => Ok((col, SortFieldType::Bool)), + ColumnType::Str => Ok(( + col, + SortFieldType::String( + fast_fields + .str(field_name)? + .expect("field with str column type should have str column"), + ), + )), + _ => panic!("unsupported"), + }) + .collect::>>()?; + + sort_columns.sort_by_key(|(_, col_typ)| col_typ.type_sort_key()); + + // TODO we could skip the columns that are before the search after + + Ok(SortingFieldExtractorComponent::FastField( + FastFieldExtractor { + sort_columns, + col_scratch: Box::new([None; COLLECT_BLOCK_BUFFER_LEN]), + }, + )) } SortByComponent::Score { .. } => Ok(SortingFieldExtractorComponent::Score), } @@ -125,347 +139,568 @@ impl SortByComponent { } } -#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[derive(Clone)] +pub(crate) struct SortByPair { + first: SortByComponent, + second: Option, +} +impl SortByPair { + pub fn sort_orders(&self) -> (SortOrder, SortOrder) { + ( + self.first.sort_order(), + self.second + .as_ref() + .map(|sort_by| sort_by.sort_order()) + .unwrap_or(SortOrder::Desc), + ) + } +} + +#[derive(Clone, Debug)] pub(crate) enum SortFieldType { U64, I64, F64, DateTime, Bool, + String(StrColumn), +} + +impl SortFieldType { + fn type_sort_key(&self) -> TypeSortKey { + match self { + SortFieldType::U64 => TypeSortKey::Numeric, + SortFieldType::I64 => TypeSortKey::Numeric, + SortFieldType::F64 => TypeSortKey::Numeric, + SortFieldType::DateTime => TypeSortKey::DateTime, + SortFieldType::Bool => TypeSortKey::Boolean, + SortFieldType::String(_) => TypeSortKey::Str, + } + } +} + +struct FastFieldExtractor { + /// Sort columns are sorted in the same order as types (TypeSortKey) + sort_columns: Vec<(Column, SortFieldType)>, + col_scratch: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +} + +impl FastFieldExtractor { + fn fill_batch( + &mut self, + docs: &[DocId], + order: SortOrder, + out: &mut [InternalValueRepr], + ) { + let n = docs.len(); + let unique_column = &self.sort_columns[0].0; + if let ColumnIndex::Multivalued(_) = unique_column.index { + // TODO: first_vals() doesn't enforce zeroing for multivalued + // columns. It seems like something that should be fixed in Tantivy? + self.col_scratch[..n].fill(None); + } + self.sort_columns[0] + .0 + .first_vals(docs, &mut self.col_scratch[..n]); + for (repr, val_opt) in out[..n].iter_mut().zip(self.col_scratch[..n].iter()) { + *repr = match val_opt { + Some(val) => InternalValueRepr::new(*val, 0, order), + None => InternalValueRepr::new_missing(), + }; + } + } } /// The `SortingFieldExtractor` is used to extract a score, which can either be a true score, /// a value from a fast field, or nothing (sort by DocId). -pub(crate) enum SortingFieldExtractorComponent { +enum SortingFieldExtractorComponent { /// If undefined, we simply sort by DocIds. DocId, - FastField { - sort_column: Column, - sort_field_type: SortFieldType, - }, + FastField(FastFieldExtractor), Score, } impl SortingFieldExtractorComponent { - pub fn is_score(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::Score) + pub fn is_doc_id(&self) -> bool { + matches!(self, SortingFieldExtractorComponent::DocId) } - pub fn is_fast_field(&self) -> bool { - matches!(self, SortingFieldExtractorComponent::FastField { .. }) - } - /// Loads the fast field values for the given doc_ids in its u64 representation. The returned - /// u64 representation maintains the ordering of the original value. - #[inline] - pub fn extract_typed_sort_values_block(&self, doc_ids: &[DocId], values: &mut [Option]) { - // In the collect block case we don't have scores to extract - if let SortingFieldExtractorComponent::FastField { sort_column, .. } = self { - let values = &mut values[..doc_ids.len()]; - sort_column.first_vals(doc_ids, values); + + /// Currently batch extraction only has a fast path for full columns. That + /// can only happen if there is only one column for the fast field. + fn extractor_for_batch_if_worthwhile(&mut self) -> Option<&mut FastFieldExtractor> { + match self { + SortingFieldExtractorComponent::FastField(extractor) + if extractor.sort_columns.len() == 1 => + { + Some(extractor) + } + _ => None, } } - /// Returns the sort value for the given element in its u64 representation. The returned u64 - /// representation maintains the ordering of the original value. - /// - /// The function returns None if the sort key is a fast field, for which we have no value - /// for the given doc_id, or we sort by DocId. + /// Returns the sort value for the given element in its u64 representation. + /// The returned u64 representation maintains the ordering of the original + /// value. #[inline] - fn extract_typed_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option { + fn project_to_internal_sort_value( + &self, + doc_id: DocId, + score: Score, + order: SortOrder, + ) -> InternalValueRepr { match self { - // Tie breaks are not handled here, but in SegmentPartialHit - SortingFieldExtractorComponent::DocId => None, - SortingFieldExtractorComponent::FastField { sort_column, .. } => { - sort_column.first(doc_id) + SortingFieldExtractorComponent::DocId => { + // Doc id is handled at the compound sort value level + debug_assert!(V::is_elided()); + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + for (idx, (sort_column, _)) in sort_columns.iter().enumerate() { + if let Some(value) = sort_column.first(doc_id) { + return InternalValueRepr::new(value, idx as u8, order); + } + } + InternalValueRepr::new_missing() + } + SortingFieldExtractorComponent::Score => { + InternalValueRepr::new((score as f64).to_u64(), 0, order) } - SortingFieldExtractorComponent::Score => Some((score as f64).to_u64()), } } - #[inline] - /// Converts u64 fast field values to its correct type. - /// The conversion is delayed for performance reasons. - /// - /// This is used to convert `search_after` sort value to a u64 representation that will respect - /// the same order as the `SortValue` representation. - pub fn convert_u64_ff_val_to_sort_value(&self, sort_value: u64) -> SortValue { - let map_fast_field_to_value = |fast_field_value, field_type| match field_type { - SortFieldType::U64 => SortValue::U64(fast_field_value), - SortFieldType::I64 => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::F64 => SortValue::F64(f64::from_u64(fast_field_value)), - SortFieldType::DateTime => SortValue::I64(i64::from_u64(fast_field_value)), - SortFieldType::Bool => SortValue::Boolean(fast_field_value != 0u64), - }; - match self { - SortingFieldExtractorComponent::DocId => SortValue::U64(sort_value), - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => map_fast_field_to_value(sort_value, *sort_field_type), - SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(sort_value)), + fn project_from_internal_sort_value( + &self, + internal_repr: InternalValueRepr, + order: SortOrder, + ) -> tantivy::Result> { + if V::is_elided() { + return Ok(None); } + let Some((col_idx, val_as_u64)) = internal_repr.decode(order) else { + return Ok(Some(SortByValue { sort_value: None })); + }; + let sort_value = match self { + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }) => { + let (_, field_type) = &sort_columns[col_idx as usize]; + match field_type { + SortFieldType::U64 => SortValue::U64(val_as_u64), + SortFieldType::I64 => SortValue::I64(i64::from_u64(val_as_u64)), + SortFieldType::F64 => SortValue::F64(f64::from_u64(val_as_u64)), + SortFieldType::DateTime => SortValue::Datetime(i64::from_u64(val_as_u64)), + SortFieldType::Bool => SortValue::Boolean(val_as_u64 != 0u64), + SortFieldType::String(str_column) => { + let term_dict = str_column.dictionary(); + let mut buffer = Vec::new(); + term_dict.ord_to_term(val_as_u64, &mut buffer)?; + let string_value = String::from_utf8(buffer).map_err(|_| { + tantivy::TantivyError::InternalError( + "term dictionary contains non-UTF-8 bytes".to_string(), + ) + })?; + SortValue::Str(string_value) + } + } + } + SortingFieldExtractorComponent::Score => SortValue::F64(f64::from_u64(val_as_u64)), + SortingFieldExtractorComponent::DocId => { + return Err(tantivy::TantivyError::InternalError( + "value should be elided on doc id sort".to_string(), + )); + } + }; + Ok(Some(SortByValue { + sort_value: Some(sort_value), + })) } - /// Converts fast field values into their u64 fast field representation. - /// - /// Returns None if value is out of bounds of target value. - /// None means that the search_after will be disabled and everything matches. - /// - /// What's currently missing is to signal that _nothing_ matches to generate an optimized - /// query. For now we just choose the max value of the target type. - #[inline] - pub fn convert_to_u64_ff_val( + + fn project_to_internal_search_after( &self, - sort_value: SortValue, + sort_by_value: &SortByValue, sort_order: SortOrder, - ) -> Option { - match self { - SortingFieldExtractorComponent::DocId => match sort_value { - SortValue::U64(val) => Some(val), - _ => panic!("Internal error: Got non-U64 sort value for DocId."), - }, - SortingFieldExtractorComponent::FastField { - sort_field_type, .. - } => { - // We need to convert a (potential user provided) value in the correct u64 - // representation of the fast field. - // This requires this weird conversion of first casting into the target type - // (if possible) and then to its u64 presentation. - // - // For the conversion into the target type it's important to know if the target - // type does not cover the whole range of the source type. In that case we need to - // add additional conversion checks, to see if it matches everything - // or nothing. (Which also depends on the sort order). - // Below are the visual representations of the value ranges of the different types. - // Note: DateTime is equal to I64 and omitted. - // - // Bool value range (0, 1): - // <-> - // - // I64 value range (signed 64-bit integer): - // <------------------------------------> - // -2^63 2^63-1 - // U64 value range (unsigned 64-bit integer): - // <------------------------------------> - // 0 2^64-1 - // F64 value range (64-bit floating point, conceptual, not to scale): - // <--------------------------------------------------------------------> - // Very negative numbers Very positive numbers - // - // Those conversions have limited target type value space: - // - [X] U64 -> I64 - // - [X] F64 -> I64 - // - [X] I64 -> U64 - // - [X] F64 -> U64 - // - // - [X] F64 -> Bool - // - [X] I64 -> Bool - // - [X] U64 -> Bool - // - let val = match (sort_value, sort_field_type) { - // Same field type, no conversion needed. - (SortValue::U64(val), SortFieldType::U64) => val, - (SortValue::F64(val), SortFieldType::F64) => val.to_u64(), - (SortValue::Boolean(val), SortFieldType::Bool) => val.to_u64(), - (SortValue::I64(val), SortFieldType::I64) => val.to_u64(), - (SortValue::U64(mut val), SortFieldType::I64) => { - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - (val as i64).to_u64() - } - (SortValue::U64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::U64(mut val), SortFieldType::DateTime) => { - // Match everything - if sort_order == SortOrder::Desc && val > i64::MAX as u64 { - return None; - } - // Add a limit to avoid overflow. - val = val.min(i64::MAX as u64); - DateTime::from_timestamp_nanos(val as i64).to_u64() - } - (SortValue::I64(val), SortFieldType::U64) => { - if val < 0 && sort_order == SortOrder::Asc { - return None; - } - if val < 0 && sort_order == SortOrder::Desc { - u64::MIN // matches nothing as search_after is not inclusive - } else { - val as u64 - } - } - (SortValue::I64(val), SortFieldType::F64) => (val as f64).to_u64(), - (SortValue::I64(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val).to_u64() - } - (SortValue::F64(val), SortFieldType::U64) => { - let all_values_ahead1 = - val < u64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > u64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - (val as u64).to_u64() - } - (SortValue::F64(val), SortFieldType::I64) - | (SortValue::F64(val), SortFieldType::DateTime) => { - let all_values_ahead1 = - val < i64::MIN as f64 && sort_order == SortOrder::Asc; - let all_values_ahead2 = - val > i64::MAX as f64 && sort_order == SortOrder::Desc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // f64 cast already handles under/overflow and clamps the value - let val_i64 = val as i64; + ) -> tantivy::Result> { + let SortByValue { + sort_value: sort_value_opt, + } = sort_by_value; + match (self, sort_value_opt) { + (SortingFieldExtractorComponent::DocId, _) => { + // Doc id sorts are handled at the compound sort value level + debug_assert!(V::is_elided()); + Ok(InternalValueRepr::new_missing()) + } + (SortingFieldExtractorComponent::FastField(_), None) => { + Ok(InternalValueRepr::new_missing()) + } + ( + SortingFieldExtractorComponent::FastField(FastFieldExtractor { + sort_columns, .. + }), + Some(sort_value), + ) => project_search_after_sort_value(sort_columns, sort_value, sort_order), + (SortingFieldExtractorComponent::Score, Some(SortValue::F64(val))) => { + Ok(InternalValueRepr::new(val.to_u64(), 0, sort_order)) + } + (SortingFieldExtractorComponent::Score, _) => { + Err(tantivy::TantivyError::InvalidArgument( + "got non-F64 sort value for score".to_string(), + )) + } + } + } +} - if *sort_field_type == SortFieldType::DateTime { - DateTime::from_timestamp_nanos(val_i64).to_u64() - } else { - val_i64.to_u64() - } - } - // Not sure when we hit this, it's probably are very rare case. - (SortValue::Boolean(val), SortFieldType::U64) => val as u64, - (SortValue::Boolean(val), SortFieldType::F64) => (val as u64 as f64).to_u64(), - (SortValue::Boolean(val), SortFieldType::I64) => (val as i64).to_u64(), - (SortValue::Boolean(val), SortFieldType::DateTime) => { - DateTime::from_timestamp_nanos(val as i64).to_u64() +fn projected_number_internal_repr( + projected: ProjectedNumber, + order: SortOrder, + accessor_idx: u8, +) -> InternalValueRepr { + match (projected, order) { + (ProjectedNumber::Exact(val), _) => { + InternalValueRepr::new(val.to_u64(), accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Asc) => { + InternalValueRepr::new_skip_column(accessor_idx, order) + } + (ProjectedNumber::AfterLast, SortOrder::Desc) => { + InternalValueRepr::new_keep_column(accessor_idx, order) + } + (ProjectedNumber::Next(val), SortOrder::Asc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_keep_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64 - 1, accessor_idx, order) + } + } + (ProjectedNumber::Next(val), SortOrder::Desc) => { + let val_u64 = val.to_u64(); + if val_u64 == 0 { + InternalValueRepr::new_skip_column(accessor_idx, order) + } else { + InternalValueRepr::new(val_u64, accessor_idx, order) + } + } + } +} + +fn project_search_after_sort_value( + sort_columns: &[(Column, SortFieldType)], + sort_value: &SortValue, + sort_order: SortOrder, +) -> tantivy::Result> { + let col_iter = match sort_order { + SortOrder::Asc => Either::Left(sort_columns.iter().enumerate()), + SortOrder::Desc => Either::Right(sort_columns.iter().enumerate().rev()), + }; + for (idx, sort_column) in col_iter { + let internal_repr = match (&sort_column.1, sort_value) { + // project to u64 column + (SortFieldType::U64, SortValue::U64(val)) => { + InternalValueRepr::new(*val, idx as u8, sort_order) + } + (SortFieldType::U64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_u64(*val), sort_order, idx as u8) + } + (SortFieldType::U64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_u64(*val), sort_order, idx as u8) + } + // project to i64 column + (SortFieldType::I64, SortValue::I64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::I64, SortValue::F64(val)) => { + projected_number_internal_repr(f64_to_i64(*val), sort_order, idx as u8) + } + (SortFieldType::I64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_i64(*val), sort_order, idx as u8) + } + // project to f64 column + (SortFieldType::F64, SortValue::F64(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::F64, SortValue::I64(val)) => { + projected_number_internal_repr(i64_to_f64(*val), sort_order, idx as u8) + } + (SortFieldType::F64, SortValue::U64(val)) => { + projected_number_internal_repr(u64_to_f64(*val), sort_order, idx as u8) + } + // other types + (SortFieldType::DateTime, SortValue::Datetime(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::Bool, SortValue::Boolean(val)) => { + InternalValueRepr::new(val.to_u64(), idx as u8, sort_order) + } + (SortFieldType::String(str_column), SortValue::Str(val)) => { + let term_dict = str_column.dictionary(); + let hit = term_dict.term_ord_or_next(val.as_str().as_bytes())?; + match (hit, sort_order) { + (TermOrdHit::Exact(ord), _) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::U64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - if all_values_ahead1 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(ord), SortOrder::Desc) => { + InternalValueRepr::new(ord, idx as u8, sort_order) } - (SortValue::I64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - // clamp value for comparison - val = val.clamp(0, 1); - (val == 1).to_u64() + (TermOrdHit::Next(0), SortOrder::Asc) => { + InternalValueRepr::new_keep_column(idx as u8, sort_order) } - (SortValue::F64(mut val), SortFieldType::Bool) => { - let all_values_ahead1 = val > 1.0 && sort_order == SortOrder::Desc; - let all_values_ahead2 = val < 0.0 && sort_order == SortOrder::Asc; - if all_values_ahead1 || all_values_ahead2 { - return None; - } - val = val.clamp(0.0, 1.0); - (val >= 0.5).to_u64() // Is this correct? + (TermOrdHit::Next(ord), SortOrder::Asc) => { + InternalValueRepr::new(ord - 1, idx as u8, sort_order) } + } + } + // unsupported mixed types + // + // TODO: we need a strongly typed pagination API to support JSON + // fields with datetime and schema evolutions + ( + SortFieldType::I64 | SortFieldType::U64 | SortFieldType::F64, + SortValue::Datetime(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported for schema updates to datetime".to_string(), + )); + } + ( + SortFieldType::DateTime, + SortValue::I64(_) | SortValue::U64(_) | SortValue::F64(_), + ) => { + return Err(TantivyError::SchemaError( + "search after not supported on multi-typed fields with datetime".to_string(), + )); + } + // supported mixed types + (sort_field_type, sort_value) => { + let column_key = sort_field_type.type_sort_key(); + let value_key = sort_value.type_sort_key(); + debug_assert_ne!(column_key, value_key); + let column_comes_after = match sort_order { + SortOrder::Desc => column_key < value_key, + SortOrder::Asc => column_key > value_key, }; - Some(val) + if column_comes_after { + InternalValueRepr::new_keep_column(idx as u8, sort_order) + } else { + continue; + } } - SortingFieldExtractorComponent::Score => match sort_value { - SortValue::F64(val) => Some(val.to_u64()), - _ => panic!("Internal error: Got non-F64 sort value for Score."), - }, - } + }; + return Ok(internal_repr); } + Ok(InternalValueRepr::new_skip_all_but_missing()) } -impl From for SortingFieldExtractorPair { - fn from(value: SortingFieldExtractorComponent) -> Self { - Self { - first: value, - second: None, - } - } +pub(crate) struct SortingFieldExtractorPair { + first: SortingFieldExtractorComponent, + second: Option, + first_order: SortOrder, + second_order: SortOrder, + sort1_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, + sort2_scratch: Box<[InternalValueRepr; COLLECT_BLOCK_BUFFER_LEN]>, } -pub(crate) struct SortingFieldExtractorPair { - pub first: SortingFieldExtractorComponent, - pub second: Option, -} +impl SortingFieldExtractorPair { + fn doc_id_sort_order(&self) -> SortOrder { + if self.first.is_doc_id() { + self.first_order + } else if let Some(second) = &self.second + && second.is_doc_id() + { + self.second_order + } else { + // TODO this is the current behavior which is weird. QW docs for the + // native search API advertise that the sort order by default is + // reverse(doc_id). In ES _shard_doc is supposed to be always ascending. + self.first_order + } + } -impl SortingFieldExtractorPair { - pub fn is_score(&self) -> bool { - self.first.is_score() - || self - .second + pub(crate) fn search_after_from_partial_hit( + &self, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + partial_hit: &PartialHit, + ) -> tantivy::Result> { + let sort_1 = if let Some(sort_by_value) = &partial_hit.sort_value { + self.first + .project_to_internal_search_after(sort_by_value, self.first_order)? + } else { + InternalValueRepr::new_missing() + }; + let sort_2 = if let Some(sort_by_value) = &partial_hit.sort_value2 { + self.second .as_ref() - .map(|second| second.is_score()) - .unwrap_or(false) + .ok_or_else(|| { + TantivyError::InvalidArgument( + "search after has 2 values but there is only 1 sort dimension".to_string(), + ) + })? + .project_to_internal_search_after(sort_by_value, self.second_order)? + } else { + InternalValueRepr::new_missing() + }; + + let internal_repr = if partial_hit.split_id.is_empty() { + // When split_id is empty, the search_after is a pure sort-value + // boundary (no doc position), any doc with the same sort value must be + // excluded otherwise we risk iterating over an over through the same + // documents. + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } else { + let split_cmp = split_id + .as_str() + .cmp(partial_hit.split_id.as_str()) + .then(segment_ord.cmp(&partial_hit.segment_ord)); + match (split_cmp, self.doc_id_sort_order()) { + (Ordering::Less, SortOrder::Asc) | (Ordering::Greater, SortOrder::Desc) => { + InternalSortValueRepr::new_skip_doc_ids(sort_1, sort_2) + } + (Ordering::Less, SortOrder::Desc) | (Ordering::Greater, SortOrder::Asc) => { + InternalSortValueRepr::new_keep_doc_ids(sort_1, sort_2) + } + (Ordering::Equal, doc_id_order) => { + InternalSortValueRepr::new(sort_1, sort_2, partial_hit.doc_id, doc_id_order) + } + } + }; + Ok(internal_repr) } - /// Returns the list of sort values for the given element - /// - /// See also [`SortingFieldExtractorComponent::extract_typed_sort_values_block`] for more - /// information. - #[inline] - pub(crate) fn extract_typed_sort_values( + + pub(crate) fn internal_to_partial_hit( &self, - doc_ids: &[DocId], - values1: &mut [Option], - values2: &mut [Option], - ) { - self.first - .extract_typed_sort_values_block(doc_ids, &mut values1[..doc_ids.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(doc_ids, &mut values2[..doc_ids.len()]); - } + split_id: &SplitId, + segment_ord: SegmentOrdinal, + internal_repr: InternalSortValueRepr, + ) -> tantivy::Result { + let sort_1 = self + .first + .project_from_internal_sort_value(internal_repr.sort_1(), self.first_order)?; + let sort_2 = self + .second + .as_ref() + .map(|second| { + second.project_from_internal_sort_value(internal_repr.sort_2(), self.second_order) + }) + .transpose()? + .unwrap_or_default(); + Ok(PartialHit { + sort_value: sort_1, + sort_value2: sort_2, + doc_id: internal_repr.doc_id(self.doc_id_sort_order()), + split_id: split_id.clone(), + segment_ord, + }) } + /// Returns the list of sort values for the given element /// /// See also [`SortingFieldExtractorComponent::extract_typed_sort_value_opt`] for more /// information. #[inline] - pub(crate) fn extract_typed_sort_value( + pub(crate) fn project_to_internal_sort_value( &self, doc_id: DocId, score: Score, - ) -> (Option, Option) { - let first = self.first.extract_typed_sort_value_opt(doc_id, score); + ) -> InternalSortValueRepr { + let first = self + .first + .project_to_internal_sort_value(doc_id, score, self.first_order); let second = self .second .as_ref() - .and_then(|second| second.extract_typed_sort_value_opt(doc_id, score)); - (first, second) + .map(|second| second.project_to_internal_sort_value(doc_id, score, self.second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + InternalSortValueRepr::new(first, second, doc_id, self.doc_id_sort_order()) } -} -impl TryFrom for SortFieldType { - type Error = tantivy::TantivyError; - - fn try_from(column_type: ColumnType) -> tantivy::Result { - match column_type { - ColumnType::U64 => Ok(SortFieldType::U64), - ColumnType::I64 => Ok(SortFieldType::I64), - ColumnType::F64 => Ok(SortFieldType::F64), - ColumnType::DateTime => Ok(SortFieldType::DateTime), - ColumnType::Bool => Ok(SortFieldType::Bool), - _ => Err(TantivyError::InvalidArgument(format!( - "Unsupported sort field type `{column_type:?}`." - ))), + pub(crate) fn project_to_internal_sort_value_block( + &mut self, + docs: &[DocId], + mut f: impl FnMut(InternalSortValueRepr), + ) { + let doc_id_order = self.doc_id_sort_order(); + let first_order = self.first_order; + let second_order = self.second_order; + + let n = docs.len(); + + let SortingFieldExtractorPair { + first, + second, + sort1_scratch, + sort2_scratch, + .. + } = self; + + let first_extractor_opt = first.extractor_for_batch_if_worthwhile(); + let second_extractor_opt = second + .as_mut() + .and_then(|s| s.extractor_for_batch_if_worthwhile()); + match (first_extractor_opt, second_extractor_opt) { + (Some(fst_batch_extr), Some(sec_batch_extr)) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (Some(fst_batch_extr), None) => { + fst_batch_extr.fill_batch(docs, first_order, &mut sort1_scratch[..n]); + for i in 0..n { + let sort2 = second + .as_ref() + .map(|s| s.project_to_internal_sort_value(docs[i], 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + sort1_scratch[i], + sort2, + docs[i], + doc_id_order, + )); + } + } + (None, Some(sec_batch_extr)) => { + sec_batch_extr.fill_batch(docs, second_order, &mut sort2_scratch[..n]); + for i in 0..n { + let sort1 = first.project_to_internal_sort_value(docs[i], 0.0, first_order); + f(InternalSortValueRepr::new( + sort1, + sort2_scratch[i], + docs[i], + doc_id_order, + )); + } + } + (None, None) => { + for &doc_id in docs { + let first = self + .first + .project_to_internal_sort_value(doc_id, 0.0, first_order); + let second = self + .second + .as_ref() + .map(|s| s.project_to_internal_sort_value(doc_id, 0.0, second_order)) + .unwrap_or_else(InternalValueRepr::new_missing); + f(InternalSortValueRepr::new( + first, + second, + doc_id, + doc_id_order, + )); + } + } } } } -/// Takes a user-defined sorting criteria and resolves it to a -/// segment specific `SortingFieldExtractorPair`. -fn get_score_extractor( - sort_by: &SortByPair, - segment_reader: &SegmentReader, -) -> tantivy::Result { - Ok(SortingFieldExtractorPair { - first: sort_by - .first - .to_sorting_field_extractor_component(segment_reader)?, - second: sort_by - .second - .as_ref() - .map(|first| first.to_sorting_field_extractor_component(segment_reader)) - .transpose()?, - }) -} - #[allow(clippy::large_enum_variant)] enum AggregationSegmentCollectors { FindTraceIdsSegmentCollector(Box), @@ -474,51 +709,50 @@ enum AggregationSegmentCollectors { /// Quickwit collector working at the scale of the segment. pub struct QuickwitSegmentCollector { - segment_top_k_collector: Option>, + segment_top_k_collector: Option, aggregation: Option, num_hits: u64, } -#[derive(Copy, Clone, Debug)] -pub(crate) struct SegmentPartialHit { - /// Normalized to u64, the typed value can be reconstructed with - /// SortingFieldExtractorComponent. - pub sort_value: Option, - pub sort_value2: Option, - pub doc_id: DocId, -} - -impl SegmentPartialHit { - pub fn into_partial_hit( - self, - split_id: SplitId, - segment_ord: SegmentOrdinal, - first: &SortingFieldExtractorComponent, - second: &Option, - ) -> PartialHit { - PartialHit { - sort_value: self - .sort_value - .map(|sort_value| first.convert_u64_ff_val_to_sort_value(sort_value)) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - sort_value2: self - .sort_value2 - .map(|sort_value| { - second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor") - .convert_u64_ff_val_to_sort_value(sort_value) - }) - .map(|sort_value| SortByValue { - sort_value: Some(sort_value), - }), - doc_id: self.doc_id, - split_id, - segment_ord, - } - } +/// Takes a user-defined sorting criteria and resolves it to a +/// segment specific `SortingFieldExtractorPair`. +#[allow(clippy::type_complexity)] +fn get_sorting_field_extractors( + sort_by: &SortByPair, + segment_reader: &SegmentReader, + split_id: &SplitId, + segment_ord: SegmentOrdinal, + search_after: &Option, +) -> tantivy::Result<( + SortingFieldExtractorPair, + Option>, +)> { + let extractor = SortingFieldExtractorPair { + first: sort_by + .first + .to_sorting_field_extractor_component(segment_reader)?, + second: sort_by + .second + .as_ref() + .map(|first| first.to_sorting_field_extractor_component(segment_reader)) + .transpose()?, + first_order: sort_by.first.sort_order(), + second_order: sort_by + .second + .as_ref() + .map(|second| second.sort_order()) + // value irrelevant? + .unwrap_or(SortOrder::Desc), + sort1_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + sort2_scratch: Box::new([InternalValueRepr::new_missing(); COLLECT_BLOCK_BUFFER_LEN]), + }; + let search_after_opt = search_after + .as_ref() + .map(|search_after| { + extractor.search_after_from_partial_hit(split_id, segment_ord, search_after) + }) + .transpose()?; + Ok((extractor, search_after_opt)) } impl SegmentCollector for QuickwitSegmentCollector { @@ -526,7 +760,6 @@ impl SegmentCollector for QuickwitSegmentCollector { #[inline] fn collect_block(&mut self, filtered_docs: &[DocId]) { - // Update results self.num_hits += filtered_docs.len() as u64; if let Some(segment_top_k_collector) = self.segment_top_k_collector.as_mut() { @@ -565,7 +798,7 @@ impl SegmentCollector for QuickwitSegmentCollector { fn harvest(self) -> Self::Fruit { let mut partial_hits: Vec = Vec::new(); if let Some(segment_top_k_collector) = self.segment_top_k_collector { - partial_hits = segment_top_k_collector.get_top_k(); + partial_hits = segment_top_k_collector.get_top_k()?; } let intermediate_aggregation_result = match self.aggregation { @@ -668,7 +901,7 @@ impl QuickwitIncrementalAggregations { let timestamp = last_elem.span_timestamp.into_timestamp_nanos(); return Some(PartialHit { sort_value: Some(SortByValue { - sort_value: Some(SortValue::I64(timestamp)), + sort_value: Some(SortValue::Datetime(timestamp)), }), sort_value2: None, split_id: SplitId::new(), @@ -792,22 +1025,70 @@ impl Collector for QuickwitCollector { ), None => None, }; - let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?; - let (order1, order2) = self.sort_by.sort_orders(); let segment_top_k_collector = if leaf_max_hits == 0 { None } else { - let coll: Box = specialized_top_k_segment_collector( - self.split_id.clone(), - score_extractor, - leaf_max_hits, - segment_ord, - self.search_after.clone(), - order1, - order2, - ); - Some(coll) + let segment_top_k_collector = match self.sort_by { + SortByPair { + first: SortByComponent::DocId { .. }, + second: None, + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_doc_id_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: None | Some(SortByComponent::DocId { .. }), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_one_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + SortByPair { + first: _, + second: Some(_), + } => { + let (extractor, search_after_opt) = get_sorting_field_extractors( + &self.sort_by, + segment_reader, + &self.split_id, + segment_ord, + &self.search_after, + )?; + QuickwitSegmentTopKCollector::new_with_two_dim_sort( + self.split_id.clone(), + segment_ord, + extractor, + leaf_max_hits, + search_after_opt, + ) + } + }; + Some(segment_top_k_collector) }; Ok(QuickwitSegmentCollector { @@ -1008,14 +1289,20 @@ pub(crate) fn sort_by_from_request(search_request: &SearchRequest) -> SortByPair let num_sort_fields = search_request.sort_fields.len(); if num_sort_fields == 0 { - SortByComponent::DocId { - order: SortOrder::Desc, + SortByPair { + first: SortByComponent::DocId { + order: SortOrder::Desc, + }, + second: None, } - .into() } else if num_sort_fields == 1 { let sort_field = &search_request.sort_fields[0]; let order = SortOrder::try_from(sort_field.sort_order).unwrap_or(SortOrder::Desc); - to_sort_by_component(&sort_field.field_name, order).into() + let first = to_sort_by_component(&sort_field.field_name, order); + SortByPair { + first, + second: None, + } } else if num_sort_fields == 2 { let sort_field1 = &search_request.sort_fields[0]; let order1 = SortOrder::try_from(sort_field1.sort_order).unwrap_or(SortOrder::Desc); @@ -1080,44 +1367,6 @@ pub(crate) fn make_merge_collector( }) } -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -pub struct SegmentPartialHitSortingKey { - sort_value: Option, - sort_value2: Option, - doc_id: DocId, - // TODO This should not be there. - sort_order: SortOrder, - // TODO This should not be there. - sort_order2: SortOrder, -} - -impl Ord for SegmentPartialHitSortingKey { - fn cmp(&self, other: &SegmentPartialHitSortingKey) -> Ordering { - debug_assert_eq!( - self.sort_order, other.sort_order, - "comparing two PartialHitSortingKey of different ordering" - ); - debug_assert_eq!( - self.sort_order2, other.sort_order2, - "comparing two PartialHitSortingKey of different ordering" - ); - let order = self - .sort_order - .compare_opt(&self.sort_value, &other.sort_value); - let order2 = self - .sort_order2 - .compare_opt(&self.sort_value2, &other.sort_value2); - let order_addr = self.sort_order.compare(&self.doc_id, &other.doc_id); - order.then(order2).then(order_addr) - } -} - -impl PartialOrd for SegmentPartialHitSortingKey { - fn partial_cmp(&self, other: &SegmentPartialHitSortingKey) -> Option { - Some(self.cmp(other)) - } -} - #[derive(Clone, Debug, PartialEq, Eq)] pub(crate) struct PartialHitSortingKey { sort_value: Option, @@ -1169,8 +1418,8 @@ impl SortKeyMapper for HitSortingMapper { type Key = PartialHitSortingKey; fn get_sort_key(&self, partial_hit: &PartialHit) -> PartialHitSortingKey { PartialHitSortingKey { - sort_value: partial_hit.sort_value.and_then(|v| v.sort_value), - sort_value2: partial_hit.sort_value2.and_then(|v| v.sort_value), + sort_value: partial_hit.sort_value.clone().and_then(|v| v.sort_value), + sort_value2: partial_hit.sort_value2.clone().and_then(|v| v.sort_value), address: GlobalDocAddress::from_partial_hit(partial_hit), sort_order: self.order1, sort_order2: self.order2, @@ -1178,19 +1427,6 @@ impl SortKeyMapper for HitSortingMapper { } } -impl SortKeyMapper for HitSortingMapper { - type Key = SegmentPartialHitSortingKey; - fn get_sort_key(&self, partial_hit: &SegmentPartialHit) -> SegmentPartialHitSortingKey { - SegmentPartialHitSortingKey { - sort_value: partial_hit.sort_value, - sort_value2: partial_hit.sort_value2, - doc_id: partial_hit.doc_id, - sort_order: self.order1, - sort_order2: self.order2, - } - } -} - /// Incrementally merge segment results. #[derive(Clone)] pub(crate) struct IncrementalCollector { @@ -1307,9 +1543,10 @@ mod tests { use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::collector::Collector; - use super::{IncrementalCollector, make_merge_collector}; - use crate::QuickwitAggregations; - use crate::collector::{merge_intermediate_aggregation_result, top_k_partial_hits}; + use super::{ + IncrementalCollector, QuickwitAggregations, make_merge_collector, + merge_intermediate_aggregation_result, top_k_partial_hits, + }; #[test] fn test_merge_partial_hits_no_tie() { @@ -1394,66 +1631,52 @@ mod tests { ] } - fn make_request(max_hits: u64, sort_fields: &str) -> SearchRequest { - SearchRequest { - max_hits, - sort_fields: sort_fields - .split(',') - .filter(|field| !field.is_empty()) - .map(|field| { - if let Some(field) = field.strip_prefix('-') { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Asc.into(), - sort_datetime_format: None, - } - } else { - SortField { - field_name: field.to_string(), - sort_order: SortOrder::Desc.into(), - sort_datetime_format: None, - } + /// Create a list of SortField from a comma-separated list of field names. + /// Field names can be prefixed with - to indicate ascending order. + fn make_sort_fields(sort_fields: &str) -> Vec { + sort_fields + .split(',') + .filter(|field| !field.is_empty()) + .map(|field| { + if let Some(field) = field.strip_prefix('-') { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Asc.into(), + sort_datetime_format: None, } - }) - .collect(), - ..SearchRequest::default() - } + } else { + SortField { + field_name: field.to_string(), + sort_order: SortOrder::Desc.into(), + sort_datetime_format: None, + } + } + }) + .collect() } - fn make_index() -> tantivy::Index { + /// Build a tantivy index from a JSON dataset. Each element must be a JSON + /// object whose keys match field names in the pre-determined schema. + fn make_index(dataset: &[serde_json::Value]) -> tantivy::Index { use tantivy::Index; use tantivy::indexer::UserOperation; - use tantivy::schema::{NumericOptions, Schema}; - - let dataset = sort_dataset(); + use tantivy::schema::{FAST, NumericOptions, Schema}; let mut schema_builder = Schema::builder(); let opts = NumericOptions::default().set_fast(); - - schema_builder.add_u64_field("sort1", opts.clone()); - schema_builder.add_u64_field("sort2", opts); + schema_builder.add_u64_field("sort_u64_1", opts.clone()); + schema_builder.add_u64_field("sort_u64_2", opts); + schema_builder.add_json_field("kv", FAST); let schema = schema_builder.build(); - let field1 = schema.get_field("sort1").unwrap(); - let field2 = schema.get_field("sort2").unwrap(); - - let index = Index::create_in_ram(schema); + let index = Index::create_in_ram(schema.clone()); let mut index_writer = index.writer(50_000_000).unwrap(); index_writer .run( dataset - .into_iter() - .map(|(val1, val2)| { - let mut doc = TantivyDocument::new(); - if let Some(val1) = val1 { - doc.add_u64(field1, val1); - } - if let Some(val2) = val2 { - doc.add_u64(field2, val2); - } - doc - }) + .iter() + .map(|obj| TantivyDocument::parse_json(&schema, &obj.to_string()).unwrap()) .map(UserOperation::Add), ) .unwrap(); @@ -1463,8 +1686,22 @@ mod tests { } #[test] - fn test_single_split_sorting() { - let index = make_index(); + fn test_single_split_sorting_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1472,7 +1709,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1532,25 +1769,27 @@ mod tests { assert_eq!(data, data_copy); } + // The implicit doc_id tiebreaker is always ascending, matching Elasticsearch's + // behavior where _shard_doc is always ascending regardless of primary sort direction. #[allow(clippy::type_complexity)] let sort_orders: Vec<(_, Box Ordering>)> = vec![ ("", Box::new(cmp_doc_id_desc)), ( - "sort1", + "sort_u64_1", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_doc_id_desc(a, b))), ), ( - "-sort1", + "-sort_u64_1", Box::new(|a, b| cmp_1_asc(a, b).then(cmp_doc_id_asc(a, b))), ), ( - "sort1,sort2", + "sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_desc(a, b).then(cmp_2_desc(a, b).then(cmp_doc_id_desc(a, b))) }), ), ( - "-sort1,sort2", + "-sort_u64_1,sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_desc(a, b)) @@ -1558,11 +1797,11 @@ mod tests { }), ), ( - "sort1,-sort2", + "sort_u64_1,-sort_u64_2", Box::new(|a, b| cmp_1_desc(a, b).then(cmp_2_asc(a, b).then(cmp_doc_id_desc(a, b)))), ), ( - "-sort1,-sort2", + "-sort_u64_1,-sort_u64_2", Box::new(|a, b| { cmp_1_asc(a, b) .then(cmp_2_asc(a, b)) @@ -1577,7 +1816,11 @@ mod tests { for slice_len in 0..dataset.len() { let collector = super::make_collector_for_split( "fake_split_id".to_string(), - &make_request(slice_len as u64, sort_str), + &SearchRequest { + max_hits: slice_len as u64, + sort_fields: make_sort_fields(sort_str), + ..SearchRequest::default() + }, Default::default(), ) .unwrap(); @@ -1604,8 +1847,8 @@ mod tests { format!( "{} {:?} {:?}", hit.doc_id, - hit.sort_value.and_then(|el| el.sort_value).clone(), - hit.sort_value2.and_then(|el| el.sort_value).clone() + hit.sort_value.clone().and_then(|el| el.sort_value), + hit.sort_value2.clone().and_then(|el| el.sort_value) ) }) .collect::>(); @@ -1619,8 +1862,22 @@ mod tests { } #[test] - fn test_search_after() { - let index = make_index(); + fn test_search_after_single_type() { + let raw_dataset = sort_dataset(); + let json_dataset: Vec = raw_dataset + .iter() + .map(|(v1, v2)| { + let mut obj = serde_json::Map::new(); + if let Some(v) = v1 { + obj.insert("sort_u64_1".to_string(), (*v).into()); + } + if let Some(v) = v2 { + obj.insert("sort_u64_2".to_string(), (*v).into()); + } + serde_json::Value::Object(obj) + }) + .collect(); + let index = make_index(&json_dataset); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -1628,7 +1885,7 @@ mod tests { // tuple of DocId and sort value type Doc = (usize, (Option, Option)); - let mut dataset: Vec = sort_dataset().into_iter().enumerate().collect(); + let mut dataset: Vec = raw_dataset.into_iter().enumerate().collect(); let reverse_int = |val: &Option| val.as_ref().map(|val| u64::MAX - val); let cmp_doc_id_desc = |a: &Doc, b: &Doc| b.0.cmp(&a.0); @@ -1658,12 +1915,12 @@ mod tests { max_hits: 1000, sort_fields: vec![ SortField { - field_name: "sort1".to_string(), + field_name: "sort_u64_1".to_string(), sort_order: SortOrder::Desc.into(), sort_datetime_format: None, }, SortField { - field_name: "sort2".to_string(), + field_name: "sort_u64_2".to_string(), sort_order: SortOrder::Asc.into(), sort_datetime_format: None, }, @@ -1752,7 +2009,256 @@ mod tests { } } - fn merge_collector_equal_results( + fn assert_search_after_results( + searcher: &tantivy::Searcher, + index_len: usize, + sort_str: &str, + search_after: PartialHit, + expected_doc_ids: impl AsRef<[u32]>, + label: &str, + ) { + let expected_doc_ids = expected_doc_ids.as_ref(); + let request = SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + search_after: Some(search_after.clone()), + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let Ok(res) = searcher.search(&tantivy::query::AllQuery, &collector) else { + panic!("search failed for {label} with search_after {search_after:?}"); + }; + // num_hits counts every doc regardless of search_after. + assert_eq!( + res.num_hits, index_len as u64, + "num_hits mismatch for {label}" + ); + assert_eq!( + res.partial_hits.len(), + expected_doc_ids.len(), + "result count mismatch for {label}" + ); + for (expected_doc_id, got) in expected_doc_ids.iter().zip(res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "doc order mismatch for {label} after {search_after:?}" + ); + } + } + + #[test] + fn test_single_split_search_after_multitype() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort1": false, "sort2": "b"}}), // doc 0 + serde_json::json!({"kv": {"sort1": true, "sort2": "a"}}), // doc 1 + serde_json::json!({"kv": {"sort1": "apple", "sort2": "a"}}), // doc 2 + serde_json::json!({"kv": {"sort1": "banana", "sort2": "b"}}), // doc 3 + serde_json::json!({"kv": {"sort1": 1, "sort2": "b"}}), // doc 4 + serde_json::json!({"kv": {"sort1": 5, "sort2": "a"}}), // doc 5 + serde_json::json!({}), // doc 6: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + for (sort_str, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (largest first) > missing + ("kv.sort1", &[1, 0, 3, 2, 5, 4, 6]), + // Asc: numbers (smallest first) > strings (lex asc) > booleans (false first) > + // missing + ("-kv.sort1", &[4, 5, 2, 3, 0, 1, 6]), + ("", &[6, 5, 4, 3, 2, 1, 0]), + ("_doc", &[6, 5, 4, 3, 2, 1, 0]), + ("-_doc", &[0, 1, 2, 3, 4, 5, 6]), + // sort2 with "b" first then "a" + ("kv.sort2,kv.sort1", &[0, 3, 4, 1, 2, 5, 6]), + // sort2 with "a" first then "b" + ("-kv.sort2,kv.sort1", &[1, 2, 5, 0, 3, 4, 6]), + ] { + // Step 1: full search to collect PartialHits carrying the correct typed SortValues. + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &SearchRequest { + max_hits: 1000, + sort_fields: make_sort_fields(sort_str), + ..Default::default() + }, + Default::default(), + ) + .unwrap(); + let full_res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + assert_eq!(full_res.partial_hits.len(), dataset.len()); + for (expected_doc_id, got) in expected_order.iter().zip(full_res.partial_hits.iter()) { + assert_eq!( + *expected_doc_id, got.doc_id, + "sort order mismatch for \"{sort_str}\"" + ); + } + + // Step 2: use each PartialHit as a search_after fence and verify the returned tail. + for (i, search_after) in full_res.partial_hits.iter().enumerate() { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + search_after.clone(), + &expected_order[i + 1..], + &format!("\"{sort_str}\" search_after position {i}"), + ); + } + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"mixed": false, "integer": 1}}), // doc 0 + serde_json::json!({"kv": {"mixed": true, "integer": 4}}), // doc 1 + serde_json::json!({"kv": {"mixed": "banana", "integer": 3}}), // doc 2 + serde_json::json!({"kv": {"mixed": "plum", "integer": 4}}), // doc 3 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let str_sort_val = |s: &str| SortValue::Str(s.to_string()); + for (sort_str, search_after_value, expected_order) in [ + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + ("kv.mixed", SortValue::I64(-10), vec![]), + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + ("-kv.mixed", SortValue::I64(-10), vec![2, 3, 0, 1]), + // project f64 to i64 + ("kv.integer", SortValue::F64(3.5), vec![2, 0]), + ("-kv.integer", SortValue::F64(3.5), vec![1, 3]), + // str not in columns dict, check all possible relative position + ("kv.mixed", str_sort_val("c"), vec![2]), + ("-kv.mixed", str_sort_val("c"), vec![3, 0, 1]), + ("kv.mixed", str_sort_val("a"), vec![]), + ("-kv.mixed", str_sort_val("a"), vec![2, 3, 0, 1]), + ("kv.mixed", str_sort_val("z"), vec![3, 2]), + ("-kv.mixed", str_sort_val("z"), vec![0, 1]), + ] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_search_after_exogeneous_type_with_null() { + let dataset: Vec = vec![ + serde_json::json!({"kv": {"sort": false}}), // doc 0 + serde_json::json!({"kv": {"sort": true}}), // doc 1 + serde_json::json!({"kv": {"sort": "apple"}}), // doc 2 + serde_json::json!({"kv": {"sort": "banana"}}), // doc 3 + serde_json::json!({}), // doc 4: missing + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let search_after_value = SortValue::I64(-10); + + // Desc: booleans (true first) > strings (lex desc) > numbers (search after) > missing + let desc_order: &[u32] = &[4]; + // Asc: numbers (search after) > strings (lex asc) > booleans (false first) > missing + let asc_order: &[u32] = &[2, 3, 0, 1, 4]; + + for (sort_str, expected_order) in [("kv.sort", desc_order), ("-kv.sort", asc_order)] { + assert_search_after_results( + &searcher, + dataset.len(), + sort_str, + PartialHit { + sort_value: Some(search_after_value.clone().into()), + sort_value2: None, + ..Default::default() + }, + expected_order, + &format!("\"{sort_str}\""), + ); + } + } + + #[test] + fn test_single_split_default_sort() { + let dataset: Vec = vec![ + serde_json::json!({"sort_u64_1": 15}), // doc 0 + serde_json::json!({"sort_u64_1": 13}), // doc 1 + serde_json::json!({"sort_u64_1": 10}), // doc 2 + serde_json::json!({"sort_u64_1": 12}), // doc 3 + serde_json::json!({"sort_u64_1": 9}), // doc 4 + ]; + + let index = make_index(&dataset); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + + let request = SearchRequest { + max_hits: 3, + sort_fields: vec![], + search_after: None, + ..SearchRequest::default() + }; + let collector = super::make_collector_for_split( + "fake_split_id".to_string(), + &request, + Default::default(), + ) + .unwrap(); + let res = searcher + .search(&tantivy::query::AllQuery, &collector) + .unwrap(); + // assert the exact hits where in other tests we mostly focus on the order + assert_eq!( + res.partial_hits, + vec![ + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 4, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 3, + sort_value: None, + sort_value2: None, + }, + PartialHit { + split_id: "fake_split_id".to_string(), + segment_ord: 0, + doc_id: 2, + sort_value: None, + sort_value2: None, + }, + ] + ); + } + + /// Merge intermediate results, asserting that both the regular and + /// incremental merge produce the same output. + fn merge_on_both_collectors( request: &SearchRequest, results: Vec, ) -> LeafSearchResponse { @@ -1774,7 +2280,7 @@ mod tests { #[test] fn test_merge_collectors() { - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1822,7 +2328,7 @@ mod tests { } ); - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, @@ -1914,7 +2420,7 @@ mod tests { ); // same request, but we reverse sort order - let result = merge_collector_equal_results( + let result = merge_on_both_collectors( &SearchRequest { start_offset: 0, max_hits: 2, diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 3d9e5d00cce..02b825537da 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -52,6 +52,7 @@ use crate::metrics::{SplitSearchOutcomeCounters, queue_label}; use crate::root::is_metadata_count_request_with_ast; use crate::search_permit_provider::{SearchPermit, compute_initial_memory_allocation}; use crate::service::{SearcherContext, deserialize_doc_mapper}; +use crate::soft_delete_query::SoftDeleteQuery; use crate::{QuickwitAggregations, SearchError}; async fn get_split_footer_from_cache_or_fetch( @@ -475,7 +476,10 @@ async fn leaf_search_single_split( // if is_metadata_count_request_with_ast(&query_ast, &search_request) { leaf_search_state_guard.set_state(SplitSearchState::PrunedBeforeWarmup); - return Ok(Some(get_leaf_resp_from_count(split.num_docs))); + let effective_num_docs = split + .num_docs + .saturating_sub(split.soft_deleted_doc_ids.len() as u64); + return Ok(Some(get_leaf_resp_from_count(effective_num_docs))); } let split_id = split.split_id.to_string(); @@ -526,6 +530,14 @@ async fn leaf_search_single_split( false, predicate_cache, )?; + let query: Box = if split.soft_deleted_doc_ids.is_empty() { + query + } else { + Box::new(SoftDeleteQuery::new( + query, + split.soft_deleted_doc_ids.clone(), + )) + }; let collector_warmup_info = collector.warmup_info(); warmup_info.merge(collector_warmup_info); @@ -576,7 +588,10 @@ async fn leaf_search_single_split( collector.update_search_param(&simplified_search_request); let mut leaf_search_response: LeafSearchResponse = if is_metadata_count_request_with_ast(&query_ast, &simplified_search_request) { - get_leaf_resp_from_count(searcher.num_docs()) + let num_docs = searcher + .num_docs() + .saturating_sub(split_clone.soft_deleted_doc_ids.len() as u64); + get_leaf_resp_from_count(num_docs) } else if collector.is_count_only() { let count = query.count(&searcher)? as u64; get_leaf_resp_from_count(count) @@ -809,28 +824,24 @@ fn remove_redundant_timestamp_range( } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; - let final_end_timestamp = match ( - visitor.end_timestamp, - split.timestamp_end.map(DateTime::from_timestamp_secs), - ) { - (Bound::Included(query_ts), Some(split_ts)) => { - if query_ts < split_ts { - Bound::Included(query_ts) - } else { - Bound::Unbounded - } - } - (Bound::Excluded(query_ts), Some(split_ts)) => { - if query_ts <= split_ts { - Bound::Excluded(query_ts) + let final_end_timestamp = match (visitor.end_timestamp, split.timestamp_end) { + ( + query_bound @ (Bound::Included(query_ts) | Bound::Excluded(query_ts)), + Some(split_end), + ) => { + // split.timestamp_end is the truncation of the highest timestamp in the split, + // so the actual known bound for the split is split.timestamp_end+1 (exclusive) + let split_end_exclusive = DateTime::from_timestamp_secs(split_end + 1); + if query_ts < split_end_exclusive { + query_bound } else { Bound::Unbounded } } (Bound::Unbounded, Some(_)) => Bound::Unbounded, - (timestamp, None) => timestamp, + (query_bound, None) => query_bound, }; if final_start_timestamp != Bound::Unbounded || final_end_timestamp != Bound::Unbounded { let range = RangeQuery { @@ -1688,6 +1699,11 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, None); + let expected_upper_inclusive = RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), + }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { field: timestamp_field.to_string(), @@ -1697,7 +1713,7 @@ mod tests { .unwrap(), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, None); + remove_timestamp_test_case(&search_request, &split, Some(expected_upper_inclusive)); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), @@ -1740,10 +1756,10 @@ mod tests { Some(expected_upper_exclusive.clone()), ); - let expected_lower_exclusive = RangeQuery { + let expected_lower_excl_upper_incl = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Excluded((time2 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time3 * S_TO_NS).into()), }; let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1757,10 +1773,22 @@ mod tests { remove_timestamp_test_case( &search_request, &split, - Some(expected_lower_exclusive.clone()), + Some(expected_lower_excl_upper_incl.clone()), ); + } + + #[test] + fn test_remove_timestamp_range_multiple_bounds() { + // When bounds are defined both in the AST and in the search request, + // make sure we take the most restrictive ones. + const S_TO_NS: i64 = 1_000_000_000; + let time1 = 1700001000; + let time2 = 1700002000; + let time3 = 1700003000; + let time4 = 1700004000; + + let timestamp_field = "timestamp".to_string(); - // we take the most restrictive bounds let split = SplitIdAndFooterOffsets { timestamp_start: Some(time1), timestamp_end: Some(time4), @@ -1803,10 +1831,10 @@ mod tests { }; remove_timestamp_test_case(&search_request, &split, Some(expected_upper_2_inc)); - let expected_lower_3 = RangeQuery { + let expected_lower_3_upper_4 = RangeQuery { field: timestamp_field.to_string(), lower_bound: Bound::Included((time3 * S_TO_NS).into()), - upper_bound: Bound::Unbounded, + upper_bound: Bound::Included((time4 * S_TO_NS).into()), }; let search_request = SearchRequest { @@ -1820,7 +1848,11 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3.clone())); + remove_timestamp_test_case( + &search_request, + &split, + Some(expected_lower_3_upper_4.clone()), + ); let search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::Range(RangeQuery { @@ -1833,7 +1865,7 @@ mod tests { end_timestamp: Some(time4 + 1), ..SearchRequest::default() }; - remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3)); + remove_timestamp_test_case(&search_request, &split, Some(expected_lower_3_upper_4)); let mut search_request = SearchRequest { query_ast: serde_json::to_string(&QueryAst::MatchAll).unwrap(), diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs index abc756763ef..c93cd190c3c 100644 --- a/quickwit/quickwit-search/src/leaf_cache.rs +++ b/quickwit/quickwit-search/src/leaf_cache.rs @@ -85,6 +85,9 @@ struct CacheKey { /// The effective time range of the request, that is, the intersection of the timerange /// requested, and the timerange covered by the split. merged_time_range: HalfOpenRange, + /// The number of soft deleted documents in the split. + /// This assumes that the list of deleted docs is append only for a split. + soft_deleted_docs_len: usize, } impl CacheKey { @@ -106,6 +109,7 @@ impl CacheKey { split_id: split_info.split_id, request: search_request, merged_time_range, + soft_deleted_docs_len: split_info.soft_deleted_doc_ids.len(), } } } @@ -253,6 +257,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -262,6 +267,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { @@ -319,6 +325,7 @@ mod tests { timestamp_start: Some(100), timestamp_end: Some(199), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -327,6 +334,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_3 = SplitIdAndFooterOffsets { split_id: "split_3".to_string(), @@ -335,6 +343,7 @@ mod tests { timestamp_start: Some(150), timestamp_end: Some(249), num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let query_1 = SearchRequest { diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index 33a21664c3f..74266f42bf2 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -35,7 +35,10 @@ mod scroll_context; mod search_job_placer; mod search_response_rest; mod service; +mod soft_delete_query; +mod sort_repr; pub(crate) mod top_k_collector; +mod top_k_computer; mod metrics; mod search_permit_provider; @@ -172,6 +175,11 @@ fn extract_split_and_footer_offsets(split_metadata: &SplitMetadata) -> SplitIdAn .as_ref() .map(|time_range| *time_range.end()), num_docs: split_metadata.num_docs as u64, + soft_deleted_doc_ids: split_metadata + .soft_deleted_doc_ids + .iter() + .copied() + .collect(), } } diff --git a/quickwit/quickwit-search/src/list_fields_cache.rs b/quickwit/quickwit-search/src/list_fields_cache.rs index 681ce7a2e77..c940893b722 100644 --- a/quickwit/quickwit-search/src/list_fields_cache.rs +++ b/quickwit/quickwit-search/src/list_fields_cache.rs @@ -83,6 +83,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let split_2 = SplitIdAndFooterOffsets { @@ -92,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let result = ListFieldsEntryResponse { diff --git a/quickwit/quickwit-search/src/retry/mod.rs b/quickwit/quickwit-search/src/retry/mod.rs index 996665717cf..a496159d76c 100644 --- a/quickwit/quickwit-search/src/retry/mod.rs +++ b/quickwit/quickwit-search/src/retry/mod.rs @@ -128,6 +128,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }; let client_for_retry = retry_client( &search_job_placer, diff --git a/quickwit/quickwit-search/src/retry/search.rs b/quickwit/quickwit-search/src/retry/search.rs index 696a352de94..7ae744c8625 100644 --- a/quickwit/quickwit-search/src/retry/search.rs +++ b/quickwit/quickwit-search/src/retry/search.rs @@ -93,6 +93,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, SplitIdAndFooterOffsets { split_id: "split_2".to_string(), @@ -101,6 +102,7 @@ mod tests { timestamp_start: None, timestamp_end: None, num_docs: 0, + soft_deleted_doc_ids: Vec::new(), }, ], }], diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index 246d3308636..370a6d442c7 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -161,12 +161,18 @@ pub struct IndexMetasForLeafSearch { pub(crate) type IndexesMetasForLeafSearch = HashMap; +/// Maps to `true` if the field mapping of all indexes is `datetime` for the +/// given sort field. Contains an entry for every sort field. Does not ensure +/// that the field is indeed a datetime in all splits (doc mapping might +/// have been updated). +type SortFieldsIsDatetime = HashMap; + #[derive(Debug)] struct RequestMetadata { timestamp_field_opt: Option, query_ast_resolved: QueryAst, indexes_meta_for_leaf_search: IndexesMetasForLeafSearch, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, } /// Validates request against each index's doc mapper and ensures that: @@ -189,11 +195,10 @@ fn validate_request_and_build_metadata( )?; let query_ast: QueryAst = serde_json::from_str(&search_request.query_ast) .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; - let mut indexes_meta_for_leaf_search: HashMap = - HashMap::new(); + let mut indexes_meta_for_leaf_search: IndexesMetasForLeafSearch = HashMap::new(); let mut query_ast_resolved_opt: Option = None; let mut timestamp_field_opt: Option = None; - let mut sort_fields_is_datetime: HashMap = HashMap::new(); + let mut sort_fields_is_datetime: SortFieldsIsDatetime = HashMap::new(); for index_metadata in indexes_metadata { let doc_mapper = build_doc_mapper( @@ -315,7 +320,7 @@ fn validate_secondary_time(index_metadata: &[IndexMetadata]) -> crate::Result, + sort_field_is_datetime: &mut SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in sort_fields.iter() { if let Some(sort_field_entry) = get_sort_by_field_entry(&sort_field.field_name, schema)? { @@ -439,16 +444,10 @@ fn validate_sort_by_fields_and_search_after( } let mut search_after_sort_value_count = 0; - // TODO: we could validate if the search after sort value types of consistent with the sort - // field types. - if let Some(sort_by_value) = search_after_partial_hit.sort_value.as_ref() { - sort_by_value.sort_value.context("sort value must be set")?; + if search_after_partial_hit.sort_value.is_some() { search_after_sort_value_count += 1; } - if let Some(sort_by_value_2) = search_after_partial_hit.sort_value2.as_ref() { - sort_by_value_2 - .sort_value - .context("sort value must be set")?; + if search_after_partial_hit.sort_value2.is_some() { search_after_sort_value_count += 1; } if search_after_sort_value_count != sort_fields_without_doc_count { @@ -486,11 +485,6 @@ fn validate_sort_by_field_type( has_timestamp_format: bool, ) -> crate::Result<()> { let field_name = sort_by_field_entry.name(); - if matches!(sort_by_field_entry.field_type(), FieldType::Str(_)) { - return Err(SearchError::InvalidArgument(format!( - "sort by field on type text is currently not supported `{field_name}`" - ))); - } if !sort_by_field_entry.is_fast() { return Err(SearchError::InvalidArgument(format!( "sort by field must be a fast field, please add the fast property to your field \ @@ -710,7 +704,8 @@ pub fn get_count_from_metadata(split_metadatas: &[SplitMetadata]) -> Vec, query_ast_resolved: QueryAst, - sort_fields_is_datetime: HashMap, + sort_fields_is_datetime: SortFieldsIsDatetime, timestamp_field_opt: Option, secondary_timestamp_field_opt: Option, ) -> crate::Result> { @@ -1448,10 +1443,9 @@ pub async fn search_plan( /// Converts search after with datetime format to nanoseconds (representation in tantivy). /// If the sort field is a datetime field and no datetime format is set, the default format is /// milliseconds. -/// `sort_fields_are_datetime_opt` must be of the same length as `search_request.sort_fields`. fn convert_search_after_datetime_values( search_request: &mut SearchRequest, - sort_fields_is_datetime: &HashMap, + sort_fields_is_datetime: &SortFieldsIsDatetime, ) -> crate::Result<()> { for sort_field in search_request.sort_fields.iter_mut() { if *sort_fields_is_datetime @@ -1488,79 +1482,57 @@ fn convert_search_after_datetime_values( Ok(()) } -/// Convert sort values from input datetime format into nanoseconds. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. +/// Converts a numerical sort value from the given input datetime format into a `Datetime` sort +/// value (nanoseconds, tantivy's internal datetime representation). +/// Only `U64` and `I64` sort values are accepted; an error is returned for other types. fn convert_sort_datetime_value_into_nanos( sort_value: &mut SortValue, input_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, - SortValue::I64(value) => match input_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value = value.checked_mul(1_000_000).ok_or_else(|| { - SearchError::Internal(format!( - "sort value defined in milliseconds is too large and cannot be converted \ - into nanoseconds: {value}" - )) - })?; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing to do as the internal format is nanos. - } - }, + // Normalise to i64, even though in theory the sort value should be parsed as i64 anyway. + let raw: i64 = match sort_value { + SortValue::U64(value) => i64::try_from(*value).map_err(|_| { + SearchError::Internal(format!( + "sort value is too large to be represented as a datetime: {value}" + )) + })?, + SortValue::I64(value) => *value, _ => { return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ + "datetime conversion is only supported for u64 and i64 sort values, not \ `{sort_value:?}`" ))); } - } + }; + let nanos: i64 = match input_format { + SortDatetimeFormat::UnixTimestampMillis => raw.checked_mul(1_000_000).ok_or_else(|| { + SearchError::Internal(format!( + "sort value defined in milliseconds is too large to be a timestamp: {raw}" + )) + })?, + SortDatetimeFormat::UnixTimestampNanos => raw, + }; + *sort_value = SortValue::Datetime(nanos); Ok(()) } -/// Convert sort values from nanoseconds to the requested output format. -/// The conversion is done only for U64 and I64 sort values, an error is returned for other types. -fn convert_sort_datetime_value( +/// Converts a `Datetime` sort value (nanoseconds, tantivy's internal representation) into the +/// requested output format, replacing the value in place. +/// +/// Only the `Datetime` variant is accepted; an error is returned for other types. +fn convert_sort_datetime_value_from_nanos( sort_value: &mut SortValue, output_format: SortDatetimeFormat, ) -> crate::Result<()> { - match sort_value { - SortValue::U64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - SortValue::I64(value) => match output_format { - SortDatetimeFormat::UnixTimestampMillis => { - *value /= 1_000_000; - } - SortDatetimeFormat::UnixTimestampNanos => { - // Nothing todo as the internal format is in nanos. - } - }, - _ => { - return Err(SearchError::Internal(format!( - "datetime conversion are only support for u64 and i64 sort values, not \ - `{sort_value:?}`" - ))); - } - } + let SortValue::Datetime(nanos) = sort_value else { + return Err(SearchError::Internal(format!( + "datetime conversion is only supported for datetime sort values, not `{sort_value:?}`" + ))); + }; + *sort_value = match output_format { + SortDatetimeFormat::UnixTimestampMillis => SortValue::I64(*nanos / 1_000_000), + SortDatetimeFormat::UnixTimestampNanos => SortValue::I64(*nanos), + }; Ok(()) } @@ -2179,27 +2151,65 @@ mod tests { #[test] fn test_convert_sort_datetime_value() { - let mut sort_value = SortValue::U64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000)); - let mut sort_value = SortValue::I64(1617000000000000000); - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap(); + // millis output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap(); assert_eq!(sort_value, SortValue::I64(1617000000000)); - // conversion with float values should fail. + // nanos output + let mut sort_value = SortValue::Datetime(1617000000000000000); + convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampNanos, + ) + .unwrap(); + assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + + // non-datetime values should fail. let mut sort_value = SortValue::F64(1617000000000000000.0); - let error = - convert_sort_datetime_value(&mut sort_value, SortDatetimeFormat::UnixTimestampMillis) - .unwrap_err(); + let error = convert_sort_datetime_value_from_nanos( + &mut sort_value, + SortDatetimeFormat::UnixTimestampMillis, + ) + .unwrap_err(); assert_eq!( error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1.617e18)``" + "internal error: `datetime conversion is only supported for datetime sort values, not \ + `F64(1.617e18)``" ); } + #[test] + fn test_sort_datetime_value_roundtrip() { + use quickwit_proto::search::SortByValue; + let nanos: i64 = 1617000000000000000; + + for format in [ + SortDatetimeFormat::UnixTimestampMillis, + SortDatetimeFormat::UnixTimestampNanos, + ] { + let mut sort_value = SortValue::Datetime(nanos); + convert_sort_datetime_value_from_nanos(&mut sort_value, format).unwrap(); + + let json = SortByValue::from(sort_value).into_json(); + + let sort_by_value = SortByValue::try_from_json(json).unwrap(); + let mut sort_value = sort_by_value.sort_value.unwrap(); + + convert_sort_datetime_value_into_nanos(&mut sort_value, format).unwrap(); + + assert_eq!( + sort_value, + SortValue::Datetime(nanos), + "roundtrip failed for format {format:?}" + ); + } + } + #[test] fn test_convert_sort_datetime_value_into_nanos() { let mut sort_value = SortValue::U64(1617000000000); @@ -2208,39 +2218,29 @@ mod tests { SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::U64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); let mut sort_value = SortValue::I64(1617000000000); convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap(); - assert_eq!(sort_value, SortValue::I64(1617000000000000000)); + assert_eq!(sort_value, SortValue::Datetime(1617000000000000000)); // conversion with a too large millisecond value should fail. let mut sort_value = SortValue::I64(1617000000000000); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `sort value defined in milliseconds is too large and cannot be \ - converted into nanoseconds: 1617000000000000`" - ); // conversion with float values should fail. let mut sort_value = SortValue::F64(1617000000000000.0); - let error = convert_sort_datetime_value_into_nanos( + convert_sort_datetime_value_into_nanos( &mut sort_value, SortDatetimeFormat::UnixTimestampMillis, ) .unwrap_err(); - assert_eq!( - error.to_string(), - "internal error: `datetime conversion are only support for u64 and i64 sort values, \ - not `F64(1617000000000000.0)``" - ); } #[test] @@ -2411,7 +2411,7 @@ mod tests { let timestamp_field = schema_builder.add_date_field("timestamp", FAST); let id_field = schema_builder.add_u64_field("id", FAST); let no_fast_field = schema_builder.add_u64_field("no_fast", STORED); - let text_field = schema_builder.add_text_field("text", STORED); + let text_field = schema_builder.add_text_field("text", FAST); let schema = schema_builder.build(); { let sort_by_field_entry = schema.get_field_entry(timestamp_field); @@ -2439,11 +2439,7 @@ mod tests { } { let sort_by_field_entry = schema.get_field_entry(text_field); - let error = validate_sort_by_field_type(sort_by_field_entry, true).unwrap_err(); - assert_eq!( - error.to_string(), - "Invalid argument: sort by field on type text is currently not supported `text`" - ); + validate_sort_by_field_type(sort_by_field_entry, false).unwrap(); } } @@ -2987,9 +2983,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Asc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; @@ -3169,9 +3165,9 @@ mod tests { query_ast: qast_json_helper("test", &["body"]), max_hits: 10, sort_fields: vec![SortField { - field_name: "response_date".to_string(), + field_name: "response_time".to_string(), sort_order: SortOrder::Desc.into(), - sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampNanos as i32), + ..Default::default() }], ..Default::default() }; diff --git a/quickwit/quickwit-search/src/soft_delete_query.rs b/quickwit/quickwit-search/src/soft_delete_query.rs new file mode 100644 index 00000000000..8283523359d --- /dev/null +++ b/quickwit/quickwit-search/src/soft_delete_query.rs @@ -0,0 +1,321 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt; +use std::sync::Arc; + +use tantivy::query::{EnableScoring, Exclude, Explanation, Query, QueryClone, Scorer, Weight}; +use tantivy::{DocId, DocSet, Score, SegmentReader, TERMINATED}; + +/// A [`DocSet`] backed by a sorted, deduplicated vector of doc IDs. +/// +/// Used as the excluding [`DocSet`] argument passed to [`Exclude`] when +/// constructing a scorer inside [`SoftDeleteWeight`]. +/// +/// # Invariant +/// +/// The underlying slice must be sorted in strictly ascending order and free of +/// duplicates. This is guaranteed by [`SoftDeleteQuery::new`], which sorts and +/// deduplicates the input before storing it. +struct SortedDocIdSet { + doc_ids: Arc>, + /// Index of the current document inside `doc_ids`. + cursor: usize, +} + +impl SortedDocIdSet { + fn new(doc_ids: Arc>) -> Self { + SortedDocIdSet { doc_ids, cursor: 0 } + } +} + +impl DocSet for SortedDocIdSet { + #[inline] + fn advance(&mut self) -> DocId { + self.cursor += 1; + self.doc() + } + + fn seek(&mut self, target: DocId) -> DocId { + // The DocSet contract guarantees seek() is always called with a + // non-decreasing target, so we only need to scan forward from cursor. + let remaining = self.doc_ids.get(self.cursor..).unwrap_or(&[]); + let offset = remaining.partition_point(|&id| id < target); + self.cursor += offset; + self.doc() + } + + #[inline] + fn doc(&self) -> DocId { + self.doc_ids.get(self.cursor).copied().unwrap_or(TERMINATED) + } + + fn size_hint(&self) -> u32 { + self.doc_ids.len().saturating_sub(self.cursor) as u32 + } +} + +/// [`Weight`] produced by [`SoftDeleteQuery`]. +/// +/// Wraps the inner weight's scorer with [`Exclude`] to filter out +/// soft-deleted doc IDs transparently across all collection paths. +struct SoftDeleteWeight { + inner: Box, + deleted_doc_ids: Arc>, +} + +impl Weight for SoftDeleteWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> tantivy::Result> { + let inner_scorer = self.inner.scorer(reader, boost)?; + let excluded = SortedDocIdSet::new(Arc::clone(&self.deleted_doc_ids)); + Ok(Box::new(Exclude::new(inner_scorer, excluded))) + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> tantivy::Result { + self.inner.explain(reader, doc) + } +} + +/// A tantivy [`Query`] that wraps another query and excludes a fixed set of +/// soft-deleted doc IDs from every result set it produces. +pub(crate) struct SoftDeleteQuery { + inner: Box, + /// Sorted, deduplicated tantivy doc IDs to exclude. + deleted_doc_ids: Arc>, +} + +impl SoftDeleteQuery { + /// Creates a new [`SoftDeleteQuery`]. + /// + /// `deleted_doc_ids` may be supplied in any order and may contain + /// duplicates; this constructor sorts and deduplicates the input. + pub(crate) fn new(inner: Box, mut deleted_doc_ids: Vec) -> Self { + deleted_doc_ids.sort_unstable(); + deleted_doc_ids.dedup(); + SoftDeleteQuery { + inner, + deleted_doc_ids: Arc::new(deleted_doc_ids), + } + } +} + +impl Clone for SoftDeleteQuery { + fn clone(&self) -> Self { + SoftDeleteQuery { + inner: self.inner.box_clone(), + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + } + } +} + +impl fmt::Debug for SoftDeleteQuery { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("SoftDeleteQuery") + .field("inner", &self.inner) + .field("num_deleted", &self.deleted_doc_ids.len()) + .finish() + } +} + +impl Query for SoftDeleteQuery { + fn weight(&self, enable_scoring: EnableScoring<'_>) -> tantivy::Result> { + let inner_weight = self.inner.weight(enable_scoring)?; + Ok(Box::new(SoftDeleteWeight { + inner: inner_weight, + deleted_doc_ids: Arc::clone(&self.deleted_doc_ids), + })) + } + + fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a tantivy::Term, bool)) { + self.inner.query_terms(visitor); + } +} + +#[cfg(test)] +mod tests { + use tantivy::collector::Count; + use tantivy::query::AllQuery; + use tantivy::schema::{Schema, TEXT}; + use tantivy::{Index, IndexWriter}; + + use super::*; + + /// Creates a single-segment, in-RAM index containing `num_docs` documents. + /// + /// Returns `(index, reader)`. The tantivy doc IDs are 0-based and + /// contiguous inside the single segment, so doc ID `k` corresponds to the + /// (k+1)-th inserted document. + fn make_index(num_docs: usize) -> tantivy::Result<(Index, tantivy::IndexReader)> { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut writer: IndexWriter = index.writer(15_000_000)?; + for i in 0..num_docs { + writer.add_document(tantivy::doc!(text_field => format!("doc {i}")))?; + } + writer.commit()?; + let reader = index.reader()?; + Ok((index, reader)) + } + + // ── SortedDocIdSet unit tests ───────────────────────────────────────────── + + #[test] + fn test_sorted_doc_id_set_advance_through_all() { + let ids = Arc::new(vec![2u32, 5, 8, 11]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.doc(), 2); + assert_eq!(ds.advance(), 5); + assert_eq!(ds.advance(), 8); + assert_eq!(ds.advance(), 11); + // Advancing past the last element returns TERMINATED via unwrap_or. + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + // Subsequent advances keep returning TERMINATED: cursor increments past + // doc_ids.len(), get() returns None, unwrap_or yields TERMINATED. + assert_eq!(ds.advance(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_empty() { + let mut ds = SortedDocIdSet::new(Arc::new(vec![])); + assert_eq!(ds.doc(), TERMINATED); + assert_eq!(ds.advance(), TERMINATED); + assert_eq!(ds.seek(0), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_exact_hit() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(7), 7); + assert_eq!(ds.doc(), 7); + + // Seeking to the same target is idempotent. + assert_eq!(ds.seek(7), 7); + + assert_eq!(ds.seek(10), 10); + assert_eq!(ds.doc(), 10); + } + + #[test] + fn test_sorted_doc_id_set_seek_between_entries() { + let ids = Arc::new(vec![1u32, 3, 7, 10, 15]); + let mut ds = SortedDocIdSet::new(ids); + + // Target falls between 3 and 7 → should return 7. + assert_eq!(ds.seek(4), 7); + assert_eq!(ds.doc(), 7); + + // Target falls between 10 and 15 → should return 15. + assert_eq!(ds.seek(11), 15); + assert_eq!(ds.doc(), 15); + } + + #[test] + fn test_sorted_doc_id_set_seek_past_last_entry() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(100), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_terminated_sentinel() { + let ids = Arc::new(vec![1u32, 3, 7]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.seek(TERMINATED), TERMINATED); + assert_eq!(ds.doc(), TERMINATED); + } + + #[test] + fn test_sorted_doc_id_set_seek_before_current_position() { + // After advancing past the start, seeking to the current doc must not + // go backwards. + let ids = Arc::new(vec![1u32, 5, 9]); + let mut ds = SortedDocIdSet::new(ids); + + ds.advance(); // cursor → 5 + // Seeking to 5 (= current) must keep returning 5. + assert_eq!(ds.seek(5), 5); + assert_eq!(ds.doc(), 5); + } + + #[test] + fn test_sorted_doc_id_set_size_hint_decrements() { + let ids = Arc::new(vec![1u32, 3, 7, 10]); + let mut ds = SortedDocIdSet::new(ids); + + assert_eq!(ds.size_hint(), 4); + ds.advance(); + assert_eq!(ds.size_hint(), 3); + ds.advance(); + ds.advance(); + ds.advance(); // now TERMINATED + assert_eq!(ds.size_hint(), 0); + } + + #[test] + fn test_soft_delete_query_no_deleted_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![]); + assert_eq!(searcher.search(&query, &Count)?, 5); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_subset() -> tantivy::Result<()> { + let (_index, reader) = make_index(5)?; + let searcher = reader.searcher(); + + // Delete doc IDs 1 and 3; 0, 2, 4 should remain. + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![1, 3]); + assert_eq!(searcher.search(&query, &Count)?, 3); + Ok(()) + } + + #[test] + fn test_soft_delete_query_excludes_all_docs() -> tantivy::Result<()> { + let (_index, reader) = make_index(3)?; + let searcher = reader.searcher(); + + let query = SoftDeleteQuery::new(Box::new(AllQuery), vec![0, 1, 2]); + assert_eq!(searcher.search(&query, &Count)?, 0); + Ok(()) + } + + #[test] + fn test_soft_delete_query_count_method_matches_search() -> tantivy::Result<()> { + let (_index, reader) = make_index(10)?; + let searcher = reader.searcher(); + + // Delete every even doc ID. + let deleted: Vec = (0..10).filter(|x| x % 2 == 0).collect(); + let query = SoftDeleteQuery::new(Box::new(AllQuery), deleted); + + let count_via_search = searcher.search(&query, &Count)?; + let count_via_method = query.count(&searcher)?; + + assert_eq!(count_via_search, 5); + assert_eq!(count_via_method, 5); + Ok(()) + } +} diff --git a/quickwit/quickwit-search/src/sort_repr.rs b/quickwit/quickwit-search/src/sort_repr.rs new file mode 100644 index 00000000000..940e97366c9 --- /dev/null +++ b/quickwit/quickwit-search/src/sort_repr.rs @@ -0,0 +1,409 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::Debug; +use std::ops::Not; + +use quickwit_proto::search::SortOrder; +use tantivy::DocId; + +use crate::top_k_computer::MinValue; + +/// A u64 that can be elided to unit type to save memory. +pub(crate) trait ElidableU64: Ord + Copy + Debug + MinValue { + fn value(self) -> u64; + fn from_u64(value: u64) -> Self; + fn is_elided() -> bool; +} + +impl MinValue for u64 { + fn min_value() -> Self { + 0 + } +} + +impl MinValue for () { + fn min_value() -> Self {} +} + +impl ElidableU64 for u64 { + fn from_u64(value: u64) -> Self { + value + } + fn value(self) -> u64 { + self + } + fn is_elided() -> bool { + false + } +} + +impl ElidableU64 for () { + fn from_u64(_value: u64) -> Self {} + fn value(self) -> u64 { + 0 + } + fn is_elided() -> bool { + true + } +} + +/// Encoded representation of the value, the index of its accessor in the list +/// of fast field columns and the sort order. +/// +/// The first u8 encodes the index of the accessor and a sentinel value for +/// missing and search after values: +/// - 0 is a sentinel for skip all +/// - 1 is a sentinel for missing (always last in the sort order) +/// - other odd values encode the index of the accessor in the list of fast field columns (3 for +/// index 0, 5 for index 1, etc.) +/// - even values are sentinels for search after values that keep/skip all documents for a given +/// column (2 to skip all columns but keep missing, 4 only keeps column 0, 6 keeps column 0 and 1, +/// etc.) +/// +/// The following u64 encodes the value itself or its bitwise negation to +/// reverse the sort order when building an ascending sort (keeping in mind that +/// this is fed to a top-k calculator). +#[derive(Clone, Copy)] +pub(crate) struct InternalValueRepr(u8, V); + +/// Inverts the sort order by reversing the bits. +/// +/// Using the bitwise negation is a cheap way to reverse the order while +/// maintaining the type (and memory footprint). It is also reversible +/// (`not(not(value)) == value`) which makes it simply decodable. +/// +/// This wrapper is just an alias to make the code more readable. Using `!value` +/// or `value.not()` inline yields the same result. +#[inline] +fn reverse>(value: T) -> T { + value.not() +} + +impl InternalValueRepr { + #[inline] + pub fn new(value: u64, accessor_idx: u8, order: SortOrder) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 3), V::from_u64(reverse(value))), + SortOrder::Desc => Self(accessor_idx * 2 + 3, V::from_u64(value)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be kept. + pub fn new_keep_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 2), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 4, V::from_u64(0)), + } + } + #[inline] + pub fn new_missing() -> Self { + // Missing always last in topk, so use the smallest possible value + // (besides the skip_all value) + Self(1, V::from_u64(0)) + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped for the given column. + pub fn new_skip_column(accessor_idx: u8, order: SortOrder) -> Self { + match order { + SortOrder::Asc => Self(reverse(accessor_idx * 2 + 4), V::from_u64(0)), + SortOrder::Desc => Self(accessor_idx * 2 + 2, V::from_u64(0)), + } + } + /// A sentinel value that can be instantiated as search after boundary to indicate + /// that all documents should be skipped. + pub fn new_skip_all_but_missing() -> Self { + Self(2, V::from_u64(0)) + } + #[inline] + pub fn decode(self, order: SortOrder) -> Option<(u8, u64)> { + if self.0 == 1 { + return None; + } + debug_assert_eq!( + match order { + SortOrder::Asc => reverse(self.0), + SortOrder::Desc => self.0, + } % 2, + 1, + "sentinel indexes are not meant to be decoded" + ); + match order { + SortOrder::Asc => Some(((reverse(self.0) - 3) / 2, reverse(V::value(self.1)))), + SortOrder::Desc => Some(((self.0 - 3) / 2, V::value(self.1))), + } + } +} + +/// Ordered representation of the sort values. It is the concatenation of: +/// - the first two (u8, u64) pairs contain the internal representation of the sort values +/// - the second sort value's internal representation +/// - the doc id, preceeded by a sentinel indicating how it should be used for tie-breaking +/// +/// ElidableU64 is used instead of u64 for sort values to reduce the size of the +/// representation when they are not used. The associated sentinels could also +/// be elided, but in practice they don't have an impact on the tuple's size +/// because the doc id and its sentinel (u8, u32) gets padded anyway. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Default, Hash)] +pub(crate) struct InternalSortValueRepr(u8, V1, u8, V2, u8, u32); + +impl InternalSortValueRepr { + #[inline] + pub fn new( + sort_1: InternalValueRepr, + sort_2: InternalValueRepr, + doc_id: DocId, + doc_id_sort: SortOrder, + ) -> Self { + // For Asc, smaller values should win: invert so smaller maps to larger repr + match doc_id_sort { + SortOrder::Asc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, reverse(doc_id)), + SortOrder::Desc => Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 1, doc_id), + } + } + pub fn new_keep_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 2, 0) + } + pub fn new_skip_doc_ids(sort_1: InternalValueRepr, sort_2: InternalValueRepr) -> Self { + Self(sort_1.0, sort_1.1, sort_2.0, sort_2.1, 0, 0) + } + #[inline] + pub fn sort_1(self) -> InternalValueRepr { + InternalValueRepr(self.0, self.1) + } + #[inline] + pub fn sort_2(self) -> InternalValueRepr { + InternalValueRepr(self.2, self.3) + } + #[inline] + pub fn doc_id(self, order: SortOrder) -> DocId { + debug_assert_eq!(self.4, 1, "doc id sentinel is not meant to be decoded"); + match order { + SortOrder::Asc => reverse(self.5), + SortOrder::Desc => self.5, + } + } + pub fn is_skip_all(&self) -> bool { + *self <= Self(1, V1::min_value(), 1, V2::min_value(), 1, 0) + } +} + +impl MinValue for InternalSortValueRepr { + fn min_value() -> Self { + Self(0, V1::min_value(), 0, V2::min_value(), 1, 0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_internal_sort_value_repr_ordering_values() { + // Primary sort (Desc v1=10) dominates over secondary (Desc v2=100) and doc_id. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary sort must dominate, asc"); + + // Secondary sort (Desc v2) breaks a tie on the primary field. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs > rhs, "secondary sort must break primary tie, desc"); + + // Same values but Asc, the order is reversed. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(10, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new(5, 0, SortOrder::Asc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "secondary sort must break primary tie, asc"); + + // Doc-id Desc tiebreaker: higher doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Desc, + ); + assert!(lhs > rhs, "Desc: higher doc_id must win tiebreaker"); + + // Doc-id Asc tiebreaker: lower doc_id wins. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 5, + SortOrder::Asc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(10, 0, SortOrder::Desc), + InternalValueRepr::::new_missing(), + 10, + SortOrder::Asc, + ); + assert!(lhs > rhs, "Asc: lower doc_id must win tiebreaker"); + + // Missing values are always smaller + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, desc"); + + // Same but Asc, missing is still smaller. + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new_missing(), + InternalValueRepr::::new(10, 0, SortOrder::Desc), + 10, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 0, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + assert!(lhs < rhs, "missing values are always smaller, asc"); + } + + #[test] + fn test_internal_sort_value_repr_ordering_sentinels() { + // Doc-id sentinel ordering: skip_doc_ids < normal_doc_id < keep_doc_ids. + let s1 = InternalValueRepr::::new(10, 0, SortOrder::Desc); + let s2 = InternalValueRepr::::new_missing(); + let skip_docs = InternalSortValueRepr::new_skip_doc_ids(s1, s2); + let keep_docs = InternalSortValueRepr::new_keep_doc_ids(s1, s2); + let normal_doc_desc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Desc); + let normal_doc_asc = InternalSortValueRepr::new(s1, s2, 0, SortOrder::Asc); + assert!( + skip_docs < normal_doc_desc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_desc < keep_docs, + "normal must be below keep_doc_ids" + ); + assert!( + skip_docs < normal_doc_asc, + "skip_doc_ids must be below normal" + ); + assert!( + normal_doc_asc < keep_docs, + "normal must be below keep_doc_ids" + ); + } + + #[test] + fn test_internal_sort_value_repr_ordering_types() { + // Primary accessor ordering dominates all the rest + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Desc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Desc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs > rhs, "primary type sort must dominate, desc"); + + // Same values but Asc, the order is reversed + let lhs = InternalSortValueRepr::new( + InternalValueRepr::::new(5, 1, SortOrder::Asc), + InternalValueRepr::::new(0, 0, SortOrder::Desc), + 0, + SortOrder::Desc, + ); + let rhs = InternalSortValueRepr::new( + InternalValueRepr::::new(15, 0, SortOrder::Asc), + InternalValueRepr::::new(100, 0, SortOrder::Desc), + 999, + SortOrder::Desc, + ); + assert!(lhs < rhs, "primary type sort must dominate, asc"); + } + + #[test] + fn test_memory_footprint() { + // Make sure that the memory representation is efficiently packed. For + // instance refactoring to: + // ``` + // struct InternalSortValueRepr(InternalValueRepr,InternalValueRepr,u64) + // ``` + // would cause InternalSortValueRepr to jump to 40 bytes. + + assert_eq!(std::mem::size_of::>(), 24); + assert_eq!(std::mem::size_of::>(), 16); + assert_eq!(std::mem::size_of::>(), 8); + } +} diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index dc6dfe9f9cd..d14f3e9ee10 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -14,6 +14,7 @@ use std::cmp::Ordering; use std::collections::{BTreeMap, BTreeSet}; +use std::vec; use assert_json_diff::{assert_json_eq, assert_json_include}; use quickwit_config::SearcherConfig; @@ -22,8 +23,8 @@ use quickwit_doc_mapper::tag_pruning::extract_tags_from_query; use quickwit_indexing::TestSandbox; use quickwit_opentelemetry::otlp::TraceId; use quickwit_proto::search::{ - LeafListTermsResponse, ListTermsRequest, SearchRequest, SortByValue, SortField, SortOrder, - SortValue, + LeafListTermsResponse, ListTermsRequest, PartialHit, SearchRequest, SortByValue, + SortDatetimeFormat, SortField, SortOrder, SortValue, }; use quickwit_query::query_ast::{ QueryAst, qast_helper, qast_json_helper, query_ast_from_user_text, @@ -371,7 +372,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 10); assert_eq!(single_node_response.hits.len(), 10); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -395,7 +397,8 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await?; + .await + .unwrap(); assert_eq!(single_node_response.num_hits, 19); assert_eq!(single_node_response.hits.len(), 19); assert!(&single_node_response.hits[0].json.contains("t:19")); @@ -890,7 +893,7 @@ async fn test_sort_by_2_field() { } #[tokio::test] -async fn test_single_node_invalid_sorting_with_query() { +async fn test_sort_by_text() { let index_id = "single-node-invalid-sorting"; let doc_mapping_yaml = r#" field_mappings: @@ -906,7 +909,7 @@ async fn test_single_node_invalid_sorting_with_query() { let mut docs = Vec::new(); for i in 0..30 { - let description = format!("city info-{}", i + 1); + let description = format!("city info-{:02}", i + 1); docs.push(json!({"description": description, "ts": i+1, "temperature": i+32})); } test_sandbox.add_documents(docs).await.unwrap(); @@ -927,13 +930,19 @@ async fn test_single_node_invalid_sorting_with_query() { test_sandbox.metastore(), test_sandbox.storage_resolver(), ) - .await; - assert!(single_node_response.is_err()); - let error_msg = single_node_response.unwrap_err().to_string(); - assert_eq!( - error_msg, - "Invalid argument: sort by field on type text is currently not supported `description`" - ); + .await + .unwrap(); + + assert_eq!(single_node_response.num_hits, 30); + assert_eq!(single_node_response.hits.len(), 15); + assert!(single_node_response.hits.windows(2).all(|hits| { + let hit0: JsonValue = serde_json::from_str(&hits[0].json).unwrap(); + let hit1: JsonValue = serde_json::from_str(&hits[1].json).unwrap(); + hit0["description"].as_str().unwrap() >= hit1["description"].as_str().unwrap() + })); + assert!(single_node_response.hits[0].json.contains("city info-30")); + assert!(single_node_response.hits[14].json.contains("city info-16")); + test_sandbox.assert_quit().await; } @@ -1887,3 +1896,630 @@ fn test_global_doc_address_ser_deser() { let doc_address_deser: GlobalDocAddress = doc_address_string.parse().unwrap(); assert_eq!(doc_address_deser, doc_address); } + +#[tokio::test] +async fn test_single_node_soft_delete_excludes_from_search() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-search"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Search all — should find 3 + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 10, + ..Default::default() + }; + let result = single_node_search( + search_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + + // Search for "alpha" specifically to find its doc_id and split_id + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let alpha_hit = &alpha_result.hits[0]; + let partial_hit = alpha_hit.partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id: split_id.clone(), + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Search all again — should find only 2 + let result = single_node_search( + search_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + + // Verify that the soft-deleted document ("alpha") is not in the results + for hit in &result.hits { + let hit_json: JsonValue = serde_json::from_str(&hit.json)?; + assert_ne!(hit_json["title"], "alpha"); + } + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_single_node_soft_delete_count_only() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Count-only search (max_hits: 0) — should find 3 + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Find the doc_id for "alpha" so we can soft-delete it + let alpha_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }; + let alpha_result = single_node_search( + alpha_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete that document via the metastore + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + + let metastore = test_sandbox.metastore(); + metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only search again — should find only 2 + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Regression test: the `is_count_only` path (non-MatchAll query with max_hits=0) was calling +/// `query.count(&searcher)` which bypasses Quickwit's soft-delete filter entirely. +/// MatchAll + max_hits=0 goes through `is_metadata_count_request_with_ast` (already correct); +/// this test specifically exercises the `is_count_only` branch with a real term query. +#[tokio::test] +async fn test_single_node_soft_delete_count_only_term_query() -> anyhow::Result<()> { + use quickwit_metastore::IndexMetadataResponseExt; + use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, SoftDeleteDocumentsRequest, SplitDocIds, + }; + + let index_id = "test-soft-delete-count-only-term-query"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["title"]).await?; + let docs = vec![ + json!({"title": "alpha"}), + json!({"title": "beta"}), + json!({"title": "gamma"}), + ]; + test_sandbox.add_documents(docs).await?; + + // Use a non-MatchAll query so that the `is_count_only` branch is taken instead of + // `is_metadata_count_request_with_ast`. "alpha OR beta OR gamma" matches all 3 docs + // but is not `QueryAst::MatchAll`. + let count_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha OR beta OR gamma", &["title"]), + max_hits: 0, + ..Default::default() + }; + let result = single_node_search( + count_request.clone(), + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 3); + assert!(result.hits.is_empty()); + + // Locate the doc_id for "alpha" so we can soft-delete it. + let alpha_result = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("alpha", &["title"]), + max_hits: 10, + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(alpha_result.num_hits, 1); + let partial_hit = alpha_result.hits[0].partial_hit.as_ref().unwrap(); + let split_id = partial_hit.split_id.clone(); + let doc_id = partial_hit.doc_id; + + // Soft-delete the "alpha" document. + let index_uid = test_sandbox + .metastore() + .index_metadata(IndexMetadataRequest::for_index_id(index_id.to_string())) + .await? + .deserialize_index_metadata()? + .index_uid; + test_sandbox + .metastore() + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids: vec![SplitDocIds { + split_id, + doc_ids: vec![doc_id], + }], + }) + .await?; + + // Count-only term query: before the fix this returned 3 (soft-deleted doc was counted); + // after the fix it must return 2. + let result = single_node_search( + count_request, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + assert_eq!(result.num_hits, 2); + assert!(result.hits.is_empty()); + + test_sandbox.assert_quit().await; + Ok(()) +} + +/// Tests that when sorting by a datetime field with `sort_datetime_format` set to millis: +/// 1. The sort values returned in `partial_hit` are in milliseconds (not nanoseconds). +/// 2. Those values can be fed back as `search_after` to retrieve the next page correctly. +#[tokio::test] +async fn test_sort_by_datetime_format_millis_and_search_after() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // Index 10 documents with timestamps 100_000_000_000 .. 100_000_009_000 ms since epoch. + let base_secs: i64 = 100_000_000; + let docs: Vec<_> = (0..10) + .map(|i| json!({"ts": base_secs + i, "body": format!("doc {i}")})) + .collect(); + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }; + + // Page 1: top 5 hits sorted by ts desc with millis output + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 10); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let expected_millis: Vec = (5..10).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis: Vec = page1 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis, expected_millis); + + // Page 2: use the last hit's sort value as search_after + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page2.hits.len(), 5); + // Page 2 should contain docs with timestamps base_secs+4 down to base_secs+0 in millis + let expected_millis_page2: Vec = (0..5).rev().map(|i| (base_secs + i) * 1_000).collect(); + let actual_millis_page2: Vec = page2 + .hits + .iter() + .map(|hit| { + let partial_hit = hit.partial_hit.as_ref().unwrap(); + match &partial_hit.sort_value.as_ref().unwrap().sort_value { + Some(SortValue::I64(ms)) => *ms, + other => panic!("expected I64 sort value in millis, got {other:?}"), + } + }) + .collect(); + assert_eq!(actual_millis_page2, expected_millis_page2); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_dynamic_with_datetime_page_fails() -> anyhow::Result<()> { + let index_id = "sort-dynamic-datetime-page-fails"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + mode: dynamic + dynamic_mapping: + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + let docs = [ + json!({"ts": 100_000_001, "my_dynamic_field": 2024}), + json!({"ts": 100_000_002, "my_dynamic_field": "2024-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2025}), + json!({"ts": 100_000_002, "my_dynamic_field": "2025-03-30T00:00:00Z"}), + json!({"ts": 100_000_001, "my_dynamic_field": 2026}), + json!({"ts": 100_000_002, "my_dynamic_field": "2026-03-30T00:00:00Z"}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_field = SortField { + field_name: "my_dynamic_field".to_string(), + sort_order: SortOrder::Desc as i32, + ..Default::default() + }; + + // Page 1: sort should work even on a dynamic field with a datetime column + // values for the first page + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field.clone()], + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + + // Verify sort values are in milliseconds (not nanoseconds) + let page_1_sort_values: Vec<_> = page1 + .hits + .iter() + .map(|hit| { + &hit.partial_hit + .as_ref() + .unwrap() + .sort_value + .as_ref() + .unwrap() + .sort_value + }) + .collect(); + assert_eq!( + page_1_sort_values, + vec![ + &Some(SortValue::Datetime(1774828800000000000)), + &Some(SortValue::Datetime(1743292800000000000)), + &Some(SortValue::Datetime(1711756800000000000)), + &Some(SortValue::I64(2026)), + &Some(SortValue::I64(2025)), + ] + ); + + // Page 2: search after not yet supported + let last_hit = page1.hits.last().unwrap().partial_hit.as_ref().unwrap(); + let search_after = PartialHit { + sort_value: last_hit.sort_value.clone(), + sort_value2: None, + split_id: String::new(), + segment_ord: 0, + doc_id: 0, + }; + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: vec![sort_field], + search_after: Some(search_after), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.failed_splits.len(), 1); + assert_eq!(page2.hits.len(), 0); + + test_sandbox.assert_quit().await; + Ok(()) +} + +#[tokio::test] +async fn test_sort_by_two_fields_with_null() -> anyhow::Result<()> { + let index_id = "sort-datetime-millis-search-after"; + let doc_mapping_yaml = r#" + field_mappings: + - name: ts + type: datetime + fast: true + - name: body + type: text + fast: true + timestamp_field: ts + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "{}", &["body"]).await?; + + // timestamps with 10 digits should be interpreted as secs + let docs: Vec<_> = vec![ + json!({"ts": 1_000_000_001i64, "body": format!("doc 9")}), + json!({"ts": 1_000_000_002i64, "body": format!("doc 8")}), + json!({"ts": 1_000_000_003i64, "body": format!("doc 7")}), + json!({"ts": 1_000_000_004i64}), + json!({"ts": 1_000_000_005i64}), + json!({"ts": 1_000_000_006i64}), + ]; + test_sandbox.add_documents(docs).await?; + + let sort_fields = vec![ + SortField { + field_name: "body".to_string(), + sort_order: SortOrder::Asc as i32, + ..Default::default() + }, + SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Asc as i32, + sort_datetime_format: Some(SortDatetimeFormat::UnixTimestampMillis as i32), + }, + ]; + + let page1 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await?; + + assert_eq!(page1.num_hits, 6); + assert_eq!(page1.hits.len(), 5); + let page_1_hits = page1 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_1_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_1_hits, + vec![ + PartialHit { + sort_value: Some(SortValue::Str("doc 7".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_003_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 2, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 8".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_002_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 1, + }, + PartialHit { + sort_value: Some(SortValue::Str("doc 9".to_string()).into()), + sort_value2: Some(SortValue::I64(1_000_000_001_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 0, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_004_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 3, + }, + PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_005_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 4, + }, + ] + ); + + let page2 = single_node_search( + SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: qast_json_helper("*", &["body"]), + max_hits: 5, + sort_fields: sort_fields.clone(), + search_after: Some(page_1_hits[4].clone()), + ..Default::default() + }, + test_sandbox.metastore(), + test_sandbox.storage_resolver(), + ) + .await + .unwrap(); + + assert_eq!(page2.num_hits, 6); + assert_eq!(page2.hits.len(), 1); + let page_2_hits = page2 + .hits + .iter() + .map(|hit| hit.partial_hit.clone().unwrap()) + .collect::>(); + let split_id = page_2_hits[0].split_id.clone(); + // for the timestamp field we convert to sort_datetime_format repr as I64 + assert_eq!( + page_2_hits, + vec![PartialHit { + sort_value: Some(SortByValue { sort_value: None }), + sort_value2: Some(SortValue::I64(1_000_000_006_000).into()), + split_id: split_id.clone(), + segment_ord: 0, + doc_id: 5, + },] + ); + + test_sandbox.assert_quit().await; + Ok(()) +} diff --git a/quickwit/quickwit-search/src/top_k_collector.rs b/quickwit/quickwit-search/src/top_k_collector.rs index f36eb6370e2..3dc9f2bd6f2 100644 --- a/quickwit/quickwit-search/src/top_k_collector.rs +++ b/quickwit/quickwit-search/src/top_k_collector.rs @@ -12,862 +12,179 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::cmp::{Ordering, Reverse}; -use std::fmt::Debug; -use std::marker::PhantomData; +use std::cmp::Ordering; -use quickwit_common::binary_heap::TopK; -use quickwit_proto::search::{PartialHit, SortOrder}; +use quickwit_proto::search::PartialHit; use quickwit_proto::types::SplitId; -use tantivy::{DocId, Score}; +use tantivy::{DocId, Score, SegmentOrdinal}; -use crate::collector::{ - HitSortingMapper, SegmentPartialHit, SegmentPartialHitSortingKey, - SortingFieldExtractorComponent, SortingFieldExtractorPair, -}; +use crate::collector::SortingFieldExtractorPair; +use crate::sort_repr::{ElidableU64, InternalSortValueRepr}; +use crate::top_k_computer::TopKComputer; -pub trait QuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]); - fn collect_top_k(&mut self, doc_id: DocId, score: Score); - fn get_top_k(&self) -> Vec; -} - -trait IntoOptionU64 { - #[inline] - fn is_unit_type() -> bool { - false - } - fn into_option_u64(self) -> Option; - fn from_option_u64(value: Option) -> Self; -} -trait MinValue { - fn min_value() -> Self; -} - -impl IntoOptionU64 for Option { - #[inline] - fn into_option_u64(self) -> Option { - self - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value - } -} - -impl MinValue for Option { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for Option> { - #[inline] - fn into_option_u64(self) -> Option { - self.map(|el| el.0) - } - #[inline] - fn from_option_u64(value: Option) -> Self { - value.map(Reverse) - } -} -impl MinValue for Option> { - #[inline] - fn min_value() -> Self { - None - } -} - -impl IntoOptionU64 for () { - #[inline] - fn is_unit_type() -> bool { - true - } - #[inline] - fn into_option_u64(self) -> Option { - None - } - #[inline] - fn from_option_u64(_: Option) -> Self {} -} -impl MinValue for () { - #[inline] - fn min_value() -> Self {} -} - -/// Generic hit struct for top k collector. -/// V1 and V2 are the types of the two values to sort by. -/// They are either Option or _statically_ disabled via unit type. -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -struct Hit { - doc_id: DocId, - value1: V1, - value2: V2, -} - -impl MinValue for Hit -where - V1: MinValue, - V2: MinValue, -{ - #[inline] - fn min_value() -> Self { - let doc_id = if REVERSE_DOCID { - DocId::MAX - } else { - DocId::MIN - }; - Hit { - doc_id, - value1: V1::min_value(), - value2: V2::min_value(), - } - } -} - -impl std::fmt::Display for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Hit(doc_id: {}, value1: {:?}, value2: {:?})", - self.doc_id, self.value1, self.value2 - ) - } -} - -impl Ord for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - let order = self.value1.cmp(&other.value1); - order - .then_with(|| self.value2.cmp(&other.value2)) - .then_with(|| { - if REVERSE_DOCID { - other.doc_id.cmp(&self.doc_id) - } else { - self.doc_id.cmp(&other.doc_id) - } - }) - } -} - -impl PartialOrd for Hit -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + Debug + MinValue, -{ - #[inline] - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> Hit -{ - #[inline] - fn into_segment_partial_hit(self) -> SegmentPartialHit { - SegmentPartialHit { - sort_value: self.value1.into_option_u64(), - sort_value2: self.value2.into_option_u64(), - doc_id: self.doc_id, - } - } -} - -pub fn specialized_top_k_segment_collector( +pub struct QuickwitSegmentTopKCollectorTemplate { split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, -) -> Box { - // TODO: Add support for search_after to the specialized collector. - // Eventually we may want to remove the generic collector to reduce complexity. - if search_after_option.is_some() || score_extractor.is_score() { - return Box::new(GenericQuickwitSegmentTopKCollector::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - search_after_option, - order1, - order2, - )); - } - - let sort_first_by_ff = score_extractor.first.is_fast_field(); - let sort_second_by_ff = score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - - #[derive(Debug)] - enum SortType { - DocId, - OneFFSort, - TwoFFSorts, - } - let sort_type = match (sort_first_by_ff, sort_second_by_ff) { - (false, false) => SortType::DocId, - (true, false) => SortType::OneFFSort, - (true, true) => SortType::TwoFFSorts, - (false, true) => panic!("Internal error: Got second sort, but no first sort"), - }; - // only check order1 for OneFFSort and DocId, as it's the only sort - // - // REVERSE_DOCID is only used for SortType::DocId and SortType::OneFFSort - match (sort_type, order1, order2) { - (SortType::DocId, SortOrder::Desc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::DocId, SortOrder::Asc, _) => { - Box::new(SpecializedSegmentTopKCollector::<(), (), true>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - )) - } - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Asc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::OneFFSort, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - (), - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::OneFFSort, SortOrder::Desc, SortOrder::Desc) => Box::new( - SpecializedSegmentTopKCollector::, (), false>::new( - split_id, - score_extractor, - leaf_max_hits, - segment_ord, - ), - ), - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option>, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Asc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option>, - Option, - true, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Asc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option>, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - (SortType::TwoFFSorts, SortOrder::Desc, SortOrder::Desc) => { - Box::new(SpecializedSegmentTopKCollector::< - Option, - Option, - false, - >::new( - split_id, score_extractor, leaf_max_hits, segment_ord - )) - } - } -} - -/// Fast Top K Computation -/// -/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. -/// That means capacity has special meaning and should be carried over when cloning or serializing. -/// -/// For TopK == 0, it will be relative expensive. -struct TopKComputer { - /// Reverses sort order to get top-semantics instead of bottom-semantics - buffer: Vec>, - top_n: usize, - pub(crate) threshold: D, -} - -// Custom clone to keep capacity -impl Clone for TopKComputer { - fn clone(&self) -> Self { - let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); - buffer_clone.extend(self.buffer.iter().cloned()); - - TopKComputer { - buffer: buffer_clone, - top_n: self.top_n, - threshold: self.threshold.clone(), - } - } -} - -impl TopKComputer -where D: Ord + Copy + Debug + MinValue -{ - /// Create a new `TopKComputer`. - pub fn new(top_n: usize) -> Self { - // Vec cap can't be 0, since it would panic in push - let vec_cap = top_n.max(1) * 10; - TopKComputer { - buffer: Vec::with_capacity(vec_cap), - top_n, - threshold: D::min_value(), - } + // We track the segment ordinal here, but splits only have 1 segment so this + // should always be 0. + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k_hits: TopKComputer>, + search_after_opt: Option>, +} + +impl QuickwitSegmentTopKCollectorTemplate { + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + let search_after_opt = self.search_after_opt; + let top_k_hits = &mut self.top_k_hits; + self.hit_fetcher + .project_to_internal_sort_value_block(docs, |repr| { + if let Some(search_after) = search_after_opt + && repr.cmp(&search_after) != Ordering::Less + { + return; + } + top_k_hits.push(repr); + }); } - /// Push a new document to the top n. - /// If the document is below the current threshold, it will be ignored. - #[inline] - pub fn push(&mut self, doc: D) { - if doc < self.threshold { + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + let internal_repr = self + .hit_fetcher + .project_to_internal_sort_value(doc_id, score); + if let Some(search_after) = self.search_after_opt + && internal_repr.cmp(&search_after) != Ordering::Less + { return; } - if self.buffer.len() == self.buffer.capacity() { - let median = self.truncate_top_n(); - self.threshold = median; - } - - // This is faster since it avoids the buffer resizing to be inlined from vec.push() - // (this is in the hot path) - // TODO: Replace with `push_within_capacity` when it's stabilized - let uninit = self.buffer.spare_capacity_mut(); - // This cannot panic, because we truncate_median will at least remove one element, since - // the min capacity is larger than 2. - uninit[0].write(Reverse(doc)); - // This is safe because it would panic in the line above - unsafe { - self.buffer.set_len(self.buffer.len() + 1); - } + self.top_k_hits.push(internal_repr); } - #[inline(never)] - fn truncate_top_n(&mut self) -> D { - // Use select_nth_unstable to find the top nth score - let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); - - let median_score = *median_el; - // Remove all elements below the top_n - self.buffer.truncate(self.top_n); - - median_score.0 - } - - /// Returns the top n elements in sorted order. - pub fn into_sorted_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.sort_unstable(); - self.buffer.into_iter().map(|el| el.0).collect() - } - - /// Returns the top n elements in stored order. - /// Useful if you do not need the elements in sorted order, - /// for example when merging the results of multiple segments. - #[allow(dead_code)] - pub fn into_vec(mut self) -> Vec { - if self.buffer.len() > self.top_n { - self.truncate_top_n(); - } - self.buffer.into_iter().map(|el| el.0).collect() - } -} - -pub use tantivy::COLLECT_BLOCK_BUFFER_LEN; -struct SpecSortingFieldExtractor { - _phantom: std::marker::PhantomData<(V1, V2)>, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - - pub first: SortingFieldExtractorComponent, - pub second: Option, -} - -impl< - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64 + Debug, -> SpecSortingFieldExtractor -{ - fn new( - first: SortingFieldExtractorComponent, - second: Option, - ) -> Self { - Self { - _phantom: PhantomData, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - first, - second, - } - } - /// Fetches the sort values for the given docs. - /// Does noting when sorting by docid. - fn fetch_data(&mut self, docs: &[DocId]) { - self.first - .extract_typed_sort_values_block(docs, &mut self.sort_values1[..docs.len()]); - if let Some(second) = self.second.as_ref() { - second.extract_typed_sort_values_block(docs, &mut self.sort_values2[..docs.len()]); - } - } - #[inline] - fn iter_hits<'a, const REVERSE_DOCID: bool>( - &'a self, - docs: &'a [DocId], - ) -> impl Iterator> + 'a { - SpecSortingFieldIter::::new( - docs, - &self.sort_values1, - &self.sort_values2, - ) - } -} - -struct SpecSortingFieldIter<'a, V1, V2, const REVERSE_DOCID: bool> { - docs: std::slice::Iter<'a, DocId>, - sort_values1: std::slice::Iter<'a, Option>, - sort_values2: std::slice::Iter<'a, Option>, - _phantom: PhantomData<(V1, V2)>, -} - -impl<'a, V1, V2, const REVERSE_DOCID: bool> SpecSortingFieldIter<'a, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, - V2: Copy + PartialEq + PartialOrd + Ord + IntoOptionU64, -{ - #[inline] - pub fn new( - docs: &'a [DocId], - sort_values1: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - sort_values2: &'a [Option; COLLECT_BLOCK_BUFFER_LEN], - ) -> Self { - Self { - docs: docs.iter(), - sort_values1: sort_values1.iter(), - sort_values2: sort_values2.iter(), - _phantom: PhantomData, - } - } -} - -impl Iterator for SpecSortingFieldIter<'_, V1, V2, REVERSE_DOCID> -where - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug, -{ - type Item = Hit; - - #[inline] - fn next(&mut self) -> Option { - let doc_id = *self.docs.next()?; - - let value1 = if !V1::is_unit_type() { - V1::from_option_u64(*self.sort_values1.next()?) - } else { - V1::from_option_u64(None) - }; - - let value2 = if !V2::is_unit_type() { - V2::from_option_u64(*self.sort_values2.next()?) - } else { - V2::from_option_u64(None) - }; - - Some(Hit { - doc_id, - value1, - value2, - }) - } -} - -/// No search after handling -/// Quickwit collector working at the scale of the segment. -struct SpecializedSegmentTopKCollector< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> { - split_id: SplitId, - hit_fetcher: SpecSortingFieldExtractor, - top_k_hits: TopKComputer>, - segment_ord: u32, -} - -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue + 'static, - const REVERSE_DOCID: bool, -> SpecializedSegmentTopKCollector -{ - pub fn new( - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - ) -> Self { - let hit_fetcher = - SpecSortingFieldExtractor::new(score_extractor.first, score_extractor.second); - let top_k_hits = TopKComputer::new(leaf_max_hits); - Self { - split_id, - hit_fetcher, - top_k_hits, - segment_ord, - } - } -} -impl< - V1: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - V2: Copy + PartialEq + Eq + PartialOrd + Ord + IntoOptionU64 + Debug + MinValue, - const REVERSE_DOCID: bool, -> QuickwitSegmentTopKCollector for SpecializedSegmentTopKCollector -{ - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.hit_fetcher.fetch_data(docs); - let iter = self.hit_fetcher.iter_hits::(docs); - for doc_id in iter { - self.top_k_hits.push(doc_id); - } - } - - #[inline] - fn collect_top_k(&mut self, _doc_id: DocId, _score: Score) { - panic!("Internal Error: This collector does not support collect_top_k"); - } - - fn get_top_k(&self) -> Vec { + pub(crate) fn get_top_k(&self) -> tantivy::Result> { self.top_k_hits .clone() .into_sorted_vec() .into_iter() - .map(|el| el.into_segment_partial_hit()) - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), + .map(|internal_repr| { + self.hit_fetcher.internal_to_partial_hit( + &self.split_id, self.segment_ord, - &self.hit_fetcher.first, - &self.hit_fetcher.second, + internal_repr, ) }) .collect() } } -/// Quickwit collector working at the scale of the segment. -pub(crate) struct GenericQuickwitSegmentTopKCollector { - split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - // PartialHits in this heap don't contain a split_id yet. - top_k_hits: TopK, - segment_ord: u32, - search_after: Option, - // Precomputed order for search_after for split_id and segment_ord - precomp_search_after_order: Ordering, - sort_values1: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, - sort_values2: Box<[Option; COLLECT_BLOCK_BUFFER_LEN]>, +pub enum QuickwitSegmentTopKCollector { + DocIdSort(QuickwitSegmentTopKCollectorTemplate<(), ()>), + OneDimSort(QuickwitSegmentTopKCollectorTemplate), + TwoDimSort(QuickwitSegmentTopKCollectorTemplate), + Noop, } -impl GenericQuickwitSegmentTopKCollector { - pub fn new( +impl QuickwitSegmentTopKCollector { + pub fn new_with_doc_id_sort( split_id: SplitId, - score_extractor: SortingFieldExtractorPair, - leaf_max_hits: usize, - segment_ord: u32, - search_after_option: Option, - order1: SortOrder, - order2: SortOrder, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair<(), ()>, + top_k: usize, + search_after_opt: Option>, ) -> Self { - let sort_key_mapper = HitSortingMapper { order1, order2 }; - let precomp_search_after_order = match &search_after_option { - Some(search_after) if !search_after.split_id.is_empty() => order1 - .compare(&split_id, &search_after.split_id) - .then_with(|| order1.compare(&segment_ord, &search_after.segment_ord)), - // This value isn't actually used. - _ => Ordering::Equal, - }; - let search_after = - SearchAfterSegment::new(search_after_option, order1, order2, &score_extractor); - - GenericQuickwitSegmentTopKCollector { - split_id, - score_extractor, - top_k_hits: TopK::new(leaf_max_hits, sort_key_mapper), // Adjusted for context - segment_ord, - search_after, - precomp_search_after_order, - sort_values1: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), - sort_values2: vec![None; COLLECT_BLOCK_BUFFER_LEN] - .into_boxed_slice() - .try_into() - .unwrap(), + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::DocIdSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } } - #[inline] - /// Generic top k collection, that includes search_after handling - /// - /// Outside of the collector to circumvent lifetime issues. - fn collect_top_k_vals( - doc_id: DocId, - sort_value: Option, - sort_value2: Option, - search_after: &Option, - precomp_search_after_order: Ordering, - top_k_hits: &mut TopK, - ) { - if let Some(search_after) = &search_after { - let search_after_value1 = search_after.sort_value; - let search_after_value2 = search_after.sort_value2; - let orders = &top_k_hits.sort_key_mapper; - let mut cmp_result = orders - .order1 - .compare_opt(&sort_value, &search_after_value1) - .then_with(|| { - orders - .order2 - .compare_opt(&sort_value2, &search_after_value2) - }); - if search_after.compare_on_equal { - // TODO actually it's not first, it should be what's in _shard_doc then first then - // default - let order = orders.order1; - cmp_result = cmp_result - .then(precomp_search_after_order) - // We compare doc_id only if sort_value1, sort_value2, split_id and segment_ord - // are equal. - .then_with(|| order.compare(&doc_id, &search_after.doc_id)) - } - if cmp_result != Ordering::Less { - return; - } + pub fn new_with_one_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop + } else { + QuickwitSegmentTopKCollector::OneDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) } - - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - top_k_hits.add_entry(hit); } -} -impl QuickwitSegmentTopKCollector for GenericQuickwitSegmentTopKCollector { - fn collect_top_k_block(&mut self, docs: &[DocId]) { - self.score_extractor.extract_typed_sort_values( - docs, - &mut self.sort_values1[..], - &mut self.sort_values2[..], - ); - if self.search_after.is_some() { - // Search after not optimized for block collection yet - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } + + pub fn new_with_two_dim_sort( + split_id: SplitId, + segment_ord: SegmentOrdinal, + hit_fetcher: SortingFieldExtractorPair, + top_k: usize, + search_after_opt: Option>, + ) -> Self { + if let Some(search_after) = &search_after_opt + && search_after.is_skip_all() + { + QuickwitSegmentTopKCollector::Noop } else { - // Probably would make sense to check the fence against e.g. sort_values1 earlier, - // before creating the SegmentPartialHit. - // - // Below are different versions to avoid iterating the caches if they are unused. - // - // No sort values loaded. Sort only by doc_id. - if !self.score_extractor.first.is_fast_field() { - for doc_id in docs.iter().cloned() { - let hit = SegmentPartialHit { - sort_value: None, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::TwoDimSort(QuickwitSegmentTopKCollectorTemplate { + split_id, + segment_ord, + top_k_hits: TopKComputer::new(top_k), + hit_fetcher, + search_after_opt, + }) + } + } + + pub(crate) fn collect_top_k_block(&mut self, docs: &[DocId]) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k_block(docs) } - let has_no_second_sort = !self - .score_extractor - .second - .as_ref() - .map(|extr| extr.is_fast_field()) - .unwrap_or(false); - // No second sort values => We can skip iterating the second sort values cache. - if has_no_second_sort { - for (doc_id, sort_value) in - docs.iter().cloned().zip(self.sort_values1.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2: None, - doc_id, - }; - self.top_k_hits.add_entry(hit); - } - return; + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k_block(docs) } - - for ((doc_id, sort_value), sort_value2) in docs - .iter() - .cloned() - .zip(self.sort_values1.iter().cloned()) - .zip(self.sort_values2.iter().cloned()) - { - let hit = SegmentPartialHit { - sort_value, - sort_value2, - doc_id, - }; - self.top_k_hits.add_entry(hit); + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k_block(docs) } + QuickwitSegmentTopKCollector::Noop => {} } } - #[inline] - fn collect_top_k(&mut self, doc_id: DocId, score: Score) { - let (sort_value, sort_value2): (Option, Option) = - self.score_extractor.extract_typed_sort_value(doc_id, score); - Self::collect_top_k_vals( - doc_id, - sort_value, - sort_value2, - &self.search_after, - self.precomp_search_after_order, - &mut self.top_k_hits, - ); - } - - fn get_top_k(&self) -> Vec { - self.top_k_hits - .clone() - .finalize() - .into_iter() - .map(|segment_partial_hit: SegmentPartialHit| { - segment_partial_hit.into_partial_hit( - self.split_id.clone(), - self.segment_ord, - &self.score_extractor.first, - &self.score_extractor.second, - ) - }) - .collect() - } -} - -/// Search After, but the sort values are converted to the u64 fast field representation. -pub(crate) struct SearchAfterSegment { - sort_value: Option, - sort_value2: Option, - compare_on_equal: bool, - doc_id: DocId, -} -impl SearchAfterSegment { - pub fn new( - search_after_opt: Option, - sort_order1: SortOrder, - sort_order2: SortOrder, - score_extractor: &SortingFieldExtractorPair, - ) -> Option { - let search_after = search_after_opt?; - let mut sort_value = None; - if let Some(search_after_sort_value) = search_after - .sort_value - .and_then(|sort_value| sort_value.sort_value) - { - if let Some(new_value) = score_extractor - .first - .convert_to_u64_ff_val(search_after_sort_value, sort_order1) - { - sort_value = Some(new_value); - } else { - // Value is out of bounds, we ignore sort_value2 and disable the whole - // search_after - return None; + pub(crate) fn collect_top_k(&mut self, doc_id: DocId, score: Score) { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => { + collector.collect_top_k(doc_id, score) } - } - let mut sort_value2 = None; - if let Some(search_after_sort_value) = search_after - .sort_value2 - .and_then(|sort_value2| sort_value2.sort_value) - { - let extractor = score_extractor - .second - .as_ref() - .expect("Internal error: Got sort_value2, but no sort extractor"); - if let Some(new_value) = - extractor.convert_to_u64_ff_val(search_after_sort_value, sort_order2) - { - sort_value2 = Some(new_value); + QuickwitSegmentTopKCollector::OneDimSort(collector) => { + collector.collect_top_k(doc_id, score) + } + QuickwitSegmentTopKCollector::TwoDimSort(collector) => { + collector.collect_top_k(doc_id, score) } + QuickwitSegmentTopKCollector::Noop => {} + } + } + + pub(crate) fn get_top_k(&self) -> tantivy::Result> { + match self { + QuickwitSegmentTopKCollector::DocIdSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::OneDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::TwoDimSort(collector) => collector.get_top_k(), + QuickwitSegmentTopKCollector::Noop => Ok(vec![]), } - Some(Self { - sort_value, - sort_value2, - compare_on_equal: !search_after.split_id.is_empty(), - doc_id: search_after.doc_id, - }) } } diff --git a/quickwit/quickwit-search/src/top_k_computer.rs b/quickwit/quickwit-search/src/top_k_computer.rs new file mode 100644 index 00000000000..8f6ff7c8d07 --- /dev/null +++ b/quickwit/quickwit-search/src/top_k_computer.rs @@ -0,0 +1,111 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::cmp::Reverse; +use std::fmt::Debug; + +pub(crate) trait MinValue { + fn min_value() -> Self; +} + +/// Fast Top K Computation +/// +/// The buffer is truncated to the top_n elements when it reaches the capacity of the Vec. +/// That means capacity has special meaning and should be carried over when cloning or serializing. +/// +/// For TopK == 0, it will be relative expensive. +pub(crate) struct TopKComputer { + /// Reverses sort order to get top-semantics instead of bottom-semantics + buffer: Vec>, + top_n: usize, + pub(crate) threshold: D, +} + +// Custom clone to keep capacity +impl Clone for TopKComputer { + fn clone(&self) -> Self { + let mut buffer_clone = Vec::with_capacity(self.buffer.capacity()); + buffer_clone.extend(self.buffer.iter().cloned()); + + TopKComputer { + buffer: buffer_clone, + top_n: self.top_n, + threshold: self.threshold.clone(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug + MinValue +{ + /// Create a new `TopKComputer`. + pub fn new(top_n: usize) -> Self { + let vec_cap = top_n.max(1) * 10; + TopKComputer { + buffer: Vec::with_capacity(vec_cap), + top_n, + threshold: D::min_value(), + } + } +} + +impl TopKComputer +where D: Ord + Copy + Debug +{ + /// Push a new document to the top n. + /// If the document is below the current threshold, it will be ignored. + #[inline] + pub fn push(&mut self, doc: D) { + if doc < self.threshold { + return; + } + if self.buffer.len() == self.buffer.capacity() { + let median = self.truncate_top_n(); + self.threshold = median; + } + + // This is faster since it avoids the buffer resizing to be inlined from vec.push() + // (this is in the hot path) + // TODO: Replace with `push_within_capacity` when it's stabilized + let uninit = self.buffer.spare_capacity_mut(); + // This cannot panic, because truncate_top_n will at least remove one element, since + // the min capacity is larger than 2. + uninit[0].write(Reverse(doc)); + // This is safe because it would panic in the line above + unsafe { + self.buffer.set_len(self.buffer.len() + 1); + } + } + + #[inline(never)] + fn truncate_top_n(&mut self) -> D { + // Use select_nth_unstable to find the top nth score + let (_, median_el, _) = self.buffer.select_nth_unstable(self.top_n); + + let median_score = *median_el; + // Remove all elements below the top_n + self.buffer.truncate(self.top_n); + + median_score.0 + } + + /// Returns the top n elements in sorted order. + pub fn into_sorted_vec(mut self) -> Vec { + if self.buffer.len() > self.top_n { + self.truncate_top_n(); + } + self.buffer.sort_unstable(); + self.buffer.into_iter().map(|el| el.0).collect() + } +} diff --git a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs index a9649200e5b..0fc234ebd0c 100644 --- a/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elasticsearch_api/rest_handler.rs @@ -778,16 +778,16 @@ fn convert_hit( .unwrap_or_else(|_| Source::from_string("{}".to_string()).unwrap()); let mut sort = Vec::new(); - if let Some(partial_hit) = hit.partial_hit { - if let Some(sort_value) = partial_hit.sort_value { - sort.push(sort_value.into_json()); + if let Some(partial_hit) = &hit.partial_hit { + if let Some(sort_value) = &partial_hit.sort_value { + sort.push(sort_value.clone().into_json()); } - if let Some(sort_value2) = partial_hit.sort_value2 { - sort.push(sort_value2.into_json()); + if let Some(sort_value2) = &partial_hit.sort_value2 { + sort.push(sort_value2.clone().into_json()); } if append_shard_doc { sort.push(serde_json::Value::String( - quickwit_search::GlobalDocAddress::from_partial_hit(&partial_hit).to_string(), + quickwit_search::GlobalDocAddress::from_partial_hit(partial_hit).to_string(), )); } } diff --git a/quickwit/quickwit-serve/src/grpc.rs b/quickwit/quickwit-serve/src/grpc.rs index 27d370c38aa..351341af895 100644 --- a/quickwit/quickwit-serve/src/grpc.rs +++ b/quickwit/quickwit-serve/src/grpc.rs @@ -188,10 +188,11 @@ pub(crate) async fn start_grpc_server( let search_service = services.search_service.clone(); let grpc_search_service = GrpcSearchAdapter::from(search_service); + let max_message_size_bytes = grpc_config.max_search_message_size.0 as usize; Some( SearchServiceServer::new(grpc_search_service) - .max_decoding_message_size(grpc_config.max_message_size.0 as usize) - .max_encoding_message_size(grpc_config.max_message_size.0 as usize), + .max_decoding_message_size(max_message_size_bytes) + .max_encoding_message_size(max_message_size_bytes), ) } else { None diff --git a/quickwit/quickwit-serve/src/lib.rs b/quickwit/quickwit-serve/src/lib.rs index 9c7543e2e04..71fc69b0ad9 100644 --- a/quickwit/quickwit-serve/src/lib.rs +++ b/quickwit/quickwit-serve/src/lib.rs @@ -38,6 +38,7 @@ mod rest; mod rest_api_response; mod search_api; pub(crate) mod simple_list; +mod soft_delete_api; pub mod tcp_listener; mod template_api; mod ui_handler; @@ -1025,7 +1026,7 @@ async fn setup_searcher( ) .await?; let search_service_clone = search_service.clone(); - let max_message_size = node_config.grpc_config.max_message_size; + let max_message_size = node_config.grpc_config.max_search_message_size; let searcher_change_stream = cluster_change_stream.filter_map(move |cluster_change| { let search_service_clone = search_service_clone.clone(); Box::pin(async move { diff --git a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs index 1654a840dad..4ec47c15847 100644 --- a/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/otlp_api/rest_handler.rs @@ -25,7 +25,6 @@ use quickwit_proto::opentelemetry::proto::collector::trace::v1::{ use quickwit_proto::types::IndexId; use quickwit_proto::{ServiceError, ServiceErrorCode, tonic}; use serde::{self, Serialize}; -use tracing::error; use warp::{Filter, Rejection}; use crate::decompression::get_body_bytes; diff --git a/quickwit/quickwit-serve/src/rest.rs b/quickwit/quickwit-serve/src/rest.rs index 3f193783b04..ae33bb50a08 100644 --- a/quickwit/quickwit-serve/src/rest.rs +++ b/quickwit/quickwit-serve/src/rest.rs @@ -53,6 +53,7 @@ use crate::rest_api_response::{RestApiError, RestApiResponse}; use crate::search_api::{ search_get_handler, search_plan_get_handler, search_plan_post_handler, search_post_handler, }; +use crate::soft_delete_api::soft_delete_api_handlers; use crate::template_api::index_template_api_handlers; use crate::ui_handler::ui_handler; use crate::{BodyFormat, BuildInfo, QuickwitServices, RuntimeInfo}; @@ -339,6 +340,11 @@ fn api_v1_routes( quickwit_services.metastore_client.clone(), )) .boxed() + .or(soft_delete_api_handlers( + quickwit_services.search_service.clone(), + quickwit_services.metastore_client.clone(), + )) + .boxed() .or(jaeger_api_handlers( quickwit_services.jaeger_service_opt.clone(), )) diff --git a/quickwit/quickwit-serve/src/soft_delete_api/handler.rs b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs new file mode 100644 index 00000000000..b7000237573 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/handler.rs @@ -0,0 +1,373 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::sync::Arc; + +use itertools::Itertools; +use quickwit_metastore::IndexMetadataResponseExt; +use quickwit_proto::metastore::{ + IndexMetadataRequest, MetastoreService, MetastoreServiceClient, SoftDeleteDocumentsRequest, + SoftDeleteDocumentsResponse, SplitDocIds, +}; +use quickwit_proto::search::SearchRequest; +use quickwit_proto::types::IndexId; +use quickwit_query::query_ast::query_ast_from_user_text; +use quickwit_search::{SearchError, SearchService}; +use serde::{Deserialize, Serialize}; +use warp::{Filter, Rejection}; + +use crate::format::extract_format_from_qs; +use crate::rest::recover_fn; +use crate::rest_api_response::into_rest_api_response; +use crate::with_arg; + +const MAX_SOFT_DELETED_HITS: u64 = 100; + +#[allow(dead_code)] +#[derive(utoipa::OpenApi)] +#[openapi( + paths(post_soft_delete), + components(schemas(SoftDeleteRequest, SoftDeleteResponse)) +)] +pub struct SoftDeleteApi; + +/// Request body for the soft-delete endpoint. +#[derive(Deserialize, Debug, PartialEq, Eq, Default, utoipa::ToSchema)] +#[serde(deny_unknown_fields)] +pub struct SoftDeleteRequest { + /// Query text in Tantivy query language to match events to soft-delete. + pub query: String, + /// Maximum number of events to soft-delete in a single call (default: 100). + #[serde(default = "default_max_soft_deletes")] + pub max_hits: u64, + /// If set, restrict soft-delete to documents with a `timestamp >= start_timestamp`. + pub start_timestamp: Option, + /// If set, restrict soft-delete to documents with a `timestamp < end_timestamp`. + pub end_timestamp: Option, +} + +fn default_max_soft_deletes() -> u64 { + MAX_SOFT_DELETED_HITS +} + +/// Response from the soft-delete endpoint. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, utoipa::ToSchema)] +pub struct SoftDeleteResponse { + /// Total number of doc_ids that were newly soft-deleted across all splits. + pub num_soft_deleted_doc_ids: u64, +} + +/// Top-level filter combining all soft-delete API handlers. +pub fn soft_delete_api_handlers( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + post_soft_delete_handler(search_service, metastore.clone()) + .recover(recover_fn) + .boxed() +} + +fn post_soft_delete_handler( + search_service: Arc, + metastore: MetastoreServiceClient, +) -> impl Filter + Clone { + warp::path!(String / "soft-delete") + .and(warp::body::json()) + .and(warp::post()) + .and(with_arg(search_service)) + .and(with_arg(metastore)) + .then(post_soft_delete) + .and(extract_format_from_qs()) + .map(into_rest_api_response) +} + +#[utoipa::path( + post, + tag = "Soft Delete", + path = "/{index_id}/soft-delete", + request_body = SoftDeleteRequest, + responses( + (status = 200, description = "Successfully soft-deleted documents.", body = SoftDeleteResponse) + ), + params( + ("index_id" = String, Path, description = "The index ID to soft-delete documents from."), + ) +)] +/// Soft Delete Documents +/// +/// Runs a search query to identify matching documents, then records their internal +/// doc IDs in the metastore so they are excluded from future search results. +pub async fn post_soft_delete( + index_id: IndexId, + request: SoftDeleteRequest, + search_service: Arc, + metastore: MetastoreServiceClient, +) -> Result { + // 1. Build a SearchRequest from the soft-delete query. + // Validate the query and make sure it doesn't require default search fields + let query_ast = query_ast_from_user_text(&request.query, None); + query_ast.clone().parse_user_query(&[])?; + let query_ast_json = serde_json::to_string(&query_ast) + .map_err(|err| SearchError::Internal(format!("failed to serialize query AST: {err}")))?; + + // Enforce a hits limit that guarantee we won't delete + // more than MAX_SOFT_DELETED_HITS per split + let max_hits = if request.max_hits > MAX_SOFT_DELETED_HITS { + MAX_SOFT_DELETED_HITS + } else { + request.max_hits + }; + + let search_request = SearchRequest { + index_id_patterns: vec![index_id.to_string()], + query_ast: query_ast_json, + max_hits, + start_timestamp: request.start_timestamp, + end_timestamp: request.end_timestamp, + ..Default::default() + }; + + // 2. Execute root_search to get PartialHits (split_id, doc_id). + let search_response = search_service.root_search(search_request).await?; + + // 3. Group hits by split_id. + let split_doc_ids: Vec = search_response + .hits + .iter() + .filter_map(|hit| hit.partial_hit.as_ref()) + .into_group_map_by(|ph| ph.split_id.clone()) + .into_iter() + .map(|(split_id, hits)| SplitDocIds { + split_id, + doc_ids: hits.into_iter().map(|h| h.doc_id).collect(), + }) + .collect(); + + if split_doc_ids.is_empty() { + return Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids: 0, + }); + } + + // 4. Resolve index_uid. + let index_metadata_request = IndexMetadataRequest::for_index_id(index_id.to_string()); + let index_uid = metastore + .index_metadata(index_metadata_request) + .await + .map_err(|err| SearchError::Internal(format!("failed to fetch index metadata: {err}")))? + .deserialize_index_metadata() + .map_err(|err| { + SearchError::Internal(format!("failed to deserialize index metadata: {err}")) + })? + .index_uid; + + // 5. Store in metastore. + let SoftDeleteDocumentsResponse { + num_soft_deleted_doc_ids, + } = metastore + .soft_delete_documents(SoftDeleteDocumentsRequest { + index_uid: Some(index_uid), + split_doc_ids, + }) + .await + .map_err(|err| SearchError::Internal(format!("failed to soft-delete documents: {err}")))?; + + Ok(SoftDeleteResponse { + num_soft_deleted_doc_ids, + }) +} + +#[cfg(test)] +mod tests { + use std::net::{Ipv4Addr, SocketAddr}; + + use quickwit_config::SearcherConfig; + use quickwit_indexing::TestSandbox; + use quickwit_search::{ClusterClient, SearchJobPlacer, SearchServiceImpl, SearcherPool}; + use warp::Filter; + + use super::*; + use crate::rest::recover_fn; + + /// Build a real `Arc` wired to the given `TestSandbox`. + async fn build_search_service(sandbox: &TestSandbox) -> Arc { + let socket_addr = SocketAddr::new(Ipv4Addr::new(127, 0, 0, 1).into(), 7280u16); + let searcher_pool = SearcherPool::default(); + let search_job_placer = SearchJobPlacer::new(searcher_pool.clone()); + let cluster_client = ClusterClient::new(search_job_placer); + let searcher_config = SearcherConfig::default(); + let searcher_context = + Arc::new(quickwit_search::SearcherContext::new(searcher_config, None)); + let search_service: Arc = Arc::new(SearchServiceImpl::new( + sandbox.metastore(), + sandbox.storage_resolver(), + cluster_client, + searcher_context, + )); + let search_service_client = + quickwit_search::SearchServiceClient::from_service(search_service.clone(), socket_addr); + searcher_pool.insert(socket_addr, search_service_client); + search_service + } + + #[tokio::test] + async fn test_soft_delete_api_post_no_matching_docs() { + let index_id = "test-soft-delete-rest"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + - name: body + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST a soft-delete query matching no docs → should get 0 + let resp = warp::test::request() + .path("/test-soft-delete-rest/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:nonexistent_term_xyz"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_with_matching_docs() { + let index_id = "test-soft-delete-match"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + // Ingest some documents. + let docs = vec![ + serde_json::json!({"title": "apple"}), + serde_json::json!({"title": "banana"}), + serde_json::json!({"title": "cherry"}), + ]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // Soft-delete documents matching "apple". + let resp = warp::test::request() + .path("/test-soft-delete-match/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_idempotent() { + let index_id = "test-soft-delete-idempotent"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + + let docs = vec![serde_json::json!({"title": "apple"})]; + test_sandbox.add_documents(docs).await.unwrap(); + + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // First soft-delete. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 1); + + // Second soft-delete of same doc — the doc is already excluded from search + // results, so the search won't find it again, yielding 0 new deletions. + let resp = warp::test::request() + .path("/test-soft-delete-idempotent/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple"}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 200); + let response: SoftDeleteResponse = serde_json::from_slice(resp.body()).unwrap(); + assert_eq!(response.num_soft_deleted_doc_ids, 0); + + test_sandbox.assert_quit().await; + } + + #[tokio::test] + async fn test_soft_delete_api_post_deny_unknown_fields() { + let index_id = "test-soft-delete-unknown"; + let doc_mapping_yaml = r#" + field_mappings: + - name: title + type: text + mode: lenient + "#; + let test_sandbox = TestSandbox::create(index_id, doc_mapping_yaml, "", &["title"]) + .await + .unwrap(); + let metastore = test_sandbox.metastore(); + let search_service = build_search_service(&test_sandbox).await; + let handler = soft_delete_api_handlers(search_service, metastore).recover(recover_fn); + + // POST with unknown field should fail. + let resp = warp::test::request() + .path("/test-soft-delete-unknown/soft-delete") + .method("POST") + .json(&true) + .body(r#"{"query": "title:apple", "unknown_field": true}"#) + .reply(&handler) + .await; + assert_eq!(resp.status(), 400); + + test_sandbox.assert_quit().await; + } +} diff --git a/quickwit/quickwit-serve/src/soft_delete_api/mod.rs b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs new file mode 100644 index 00000000000..d72811748f5 --- /dev/null +++ b/quickwit/quickwit-serve/src/soft_delete_api/mod.rs @@ -0,0 +1,17 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +mod handler; + +pub use handler::soft_delete_api_handlers; diff --git a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs index ecce3c795da..77679c7fa39 100644 --- a/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs +++ b/quickwit/quickwit-storage/src/object_storage/s3_compatible_storage.rs @@ -17,10 +17,12 @@ use std::ops::Range; use std::path::{Path, PathBuf}; use std::pin::Pin; use std::task::{Context, Poll}; +use std::time::Duration; use std::{fmt, io}; use anyhow::{Context as AnyhhowContext, anyhow}; use async_trait::async_trait; +use aws_config::timeout::TimeoutConfig; use aws_credential_types::provider::SharedCredentialsProvider; use aws_sdk_s3::Client as S3Client; use aws_sdk_s3::config::{Credentials, Region}; @@ -145,7 +147,14 @@ pub async fn create_s3_client(s3_storage_config: &S3StorageConfig) -> S3Client { s3_config.set_retry_config(aws_config.retry_config().cloned()); s3_config.set_sleep_impl(aws_config.sleep_impl()); s3_config.set_stalled_stream_protection(aws_config.stalled_stream_protection()); - s3_config.set_timeout_config(aws_config.timeout_config().cloned()); + s3_config.set_timeout_config(Some( + TimeoutConfig::builder() + .connect_timeout(Duration::from_secs(5)) + .read_timeout(Duration::from_secs(10)) // Time to first byte + .operation_attempt_timeout(Duration::from_secs(900)) // Single attempt timeout + .operation_timeout(Duration::from_secs(1800)) // Total timeout + .build(), + )); if let Some(endpoint) = s3_storage_config.endpoint() { info!(endpoint=%endpoint, "using S3 endpoint defined in storage config or environment variable"); diff --git a/quickwit/quickwit-ui/src/components/IndexSummary.tsx b/quickwit/quickwit-ui/src/components/IndexSummary.tsx index c3eca2da261..7be3b8b01ee 100644 --- a/quickwit/quickwit-ui/src/components/IndexSummary.tsx +++ b/quickwit/quickwit-ui/src/components/IndexSummary.tsx @@ -13,7 +13,7 @@ // limitations under the License. import styled from "@emotion/styled"; -import { Paper } from "@mui/material"; +import { Alert, Paper } from "@mui/material"; import dayjs from "dayjs"; import utc from "dayjs/plugin/utc"; import { FC, ReactNode } from "react"; @@ -75,6 +75,12 @@ export function IndexSummary({ index }: { index: Index }) { return ( + {index.split_limit_reached && ( + + Split limit reached. Only the first 10,000 splits were retrieved. + The actual total may be higher. Statistics shown are incomplete. + + )} {dayjs .unix(index.metadata.create_timestamp) diff --git a/quickwit/quickwit-ui/src/services/client.ts b/quickwit/quickwit-ui/src/services/client.ts index cc7643b6687..95baaceed99 100644 --- a/quickwit/quickwit-ui/src/services/client.ts +++ b/quickwit/quickwit-ui/src/services/client.ts @@ -81,7 +81,8 @@ export class Client { ]); return { metadata: metadata, - splits: splits, + splits: splits[0], + split_limit_reached: splits[1], }; } @@ -89,14 +90,16 @@ export class Client { return this.fetch(`${this.apiRoot()}indexes/${indexId}`, {}); } - async getAllSplits(indexId: string): Promise> { + async getAllSplits( + indexId: string, + ): Promise<[Array, boolean]> { // TODO: restrieve all the splits. const results: { splits: Array } = await this.fetch( `${this.apiRoot()}indexes/${indexId}/splits?limit=10000`, {}, ); - return results["splits"]; + return [results["splits"], results["splits"].length === 10000]; } async listIndexes(): Promise> { diff --git a/quickwit/quickwit-ui/src/utils/models.ts b/quickwit/quickwit-ui/src/utils/models.ts index 67e77add3de..8abe8acc6e1 100644 --- a/quickwit/quickwit-ui/src/utils/models.ts +++ b/quickwit/quickwit-ui/src/utils/models.ts @@ -282,6 +282,7 @@ export type Range = { export type Index = { metadata: IndexMetadata; splits: SplitMetadata[]; + split_limit_reached: boolean; }; export type Cluster = { diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml index 40413bbfcec..755e8ae1db1 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml @@ -284,14 +284,16 @@ expected: response: values: - key: 85.0 - value: 100.49456770856702 + value: + $expect: 'abs(val - 100.4945) < 0.1' - doc_count: 2 key: 1422662400000.0 key_as_string: '2015-01-31T00:00:00Z' response: values: - key: 85.0 - value: 30.26717133872237 + value: + $expect: 'abs(val - 30.2617) < 0.1' --- # Test histogram method: [GET] @@ -353,12 +355,16 @@ json: field: "date" expected: aggregations: + # cardinality queries are currently being improved upstream unique_names: - value: 8.0 + value: + $expect: 'abs(val - 8) <= 2' unique_response: - value: 5.0 # TODO: Check. The correct number is 6 + value: + $expect: 'abs(val - 6) <= 2' unique_dates: - value: 6.0 + value: + $expect: 'abs(val - 6) <= 3' --- # Test extended stats aggregation method: [GET] diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml index bd24f4fb718..c34cd43d64a 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0018-search_after.yaml @@ -45,24 +45,6 @@ expected: hits: - sort: [9018] --- -# Test with a search after value as string -# Quickwit should convert it to the correct type -json: - size: 1 - query: - match_all: {} - sort: - - actor.id: - order: asc - search_after: ["5688"] -expected: - hits: - total: - value: 100 - relation: eq - hits: - - sort: [9018] ---- json: size: 1 query: @@ -93,21 +75,6 @@ expected: hits: $expect: "len(val) == 4" --- -# Quickwit should accept timestamp as string. -json: - size: 100 - track_total_hits: true - query: - match_all: {} - sort: - - created_at: - order: asc - search_after: ["1422748815000"] -expected: - hits: - hits: - $expect: "len(val) == 4" ---- json: size: 100 track_total_hits: true @@ -116,7 +83,7 @@ json: sort: - created_at: order: desc - search_after: ["1422748800001"] + search_after: [1422748800001] expected: hits: hits: diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml index dc9765b634e..7dae4d645da 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/0001_ts_range.yaml @@ -40,3 +40,9 @@ params: query: "auto_date:>=2023-05-25T00:00:00Z AND auto_date:<2023-05-26T00:00:00Z" expected: num_hits: 2 +--- +endpoint: millisec/search +params: + query: "ts:>=2022-12-16T10:00:57.000Z AND ts:<=2022-12-16T10:00:57.000Z" +expected: + num_hits: 1 \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml index b333ed3c86a..e410ecd96c0 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_setup.quickwit.yaml @@ -98,3 +98,31 @@ ndjson: - {"text_raw": "indexed with raw tokenizer dashes"} - {"text_fast": "fast-text-value-dashes"} - {"text_fast": "fast text value whitespaces"} +--- +method: DELETE +endpoint: indexes/millisec +status_code: null +--- +method: POST +endpoint: indexes/ +json: + version: "0.7" + index_id: millisec + doc_mapping: + timestamp_field: ts + mode: strict + field_mappings: + - name: ts + type: datetime + fast: true + input_formats: ["rfc3339"] + fast_precision: milliseconds +--- +method: POST +endpoint: millisec/ingest +params: + commit: force +ndjson: + - {"ts": "2022-12-16T10:00:56.297Z"} + - {"ts": "2022-12-16T10:00:57.000Z"} + - {"ts": "2022-12-16T10:00:57.297Z"} \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml index 56cd2bda8a9..ebfa1c4931b 100644 --- a/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/qw_search_api/_teardown.quickwit.yaml @@ -3,3 +3,6 @@ endpoint: indexes/simple --- method: DELETE endpoint: indexes/nested +--- +method: DELETE +endpoint: indexes/millisec \ No newline at end of file diff --git a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml index a1e958e0e50..85f6aa999f6 100644 --- a/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml +++ b/quickwit/rest-api-tests/scenarii/search_after/0001-search_after_edge_case.yaml @@ -227,9 +227,9 @@ expected: relation: eq hits: - sort: [0] - - sort: [True] - sort: [10.5] - sort: [18000000000000000000] + - sort: [True] --- desc: "search after on mixed column desc match nothing" json: @@ -263,8 +263,8 @@ expected: value: 5 relation: eq hits: - - sort: [True] - sort: [0] - sort: [-10] + diff --git a/quickwit/rust-toolchain.toml b/quickwit/rust-toolchain.toml index e54a09951e9..2a30998f14b 100644 --- a/quickwit/rust-toolchain.toml +++ b/quickwit/rust-toolchain.toml @@ -1,4 +1,4 @@ [toolchain] -channel = "1.91" +channel = "1.93" components = ["cargo", "clippy", "rustfmt", "rust-docs"]