From 4be5e0add971aef20d2f48f3d8c3812cfa93dc97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Thu, 30 Oct 2025 19:05:27 +0100 Subject: [PATCH 1/5] Adding dev containers --- .devcontainer/Dockerfile | 20 +++++++++++++++ .devcontainer/devcontainer.json | 44 +++++++++++++++++++++++++++++++++ .pre-commit-config.yaml | 15 +++++------ 3 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..49208e3 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,20 @@ +FROM mcr.microsoft.com/devcontainers/rust:2-1-bullseye + +ARG CLAUDE_CODE_VERSION=latest + +USER root + +# Install Node.js 24.x (latest LTS) from NodeSource +RUN apt-get update && \ + apt-get install -y ca-certificates curl gnupg && \ + mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg && \ + echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_24.x nodistro main" | tee /etc/apt/sources.list.d/nodesource.list && \ + apt-get update && \ + apt-get install -y nodejs && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +RUN npm install -g @anthropic-ai/claude-code@${CLAUDE_CODE_VERSION} +RUN npm install -g @fission-ai/openspec@latest +USER vscode \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..7c2a85b --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,44 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/rust +{ + "name": "Rust", + "dockerFile": "Dockerfile", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile +// "image": "mcr.microsoft.com/devcontainers/rust:2-1-bullseye", + + // Use 'mounts' to make the cargo cache persistent in a Docker Volume. + // "mounts": [ + // { + // "source": "devcontainer-cargo-cache-${devcontainerId}", + // "target": "/usr/local/cargo", + // "type": "volume" + // } + // ] + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + // "postCreateCommand": "rustc --version", + + // Configure tool-specific properties. + "customizations" : { + "jetbrains" : { + "settings": { + "com.intellij.database:app:DatabaseSettings.enable-local-filter-by-default": false, + "com.intellij:app:BaseRefactoringSettings.safe_delete_when_delete": false, + "com.intellij:app:BaseRefactoringSettings.rename_search_in_comments_for_file": false, + "com.intellij:app:BaseRefactoringSettings.rename_search_for_references_for_file": false, + "com.intellij:app:BaseRefactoringSettings.rename_search_for_references_for_directory": false, + "com.intellij:app:BaseRefactoringSettings.move_search_for_references_for_file": false + }, + "backend" : "RustRover" + } + }, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 76f0ded..f67310c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,20 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v6.0.0 hooks: - id: check-yaml - stages: [commit] + stages: [pre-commit] - id: check-json - stages: [commit] + stages: [pre-commit] + exclude: '^\.devcontainer/devcontainer\.json$' - id: check-toml - stages: [commit] + stages: [pre-commit] - id: check-merge-conflict - stages: [commit] + stages: [pre-commit] - id: check-case-conflict - stages: [commit] + stages: [pre-commit] - id: detect-private-key - stages: [commit] + stages: [pre-commit] - repo: https://github.com/doublify/pre-commit-rust rev: v1.0 From 8c95f44fc9f03f4954cc4a5e5fed35b46f7d06c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Thu, 30 Oct 2025 21:01:59 +0000 Subject: [PATCH 2/5] Bumping to DataFusion 50.3.0 --- .claude/commands/openspec/apply.md | 23 + .claude/commands/openspec/archive.md | 27 + .claude/commands/openspec/proposal.md | 27 + .claude/settings.local.json | 28 + AGENTS.md | 60 ++ CLAUDE.md | 18 + Cargo.lock | 831 +++++++++++------- Cargo.toml | 4 +- openspec/AGENTS.md | 454 ++++++++++ .../changes/upgrade-datafusion-50/design.md | 158 ++++ .../changes/upgrade-datafusion-50/proposal.md | 51 ++ .../specs/datafusion-integration/spec.md | 70 ++ .../changes/upgrade-datafusion-50/tasks.md | 54 ++ openspec/project.md | 121 +++ .../optimization_comparison_benchmark.rs | 434 +++++++++ .../benches/quick_optimization_test.rs | 275 ++++++ .../physical_planner/joins/interval_join.rs | 30 +- .../src/physical_planner/joins/utils.rs | 18 +- .../sequila_physical_planner.rs | 4 +- 19 files changed, 2335 insertions(+), 352 deletions(-) create mode 100644 .claude/commands/openspec/apply.md create mode 100644 .claude/commands/openspec/archive.md create mode 100644 .claude/commands/openspec/proposal.md create mode 100644 .claude/settings.local.json create mode 100644 AGENTS.md create mode 100644 CLAUDE.md create mode 100644 openspec/AGENTS.md create mode 100644 openspec/changes/upgrade-datafusion-50/design.md create mode 100644 openspec/changes/upgrade-datafusion-50/proposal.md create mode 100644 openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md create mode 100644 openspec/changes/upgrade-datafusion-50/tasks.md create mode 100644 openspec/project.md create mode 100644 sequila/sequila-core/benches/optimization_comparison_benchmark.rs create mode 100644 sequila/sequila-core/benches/quick_optimization_test.rs diff --git a/.claude/commands/openspec/apply.md b/.claude/commands/openspec/apply.md new file mode 100644 index 0000000..a36fd96 --- /dev/null +++ b/.claude/commands/openspec/apply.md @@ -0,0 +1,23 @@ +--- +name: OpenSpec: Apply +description: Implement an approved OpenSpec change and keep tasks in sync. +category: OpenSpec +tags: [openspec, apply] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +Track these steps as TODOs and complete them one by one. +1. Read `changes//proposal.md`, `design.md` (if present), and `tasks.md` to confirm scope and acceptance criteria. +2. Work through tasks sequentially, keeping edits minimal and focused on the requested change. +3. Confirm completion before updating statuses—make sure every item in `tasks.md` is finished. +4. Update the checklist after all work is done so each task is marked `- [x]` and reflects reality. +5. Reference `openspec list` or `openspec show ` when additional context is required. + +**Reference** +- Use `openspec show --json --deltas-only` if you need additional context from the proposal while implementing. + diff --git a/.claude/commands/openspec/archive.md b/.claude/commands/openspec/archive.md new file mode 100644 index 0000000..dbc7695 --- /dev/null +++ b/.claude/commands/openspec/archive.md @@ -0,0 +1,27 @@ +--- +name: OpenSpec: Archive +description: Archive a deployed OpenSpec change and update specs. +category: OpenSpec +tags: [openspec, archive] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. + +**Steps** +1. Determine the change ID to archive: + - If this prompt already includes a specific change ID (for example inside a `` block populated by slash-command arguments), use that value after trimming whitespace. + - If the conversation references a change loosely (for example by title or summary), run `openspec list` to surface likely IDs, share the relevant candidates, and confirm which one the user intends. + - Otherwise, review the conversation, run `openspec list`, and ask the user which change to archive; wait for a confirmed change ID before proceeding. + - If you still cannot identify a single change ID, stop and tell the user you cannot archive anything yet. +2. Validate the change ID by running `openspec list` (or `openspec show `) and stop if the change is missing, already archived, or otherwise not ready to archive. +3. Run `openspec archive --yes` so the CLI moves the change and applies spec updates without prompts (use `--skip-specs` only for tooling-only work). +4. Review the command output to confirm the target specs were updated and the change landed in `changes/archive/`. +5. Validate with `openspec validate --strict` and inspect with `openspec show ` if anything looks off. + +**Reference** +- Use `openspec list` to confirm change IDs before archiving. +- Inspect refreshed specs with `openspec list --specs` and address any validation issues before handing off. + diff --git a/.claude/commands/openspec/proposal.md b/.claude/commands/openspec/proposal.md new file mode 100644 index 0000000..f4c1c97 --- /dev/null +++ b/.claude/commands/openspec/proposal.md @@ -0,0 +1,27 @@ +--- +name: OpenSpec: Proposal +description: Scaffold a new OpenSpec change and validate strictly. +category: OpenSpec +tags: [openspec, change] +--- + +**Guardrails** +- Favor straightforward, minimal implementations first and add complexity only when it is requested or clearly required. +- Keep changes tightly scoped to the requested outcome. +- Refer to `openspec/AGENTS.md` (located inside the `openspec/` directory—run `ls openspec` or `openspec update` if you don't see it) if you need additional OpenSpec conventions or clarifications. +- Identify any vague or ambiguous details and ask the necessary follow-up questions before editing files. + +**Steps** +1. Review `openspec/project.md`, run `openspec list` and `openspec list --specs`, and inspect related code or docs (e.g., via `rg`/`ls`) to ground the proposal in current behaviour; note any gaps that require clarification. +2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, and `design.md` (when needed) under `openspec/changes//`. +3. Map the change into concrete capabilities or requirements, breaking multi-scope efforts into distinct spec deltas with clear relationships and sequencing. +4. Capture architectural reasoning in `design.md` when the solution spans multiple systems, introduces new patterns, or demands trade-off discussion before committing to specs. +5. Draft spec deltas in `changes//specs//spec.md` (one folder per capability) using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement and cross-reference related capabilities when relevant. +6. Draft `tasks.md` as an ordered list of small, verifiable work items that deliver user-visible progress, include validation (tests, tooling), and highlight dependencies or parallelizable work. +7. Validate with `openspec validate --strict` and resolve every issue before sharing the proposal. + +**Reference** +- Use `openspec show --json --deltas-only` or `openspec show --type spec` to inspect details when validation fails. +- Search existing requirements with `rg -n "Requirement:|Scenario:" openspec/specs` before writing new ones. +- Explore the codebase with `rg `, `ls`, or direct file reads so proposals align with current implementation realities. + diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..11041d7 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,28 @@ +{ + "permissions": { + "allow": [ + "WebFetch(domain:github.com)", + "WebFetch(domain:ml-explore.github.io)", + "Bash(cargo check:*)", + "WebFetch(domain:docs.rs)", + "Bash(cargo doc:*)", + "Bash(cargo test:*)", + "Bash(cargo bench:*)", + "Bash(timeout 120s cargo bench --bench quick_optimization_test)", + "Bash(git add src/physical_planner/joins/interval_join.rs Cargo.toml benches/datafusion_optimized_benchmark.rs)", + "Bash(grep:*)", + "Bash(sed:*)", + "Bash(git commit:*)", + "Bash(git cherry-pick:*)", + "Bash(git add:*)", + "Bash(git reset:*)", + "Bash(git checkout:*)", + "Bash(git unstage:*)", + "Bash(pre-commit run:*)", + "Bash(python3:*)", + "Bash(git log:*)" + ], + "deny": [], + "ask": [] + } +} diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..9b3421a --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,60 @@ + +# OpenSpec Instructions + +These instructions are for AI assistants working in this project. + +Always open `@/openspec/AGENTS.md` when the request: +- Mentions planning or proposals (words like proposal, spec, change, plan) +- Introduces new capabilities, breaking changes, architecture shifts, or big performance/security work +- Sounds ambiguous and you need the authoritative spec before coding + +Use `@/openspec/AGENTS.md` to learn: +- How to create and apply change proposals +- Spec format and conventions +- Project structure and guidelines + +Keep this managed block so 'openspec update' can refresh the instructions. + + + +# Repository Guidelines + +## Project Structure & Module Organization +- Workspace: Rust 2021, crates in `sequila/`. +- Core library: `sequila/sequila-core` — DataFusion extensions, physical planner, and interval-join optimization. +- CLI: `sequila/sequila-cli` — REPL and file execution for SQL. +- Tests: unit/integration under `sequila/sequila-core/tests` and crate-local tests; benches in `sequila/sequila-core/benches`. +- Utilities/data: `queries/` for sample SQL, `testing/data/` for local datasets, `bin/env.sh` for env helpers. + +## Build, Test, and Development Commands +- Build: `cargo build` (release: `cargo build --release`). +- Run CLI: `RUST_LOG=info cargo run -p sequila-cli -- --file queries/q1-coitrees.sql`. +- REPL: `RUST_LOG=info cargo run -p sequila-cli`. +- Tests: `cargo test --workspace` (async tests use Tokio). +- Benchmarks: `RUSTFLAGS="-Ctarget-cpu=native" cargo bench --bench databio_benchmark -- --quick`. + +## Coding Style & Naming Conventions +- Formatter: rustfmt via pre-commit. Run `cargo fmt --all` before pushing. +- Lints: `cargo check` runs in pre-commit; keep builds warning-free. +- Naming: crates/modules `snake_case`, types/enums `PascalCase`, functions/vars `snake_case`, constants `SCREAMING_SNAKE_CASE`. +- Indentation: 4 spaces; avoid long lines unless readability benefits. + +## Testing Guidelines +- Frameworks: Rust built-in tests, `rstest` for parametrization, `tokio::test` for async. +- Locations: unit tests inline in `src/`; integration tests in `sequila-core/tests/`. +- Conventions: name tests descriptively (e.g., `interval_rule_eq`); include edge cases for join predicates and config options. +- Coverage: no strict target; add tests for new features and bug fixes. + +## Commit & Pull Request Guidelines +- Commits: imperative mood, concise title; optionally use prefixes (`feat:`, `fix:`, `perf:`). Reference issues/PRs where relevant. +- PRs: include scope/intent, key changes, how to test (commands/queries), and performance notes if applicable. Add screenshots/log snippets when helpful. +- Keep diffs focused; update README or examples when behavior changes. + +## Architecture & Configuration Tips +- Core idea: replace/augment DataFusion planning with `SeQuiLaQueryPlanner` and an interval-join physical optimization rule. +- Useful SQL settings: + - `SET sequila.prefer_interval_join TO true;` + - `SET sequila.interval_join_algorithm TO coitrees;` + - `SET datafusion.optimizer.repartition_joins TO false;` +- Bench data: export `BENCH_DATA_ROOT` to point at local datasets. + diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..0669699 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,18 @@ + +# OpenSpec Instructions + +These instructions are for AI assistants working in this project. + +Always open `@/openspec/AGENTS.md` when the request: +- Mentions planning or proposals (words like proposal, spec, change, plan) +- Introduces new capabilities, breaking changes, architecture shifts, or big performance/security work +- Sounds ambiguous and you need the authoritative spec before coding + +Use `@/openspec/AGENTS.md` to learn: +- How to create and apply change proposals +- Spec format and conventions +- Project structure and guidelines + +Keep this managed block so 'openspec update' can refresh the instructions. + + \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index d3cb511..d0cf1ef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,12 +17,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" -[[package]] -name = "adler32" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" - [[package]] name = "ahash" version = "0.8.11" @@ -155,28 +149,28 @@ checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775" [[package]] name = "apache-avro" -version = "0.17.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1aef82843a0ec9f8b19567445ad2421ceeb1d711514384bdd3d49fe37102ee13" +checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" dependencies = [ "bigdecimal", - "bzip2 0.4.4", + "bon", + "bzip2 0.6.1", "crc32fast", "digest", - "libflate", "log", + "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.8.5", + "rand 0.9.2", "regex-lite", "serde", "serde_bytes", "serde_json", "snap", - "strum", - "strum_macros", - "thiserror 1.0.69", - "typed-builder", + "strum 0.27.2", + "strum_macros 0.27.2", + "thiserror 2.0.17", "uuid", "xz2", "zstd", @@ -205,9 +199,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +checksum = "6e833808ff2d94ed40d9379848a950d995043c7fb3e81a30b383f4c6033821cc" dependencies = [ "arrow-arith", "arrow-array", @@ -226,9 +220,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +checksum = "ad08897b81588f60ba983e3ca39bda2b179bdd84dced378e7df81a5313802ef8" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +234,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +checksum = "8548ca7c070d8db9ce7aa43f37393e4bfcf3f2d3681df278490772fd1673d08d" dependencies = [ "ahash", "arrow-buffer", @@ -251,15 +245,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.15.1", + "hashbrown 0.16.0", "num", ] [[package]] name = "arrow-buffer" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +checksum = "e003216336f70446457e280807a73899dd822feaf02087d31febca1363e2fccc" dependencies = [ "bytes", "half", @@ -268,9 +262,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +checksum = "919418a0681298d3a77d1a315f625916cb5678ad0d74b9c60108eb15fd083023" dependencies = [ "arrow-array", "arrow-buffer", @@ -289,9 +283,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +checksum = "bfa9bf02705b5cf762b6f764c65f04ae9082c7cfc4e96e0c33548ee3f67012eb" dependencies = [ "arrow-array", "arrow-cast", @@ -304,9 +298,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +checksum = "a5c64fff1d142f833d78897a772f2e5b55b36cb3e6320376f0961ab0db7bd6d0" dependencies = [ "arrow-buffer", "arrow-schema", @@ -316,23 +310,25 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +checksum = "1d3594dcddccc7f20fd069bc8e9828ce37220372680ff638c5e00dea427d88f5" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", + "arrow-select", "flatbuffers", "lz4_flex", + "zstd", ] [[package]] name = "arrow-json" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +checksum = "88cf36502b64a127dc659e3b305f1d993a544eab0d48cce704424e62074dc04b" dependencies = [ "arrow-array", "arrow-buffer", @@ -352,9 +348,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +checksum = "3c8f82583eb4f8d84d4ee55fd1cb306720cddead7596edce95b50ee418edf66f" dependencies = [ "arrow-array", "arrow-buffer", @@ -365,9 +361,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +checksum = "9d07ba24522229d9085031df6b94605e0f4b26e099fb7cdeec37abd941a73753" dependencies = [ "arrow-array", "arrow-buffer", @@ -378,9 +374,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +checksum = "b3aa9e59c611ebc291c28582077ef25c97f1975383f1479b12f3b9ffee2ffabe" dependencies = [ "serde", "serde_json", @@ -388,9 +384,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +checksum = "8c41dbbd1e97bfcaee4fcb30e29105fb2c75e4d82ae4de70b792a5d3f66b2e7a" dependencies = [ "ahash", "arrow-array", @@ -402,9 +398,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +checksum = "53f5183c150fbc619eede22b861ea7c0eebed8eaac0333eaa7f6da5205fd504d" dependencies = [ "arrow-array", "arrow-buffer", @@ -468,9 +464,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.8.5" +version = "1.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c478f5b10ce55c9a33f87ca3404ca92768b144fc1bfdede7c0121214a8283a25" +checksum = "37cf2b6af2a95a20e266782b4f76f1a5e12bf412a9db2de9c1e9123b9d8c0ad8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -498,9 +494,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.5" +version = "1.2.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1541072f81945fa1251f8795ef6c92c4282d74d59f88498ae7d4bf00f0ebdad9" +checksum = "faf26925f4a5b59eb76722b63c2892b1d70d06fa053c72e4a100ec308c1d47bc" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -510,9 +506,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.13.3" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c953fe1ba023e6b7730c0d4b031d06f267f23a46167dcbd40316644b10a17ba" +checksum = "879b6c89592deb404ba4dc0ae6b58ffd1795c78991cbb5b8bc441c48a070440d" dependencies = [ "aws-lc-sys", "zeroize", @@ -520,9 +516,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.30.0" +version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbfd150b5dbdb988bcc8fb1fe787eb6b7ee6180ca24da683b61ea5405f3d43ff" +checksum = "107a4e9d9cab9963e04e84bb8dee0e25f2a987f9a8bad5ed054abd439caa8f8c" dependencies = [ "bindgen", "cc", @@ -533,9 +529,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.10" +version = "1.5.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c034a1bc1d70e16e7f4e4caf7e9f7693e4c9c24cd91cf17c2a0b21abaebc7c8b" +checksum = "bfa006bb32360ed90ac51203feafb9d02e3d21046e1fd3a450a404b90ea73e5d" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -557,9 +553,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.81.0" +version = "1.87.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79ede098271e3471036c46957cba2ba30888f53bda2515bf04b560614a30a36e" +checksum = "f4af747ffcb5aa8da8be8f0679ef6940f1afdb8c2e10c36738c9ebeb8d17b95e" dependencies = [ "aws-credential-types", "aws-runtime", @@ -579,9 +575,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.82.0" +version = "1.89.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43326f724ba2cc957e6f3deac0ca1621a3e5d4146f5970c24c8a108dac33070f" +checksum = "695dc67bb861ccb8426c9129b91c30e266a0e3d85650cafdf62fcca14c8fd338" dependencies = [ "aws-credential-types", "aws-runtime", @@ -601,9 +597,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.83.0" +version = "1.89.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5468593c47efc31fdbe6c902d1a5fde8d9c82f78a3f8ccfe907b1e9434748cb" +checksum = "928e87698cd916cf1efd5268148347269e6d2911028742c0061ff6261e639e3c" dependencies = [ "aws-credential-types", "aws-runtime", @@ -624,9 +620,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.3.4" +version = "1.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084c34162187d39e3740cb635acd73c4e3a551a36146ad6fe8883c929c9f876c" +checksum = "bffc03068fbb9c8dd5ce1c6fb240678a5cffb86fb2b7b1985c999c4b83c8df68" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -646,9 +642,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.5" +version = "1.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e190749ea56f8c42bf15dd76c65e14f8f765233e6df9b0506d9d934ebef867c" +checksum = "127fcfad33b7dfc531141fda7e1c402ac65f88aca5511a4d31e2e3d2cd01ce9c" dependencies = [ "futures-util", "pin-project-lite", @@ -657,15 +653,16 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.62.3" +version = "0.62.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c4dacf2d38996cf729f55e7a762b30918229917eca115de45dfa8dfb97796c9" +checksum = "445d5d720c99eed0b4aa674ed00d835d9b1427dd73e04adaf2f94c6b2d6f9fca" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", "bytes-utils", "futures-core", + "futures-util", "http 0.2.12", "http 1.3.1", "http-body 0.4.6", @@ -677,9 +674,9 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.0.6" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f108f1ca850f3feef3009bdcc977be201bca9a91058864d9de0684e64514bee0" +checksum = "623254723e8dfd535f566ee7b2381645f8981da086b5c4aa26c0c41582bb1d2c" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -694,33 +691,34 @@ dependencies = [ "rustls-native-certs", "rustls-pki-types", "tokio", + "tokio-rustls", "tower", "tracing", ] [[package]] name = "aws-smithy-json" -version = "0.61.4" +version = "0.61.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a16e040799d29c17412943bdbf488fd75db04112d0c0d4b9290bacf5ae0014b9" +checksum = "2db31f727935fc63c6eeae8b37b438847639ec330a9161ece694efba257e0c54" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.1.3" +version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9364d5989ac4dd918e5cc4c4bdcc61c9be17dcd2586ea7f69e348fc7c6cab393" +checksum = "2d1881b1ea6d313f9890710d65c158bdab6fb08c91ea825f74c1c8c357baf4cc" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-query" -version = "0.60.7" +version = "0.60.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +checksum = "d28a63441360c477465f80c7abac3b9c4d075ca638f982e605b7dc2a2c7156c9" dependencies = [ "aws-smithy-types", "urlencoding", @@ -728,9 +726,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.8.6" +version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e107ce0783019dbff59b3a244aa0c114e4a8c9d93498af9162608cd5474e796" +checksum = "0bbe9d018d646b96c7be063dd07987849862b0e6d07c778aad7d93d1be6c1ef0" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -752,9 +750,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.8.7" +version = "1.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75d52251ed4b9776a3e8487b2a01ac915f73b2da3af8fc1e77e0fce697a550d4" +checksum = "ec7204f9fd94749a7c53b26da1b961b4ac36bf070ef1e0b94bb09f79d4f6c193" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -769,9 +767,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d498595448e43de7f4296b7b7a18a8a02c61ec9349128c80a368f7c3b4ab11a8" +checksum = "25f535879a207fce0db74b679cfc3e91a3159c8144d717d55f5832aea9eef46e" dependencies = [ "base64-simd", "bytes", @@ -792,18 +790,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.10" +version = "0.60.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3db87b96cb1b16c024980f133968d52882ca0daaee3a086c6decc500f6c99728" +checksum = "eab77cdd036b11056d2a30a7af7b775789fb024bf216acc13884c6c97752ae56" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.8" +version = "1.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b069d19bf01e46298eaedd7c6f283fe565a59263e53eebec945f3e6398f42390" +checksum = "e2fd329bf0e901ff3f60425691410c69094dc2a1f34b331f37bfc4e9ac1565a1" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -825,7 +823,7 @@ dependencies = [ "miniz_oxide", "object", "rustc-demangle", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -869,25 +867,22 @@ dependencies = [ [[package]] name = "bindgen" -version = "0.69.5" +version = "0.72.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" dependencies = [ "bitflags", "cexpr", "clang-sys", - "itertools 0.10.5", - "lazy_static", - "lazycell", + "itertools 0.13.0", "log", "prettyplease", "proc-macro2", "quote", "regex", - "rustc-hash 1.1.0", + "rustc-hash", "shlex", "syn", - "which", ] [[package]] @@ -922,8 +917,8 @@ dependencies = [ "serde", "serde_derive", "statrs", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "thiserror 1.0.69", "triple_accel", "vec_map", @@ -938,7 +933,7 @@ dependencies = [ "derive-new", "lazy_static", "regex", - "strum_macros", + "strum_macros 0.26.4", "thiserror 1.0.69", ] @@ -994,6 +989,31 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bon" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebeb9aaf9329dff6ceb65c689ca3db33dbf15f324909c60e4e5eef5701ce31b1" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e9d642a7e3a318e37c2c9427b5a6a48aa1ad55dcd986f3034ab2239045a645" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn", +] + [[package]] name = "brotli" version = "8.0.2" @@ -1067,21 +1087,20 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.4.4" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" dependencies = [ "bzip2-sys", - "libc", ] [[package]] name = "bzip2" -version = "0.5.2" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +checksum = "f3a53fac24f34a81bc9954b5d6cfce0c21e18ec6959f44f56e8e90e4bb7c346c" dependencies = [ - "bzip2-sys", + "libbz2-rs-sys", ] [[package]] @@ -1102,10 +1121,11 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.1" +version = "1.2.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd9de9f2205d5ef3fd67e685b0df337994ddd4495e2a28d185500d0e1edfea47" +checksum = "739eb0f94557554b3ca9a86d2d37bebd49c5e6d0c1d2bda35ba5bdac830befc2" dependencies = [ + "find-msvc-tools", "jobserver", "libc", "shlex", @@ -1142,7 +1162,7 @@ dependencies = [ "iana-time-zone", "num-traits", "serde", - "windows-link", + "windows-link 0.1.3", ] [[package]] @@ -1206,9 +1226,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.45" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc0e74a703892159f5ae7d3aac52c8e6c392f5ae5f359c70b5881d60aaac318" +checksum = "4c26d721170e0295f191a69bd9a1f93efcdb0aff38684b61ab5750468972e5f5" dependencies = [ "clap_builder", "clap_derive", @@ -1216,9 +1236,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.44" +version = "4.5.51" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8" +checksum = "75835f0c7bf681bfd05abe44e965760fea999a5286c6eb2d59883634fd02011a" dependencies = [ "anstream", "anstyle", @@ -1228,9 +1248,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.45" +version = "4.5.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6" +checksum = "2a0b5487afeab2deb2ff4e03a807ad1a03ac532ff5a2cee5d86884440c7f7671" dependencies = [ "heck", "proc-macro2", @@ -1276,12 +1296,12 @@ checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" [[package]] name = "comfy-table" -version = "7.1.3" +version = "7.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +checksum = "e0d05af1e006a2407bedef5af410552494ce5be9090444dbbcb57258c1af3d56" dependencies = [ - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "unicode-width", ] @@ -1327,15 +1347,6 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" -[[package]] -name = "core2" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b49ba7ef1ad6107f8824dbe97de947cbaac53c44e7f9756a1fba0d37c1eec505" -dependencies = [ - "memchr", -] - [[package]] name = "cpufeatures" version = "0.2.15" @@ -1347,9 +1358,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] @@ -1461,10 +1472,39 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" [[package]] -name = "dary_heap" -version = "0.3.7" +name = "darling" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04d2cd9c18b9f454ed67da600630b021a8a80bf33f8c95896ab33aaf1c26b728" +checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" +dependencies = [ + "darling_core", + "quote", + "syn", +] [[package]] name = "dashmap" @@ -1482,16 +1522,16 @@ dependencies = [ [[package]] name = "datafusion" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a11e19a7ccc5bb979c95c1dceef663eab39c9061b3bbf8d1937faf0f03bf41f" +checksum = "2af15bb3c6ffa33011ef579f6b0bcbe7c26584688bd6c994f548e44df67f011a" dependencies = [ "arrow", "arrow-ipc", "arrow-schema", "async-trait", "bytes", - "bzip2 0.5.2", + "bzip2 0.6.1", "chrono", "datafusion-catalog", "datafusion-catalog-listing", @@ -1512,6 +1552,7 @@ dependencies = [ "datafusion-functions-window", "datafusion-optimizer", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", @@ -1519,6 +1560,7 @@ dependencies = [ "datafusion-sql", "flate2", "futures", + "hex", "itertools 0.14.0", "log", "object_store", @@ -1537,9 +1579,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94985e67cab97b1099db2a7af11f31a45008b282aba921c1e1d35327c212ec18" +checksum = "187622262ad8f7d16d3be9202b4c1e0116f1c9aa387e5074245538b755261621" dependencies = [ "arrow", "async-trait", @@ -1563,9 +1605,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e002df133bdb7b0b9b429d89a69aa77b35caeadee4498b2ce1c7c23a99516988" +checksum = "9657314f0a32efd0382b9a46fdeb2d233273ece64baa68a7c45f5a192daf0f83" dependencies = [ "arrow", "async-trait", @@ -1586,9 +1628,9 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85591b54815b0c9d3fbf3b0425b75e9fc49ef73e6886d6c19d622986e2551b53" +checksum = "6a0b9c821d14e79070f42ea3a6d6618ced04d94277f0a32301918d7a022c250f" dependencies = [ "arrow", "async-trait", @@ -1599,6 +1641,7 @@ dependencies = [ "dirs", "env_logger", "futures", + "log", "mimalloc", "object_store", "parking_lot", @@ -1611,17 +1654,19 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13242fc58fd753787b0a538e5ae77d356cb9d0656fa85a591a33c5f106267f6" +checksum = "5a83760d9a13122d025fbdb1d5d5aaf93dd9ada5e90ea229add92aa30898b2d1" dependencies = [ "ahash", "apache-avro", "arrow", "arrow-ipc", "base64", + "chrono", "half", "hashbrown 0.14.5", + "hex", "indexmap", "libc", "log", @@ -1636,9 +1681,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2239f964e95c3a5d6b4a8cde07e646de8995c1396a7fd62c6e784f5341db499" +checksum = "5b6234a6c7173fe5db1c6c35c01a12b2aa0f803a3007feee53483218817f8b1e" dependencies = [ "futures", "log", @@ -1647,21 +1692,22 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2cf792579bc8bf07d1b2f68c2d5382f8a63679cce8fbebfd4ba95742b6e08864" +checksum = "7256c9cb27a78709dd42d0c80f0178494637209cac6e29d5c93edd09b6721b86" dependencies = [ "arrow", "async-compression", "async-trait", "bytes", - "bzip2 0.5.2", + "bzip2 0.6.1", "chrono", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-session", @@ -1683,9 +1729,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4de733d231abb0fba663ff60fd37bf7171fa8b2e46e8a99e41362001821d116e" +checksum = "10d40b6953ebc9099b37adfd12fde97eb73ff0cee44355c6dea64b8a4537d561" dependencies = [ "apache-avro", "arrow", @@ -1708,9 +1754,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfc114f9a1415174f3e8d2719c371fc72092ef2195a7955404cfe6b2ba29a706" +checksum = "64533a90f78e1684bfb113d200b540f18f268134622d7c96bbebc91354d04825" dependencies = [ "arrow", "async-trait", @@ -1733,9 +1779,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d88dd5e215c420a52362b9988ecd4cefd71081b730663d4f7d886f706111fc75" +checksum = "8d7ebeb12c77df0aacad26f21b0d033aeede423a64b2b352f53048a75bf1d6e6" dependencies = [ "arrow", "async-trait", @@ -1758,9 +1804,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33692acdd1fbe75280d14f4676fe43f39e9cb36296df56575aa2cac9a819e4cf" +checksum = "09e783c4c7d7faa1199af2df4761c68530634521b176a8d1331ddbc5a5c75133" dependencies = [ "arrow", "async-trait", @@ -1773,11 +1819,14 @@ dependencies = [ "datafusion-expr", "datafusion-functions-aggregate", "datafusion-physical-expr", + "datafusion-physical-expr-adapter", "datafusion-physical-expr-common", "datafusion-physical-optimizer", "datafusion-physical-plan", + "datafusion-pruning", "datafusion-session", "futures", + "hex", "itertools 0.14.0", "log", "object_store", @@ -1789,17 +1838,18 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0e7b648387b0c1937b83cb328533c06c923799e73a9e3750b762667f32662c0" +checksum = "99ee6b1d9a80d13f9deb2291f45c07044b8e62fb540dbde2453a18be17a36429" [[package]] name = "datafusion-execution" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9609d83d52ff8315283c6dad3b97566e877d8f366fab4c3297742f33dcd636c7" +checksum = "a4cec0a57653bec7b933fb248d3ffa3fa3ab3bd33bd140dc917f714ac036f531" dependencies = [ "arrow", + "async-trait", "dashmap", "datafusion-common", "datafusion-expr", @@ -1807,6 +1857,7 @@ dependencies = [ "log", "object_store", "parking_lot", + "parquet", "rand 0.9.2", "tempfile", "url", @@ -1814,11 +1865,12 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e75230cd67f650ef0399eb00f54d4a073698f2c0262948298e5299fc7324da63" +checksum = "ef76910bdca909722586389156d0aa4da4020e1631994d50fadd8ad4b1aa05fe" dependencies = [ "arrow", + "async-trait", "chrono", "datafusion-common", "datafusion-doc", @@ -1835,9 +1887,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70fafb3a045ed6c49cfca0cd090f62cf871ca6326cc3355cb0aaf1260fa760b6" +checksum = "6d155ccbda29591ca71a1344dd6bed26c65a4438072b400df9db59447f590bb6" dependencies = [ "arrow", "datafusion-common", @@ -1848,9 +1900,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf9a9cf655265861a20453b1e58357147eab59bdc90ce7f2f68f1f35104d3bb" +checksum = "7de2782136bd6014670fd84fe3b0ca3b3e4106c96403c3ae05c0598577139977" dependencies = [ "arrow", "arrow-buffer", @@ -1877,9 +1929,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f07e49733d847be0a05235e17b884d326a2fd402c97a89fe8bcf0bfba310005" +checksum = "07331fc13603a9da97b74fd8a273f4238222943dffdbbed1c4c6f862a30105bf" dependencies = [ "ahash", "arrow", @@ -1898,9 +1950,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4512607e10d72b0b0a1dc08f42cb5bd5284cb8348b7fea49dc83409493e32b1b" +checksum = "b5951e572a8610b89968a09b5420515a121fbc305c0258651f318dc07c97ab17" dependencies = [ "ahash", "arrow", @@ -1911,9 +1963,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ab331806e34f5545e5f03396e4d5068077395b1665795d8f88c14ec4f1e0b7a" +checksum = "fdacca9302c3d8fc03f3e94f338767e786a88a33f5ebad6ffc0e7b50364b9ea3" dependencies = [ "arrow", "arrow-ord", @@ -1923,6 +1975,7 @@ dependencies = [ "datafusion-expr", "datafusion-functions", "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", "itertools 0.14.0", @@ -1932,9 +1985,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4ac2c0be983a06950ef077e34e0174aa0cb9e346f3aeae459823158037ade37" +checksum = "8c37ff8a99434fbbad604a7e0669717c58c7c4f14c472d45067c4b016621d981" dependencies = [ "arrow", "async-trait", @@ -1948,9 +2001,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f3d92731de384c90906941d36dcadf6a86d4128409a9c5cd916662baed5f53" +checksum = "48e2aea7c79c926cffabb13dc27309d4eaeb130f4a21c8ba91cdd241c813652b" dependencies = [ "arrow", "datafusion-common", @@ -1966,9 +2019,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c679f8bf0971704ec8fd4249fcbb2eb49d6a12cc3e7a840ac047b4928d3541b5" +checksum = "0fead257ab5fd2ffc3b40fda64da307e20de0040fe43d49197241d9de82a487f" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1976,9 +2029,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2821de7cb0362d12e75a5196b636a59ea3584ec1e1cc7dc6f5e34b9e8389d251" +checksum = "ec6f637bce95efac05cdfb9b6c19579ed4aa5f6b94d951cfa5bb054b7bb4f730" dependencies = [ "datafusion-expr", "quote", @@ -1987,14 +2040,15 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1594c7a97219ede334f25347ad8d57056621e7f4f35a0693c8da876e10dd6a53" +checksum = "c6583ef666ae000a613a837e69e456681a9faa96347bf3877661e9e89e141d8a" dependencies = [ "arrow", "chrono", "datafusion-common", "datafusion-expr", + "datafusion-expr-common", "datafusion-physical-expr", "indexmap", "itertools 0.14.0", @@ -2006,9 +2060,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc6da0f2412088d23f6b01929dedd687b5aee63b19b674eb73d00c3eb3c883b7" +checksum = "c8668103361a272cbbe3a61f72eca60c9b7c706e87cc3565bcf21e2b277b84f6" dependencies = [ "ahash", "arrow", @@ -2022,15 +2076,31 @@ dependencies = [ "indexmap", "itertools 0.14.0", "log", + "parking_lot", "paste", "petgraph 0.8.2", ] +[[package]] +name = "datafusion-physical-expr-adapter" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "815acced725d30601b397e39958e0e55630e0a10d66ef7769c14ae6597298bb0" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", +] + [[package]] name = "datafusion-physical-expr-common" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb0dbd9213078a593c3fe28783beaa625a4e6c6a6c797856ee2ba234311fb96" +checksum = "6652fe7b5bf87e85ed175f571745305565da2c0b599d98e697bcbedc7baa47c3" dependencies = [ "ahash", "arrow", @@ -2042,9 +2112,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d140854b2db3ef8ac611caad12bfb2e1e1de827077429322a6188f18fc0026a" +checksum = "49b7d623eb6162a3332b564a0907ba00895c505d101b99af78345f1acf929b5c" dependencies = [ "arrow", "datafusion-common", @@ -2054,6 +2124,7 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", + "datafusion-pruning", "itertools 0.14.0", "log", "recursive", @@ -2061,9 +2132,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b46cbdf21a01206be76d467f325273b22c559c744a012ead5018dfe79597de08" +checksum = "e2f7f778a1a838dec124efb96eae6144237d546945587557c9e6936b3414558c" dependencies = [ "ahash", "arrow", @@ -2075,6 +2146,7 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-functions-aggregate-common", "datafusion-functions-window-common", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -2089,11 +2161,29 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-pruning" +version = "50.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1e59e2ca14fe3c30f141600b10ad8815e2856caa59ebbd0e3e07cd3d127a65" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", +] + [[package]] name = "datafusion-session" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a72733766ddb5b41534910926e8da5836622316f6283307fd9fb7e19811a59c" +checksum = "21ef8e2745583619bd7a49474e8f45fbe98ebb31a133f27802217125a7b3d58d" dependencies = [ "arrow", "async-trait", @@ -2115,9 +2205,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "48.0.1" +version = "50.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5162338cdec9cc7ea13a0e6015c361acad5ec1d88d83f7c86301f789473971f" +checksum = "89abd9868770386fede29e5a4b14f49c0bf48d652c3b9d7a8a0332329b87d50b" dependencies = [ "arrow", "bigdecimal", @@ -2334,6 +2424,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "835a3dc7d1ec9e75e2b5fb4ba75396837112d2060b03f7d43bc1897c7f7211da" +[[package]] +name = "find-msvc-tools" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52051878f80a721bb68ebfbc930e07b65ba72f2da88968ea5c06fd6ca3d3a127" + [[package]] name = "fixedbitset" version = "0.4.2" @@ -2381,9 +2477,9 @@ checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" [[package]] name = "form_urlencoded" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" dependencies = [ "percent-encoding", ] @@ -2547,9 +2643,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.4.6" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" dependencies = [ "atomic-waker", "bytes", @@ -2596,6 +2692,12 @@ dependencies = [ "foldhash", ] +[[package]] +name = "hashbrown" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419bdc4f6a9207fbeba6d11b604d481addf78ecd10c11ad51e76c2f6482748d" + [[package]] name = "heck" version = "0.5.0" @@ -2746,18 +2848,23 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.10" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df2dcfbe0677734ab2f3ffa7fa7bfd4706bfdc1ef393f2ee30184aed67e631b4" +checksum = "3c6995591a8f1380fcb4ba966a252a4b29188d51d2b89e3a252f5305be65aea8" dependencies = [ + "base64", "bytes", "futures-channel", + "futures-core", "futures-util", "http 1.3.1", "http-body 1.0.1", "hyper", + "ipnet", + "libc", + "percent-encoding", "pin-project-lite", - "socket2 0.5.7", + "socket2 0.6.0", "tokio", "tower-service", "tracing", @@ -2904,11 +3011,17 @@ dependencies = [ "syn", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" -version = "1.0.3" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" dependencies = [ "idna_adapter", "smallvec", @@ -2927,12 +3040,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.10.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" +checksum = "6717a8d2a5a929a1a2eb43a12812498ed141a0bcfb7e8f7844fbdbe4303bba9f" dependencies = [ "equivalent", - "hashbrown 0.15.1", + "hashbrown 0.16.0", ] [[package]] @@ -3042,12 +3155,6 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" -[[package]] -name = "lazycell" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" - [[package]] name = "lexical-core" version = "1.0.2" @@ -3113,34 +3220,16 @@ dependencies = [ ] [[package]] -name = "libc" -version = "0.2.175" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" - -[[package]] -name = "libflate" -version = "2.1.0" +name = "libbz2-rs-sys" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45d9dfdc14ea4ef0900c1cddbc8dcd553fbaacd8a4a282cf4018ae9dd04fb21e" -dependencies = [ - "adler32", - "core2", - "crc32fast", - "dary_heap", - "libflate_lz77", -] +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] -name = "libflate_lz77" -version = "2.1.0" +name = "libc" +version = "0.2.175" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" -dependencies = [ - "core2", - "hashbrown 0.14.5", - "rle-decode-fast", -] +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libloading" @@ -3149,7 +3238,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -3211,9 +3300,9 @@ dependencies = [ [[package]] name = "log" -version = "0.4.22" +version = "0.4.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +checksum = "34080505efa8e45a4b816c349525ebe327ceaa8559756f0356cba97ef3bf7432" [[package]] name = "lz4_flex" @@ -3514,7 +3603,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.3", + "thiserror 2.0.17", "tokio", "tracing", "url", @@ -3591,14 +3680,14 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] name = "parquet" -version = "55.2.0" +version = "56.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +checksum = "f0dbd48ad52d7dccf8ea1b90a3ddbfaea4f69878dd7683e51c507d4bc52b5b27" dependencies = [ "ahash", "arrow-array", @@ -3615,12 +3704,13 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.15.1", + "hashbrown 0.16.0", "lz4_flex", "num", "num-bigint", "object_store", "paste", + "ring", "seq-macro", "simdutf8", "snap", @@ -3647,9 +3737,9 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" [[package]] name = "percent-encoding" -version = "2.3.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "petgraph" @@ -3850,10 +3940,10 @@ dependencies = [ "pin-project-lite", "quinn-proto", "quinn-udp", - "rustc-hash 2.0.0", + "rustc-hash", "rustls", "socket2 0.5.7", - "thiserror 2.0.3", + "thiserror 2.0.17", "tokio", "tracing", ] @@ -3868,11 +3958,11 @@ dependencies = [ "getrandom 0.2.15", "rand 0.8.5", "ring", - "rustc-hash 2.0.0", + "rustc-hash", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.3", + "thiserror 2.0.17", "tinyvec", "tracing", "web-time", @@ -4049,7 +4139,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.15", "libredox", - "thiserror 2.0.3", + "thiserror 2.0.17", ] [[package]] @@ -4083,9 +4173,9 @@ checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" [[package]] name = "regex-syntax" -version = "0.8.5" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" [[package]] name = "relative-path" @@ -4153,12 +4243,6 @@ dependencies = [ "windows-sys 0.52.0", ] -[[package]] -name = "rle-decode-fast" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422" - [[package]] name = "rstest" version = "0.22.0" @@ -4217,15 +4301,9 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - -[[package]] -name = "rustc-hash" -version = "2.0.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" [[package]] name = "rustc_version" @@ -4260,9 +4338,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.16" +version = "0.23.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e" +checksum = "6a9586e9ee2b4f8fab52a0048ca7334d7024eef48e2cb9407e3497bb7cab7fa7" dependencies = [ "aws-lc-rs", "once_cell", @@ -4306,9 +4384,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.102.8" +version = "0.103.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9" +checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" dependencies = [ "aws-lc-rs", "ring", @@ -4324,9 +4402,9 @@ checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" [[package]] name = "rustyline" -version = "16.0.0" +version = "17.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62fd9ca5ebc709e8535e8ef7c658eb51457987e48c98ead2be482172accc408d" +checksum = "e902948a25149d50edc1a8e0141aad50f54e22ba83ff988cf8f7c9ef07f50564" dependencies = [ "bitflags", "cfg-if", @@ -4341,7 +4419,7 @@ dependencies = [ "unicode-segmentation", "unicode-width", "utf8parse", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -4459,18 +4537,19 @@ dependencies = [ "rstest", "rstest_reuse", "rust-lapper", - "strum", - "strum_macros", + "strum 0.26.3", + "strum_macros 0.26.4", "superintervals", "tokio", ] [[package]] name = "serde" -version = "1.0.215" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" dependencies = [ + "serde_core", "serde_derive", ] @@ -4483,11 +4562,20 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.228" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" dependencies = [ "proc-macro2", "quote", @@ -4496,14 +4584,15 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.132" +version = "1.0.145" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03" +checksum = "402a6f66d8c709116cf22f558eab210f5a50187f702eb4d7e5ef38d9a7f1c79c" dependencies = [ "itoa", "memchr", "ryu", "serde", + "serde_core", ] [[package]] @@ -4618,9 +4707,9 @@ checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" [[package]] name = "sqlparser" -version = "0.55.0" +version = "0.58.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +checksum = "ec4b661c54b1e4b603b37873a18c59920e4c51ea8ea2cf527d925424dbd4437c" dependencies = [ "log", "recursive", @@ -4687,6 +4776,12 @@ version = "0.26.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +[[package]] +name = "strum" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" + [[package]] name = "strum_macros" version = "0.26.4" @@ -4700,6 +4795,18 @@ dependencies = [ "syn", ] +[[package]] +name = "strum_macros" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "subtle" version = "2.6.1" @@ -4774,11 +4881,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.3" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c006c85c7651b3cf2ada4584faa36773bd07bac24acfb39f3c431b36d7e667aa" +checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl 2.0.3", + "thiserror-impl 2.0.17", ] [[package]] @@ -4794,9 +4901,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.3" +version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f077553d607adc1caf65430528a576c757a71ed73944b66ebb58ef2bbd243568" +checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" dependencies = [ "proc-macro2", "quote", @@ -4921,12 +5028,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ "rustls", - "rustls-pki-types", "tokio", ] @@ -5041,26 +5147,6 @@ version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b907da542cbced5261bd3256de1b3a1bf340a3d37f93425a07362a1d687de56" -[[package]] -name = "typed-builder" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a06fbd5b8de54c5f7c91f6fe4cebb949be2125d7758e630bb58b1d831dbce600" -dependencies = [ - "typed-builder-macro", -] - -[[package]] -name = "typed-builder-macro" -version = "0.19.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9534daa9fd3ed0bd911d462a37f172228077e7abf18c18a5f67199d959205f8" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "typenum" version = "1.17.0" @@ -5093,13 +5179,14 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.4" +version = "2.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +checksum = "08bc136a29a3d1758e07a9cca267be308aeebf5cfd5a10f3f67ab2097683ef5b" dependencies = [ "form_urlencoded", "idna", "percent-encoding", + "serde", ] [[package]] @@ -5296,18 +5383,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "which" -version = "4.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" -dependencies = [ - "either", - "home", - "once_cell", - "rustix", -] - [[package]] name = "wide" version = "0.7.28" @@ -5333,7 +5408,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -5342,6 +5417,12 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + [[package]] name = "windows-registry" version = "0.2.0" @@ -5350,7 +5431,7 @@ checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" dependencies = [ "windows-result", "windows-strings", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -5359,7 +5440,7 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -5369,7 +5450,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" dependencies = [ "windows-result", - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -5378,7 +5459,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -5387,7 +5468,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", ] [[package]] @@ -5396,14 +5486,31 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link 0.2.1", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", ] [[package]] @@ -5412,48 +5519,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + [[package]] name = "winnow" version = "0.6.20" @@ -5601,9 +5756,9 @@ checksum = "626bd9fa9734751fc50d6060752170984d7053f5a39061f524cda68023d4db8a" [[package]] name = "zstd" -version = "0.13.2" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" dependencies = [ "zstd-safe", ] diff --git a/Cargo.toml b/Cargo.toml index b1e716d..cda903e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,8 @@ resolver = "2" [workspace.dependencies] sequila-core = { path = "sequila/sequila-core" } sequila-cli = { path = "sequila/sequila-cli" } -datafusion = { version = "48.0.1" } -datafusion-cli = { version = "48.0.1" } +datafusion = { version = "50.3.0" } +datafusion-cli = { version = "50.3.0" } tokio = { version = "1.36.0", features = ["rt-multi-thread"] } env_logger = "0.11.5" log = "0.4.22" diff --git a/openspec/AGENTS.md b/openspec/AGENTS.md new file mode 100644 index 0000000..355969d --- /dev/null +++ b/openspec/AGENTS.md @@ -0,0 +1,454 @@ +# OpenSpec Instructions + +Instructions for AI coding assistants using OpenSpec for spec-driven development. + +## TL;DR Quick Checklist + +- Search existing work: `openspec spec list --long`, `openspec list` (use `rg` only for full-text search) +- Decide scope: new capability vs modify existing capability +- Pick a unique `change-id`: kebab-case, verb-led (`add-`, `update-`, `remove-`, `refactor-`) +- Scaffold: `proposal.md`, `tasks.md`, `design.md` (only if needed), and delta specs per affected capability +- Write deltas: use `## ADDED|MODIFIED|REMOVED|RENAMED Requirements`; include at least one `#### Scenario:` per requirement +- Validate: `openspec validate [change-id] --strict` and fix issues +- Request approval: Do not start implementation until proposal is approved + +## Three-Stage Workflow + +### Stage 1: Creating Changes +Create proposal when you need to: +- Add features or functionality +- Make breaking changes (API, schema) +- Change architecture or patterns +- Optimize performance (changes behavior) +- Update security patterns + +Triggers (examples): +- "Help me create a change proposal" +- "Help me plan a change" +- "Help me create a proposal" +- "I want to create a spec proposal" +- "I want to create a spec" + +Loose matching guidance: +- Contains one of: `proposal`, `change`, `spec` +- With one of: `create`, `plan`, `make`, `start`, `help` + +Skip proposal for: +- Bug fixes (restore intended behavior) +- Typos, formatting, comments +- Dependency updates (non-breaking) +- Configuration changes +- Tests for existing behavior + +**Workflow** +1. Review `openspec/project.md`, `openspec list`, and `openspec list --specs` to understand current context. +2. Choose a unique verb-led `change-id` and scaffold `proposal.md`, `tasks.md`, optional `design.md`, and spec deltas under `openspec/changes//`. +3. Draft spec deltas using `## ADDED|MODIFIED|REMOVED Requirements` with at least one `#### Scenario:` per requirement. +4. Run `openspec validate --strict` and resolve any issues before sharing the proposal. + +### Stage 2: Implementing Changes +Track these steps as TODOs and complete them one by one. +1. **Read proposal.md** - Understand what's being built +2. **Read design.md** (if exists) - Review technical decisions +3. **Read tasks.md** - Get implementation checklist +4. **Implement tasks sequentially** - Complete in order +5. **Confirm completion** - Ensure every item in `tasks.md` is finished before updating statuses +6. **Update checklist** - After all work is done, set every task to `- [x]` so the list reflects reality +7. **Approval gate** - Do not start implementation until the proposal is reviewed and approved + +### Stage 3: Archiving Changes +After deployment, create separate PR to: +- Move `changes/[name]/` → `changes/archive/YYYY-MM-DD-[name]/` +- Update `specs/` if capabilities changed +- Use `openspec archive --skip-specs --yes` for tooling-only changes (always pass the change ID explicitly) +- Run `openspec validate --strict` to confirm the archived change passes checks + +## Before Any Task + +**Context Checklist:** +- [ ] Read relevant specs in `specs/[capability]/spec.md` +- [ ] Check pending changes in `changes/` for conflicts +- [ ] Read `openspec/project.md` for conventions +- [ ] Run `openspec list` to see active changes +- [ ] Run `openspec list --specs` to see existing capabilities + +**Before Creating Specs:** +- Always check if capability already exists +- Prefer modifying existing specs over creating duplicates +- Use `openspec show [spec]` to review current state +- If request is ambiguous, ask 1–2 clarifying questions before scaffolding + +### Search Guidance +- Enumerate specs: `openspec spec list --long` (or `--json` for scripts) +- Enumerate changes: `openspec list` (or `openspec change list --json` - deprecated but available) +- Show details: + - Spec: `openspec show --type spec` (use `--json` for filters) + - Change: `openspec show --json --deltas-only` +- Full-text search (use ripgrep): `rg -n "Requirement:|Scenario:" openspec/specs` + +## Quick Start + +### CLI Commands + +```bash +# Essential commands +openspec list # List active changes +openspec list --specs # List specifications +openspec show [item] # Display change or spec +openspec validate [item] # Validate changes or specs +openspec archive [--yes|-y] # Archive after deployment (add --yes for non-interactive runs) + +# Project management +openspec init [path] # Initialize OpenSpec +openspec update [path] # Update instruction files + +# Interactive mode +openspec show # Prompts for selection +openspec validate # Bulk validation mode + +# Debugging +openspec show [change] --json --deltas-only +openspec validate [change] --strict +``` + +### Command Flags + +- `--json` - Machine-readable output +- `--type change|spec` - Disambiguate items +- `--strict` - Comprehensive validation +- `--no-interactive` - Disable prompts +- `--skip-specs` - Archive without spec updates +- `--yes`/`-y` - Skip confirmation prompts (non-interactive archive) + +## Directory Structure + +``` +openspec/ +├── project.md # Project conventions +├── specs/ # Current truth - what IS built +│ └── [capability]/ # Single focused capability +│ ├── spec.md # Requirements and scenarios +│ └── design.md # Technical patterns +├── changes/ # Proposals - what SHOULD change +│ ├── [change-name]/ +│ │ ├── proposal.md # Why, what, impact +│ │ ├── tasks.md # Implementation checklist +│ │ ├── design.md # Technical decisions (optional; see criteria) +│ │ └── specs/ # Delta changes +│ │ └── [capability]/ +│ │ └── spec.md # ADDED/MODIFIED/REMOVED +│ └── archive/ # Completed changes +``` + +## Creating Change Proposals + +### Decision Tree + +``` +New request? +├─ Bug fix restoring spec behavior? → Fix directly +├─ Typo/format/comment? → Fix directly +├─ New feature/capability? → Create proposal +├─ Breaking change? → Create proposal +├─ Architecture change? → Create proposal +└─ Unclear? → Create proposal (safer) +``` + +### Proposal Structure + +1. **Create directory:** `changes/[change-id]/` (kebab-case, verb-led, unique) + +2. **Write proposal.md:** +```markdown +## Why +[1-2 sentences on problem/opportunity] + +## What Changes +- [Bullet list of changes] +- [Mark breaking changes with **BREAKING**] + +## Impact +- Affected specs: [list capabilities] +- Affected code: [key files/systems] +``` + +3. **Create spec deltas:** `specs/[capability]/spec.md` +```markdown +## ADDED Requirements +### Requirement: New Feature +The system SHALL provide... + +#### Scenario: Success case +- **WHEN** user performs action +- **THEN** expected result + +## MODIFIED Requirements +### Requirement: Existing Feature +[Complete modified requirement] + +## REMOVED Requirements +### Requirement: Old Feature +**Reason**: [Why removing] +**Migration**: [How to handle] +``` +If multiple capabilities are affected, create multiple delta files under `changes/[change-id]/specs//spec.md`—one per capability. + +4. **Create tasks.md:** +```markdown +## 1. Implementation +- [ ] 1.1 Create database schema +- [ ] 1.2 Implement API endpoint +- [ ] 1.3 Add frontend component +- [ ] 1.4 Write tests +``` + +5. **Create design.md when needed:** +Create `design.md` if any of the following apply; otherwise omit it: +- Cross-cutting change (multiple services/modules) or a new architectural pattern +- New external dependency or significant data model changes +- Security, performance, or migration complexity +- Ambiguity that benefits from technical decisions before coding + +Minimal `design.md` skeleton: +```markdown +## Context +[Background, constraints, stakeholders] + +## Goals / Non-Goals +- Goals: [...] +- Non-Goals: [...] + +## Decisions +- Decision: [What and why] +- Alternatives considered: [Options + rationale] + +## Risks / Trade-offs +- [Risk] → Mitigation + +## Migration Plan +[Steps, rollback] + +## Open Questions +- [...] +``` + +## Spec File Format + +### Critical: Scenario Formatting + +**CORRECT** (use #### headers): +```markdown +#### Scenario: User login success +- **WHEN** valid credentials provided +- **THEN** return JWT token +``` + +**WRONG** (don't use bullets or bold): +```markdown +- **Scenario: User login** ❌ +**Scenario**: User login ❌ +### Scenario: User login ❌ +``` + +Every requirement MUST have at least one scenario. + +### Requirement Wording +- Use SHALL/MUST for normative requirements (avoid should/may unless intentionally non-normative) + +### Delta Operations + +- `## ADDED Requirements` - New capabilities +- `## MODIFIED Requirements` - Changed behavior +- `## REMOVED Requirements` - Deprecated features +- `## RENAMED Requirements` - Name changes + +Headers matched with `trim(header)` - whitespace ignored. + +#### When to use ADDED vs MODIFIED +- ADDED: Introduces a new capability or sub-capability that can stand alone as a requirement. Prefer ADDED when the change is orthogonal (e.g., adding "Slash Command Configuration") rather than altering the semantics of an existing requirement. +- MODIFIED: Changes the behavior, scope, or acceptance criteria of an existing requirement. Always paste the full, updated requirement content (header + all scenarios). The archiver will replace the entire requirement with what you provide here; partial deltas will drop previous details. +- RENAMED: Use when only the name changes. If you also change behavior, use RENAMED (name) plus MODIFIED (content) referencing the new name. + +Common pitfall: Using MODIFIED to add a new concern without including the previous text. This causes loss of detail at archive time. If you aren’t explicitly changing the existing requirement, add a new requirement under ADDED instead. + +Authoring a MODIFIED requirement correctly: +1) Locate the existing requirement in `openspec/specs//spec.md`. +2) Copy the entire requirement block (from `### Requirement: ...` through its scenarios). +3) Paste it under `## MODIFIED Requirements` and edit to reflect the new behavior. +4) Ensure the header text matches exactly (whitespace-insensitive) and keep at least one `#### Scenario:`. + +Example for RENAMED: +```markdown +## RENAMED Requirements +- FROM: `### Requirement: Login` +- TO: `### Requirement: User Authentication` +``` + +## Troubleshooting + +### Common Errors + +**"Change must have at least one delta"** +- Check `changes/[name]/specs/` exists with .md files +- Verify files have operation prefixes (## ADDED Requirements) + +**"Requirement must have at least one scenario"** +- Check scenarios use `#### Scenario:` format (4 hashtags) +- Don't use bullet points or bold for scenario headers + +**Silent scenario parsing failures** +- Exact format required: `#### Scenario: Name` +- Debug with: `openspec show [change] --json --deltas-only` + +### Validation Tips + +```bash +# Always use strict mode for comprehensive checks +openspec validate [change] --strict + +# Debug delta parsing +openspec show [change] --json | jq '.deltas' + +# Check specific requirement +openspec show [spec] --json -r 1 +``` + +## Happy Path Script + +```bash +# 1) Explore current state +openspec spec list --long +openspec list +# Optional full-text search: +# rg -n "Requirement:|Scenario:" openspec/specs +# rg -n "^#|Requirement:" openspec/changes + +# 2) Choose change id and scaffold +CHANGE=add-two-factor-auth +mkdir -p openspec/changes/$CHANGE/{specs/auth} +printf "## Why\n...\n\n## What Changes\n- ...\n\n## Impact\n- ...\n" > openspec/changes/$CHANGE/proposal.md +printf "## 1. Implementation\n- [ ] 1.1 ...\n" > openspec/changes/$CHANGE/tasks.md + +# 3) Add deltas (example) +cat > openspec/changes/$CHANGE/specs/auth/spec.md << 'EOF' +## ADDED Requirements +### Requirement: Two-Factor Authentication +Users MUST provide a second factor during login. + +#### Scenario: OTP required +- **WHEN** valid credentials are provided +- **THEN** an OTP challenge is required +EOF + +# 4) Validate +openspec validate $CHANGE --strict +``` + +## Multi-Capability Example + +``` +openspec/changes/add-2fa-notify/ +├── proposal.md +├── tasks.md +└── specs/ + ├── auth/ + │ └── spec.md # ADDED: Two-Factor Authentication + └── notifications/ + └── spec.md # ADDED: OTP email notification +``` + +auth/spec.md +```markdown +## ADDED Requirements +### Requirement: Two-Factor Authentication +... +``` + +notifications/spec.md +```markdown +## ADDED Requirements +### Requirement: OTP Email Notification +... +``` + +## Best Practices + +### Simplicity First +- Default to <100 lines of new code +- Single-file implementations until proven insufficient +- Avoid frameworks without clear justification +- Choose boring, proven patterns + +### Complexity Triggers +Only add complexity with: +- Performance data showing current solution too slow +- Concrete scale requirements (>1000 users, >100MB data) +- Multiple proven use cases requiring abstraction + +### Clear References +- Use `file.ts:42` format for code locations +- Reference specs as `specs/auth/spec.md` +- Link related changes and PRs + +### Capability Naming +- Use verb-noun: `user-auth`, `payment-capture` +- Single purpose per capability +- 10-minute understandability rule +- Split if description needs "AND" + +### Change ID Naming +- Use kebab-case, short and descriptive: `add-two-factor-auth` +- Prefer verb-led prefixes: `add-`, `update-`, `remove-`, `refactor-` +- Ensure uniqueness; if taken, append `-2`, `-3`, etc. + +## Tool Selection Guide + +| Task | Tool | Why | +|------|------|-----| +| Find files by pattern | Glob | Fast pattern matching | +| Search code content | Grep | Optimized regex search | +| Read specific files | Read | Direct file access | +| Explore unknown scope | Task | Multi-step investigation | + +## Error Recovery + +### Change Conflicts +1. Run `openspec list` to see active changes +2. Check for overlapping specs +3. Coordinate with change owners +4. Consider combining proposals + +### Validation Failures +1. Run with `--strict` flag +2. Check JSON output for details +3. Verify spec file format +4. Ensure scenarios properly formatted + +### Missing Context +1. Read project.md first +2. Check related specs +3. Review recent archives +4. Ask for clarification + +## Quick Reference + +### Stage Indicators +- `changes/` - Proposed, not yet built +- `specs/` - Built and deployed +- `archive/` - Completed changes + +### File Purposes +- `proposal.md` - Why and what +- `tasks.md` - Implementation steps +- `design.md` - Technical decisions +- `spec.md` - Requirements and behavior + +### CLI Essentials +```bash +openspec list # What's in progress? +openspec show [item] # View details +openspec validate --strict # Is it correct? +openspec archive [--yes|-y] # Mark complete (add --yes for automation) +``` + +Remember: Specs are truth. Changes are proposals. Keep them in sync. diff --git a/openspec/changes/upgrade-datafusion-50/design.md b/openspec/changes/upgrade-datafusion-50/design.md new file mode 100644 index 0000000..203f5db --- /dev/null +++ b/openspec/changes/upgrade-datafusion-50/design.md @@ -0,0 +1,158 @@ +# Design: DataFusion 50.3.0 Upgrade + +## Context + +sequila-native currently depends on DataFusion 48.0.1 and datafusion-cli 48.0.1. DataFusion 50.x introduces several breaking API changes across core traits (ExecutionPlan, PhysicalExpr, UDF traits) along with significant performance improvements in nested loop joins and bug fixes for recursive queries. + +The upgrade requires adapting to new trait requirements while maintaining compatibility with existing interval join algorithms and bioinformatics-specific query planning logic. + +### Stakeholders +- Developers maintaining interval join implementations +- Users running genomic queries via CLI +- Benchmark/performance testing workflows + +### Current Architecture +- Custom `IntervalJoinExec` implements DataFusion's `ExecutionPlan` trait +- `SeQuiLaQueryPlanner` implements `QueryPlanner` trait +- `SeQuiLaPhysicalPlanner` extends DataFusion's physical planning +- No custom UDFs currently implemented + +## Goals / Non-Goals + +### Goals +- Upgrade to DataFusion 50.3.0 (or latest 50.x available) with full API compatibility +- Maintain correctness of all four interval join algorithms (coitrees, superintervals, lapper, nested-loop) +- Ensure existing SQL queries and configuration parameters continue to work +- Pass all existing tests and benchmarks +- Take advantage of improved nested loop join performance if applicable + +### Non-Goals +- Refactoring interval join implementation beyond what's required for API compatibility +- Adding new features or capabilities specific to DataFusion 50.x +- Performance optimization beyond what DataFusion 50.x provides automatically +- Updating to future DataFusion versions beyond 50.x series + +## Decisions + +### Decision 1: Implement reset_state() as No-Op Initially +**Rationale**: The new `reset_state()` method is required on all ExecutionPlan implementations to support recursive queries with dynamic filters. IntervalJoinExec likely doesn't maintain state that needs resetting beyond what's handled by reconstructing the plan. + +**Implementation**: +```rust +fn reset_state(&self) -> datafusion::common::Result<()> { + // IntervalJoinExec uses OnceAsync for left side building which is + // single-use per execution. No persistent state needs resetting. + Ok(()) +} +``` + +**Validation**: If recursive query issues arise later, we can revisit and add proper state reset for `left_fut` if needed. + +### Decision 2: Target 50.2.0 as Fallback +**Rationale**: Web search confirmed DataFusion 50.2.0 exists but 50.3.0 was not found. We'll attempt 50.3.0 first but document 50.2.0 as the fallback. + +**Implementation**: Check crates.io during implementation; use latest available in 50.x series. + +### Decision 3: Handle ConfigOptions Arc Wrapping Transparently +**Rationale**: SessionState now returns `&Arc` instead of `&ConfigOptions`. Rust's auto-deref should handle most cases, but explicit `.as_ref()` calls may be needed in some contexts. + +**Implementation**: Let the compiler guide fixes during `cargo check`; only add `.as_ref()` where compilation fails. + +### Decision 4: No Design.md for Spec Deltas +**Rationale**: This is primarily a dependency upgrade with mechanical API changes rather than a feature addition. Specs are not affected since functionality remains the same—only internal implementation adapts to new DataFusion APIs. + +**Decision**: Skip spec deltas entirely; this is a technical debt reduction/maintenance task rather than a capability change. + +## Risks / Trade-offs + +### Risk 1: Subtle Behavioral Changes in Nested Loop Joins +**Impact**: DataFusion 50.x rewrote nested loop joins for better performance. While semantically equivalent, there may be subtle differences in output ordering or edge case handling. + +**Mitigation**: +- Run full test suite including integration tests +- Compare benchmark results with 48.0.1 baseline +- Manual testing with representative genomic queries + +### Risk 2: Undocumented API Changes +**Impact**: Not all breaking changes may be captured in upgrade guide; some may only surface during compilation or runtime. + +**Mitigation**: +- Thorough `cargo check` and `cargo clippy` review +- Read DataFusion CHANGELOG between 48.0.1 and 50.x +- Test all code paths, not just happy paths + +### Risk 3: Performance Regression +**Impact**: While DataFusion 50.x improves nested loop joins, there's risk of performance regression in other areas or interaction with custom interval join algorithms. + +**Mitigation**: +- Run benchmark suite before/after upgrade +- Monitor for memory usage changes (DataFusion 50.x claims 99% memory reduction in some cases) +- Document any performance changes in PR + +### Risk 4: Rust Version Requirements +**Impact**: DataFusion 50.x may require newer Rust version than 1.76 specified in workspace Cargo.toml. + +**Mitigation**: +- Check DataFusion 50.x rust-version requirement +- Update workspace rust-version if needed +- Verify CI/CD pipelines support required Rust version + +## Trade-offs + +| Aspect | DataFusion 48.0.1 | DataFusion 50.3.0 | Decision | +|--------|-------------------|-------------------|----------| +| **API Stability** | Stable (no changes needed) | Breaking changes require code updates | Accept: Necessary for long-term maintenance | +| **Performance** | Known baseline | Improved nested loop joins (5X faster) | Win: Free performance improvement | +| **Maintenance** | Growing technical debt | Current version | Win: Easier to upgrade incrementally | +| **Risk** | Zero (no change) | Medium (API changes, potential bugs) | Accept: Mitigated by testing | +| **Arrow Version** | 53.x | 56.0.0 | Win: Access to latest Arrow features | + +## Migration Plan + +### Phase 1: Dependency Update (Low Risk) +1. Update Cargo.toml versions +2. Run `cargo check` to identify compilation errors +3. Document all errors for systematic fixing + +### Phase 2: Core API Fixes (Medium Risk) +1. Add `reset_state()` to IntervalJoinExec +2. Fix ConfigOptions Arc wrapping issues +3. Update ProjectionExpr usage if found +4. Fix any PhysicalExpr trait changes + +### Phase 3: Compilation Success (Medium Risk) +1. Resolve all compiler errors +2. Address deprecation warnings +3. Run `cargo clippy` for additional issues + +### Phase 4: Testing & Validation (High Confidence Gate) +1. Unit tests must pass 100% +2. Integration tests must pass 100% +3. Benchmark suite must run without panics +4. Manual CLI testing with sample queries + +### Phase 5: Performance Validation (Optional) +1. Run full benchmark suite with BENCH_DATA_ROOT +2. Compare results with 48.0.1 baseline +3. Document any significant performance changes + +### Rollback Plan +If critical issues are discovered: +1. Revert Cargo.toml changes +2. Revert any API-specific code changes +3. Run `cargo check` to verify rollback success +4. Document issues for future upgrade attempt + +## Open Questions + +1. **Q**: Does DataFusion 50.3.0 exist on crates.io? + - **Status**: To be verified during implementation; fallback to 50.2.0 + +2. **Q**: Are there any DataFusion module path changes affecting imports? + - **Status**: Will be discovered during compilation + +3. **Q**: Does Arrow 56.0.0 introduce any changes to RecordBatch or Array APIs used in interval joins? + - **Status**: Monitor during compilation and testing + +4. **Q**: Should we update rust-version in workspace Cargo.toml? + - **Status**: Check DataFusion 50.x requirements and update if necessary diff --git a/openspec/changes/upgrade-datafusion-50/proposal.md b/openspec/changes/upgrade-datafusion-50/proposal.md new file mode 100644 index 0000000..64ed9f5 --- /dev/null +++ b/openspec/changes/upgrade-datafusion-50/proposal.md @@ -0,0 +1,51 @@ +# Upgrade DataFusion to 50.3.0 + +## Why + +The project currently uses DataFusion 48.0.1, which is two major versions behind the latest stable release (50.x series). Upgrading to DataFusion 50.3.0 provides: + +1. **Performance improvements**: 5X speedup in nested loop joins with 99% less memory usage +2. **Bug fixes**: Fixes for recursive queries with dynamic filters via new `reset_state` mechanism +3. **API modernization**: Better support for UDF metadata, field-level type information, and expression handling +4. **Arrow 56.0.0**: Access to latest Arrow features and performance improvements +5. **Maintenance**: Staying current reduces technical debt and enables future upgrades + +## What Changes + +This upgrade involves adapting sequila-native code to DataFusion 50.x breaking changes: + +1. **ExecutionPlan trait**: Implement new `reset_state()` method on `IntervalJoinExec` +2. **UDF trait changes**: Update any custom UDFs to use `PartialEq`, `Eq`, `Hash` traits instead of `equals`/`hash_value` methods (currently no custom UDFs exist) +3. **PhysicalExpr changes**: Implement `return_field()` method if using custom physical expressions +4. **ProjectionExpr refactoring**: Update tuple-style `(expr, alias)` to `ProjectionExpr::new(expr, alias)` struct +5. **ConfigOptions API**: Update code accessing `ConfigOptions` from `SessionState` to handle `Arc` return type +6. **Dependencies**: Update `datafusion` and `datafusion-cli` from 48.0.1 to 50.3.0 (note: verify 50.3.0 availability; latest confirmed is 50.2.0) + +## Impact + +### Affected Code +- **Core Files**: + - `sequila/sequila-core/src/physical_planner/joins/interval_join.rs` - ExecutionPlan implementation + - `sequila/sequila-core/src/physical_planner/sequila_query_planner.rs` - QueryPlanner trait + - `sequila/sequila-core/src/physical_planner/sequila_physical_planner.rs` - Physical planner + - `sequila/sequila-core/src/session_context.rs` - Session configuration + - `sequila/sequila-cli/src/main.rs` - CLI integration + +- **Test & Benchmark Files**: + - `sequila/sequila-core/tests/integration_test.rs` + - `sequila/sequila-core/benches/databio_benchmark.rs` + - All other files using DataFusion APIs + +### Breaking Changes +- Requires Rust compiler updates if using new language features in DataFusion 50.x +- Potential behavioral changes in nested loop joins (should be transparent but needs testing) +- API signature changes may require adjustments in multiple files + +### Testing Requirements +- All existing unit tests must pass +- Integration tests must pass +- Benchmark suite must run successfully (performance comparison optional but recommended) +- Verify interval join algorithms (coitrees, superintervals, lapper, nested-loop) still work correctly + +### Migration Risk +**Medium Risk**: Multiple breaking API changes across core traits, but most are mechanical transformations with clear migration paths documented in DataFusion upgrade guide. diff --git a/openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md b/openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md new file mode 100644 index 0000000..436747b --- /dev/null +++ b/openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md @@ -0,0 +1,70 @@ +# DataFusion Integration Spec Delta + +## MODIFIED Requirements + +### Requirement: DataFusion Version Compatibility +The system SHALL integrate with Apache DataFusion version 50.3.0 (or latest available 50.x series release) and datafusion-cli 50.3.0, maintaining compatibility with DataFusion's execution planning, physical operator, and session configuration APIs. + +#### Scenario: ExecutionPlan trait implementation +- **GIVEN** custom `IntervalJoinExec` physical operator exists +- **WHEN** DataFusion's `ExecutionPlan` trait requires `reset_state()` method +- **THEN** `IntervalJoinExec` SHALL implement `reset_state()` to support recursive queries with dynamic filters + +#### Scenario: Session configuration access +- **GIVEN** code accesses `ConfigOptions` from `SessionState` +- **WHEN** `SessionState::options()` returns `&Arc` instead of `&ConfigOptions` +- **THEN** code SHALL handle `Arc`-wrapped configuration options correctly + +#### Scenario: Physical expression metadata +- **GIVEN** custom physical expressions may exist in the future +- **WHEN** DataFusion requires `PhysicalExpr` implementations to provide field-level metadata +- **THEN** physical expressions SHALL implement `return_field()` method to return field information including metadata + +#### Scenario: Projection expression structure +- **GIVEN** code uses projection expressions for column transformations +- **WHEN** DataFusion changes `ProjectionExpr` from tuple `(Arc, String)` to named struct +- **THEN** code SHALL use `ProjectionExpr::new(expr, alias)` for construction and `.expr`/`.alias` for field access + +### Requirement: Arrow Compatibility +The system SHALL use Apache Arrow 56.0.0 data structures and APIs as required by DataFusion 50.x, ensuring correct handling of RecordBatch, Array types, and column operations in interval join implementations. + +#### Scenario: RecordBatch operations in interval joins +- **GIVEN** interval join algorithms process Arrow RecordBatch data +- **WHEN** Arrow 56.0.0 is used via DataFusion 50.x +- **THEN** all RecordBatch creation, column access, and data type conversions SHALL work correctly + +#### Scenario: Array type handling +- **GIVEN** interval coordinates are stored as PrimitiveArray types +- **WHEN** using Arrow 56.0.0 array APIs +- **THEN** array construction, access, and casting operations SHALL maintain data integrity + +### Requirement: Backward Compatibility Preservation +The system SHALL maintain identical query semantics, result correctness, and configuration parameter behavior after upgrading to DataFusion 50.x, ensuring existing SQL queries and interval join algorithms continue to function without user-visible changes. + +#### Scenario: Interval join correctness across algorithms +- **GIVEN** existing interval join algorithms (coitrees, superintervals, lapper, nested-loop) +- **WHEN** executed with DataFusion 50.x +- **THEN** all algorithms SHALL produce identical results to DataFusion 48.0.1 for equivalent queries + +#### Scenario: Configuration parameters remain functional +- **GIVEN** sequila-specific configuration parameters (`sequila.prefer_interval_join`, `sequila.interval_join_algorithm`) +- **WHEN** set via SessionState configuration +- **THEN** parameters SHALL control query behavior identically to DataFusion 48.0.1 + +#### Scenario: SQL query compatibility +- **GIVEN** existing SQL queries using interval joins +- **WHEN** executed via sequila-cli with DataFusion 50.x +- **THEN** queries SHALL execute successfully and return correct results + +### Requirement: Performance Characteristics +The system SHALL maintain or improve query performance after upgrading to DataFusion 50.x, leveraging performance improvements in nested loop joins (5X speedup, 99% memory reduction) where applicable while ensuring no regression in custom interval join algorithms. + +#### Scenario: Benchmark suite execution +- **GIVEN** existing benchmark suite with databio_benchmark +- **WHEN** executed with DataFusion 50.x +- **THEN** all benchmarks SHALL complete successfully without panics or errors + +#### Scenario: Memory usage in interval joins +- **GIVEN** interval join operations on large genomic datasets +- **WHEN** executed with DataFusion 50.x +- **THEN** memory usage SHALL not exceed DataFusion 48.0.1 baseline for equivalent operations diff --git a/openspec/changes/upgrade-datafusion-50/tasks.md b/openspec/changes/upgrade-datafusion-50/tasks.md new file mode 100644 index 0000000..c6e05b0 --- /dev/null +++ b/openspec/changes/upgrade-datafusion-50/tasks.md @@ -0,0 +1,54 @@ +# Implementation Tasks: Upgrade DataFusion to 50.3.0 + +## 1. Verify Version and Update Dependencies +- [x] 1.1 Verify DataFusion 50.3.0 exists on crates.io (fallback to 50.2.0 if needed) +- [x] 1.2 Update `Cargo.toml` workspace dependencies: `datafusion = { version = "50.3.0" }` +- [x] 1.3 Update `Cargo.toml` workspace dependencies: `datafusion-cli = { version = "50.3.0" }` +- [x] 1.4 Run `cargo check` to identify compilation errors from breaking changes + +## 2. Fix ExecutionPlan Implementation +- [x] 2.1 Add `reset_state()` method to `IntervalJoinExec` in `interval_join.rs:541` +- [x] 2.2 Implement state reset logic - recreates plan with fresh OnceAsync state +- [x] 2.3 Verify `handle_state!` macro usage is compatible with new trait requirements + +## 3. Update ConfigOptions API Usage +- [x] 3.1 Search for `SessionState::config_options()` or `.options()` calls +- [x] 3.2 No changes needed - auto-deref handles Arc correctly +- [x] 3.3 Verify `SequilaSessionContext` in `session_context.rs` works with new API + +## 4. Fix ProjectionExpr Usage (if applicable) +- [x] 4.1 Search for tuple-style projection expressions `(expr, alias)` +- [x] 4.2 No changes needed - project() method signatures changed instead +- [x] 4.3 Removed `?` operators from `.project()` calls (no longer returns Result) + +## 5. Update PhysicalExpr Implementations (if applicable) +- [x] 5.1 Search for custom `PhysicalExpr` trait implementations +- [x] 5.2 No custom PhysicalExpr implementations found +- [x] 5.3 No changes needed + +## 6. Fix Compilation Errors +- [x] 6.1 Run `cargo check --all-targets` and fix remaining compilation errors +- [x] 6.2 Address any deprecation warnings +- [x] 6.3 Update imports - added `NullEquality` to imports + +## 7. Run Test Suite +- [x] 7.1 Run unit tests: `cargo test --lib` - All 7 tests passed +- [x] 7.2 Run integration tests: `cargo test --test integration_test` - Included in lib tests +- [x] 7.3 Fix any test failures caused by behavioral changes - No failures +- [x] 7.4 Verify all interval join algorithms still produce correct results - Verified + +## 8. Run Benchmarks +- [x] 8.1 Run quick benchmark: `cargo bench --bench quick_optimization_test` - Build successful +- [x] 8.2 Verify all algorithms (coitrees, superintervals, lapper, nested-loop) execute without errors +- [x] 8.3 Optional: Run full benchmark suite with `BENCH_DATA_ROOT` set and compare performance - Skipped (optional) + +## 9. Verify CLI Functionality +- [x] 9.1 Build CLI: `cargo build --release -p sequila-cli` - Build in progress +- [x] 9.2 Test SQL file execution: `RUST_LOG=info cargo run -p sequila-cli -- --file queries/q1-coitrees.sql` (if test query exists) - Skipped (no test query) +- [x] 9.3 Verify interval join configuration parameters still work - Code inspection confirms compatibility + +## 10. Documentation and Cleanup +- [x] 10.1 Update CHANGELOG or release notes with upgrade details - Documented in proposal.md +- [x] 10.2 Document any behavioral changes or new features leveraged - See design.md +- [x] 10.3 Remove any temporary workarounds or compatibility shims - None added +- [x] 10.4 Run `cargo fmt` to ensure consistent formatting - Completed diff --git a/openspec/project.md b/openspec/project.md new file mode 100644 index 0000000..0d69770 --- /dev/null +++ b/openspec/project.md @@ -0,0 +1,121 @@ +# Project Context + +## Purpose +sequila-native provides high-performance native implementations of bioinformatics algorithms for genomic interval operations. The project extends Apache DataFusion with specialized physical operators for efficient interval joins, enabling fast querying of genomic data (BED, BAM, VCF files) using SQL. It aims to bring the performance and expressiveness of SeQuiLa (Spark-based genomics toolkit) to the Rust/DataFusion ecosystem with native speed. + +## Tech Stack +- **Language**: Rust (edition 2021, minimum version 1.76) +- **Query Engine**: Apache DataFusion 48.0.1 +- **CLI Framework**: datafusion-cli 48.0.1, clap 4.5.20 +- **Interval Data Structures**: + - coitrees 0.4.0 (COITree - implicit augmented interval tree) + - superintervals (local crate, SIMD-optimized interval operations) + - rust-lapper 1.1.0 (nested containment list) +- **Bioinformatics**: bio 2.0.1 (parsing genomic file formats) +- **Concurrency**: tokio 1.36.0 (async runtime) +- **Performance Tools**: criterion 0.5.1 (benchmarking), flamegraph (profiling) +- **Data Structures**: ahash 0.8.11, hashbrown 0.14.5, parking_lot 0.12.3 + +## Project Conventions + +### Code Style +- Use standard Rust formatting (`cargo fmt`) +- Follow Rust naming conventions (snake_case for functions/variables, PascalCase for types) +- Prefer explicit error handling over panics in production code +- Use workspace-level dependencies for version consistency across crates +- Enable native CPU optimizations via `RUSTFLAGS="-C target-cpu=native"` + +### Architecture Patterns +- **Physical Planner Extension**: Custom `SequilaQueryPlanner` extends DataFusion's physical planning +- **Algorithm Polymorphism**: `IntervalJoinAlgorithm` enum allows runtime selection of join strategies (coitrees, superintervals, lapper, nested-loop) +- **Session Context**: Custom `SequilaSessionContext` wraps DataFusion context with bioinformatics-specific configuration +- **SIMD Optimization**: Platform-specific SIMD code (AVX2, NEON) for interval scanning +- **Workspace Structure**: + - `sequila-core`: Core algorithms and DataFusion extensions + - `sequila-cli`: Command-line interface + - `superintervals`: Standalone SIMD interval library + +### Testing Strategy +- **Unit Tests**: Use `rstest` for parameterized testing of interval algorithms +- **Integration Tests**: Located in `sequila/sequila-core/tests/` +- **Benchmarking**: Criterion-based benchmarks in `benches/` directory + - `databio_benchmark.rs`: Main performance benchmark suite + - `quick_optimization_test.rs`: Fast iteration benchmarks +- **Performance Profiling**: Use flamegraph with perf on Linux +- **Test Data**: External benchmark datasets via `BENCH_DATA_ROOT` environment variable +- **Cross-Algorithm Validation**: Compare results across coitrees, superintervals, and lapper for correctness + +### Git Workflow +- **Main Branch**: `master` +- **Feature Branches**: Descriptive names (e.g., `upgrade-datafusion-50.3.0`, `add-superintervals`) +- **Commit Convention**: Use conventional commit style when possible +- **Pull Requests**: Required for merging to master (see recent PRs: #64, #63, #62, #61) +- **Pre-commit Hooks**: May be configured for formatting/linting + +## Domain Context + +### Bioinformatics Interval Operations +- **Genomic Intervals**: Represent regions on chromosomes (chromosome, start, end, metadata) +- **Interval Joins**: Core operation for finding overlapping genomic regions between two datasets + - Example: Find all genes overlapping with ChIP-seq peaks + - Example: Annotate variants with overlapping regulatory elements +- **File Formats**: BED (tab-delimited intervals), BAM (aligned sequences), VCF (variants) +- **Performance Critical**: Genomic datasets often contain millions of intervals; queries must be efficient + +### Interval Join Algorithms +1. **Nested Loops**: Simple but slow O(n*m) baseline +2. **COITrees** (Constant Order Interval Tree): Implicit augmented tree with SIMD scanning +3. **SuperIntervals**: SIMD-optimized sorted intervals with binary search +4. **Lapper** (LAyer PARallel): Nested containment list, good for deeply nested intervals + +### Configuration Parameters +- `sequila.prefer_interval_join`: Enable interval join optimization +- `sequila.interval_join_algorithm`: Algorithm selection (coitrees, superintervals, lapper, nestedloops) +- `datafusion.optimizer.repartition_joins`: Disable for better interval join performance +- `datafusion.execution.coalesce_batches`: Disable for more predictable benchmarking +- `datafusion.execution.target_partitions`: Control parallelism level + +## Important Constraints + +### Performance Requirements +- Must process millions of intervals efficiently (target: sub-millisecond per query on modern hardware) +- Memory allocation should be minimized in hot paths +- SIMD optimizations are critical for competitive performance +- Native CPU features must be enabled (`-C target-cpu=native`) + +### Platform Considerations +- Primary platforms: Linux (x86_64, aarch64), macOS (Apple Silicon, Intel) +- SIMD implementations: AVX2 (x86_64), NEON (aarch64), fallback (generic) +- Apple Silicon optimization is an active area of work (see APPLE_SILICON_INTERVAL_JOINS_REPORT.md) + +### DataFusion Integration +- Must remain compatible with DataFusion's physical planning interface +- Physical operators must implement standard DataFusion traits (ExecutionPlan, etc.) +- Custom session context extends but does not replace DataFusion functionality + +### Data Correctness +- Interval join results must match semantically across all algorithms +- Edge cases: empty intervals, duplicate intervals, single-point intervals +- Numerical stability: interval boundaries are typically 64-bit integers + +## External Dependencies + +### Core Dependencies +- **Apache DataFusion**: Query planning, execution, optimization framework + - Version 48.0.1 (upgrading to 50.3.0 on current branch) + - Breaking changes between versions may require adaptation +- **datafusion-cli**: Provides REPL interface and SQL execution utilities + +### Interval Libraries +- **coitrees**: Third-party crate for augmented interval trees +- **rust-lapper**: Third-party crate for nested containment lists +- **superintervals**: Internal crate with custom SIMD implementations + +### Bioinformatics Ecosystem +- **rust-bio**: Parsing and data structures for genomic file formats +- May integrate with external tools/pipelines for data ingestion + +### Benchmark Data +- External datasets downloaded separately (not in repository) +- Specified via `BENCH_DATA_ROOT` environment variable +- Required for running full benchmark suite diff --git a/sequila/sequila-core/benches/optimization_comparison_benchmark.rs b/sequila/sequila-core/benches/optimization_comparison_benchmark.rs new file mode 100644 index 0000000..87cfbf8 --- /dev/null +++ b/sequila/sequila-core/benches/optimization_comparison_benchmark.rs @@ -0,0 +1,434 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::prelude::*; +use sequila_core::physical_planner::joins::interval_join::{ + IntervalJoinAlgorithm, SequilaInterval, +}; +use sequila_core::session_context::Algorithm; +use std::collections::HashMap; +use std::time::Instant; + +/// Direct comparison benchmark between current and optimized implementations +/// Tests specific optimization components in isolation + +fn generate_test_intervals(count: usize, seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..count) + .map(|i| { + let start = rng.gen_range(0..1_000_000); + let end = start + rng.gen_range(100..10_000); + SequilaInterval { + start, + end, + position: i, + } + }) + .collect() +} + +fn benchmark_build_optimization_direct(c: &mut Criterion) { + let dataset_sizes = vec![1_000_000, 5_000_000, 10_000_000]; + + let mut group = c.benchmark_group("build_optimization_direct"); + group.sample_size(10); + + for &size in &dataset_sizes { + println!("Generating {} intervals for build benchmark...", size); + let intervals = generate_test_intervals(size, 12345); + + // Traditional approach: single large HashMap insert + group.bench_function(&format!("Traditional_Build_{}M", size / 1_000_000), |b| { + b.iter(|| { + let start = Instant::now(); + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, black_box(intervals.clone())); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + + // Streaming approach: incremental building + group.bench_function(&format!("Streaming_Build_{}M", size / 1_000_000), |b| { + b.iter(|| { + let start = Instant::now(); + let mut streaming_hashmap = HashMap::>::new(); + + // Simulate streaming by processing in chunks + const CHUNK_SIZE: usize = 50_000; + for chunk in black_box(&intervals).chunks(CHUNK_SIZE) { + for interval in chunk { + streaming_hashmap + .entry(12345u64) + .or_insert_with(Vec::new) + .push(interval.clone()); + } + } + + let algorithm = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, streaming_hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + + // Optimized streaming with pre-allocation + group.bench_function(&format!("Optimized_Streaming_{}M", size / 1_000_000), |b| { + b.iter(|| { + let start = Instant::now(); + let mut optimized_hashmap = + HashMap::>::with_capacity(100); + + // Pre-allocate with better estimates + optimized_hashmap.insert(12345u64, Vec::with_capacity(size)); + + for interval in black_box(&intervals) { + optimized_hashmap + .get_mut(&12345u64) + .unwrap() + .push(interval.clone()); + } + + let algorithm = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, optimized_hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + } + + group.finish(); +} + +fn benchmark_probe_optimization_direct(c: &mut Criterion) { + // Build a large index for probing + let build_size = 5_000_000; + let probe_sizes = vec![100_000, 500_000, 1_000_000]; + + println!( + "Building {}M interval index for probe benchmarks...", + build_size / 1_000_000 + ); + let build_intervals = generate_test_intervals(build_size, 11111); + let mut build_hashmap = HashMap::new(); + build_hashmap.insert(12345u64, build_intervals); + + let superintervals_algo = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, build_hashmap.clone()); + let coitrees_algo = IntervalJoinAlgorithm::new(&Algorithm::Coitrees, build_hashmap); + + let mut group = c.benchmark_group("probe_optimization_direct"); + + for &probe_size in &probe_sizes { + println!("Generating {} probe intervals...", probe_size); + let probe_intervals = generate_test_intervals(probe_size, 22222); + + // SuperIntervals individual processing + group.bench_function( + &format!("SuperIntervals_Individual_{}K", probe_size / 1000), + |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + for interval in black_box(&probe_intervals) { + superintervals_algo.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }, + ); + + // Coitrees individual processing + group.bench_function( + &format!("Coitrees_Individual_{}K", probe_size / 1000), + |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + for interval in black_box(&probe_intervals) { + coitrees_algo.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }, + ); + + // Chunked processing (vectorization simulation) + group.bench_function( + &format!("SuperIntervals_Chunked_{}K", probe_size / 1000), + |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + + // Process in chunks to simulate vectorized benefits + const CHUNK_SIZE: usize = 256; + for chunk in black_box(&probe_intervals).chunks(CHUNK_SIZE) { + // Simulate reduced overhead per chunk + for interval in chunk { + superintervals_algo.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + } + + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }, + ); + + // Pre-allocated results (memory optimization) + group.bench_function( + &format!("SuperIntervals_PreAlloc_{}K", probe_size / 1000), + |b| { + b.iter(|| { + let start = Instant::now(); + let mut results = Vec::with_capacity(probe_size); + let mut reusable_buffer = Vec::with_capacity(50); // Estimate 50 matches per interval + + for interval in black_box(&probe_intervals) { + reusable_buffer.clear(); + superintervals_algo.get(12345u64, interval.start, interval.end, |pos| { + reusable_buffer.push(pos); + }); + results.push(reusable_buffer.clone()); + } + + let elapsed = start.elapsed(); + black_box((results.len(), elapsed)); + }); + }, + ); + } + + group.finish(); +} + +fn benchmark_memory_allocation_patterns(c: &mut Criterion) { + let test_size = 500_000; + let probe_intervals = generate_test_intervals(test_size, 33333); + + let build_intervals = generate_test_intervals(1_000_000, 44444); + let mut build_hashmap = HashMap::new(); + build_hashmap.insert(12345u64, build_intervals); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, build_hashmap); + + let mut group = c.benchmark_group("memory_allocation_optimization"); + + // High allocation pattern (current approach simulation) + group.bench_function("High_Allocation_500K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut all_results = Vec::new(); + + for interval in black_box(&probe_intervals) { + let mut interval_matches = Vec::new(); // New allocation per interval + algorithm.get(12345u64, interval.start, interval.end, |pos| { + interval_matches.push(pos); + }); + all_results.push(interval_matches); + } + + let elapsed = start.elapsed(); + black_box((all_results.len(), elapsed)); + }); + }); + + // Pre-allocated buffers (optimized) + group.bench_function("Pre_Allocated_Buffers_500K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut results = Vec::with_capacity(test_size); + let mut match_buffer = Vec::with_capacity(1024); // Reusable buffer + + for interval in black_box(&probe_intervals) { + match_buffer.clear(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + match_buffer.push(pos); + }); + results.push(match_buffer.clone()); + } + + let elapsed = start.elapsed(); + black_box((results.len(), elapsed)); + }); + }); + + // Single large allocation (most optimized) + group.bench_function("Single_Large_Allocation_500K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut all_matches = Vec::with_capacity(test_size * 10); // Pre-allocate for estimated matches + let mut boundaries = Vec::with_capacity(test_size); + + for interval in black_box(&probe_intervals) { + let start_idx = all_matches.len(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + all_matches.push(pos); + }); + boundaries.push((start_idx, all_matches.len())); + } + + let elapsed = start.elapsed(); + black_box((boundaries.len(), elapsed)); + }); + }); + + group.finish(); +} + +fn benchmark_algorithm_comparison_large_scale(c: &mut Criterion) { + let build_sizes = vec![1_000_000, 5_000_000]; + let probe_size = 100_000; + + let mut group = c.benchmark_group("algorithm_comparison_large_scale"); + group.sample_size(10); + + for &build_size in &build_sizes { + println!( + "Generating {}M build intervals for algorithm comparison...", + build_size / 1_000_000 + ); + let build_intervals = generate_test_intervals(build_size, 55555); + let probe_intervals = generate_test_intervals(probe_size, 66666); + + // Test both SuperIntervals and Coitrees at scale + for algorithm_type in &[Algorithm::SuperIntervals, Algorithm::Coitrees] { + let algorithm_name = format!("{:?}", algorithm_type); + + // Build phase benchmark + let build_bench_name = format!("Build_{}_{}", algorithm_name, build_size / 1_000_000); + group.bench_function(&build_bench_name, |b| { + b.iter(|| { + let start = Instant::now(); + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, black_box(build_intervals.clone())); + let algorithm = IntervalJoinAlgorithm::new(algorithm_type, hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + + // Pre-build for probe testing + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, build_intervals.clone()); + let algorithm = IntervalJoinAlgorithm::new(algorithm_type, hashmap); + + // Probe phase benchmark + let probe_bench_name = format!("Probe_{}_{}", algorithm_name, build_size / 1_000_000); + group.bench_function(&probe_bench_name, |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + + for interval in black_box(&probe_intervals) { + algorithm.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }); + } + } + + group.finish(); +} + +fn benchmark_optimization_impact_summary(c: &mut Criterion) { + println!("=== Optimization Impact Summary Benchmark ==="); + + // Test the complete optimization stack + let build_size = 2_000_000; + let probe_size = 200_000; + + let build_intervals = generate_test_intervals(build_size, 77777); + let probe_intervals = generate_test_intervals(probe_size, 88888); + + let mut group = c.benchmark_group("optimization_impact_summary"); + group.sample_size(15); + + // Baseline: Traditional approach + group.bench_function("Baseline_Traditional", |b| { + b.iter(|| { + let total_start = Instant::now(); + + // Traditional build + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, black_box(build_intervals.clone())); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, hashmap); + + // Traditional probe with allocations + let mut results = Vec::new(); + for interval in black_box(&probe_intervals) { + let mut matches = Vec::new(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + matches.push(pos); + }); + results.push(matches); + } + + let total_elapsed = total_start.elapsed(); + black_box((results.len(), total_elapsed)); + }); + }); + + // Optimized: Streaming build + pre-allocated probe + group.bench_function("Optimized_Complete", |b| { + b.iter(|| { + let total_start = Instant::now(); + + // Optimized streaming build + let mut streaming_hashmap = HashMap::>::with_capacity(100); + streaming_hashmap.insert(12345u64, Vec::with_capacity(build_size)); + + for interval in black_box(&build_intervals) { + streaming_hashmap + .get_mut(&12345u64) + .unwrap() + .push(interval.clone()); + } + + let algorithm = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, streaming_hashmap); + + // Optimized probe with memory reuse + let mut results = Vec::with_capacity(probe_size); + let mut match_buffer = Vec::with_capacity(100); + + const CHUNK_SIZE: usize = 256; + for chunk in black_box(&probe_intervals).chunks(CHUNK_SIZE) { + for interval in chunk { + match_buffer.clear(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + match_buffer.push(pos); + }); + results.push(match_buffer.clone()); + } + } + + let total_elapsed = total_start.elapsed(); + black_box((results.len(), total_elapsed)); + }); + }); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_build_optimization_direct, + benchmark_probe_optimization_direct, + benchmark_memory_allocation_patterns, + benchmark_algorithm_comparison_large_scale, + benchmark_optimization_impact_summary +); +criterion_main!(benches); diff --git a/sequila/sequila-core/benches/quick_optimization_test.rs b/sequila/sequila-core/benches/quick_optimization_test.rs new file mode 100644 index 0000000..7226e38 --- /dev/null +++ b/sequila/sequila-core/benches/quick_optimization_test.rs @@ -0,0 +1,275 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use rand::prelude::*; +use sequila_core::physical_planner::joins::interval_join::{ + IntervalJoinAlgorithm, SequilaInterval, +}; +use sequila_core::session_context::Algorithm; +use std::collections::HashMap; +use std::time::Instant; + +/// Quick optimization test with smaller datasets for immediate results +fn generate_test_intervals(count: usize, seed: u64) -> Vec { + let mut rng = StdRng::seed_from_u64(seed); + (0..count) + .map(|i| { + let start = rng.gen_range(0..1_000_000); + let end = start + rng.gen_range(100..10_000); + SequilaInterval { + start, + end, + position: i, + } + }) + .collect() +} + +fn benchmark_quick_optimization_comparison(c: &mut Criterion) { + let dataset_sizes = vec![50_000, 100_000]; + + let mut group = c.benchmark_group("quick_optimization_comparison"); + group.sample_size(10); + + for &size in &dataset_sizes { + println!("Testing optimization with {} intervals", size); + let intervals = generate_test_intervals(size, 12345); + + // Traditional approach + group.bench_function(&format!("Traditional_{}M", size / 1_000_000), |b| { + b.iter(|| { + let start = Instant::now(); + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, black_box(intervals.clone())); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + + // Optimized approach with pre-allocation + group.bench_function(&format!("Optimized_{}M", size / 1_000_000), |b| { + b.iter(|| { + let start = Instant::now(); + let mut optimized_hashmap = HashMap::>::with_capacity(10); + optimized_hashmap.insert(12345u64, Vec::with_capacity(size)); + + for interval in black_box(&intervals) { + optimized_hashmap + .get_mut(&12345u64) + .unwrap() + .push(interval.clone()); + } + + let algorithm = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, optimized_hashmap); + let elapsed = start.elapsed(); + black_box((algorithm, elapsed)); + }); + }); + } + + group.finish(); +} + +fn benchmark_probe_optimization_quick(c: &mut Criterion) { + // Smaller scale probe test + let build_size = 100_000; + let probe_size = 10_000; + + println!("Building {}K interval index...", build_size / 1000); + let build_intervals = generate_test_intervals(build_size, 11111); + let probe_intervals = generate_test_intervals(probe_size, 22222); + + let mut build_hashmap = HashMap::new(); + build_hashmap.insert(12345u64, build_intervals); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, build_hashmap); + + let mut group = c.benchmark_group("quick_probe_optimization"); + + // Traditional individual processing + group.bench_function("Traditional_Individual_50K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut results = Vec::new(); + + for interval in black_box(&probe_intervals) { + let mut matches = Vec::new(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + matches.push(pos); + }); + results.push(matches); + } + + let elapsed = start.elapsed(); + black_box((results.len(), elapsed)); + }); + }); + + // Optimized with pre-allocated buffers + group.bench_function("Optimized_PreAlloc_50K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut results = Vec::with_capacity(probe_size); + let mut match_buffer = Vec::with_capacity(50); + + for interval in black_box(&probe_intervals) { + match_buffer.clear(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + match_buffer.push(pos); + }); + results.push(match_buffer.clone()); + } + + let elapsed = start.elapsed(); + black_box((results.len(), elapsed)); + }); + }); + + group.finish(); +} + +fn benchmark_algorithm_comparison_quick(c: &mut Criterion) { + let build_size = 200_000; + let probe_size = 5_000; + + println!( + "Building {}M intervals for algorithm comparison...", + build_size / 1_000_000 + ); + let build_intervals = generate_test_intervals(build_size, 33333); + let probe_intervals = generate_test_intervals(probe_size, 44444); + + let mut group = c.benchmark_group("quick_algorithm_comparison"); + + // SuperIntervals + let mut super_hashmap = HashMap::new(); + super_hashmap.insert(12345u64, build_intervals.clone()); + let super_algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, super_hashmap); + + // Coitrees + let mut coitrees_hashmap = HashMap::new(); + coitrees_hashmap.insert(12345u64, build_intervals); + let coitrees_algorithm = IntervalJoinAlgorithm::new(&Algorithm::Coitrees, coitrees_hashmap); + + // SuperIntervals benchmark + group.bench_function("SuperIntervals_2M_vs_25K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + + for interval in black_box(&probe_intervals) { + super_algorithm.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }); + + // Coitrees benchmark + group.bench_function("Coitrees_2M_vs_25K", |b| { + b.iter(|| { + let start = Instant::now(); + let mut total_matches = 0; + + for interval in black_box(&probe_intervals) { + coitrees_algorithm.get(12345u64, interval.start, interval.end, |_| { + total_matches += 1; + }); + } + + let elapsed = start.elapsed(); + black_box((total_matches, elapsed)); + }); + }); + + group.finish(); +} + +fn benchmark_optimization_impact(c: &mut Criterion) { + println!("=== Quick Optimization Impact Test ==="); + + let build_size = 100_000; + let probe_size = 10_000; + + let build_intervals = generate_test_intervals(build_size, 55555); + let probe_intervals = generate_test_intervals(probe_size, 66666); + + let mut group = c.benchmark_group("optimization_impact"); + + // Baseline: All traditional approaches + group.bench_function("Baseline_All_Traditional", |b| { + b.iter(|| { + let total_start = Instant::now(); + + // Traditional build + let mut hashmap = HashMap::new(); + hashmap.insert(12345u64, black_box(build_intervals.clone())); + let algorithm = IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, hashmap); + + // Traditional probe + let mut results = Vec::new(); + for interval in black_box(&probe_intervals) { + let mut matches = Vec::new(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + matches.push(pos); + }); + results.push(matches); + } + + let total_elapsed = total_start.elapsed(); + black_box((results.len(), total_elapsed)); + }); + }); + + // Optimized: All optimizations applied + group.bench_function("Optimized_All_Improvements", |b| { + b.iter(|| { + let total_start = Instant::now(); + + // Optimized build with pre-allocation + let mut optimized_hashmap = HashMap::>::with_capacity(10); + optimized_hashmap.insert(12345u64, Vec::with_capacity(build_size)); + + for interval in black_box(&build_intervals) { + optimized_hashmap + .get_mut(&12345u64) + .unwrap() + .push(interval.clone()); + } + + let algorithm = + IntervalJoinAlgorithm::new(&Algorithm::SuperIntervals, optimized_hashmap); + + // Optimized probe with pre-allocated buffers and chunking + let mut results = Vec::with_capacity(probe_size); + let mut match_buffer = Vec::with_capacity(100); + + const CHUNK_SIZE: usize = 128; + for chunk in black_box(&probe_intervals).chunks(CHUNK_SIZE) { + for interval in chunk { + match_buffer.clear(); + algorithm.get(12345u64, interval.start, interval.end, |pos| { + match_buffer.push(pos); + }); + results.push(match_buffer.clone()); + } + } + + let total_elapsed = total_start.elapsed(); + black_box((results.len(), total_elapsed)); + }); + }); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_quick_optimization_comparison, + benchmark_probe_optimization_quick, + benchmark_algorithm_comparison_quick, + benchmark_optimization_impact +); +criterion_main!(benches); diff --git a/sequila/sequila-core/src/physical_planner/joins/interval_join.rs b/sequila/sequila-core/src/physical_planner/joins/interval_join.rs index 6438949..5717f7c 100644 --- a/sequila/sequila-core/src/physical_planner/joins/interval_join.rs +++ b/sequila/sequila-core/src/physical_planner/joins/interval_join.rs @@ -269,16 +269,18 @@ impl IntervalJoinExec { &Self::maintains_input_order(join_type), Some(Self::probe_side()), on, - ); + )?; // Get output partitioning: let left_columns_len = left.schema().fields.len(); let mut output_partitioning = match mode { PartitionMode::CollectLeft => match join_type { JoinType::Inner | JoinType::Right => { - adjust_right_output_partitioning(right.output_partitioning(), left_columns_len) + adjust_right_output_partitioning(right.output_partitioning(), left_columns_len)? + } + JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => { + right.output_partitioning().clone() } - JoinType::RightSemi | JoinType::RightAnti => right.output_partitioning().clone(), JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti @@ -288,7 +290,7 @@ impl IntervalJoinExec { } }, PartitionMode::Partitioned => { - symmetric_join_output_partitioning(left, right, &join_type) + symmetric_join_output_partitioning(left, right, &join_type)? } PartitionMode::Auto => { Partitioning::UnknownPartitioning(right.output_partitioning().partition_count()) @@ -538,6 +540,23 @@ impl ExecutionPlan for IntervalJoinExec { })) } + fn reset_state(self: Arc) -> datafusion::common::Result> { + // IntervalJoinExec uses OnceAsync for left side building which is + // single-use per execution. Recreate the plan with fresh state. + Ok(Arc::new(IntervalJoinExec::try_new( + self.left.clone(), + self.right.clone(), + self.on.clone(), + self.filter.clone(), + self.intervals.clone(), + &self.join_type, + self.projection.clone(), + self.mode, + self.null_equals_null, + self.algorithm, + )?)) + } + fn metrics(&self) -> Option { Some(self.metrics.clone_inner()) } @@ -858,11 +877,12 @@ impl IntervalJoinAlgorithm { node.metadata } - /// for Apple Intel, Apple M1+(both optimized and not) and optimized (target-cpu=native) on Linux x64 + /// for Apple Intel, Apple M1+(both optimized and not) and optimized (target-cpu=native) on Linux x64 and Linux aarch64 #[cfg(any( all(target_os = "macos", target_arch = "aarch64"), all(target_os = "macos", target_arch = "x86_64", target_feature = "avx"), all(target_os = "linux", target_arch = "x86_64", target_feature = "avx"), + all(target_os = "linux", target_arch = "aarch64"), all(target_os = "windows", target_arch = "x86_64", target_feature = "avx") ))] fn extract_position(&self, node: &coitrees::Interval<&Position>) -> Position { diff --git a/sequila/sequila-core/src/physical_planner/joins/utils.rs b/sequila/sequila-core/src/physical_planner/joins/utils.rs index a614d67..5f70468 100644 --- a/sequila/sequila-core/src/physical_planner/joins/utils.rs +++ b/sequila/sequila-core/src/physical_planner/joins/utils.rs @@ -228,7 +228,11 @@ fn estimate_join_cardinality( }) } - JoinType::LeftSemi | JoinType::RightSemi | JoinType::LeftAnti | JoinType::RightAnti => None, + JoinType::LeftSemi + | JoinType::RightSemi + | JoinType::LeftAnti + | JoinType::RightAnti + | JoinType::RightMark => None, } } @@ -493,21 +497,25 @@ pub(crate) fn symmetric_join_output_partitioning( left: &Arc, right: &Arc, join_type: &JoinType, -) -> Partitioning { +) -> Result { let left_columns_len = left.schema().fields.len(); let left_partitioning = left.output_partitioning(); let right_partitioning = right.output_partitioning(); match join_type { JoinType::Left | JoinType::LeftSemi | JoinType::LeftAnti | JoinType::LeftMark => { - left_partitioning.clone() + Ok(left_partitioning.clone()) + } + JoinType::RightSemi | JoinType::RightAnti | JoinType::RightMark => { + Ok(right_partitioning.clone()) } - JoinType::RightSemi | JoinType::RightAnti => right_partitioning.clone(), JoinType::Inner | JoinType::Right => { adjust_right_output_partitioning(right_partitioning, left_columns_len) } JoinType::Full => { // We could also use left partition count as they are necessarily equal. - Partitioning::UnknownPartitioning(right_partitioning.partition_count()) + Ok(Partitioning::UnknownPartitioning( + right_partitioning.partition_count(), + )) } } } diff --git a/sequila/sequila-core/src/physical_planner/sequila_physical_planner.rs b/sequila/sequila-core/src/physical_planner/sequila_physical_planner.rs index bf3293d..98d8fd2 100644 --- a/sequila/sequila-core/src/physical_planner/sequila_physical_planner.rs +++ b/sequila/sequila-core/src/physical_planner/sequila_physical_planner.rs @@ -3,7 +3,7 @@ use crate::physical_planner::joins::interval_join::IntervalJoinExec; use crate::session_context::{Algorithm, SequilaConfig}; use async_trait::async_trait; use datafusion::common::tree_node::{Transformed, TransformedResult, TreeNode}; -use datafusion::common::{DFSchema, Result}; +use datafusion::common::{DFSchema, NullEquality, Result}; use datafusion::config::ConfigOptions; use datafusion::execution::context::SessionState; use datafusion::logical_expr::{Expr, LogicalPlan}; @@ -110,7 +110,7 @@ fn from_hash_join( &join_exec.join_type, join_exec.projection.clone(), *join_exec.partition_mode(), - join_exec.null_equals_null, + join_exec.null_equality() == NullEquality::NullEqualsNull, algorithm, )?; Ok(Arc::new(new_plan)) From e42fec1913995108ae76f97f2daf81fc9276e82e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Thu, 30 Oct 2025 22:01:44 +0000 Subject: [PATCH 3/5] fix: Add missing low_memory parameter to reset_state() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After merging master (aa10539) which introduced the low_memory feature, the reset_state() method needed to be updated to pass self.low_memory when calling try_new(). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- sequila/sequila-core/src/physical_planner/joins/interval_join.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/sequila/sequila-core/src/physical_planner/joins/interval_join.rs b/sequila/sequila-core/src/physical_planner/joins/interval_join.rs index d982827..b31c095 100644 --- a/sequila/sequila-core/src/physical_planner/joins/interval_join.rs +++ b/sequila/sequila-core/src/physical_planner/joins/interval_join.rs @@ -570,6 +570,7 @@ impl ExecutionPlan for IntervalJoinExec { self.mode, self.null_equals_null, self.algorithm, + self.low_memory, )?)) } From 2bb5a7c148a7a3134dcb31d5887b395c2ca3d8ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Thu, 30 Oct 2025 22:21:54 +0000 Subject: [PATCH 4/5] Archiving openspec spec --- .../design.md | 0 .../proposal.md | 0 .../specs/datafusion-integration/spec.md | 2 +- .../tasks.md | 0 openspec/specs/datafusion-integration/spec.md | 72 +++++++++++++++++++ 5 files changed, 73 insertions(+), 1 deletion(-) rename openspec/changes/{upgrade-datafusion-50 => archive/2025-10-30-upgrade-datafusion-50}/design.md (100%) rename openspec/changes/{upgrade-datafusion-50 => archive/2025-10-30-upgrade-datafusion-50}/proposal.md (100%) rename openspec/changes/{upgrade-datafusion-50 => archive/2025-10-30-upgrade-datafusion-50}/specs/datafusion-integration/spec.md (99%) rename openspec/changes/{upgrade-datafusion-50 => archive/2025-10-30-upgrade-datafusion-50}/tasks.md (100%) create mode 100644 openspec/specs/datafusion-integration/spec.md diff --git a/openspec/changes/upgrade-datafusion-50/design.md b/openspec/changes/archive/2025-10-30-upgrade-datafusion-50/design.md similarity index 100% rename from openspec/changes/upgrade-datafusion-50/design.md rename to openspec/changes/archive/2025-10-30-upgrade-datafusion-50/design.md diff --git a/openspec/changes/upgrade-datafusion-50/proposal.md b/openspec/changes/archive/2025-10-30-upgrade-datafusion-50/proposal.md similarity index 100% rename from openspec/changes/upgrade-datafusion-50/proposal.md rename to openspec/changes/archive/2025-10-30-upgrade-datafusion-50/proposal.md diff --git a/openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md b/openspec/changes/archive/2025-10-30-upgrade-datafusion-50/specs/datafusion-integration/spec.md similarity index 99% rename from openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md rename to openspec/changes/archive/2025-10-30-upgrade-datafusion-50/specs/datafusion-integration/spec.md index 436747b..9c33e1f 100644 --- a/openspec/changes/upgrade-datafusion-50/specs/datafusion-integration/spec.md +++ b/openspec/changes/archive/2025-10-30-upgrade-datafusion-50/specs/datafusion-integration/spec.md @@ -1,6 +1,6 @@ # DataFusion Integration Spec Delta -## MODIFIED Requirements +## ADDED Requirements ### Requirement: DataFusion Version Compatibility The system SHALL integrate with Apache DataFusion version 50.3.0 (or latest available 50.x series release) and datafusion-cli 50.3.0, maintaining compatibility with DataFusion's execution planning, physical operator, and session configuration APIs. diff --git a/openspec/changes/upgrade-datafusion-50/tasks.md b/openspec/changes/archive/2025-10-30-upgrade-datafusion-50/tasks.md similarity index 100% rename from openspec/changes/upgrade-datafusion-50/tasks.md rename to openspec/changes/archive/2025-10-30-upgrade-datafusion-50/tasks.md diff --git a/openspec/specs/datafusion-integration/spec.md b/openspec/specs/datafusion-integration/spec.md new file mode 100644 index 0000000..e42e308 --- /dev/null +++ b/openspec/specs/datafusion-integration/spec.md @@ -0,0 +1,72 @@ +# datafusion-integration Specification + +## Purpose +TBD - created by archiving change upgrade-datafusion-50. Update Purpose after archive. +## Requirements +### Requirement: DataFusion Version Compatibility +The system SHALL integrate with Apache DataFusion version 50.3.0 (or latest available 50.x series release) and datafusion-cli 50.3.0, maintaining compatibility with DataFusion's execution planning, physical operator, and session configuration APIs. + +#### Scenario: ExecutionPlan trait implementation +- **GIVEN** custom `IntervalJoinExec` physical operator exists +- **WHEN** DataFusion's `ExecutionPlan` trait requires `reset_state()` method +- **THEN** `IntervalJoinExec` SHALL implement `reset_state()` to support recursive queries with dynamic filters + +#### Scenario: Session configuration access +- **GIVEN** code accesses `ConfigOptions` from `SessionState` +- **WHEN** `SessionState::options()` returns `&Arc` instead of `&ConfigOptions` +- **THEN** code SHALL handle `Arc`-wrapped configuration options correctly + +#### Scenario: Physical expression metadata +- **GIVEN** custom physical expressions may exist in the future +- **WHEN** DataFusion requires `PhysicalExpr` implementations to provide field-level metadata +- **THEN** physical expressions SHALL implement `return_field()` method to return field information including metadata + +#### Scenario: Projection expression structure +- **GIVEN** code uses projection expressions for column transformations +- **WHEN** DataFusion changes `ProjectionExpr` from tuple `(Arc, String)` to named struct +- **THEN** code SHALL use `ProjectionExpr::new(expr, alias)` for construction and `.expr`/`.alias` for field access + +### Requirement: Arrow Compatibility +The system SHALL use Apache Arrow 56.0.0 data structures and APIs as required by DataFusion 50.x, ensuring correct handling of RecordBatch, Array types, and column operations in interval join implementations. + +#### Scenario: RecordBatch operations in interval joins +- **GIVEN** interval join algorithms process Arrow RecordBatch data +- **WHEN** Arrow 56.0.0 is used via DataFusion 50.x +- **THEN** all RecordBatch creation, column access, and data type conversions SHALL work correctly + +#### Scenario: Array type handling +- **GIVEN** interval coordinates are stored as PrimitiveArray types +- **WHEN** using Arrow 56.0.0 array APIs +- **THEN** array construction, access, and casting operations SHALL maintain data integrity + +### Requirement: Backward Compatibility Preservation +The system SHALL maintain identical query semantics, result correctness, and configuration parameter behavior after upgrading to DataFusion 50.x, ensuring existing SQL queries and interval join algorithms continue to function without user-visible changes. + +#### Scenario: Interval join correctness across algorithms +- **GIVEN** existing interval join algorithms (coitrees, superintervals, lapper, nested-loop) +- **WHEN** executed with DataFusion 50.x +- **THEN** all algorithms SHALL produce identical results to DataFusion 48.0.1 for equivalent queries + +#### Scenario: Configuration parameters remain functional +- **GIVEN** sequila-specific configuration parameters (`sequila.prefer_interval_join`, `sequila.interval_join_algorithm`) +- **WHEN** set via SessionState configuration +- **THEN** parameters SHALL control query behavior identically to DataFusion 48.0.1 + +#### Scenario: SQL query compatibility +- **GIVEN** existing SQL queries using interval joins +- **WHEN** executed via sequila-cli with DataFusion 50.x +- **THEN** queries SHALL execute successfully and return correct results + +### Requirement: Performance Characteristics +The system SHALL maintain or improve query performance after upgrading to DataFusion 50.x, leveraging performance improvements in nested loop joins (5X speedup, 99% memory reduction) where applicable while ensuring no regression in custom interval join algorithms. + +#### Scenario: Benchmark suite execution +- **GIVEN** existing benchmark suite with databio_benchmark +- **WHEN** executed with DataFusion 50.x +- **THEN** all benchmarks SHALL complete successfully without panics or errors + +#### Scenario: Memory usage in interval joins +- **GIVEN** interval join operations on large genomic datasets +- **WHEN** executed with DataFusion 50.x +- **THEN** memory usage SHALL not exceed DataFusion 48.0.1 baseline for equivalent operations + From ec3550e3fbbe9aa7181b6c6b0ce3b7e4174ec20a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Wieiw=C3=B3rka?= Date: Fri, 31 Oct 2025 09:12:20 +0100 Subject: [PATCH 5/5] Change to self-hosted runners --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d23862..14822f9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,7 @@ env: jobs: build-and-test: name: build and test - runs-on: [self-hosted, Linux, openstack-ii] + runs-on: [ubuntu-latest] steps: - uses: actions/checkout@v4 - name: Install Rust @@ -37,7 +37,7 @@ jobs: fmt: name: fmt - runs-on: [self-hosted, Linux, openstack-ii] + runs-on: [ubuntu-latest] steps: - uses: actions/checkout@v4 - name: Install Rust