From 4900f9b32c018b1a8d65f5b47194e41caca92259 Mon Sep 17 00:00:00 2001 From: Davide Faconti Date: Fri, 29 May 2026 08:27:59 +0200 Subject: [PATCH 1/2] perf(datastore): binary-search RangeCursor start and latestAt ARCHITECTURE.md documents both as binary searches, but findFirstValid() and latestAt() linear-scanned the chunk deque and the rows within a chunk (O(C*R)). Committed chunks are never empty and are time-ordered (each chunk's t_min >= the previous chunk's t_max), so t_min/t_max are monotonic across the deque and timestamps are monotonic within a chunk -- enabling std::lower_bound / std::upper_bound for an O(log C + log R) start. Behaviour-preserving: lower_bound keeps the first-duplicate range start, and upper_bound keeps the last-duplicate / later-chunk-at-shared-boundary semantics of the previous reverse scans. Adds regression tests for duplicate timestamps, shared chunk boundaries, and single-point ranges. Makes the ARCHITECTURE.md "binary-searches" claim accurate. Internal optimization; no API/ABI change. Co-Authored-By: Claude Opus 4.8 (1M context) --- pj_datastore/src/query.cpp | 95 +++++++++++++++++-------------- pj_datastore/tests/query_test.cpp | 69 ++++++++++++++++++++++ 2 files changed, 120 insertions(+), 44 deletions(-) diff --git a/pj_datastore/src/query.cpp b/pj_datastore/src/query.cpp index 1fb0690..9fbbcb3 100644 --- a/pj_datastore/src/query.cpp +++ b/pj_datastore/src/query.cpp @@ -126,30 +126,37 @@ void RangeCursor::forEachChunk(std::function callbac } void RangeCursor::findFirstValid() { - // Linear scan to find the first chunk whose t_max >= t_min_ - // (i.e., that could contain data in our range) - for (chunk_index_ = 0; chunk_index_ < chunks_->size(); ++chunk_index_) { - const auto& chunk = (*chunks_)[chunk_index_]; - if (chunk.stats.t_max >= t_min_) { - // This chunk might contain data in range. - // Now find the first row where timestamp >= t_min_. - for (row_index_ = 0; row_index_ < chunk.stats.row_count; ++row_index_) { - Timestamp ts = chunk.readTimestamp(row_index_); - if (ts >= t_min_) { - // Check if this row is also within t_max - if (ts <= t_max_) { - return; // Found a valid starting position - } - // ts > t_max_ means no valid data in range at all - chunk_index_ = chunks_->size(); - return; - } - } - // All rows in this chunk are before t_min, try next chunk - continue; - } + const auto& chunks = *chunks_; + + // First chunk that could contain a row in range, i.e. whose t_max >= t_min_. + // Committed chunks are non-empty and time-ordered (each chunk's t_min >= the + // previous chunk's t_max), so t_max is non-decreasing across the deque and we + // can binary-search it. + const auto chunk_it = std::lower_bound( + chunks.begin(), chunks.end(), t_min_, + [](const TopicChunk& chunk, Timestamp value) { return chunk.stats.t_max < value; }); + if (chunk_it == chunks.end()) { + // All data is strictly before t_min_. + chunk_index_ = chunks.size(); + row_index_ = 0; + return; + } + chunk_index_ = static_cast(chunk_it - chunks.begin()); + + // First row with timestamp >= t_min_ within that chunk. Such a row exists + // because t_max (the chunk's last timestamp) >= t_min_. + const TopicChunk& chunk = *chunk_it; + const auto ts_begin = chunk.timestamps.begin(); + const auto ts_end = ts_begin + static_cast(chunk.stats.row_count); + const auto row_it = std::lower_bound(ts_begin, ts_end, t_min_); + row_index_ = static_cast(row_it - ts_begin); + + // If the first row at or after t_min_ is already past t_max_, nothing in the + // deque falls inside [t_min_, t_max_]. + if (row_it == ts_end || *row_it > t_max_) { + chunk_index_ = chunks.size(); + row_index_ = 0; } - // No valid chunk found: chunk_index_ == chunks_->size() (past-end) } void RangeCursor::skipToValid() { @@ -171,30 +178,30 @@ void RangeCursor::skipToValid() { // =========================================================================== std::optional latestAt(const std::deque& chunks, Timestamp t) { - if (chunks.empty()) { + // Last chunk that can contain a row at or before t, i.e. the latest chunk + // whose t_min <= t. Committed chunks are non-empty and have non-decreasing + // t_min, so upper_bound finds the first chunk strictly after t; the chunk + // before it is the answer. (At a shared boundary timestamp this selects the + // later chunk, matching the previous reverse-scan behaviour.) + const auto after = std::upper_bound(chunks.begin(), chunks.end(), t, [](Timestamp value, const TopicChunk& chunk) { + return value < chunk.stats.t_min; + }); + if (after == chunks.begin()) { + // Empty deque, or every chunk starts strictly after t. return std::nullopt; } - - // Reverse iterate chunks. For each chunk, if t_min <= t, search within it. - for (std::size_t ci = chunks.size(); ci > 0; --ci) { - const auto& chunk = chunks[ci - 1]; - if (chunk.stats.t_min > t) { - continue; // Entire chunk is after t - } - // chunk.stats.t_min <= t, so there might be a row <= t in this chunk. - // Reverse scan within the chunk to find the last row with timestamp <= t. - for (std::size_t ri = chunk.stats.row_count; ri > 0; --ri) { - Timestamp ts = chunk.readTimestamp(ri - 1); - if (ts <= t) { - return SampleRow{ts, &chunk, ri - 1}; - } - } - // All rows in this chunk are after t, but t_min <= t was true. - // This shouldn't happen with sorted data, but handle gracefully - // by continuing to the previous chunk. + const TopicChunk& chunk = *(after - 1); + + // Last row with timestamp <= t within that chunk. Such a row exists because + // the chunk's first timestamp (t_min) is <= t. + const auto ts_begin = chunk.timestamps.begin(); + const auto ts_end = ts_begin + static_cast(chunk.stats.row_count); + const auto row_after = std::upper_bound(ts_begin, ts_end, t); + if (row_after == ts_begin) { + return std::nullopt; // unreachable for committed chunks (row 0 ts == t_min <= t) } - - return std::nullopt; + const std::size_t row = static_cast((row_after - 1) - ts_begin); + return SampleRow{chunk.readTimestamp(row), &chunk, row}; } // =========================================================================== diff --git a/pj_datastore/tests/query_test.cpp b/pj_datastore/tests/query_test.cpp index 21298c8..62360f1 100644 --- a/pj_datastore/tests/query_test.cpp +++ b/pj_datastore/tests/query_test.cpp @@ -28,6 +28,20 @@ TopicChunk make_test_chunk(Timestamp t_start, uint32_t num_rows, Timestamp step) return builder.seal(); } +// Helper: build a chunk from an explicit (possibly non-uniform / duplicated) +// timestamp list. The column value equals the row index, so a returned +// row_index can be cross-checked against value. +TopicChunk make_chunk_from_timestamps(const std::vector& ts) { + std::vector cols = {{0, PrimitiveType::kFloat32, "value"}}; + TopicChunkBuilder builder(1, 1, cols, static_cast(ts.size())); + for (std::size_t i = 0; i < ts.size(); ++i) { + builder.beginRow(ts[i]); + builder.set(0, static_cast(i)); + builder.finishRow(); + } + return builder.seal(); +} + // Build the standard 5-chunk test fixture: // Chunk 0: t=[0, 90], step=10 // Chunk 1: t=[100, 190], step=10 @@ -160,6 +174,61 @@ TEST(QueryTest, LatestAtBetweenChunks) { EXPECT_EQ(result->timestamp, 90); } +// ========================================================================= +// Binary-search edge cases (duplicate timestamps, shared chunk boundaries) +// ========================================================================= + +TEST(QueryTest, LatestAtWithDuplicateTimestampsReturnsLastDuplicate) { + std::deque chunks; + // Rows: 0 1 2 3 4 + chunks.push_back(make_chunk_from_timestamps({10, 20, 20, 20, 30})); + + auto result = latestAt(chunks, 20); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result->timestamp, 20); + // upper_bound semantics: the last row with ts <= 20 is row index 3. + EXPECT_EQ(result->row_index, 3u); +} + +TEST(QueryTest, RangeQueryWithDuplicateTimestampsStartsAtFirstDuplicate) { + std::deque chunks; + // Rows: 0 1 2 3 4 + chunks.push_back(make_chunk_from_timestamps({10, 20, 20, 20, 30})); + + auto cursor = rangeQuery(chunks, 20, 20); + std::vector rows; + cursor.forEach([&](const SampleRow& row) { rows.push_back(row.row_index); }); + + // lower_bound semantics: starts at the first ts >= 20 (row 1) and includes + // every row with ts <= 20 (rows 1, 2, 3). + ASSERT_EQ(rows.size(), 3u); + EXPECT_EQ(rows.front(), 1u); + EXPECT_EQ(rows.back(), 3u); +} + +TEST(QueryTest, LatestAtAtSharedChunkBoundarySelectsLaterChunk) { + std::deque chunks; + chunks.push_back(make_chunk_from_timestamps({70, 80, 90})); // chunk A, t_max=90 + chunks.push_back(make_chunk_from_timestamps({90, 100, 110})); // chunk B, t_min=90 + + auto result = latestAt(chunks, 90); + ASSERT_TRUE(result.has_value()); + EXPECT_EQ(result->timestamp, 90); + // The boundary value 90 exists in both chunks; the later chunk (B, row 0) wins. + EXPECT_EQ(result->chunk, &chunks[1]); + EXPECT_EQ(result->row_index, 0u); +} + +TEST(QueryTest, RangeQuerySingleTimestampPoint) { + auto chunks = make_standard_chunks(); + // Degenerate inclusive range [200, 200] hits exactly one row. + auto cursor = rangeQuery(chunks, 200, 200); + std::vector timestamps; + cursor.forEach([&](const SampleRow& row) { timestamps.push_back(row.timestamp); }); + ASSERT_EQ(timestamps.size(), 1u); + EXPECT_EQ(timestamps.front(), 200); +} + // ========================================================================= // Empty deque tests // ========================================================================= From 52127e12654955dc870120d7ef120f5a96b3889e Mon Sep 17 00:00:00 2001 From: Davide Faconti Date: Fri, 29 May 2026 08:28:09 +0200 Subject: [PATCH 2/2] chore: add missing SPDX headers; drop stale `development` CI trigger number_parse.{hpp,cpp} (pj_base, Apache-2.0) lacked their SPDX-License-Identifier headers -- the dual-license boundary relies on per-file headers, so add them. All four branch-triggered workflows still keyed on the deleted `development` branch; restrict them to `main`. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/linux-ci.yml | 4 ++-- .github/workflows/macos-ci.yml | 4 ++-- .github/workflows/pre-commit.yml | 4 ++-- .github/workflows/windows-ci.yml | 4 ++-- pj_base/include/pj_base/number_parse.hpp | 2 ++ pj_base/src/number_parse.cpp | 3 +++ 6 files changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/linux-ci.yml b/.github/workflows/linux-ci.yml index c290949..fad28a6 100644 --- a/.github/workflows/linux-ci.yml +++ b/.github/workflows/linux-ci.yml @@ -1,9 +1,9 @@ name: Linux CI on: push: - branches: [development, main] + branches: [main] pull_request: - branches: [development, main] + branches: [main] types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: {} diff --git a/.github/workflows/macos-ci.yml b/.github/workflows/macos-ci.yml index f5e2496..2499975 100644 --- a/.github/workflows/macos-ci.yml +++ b/.github/workflows/macos-ci.yml @@ -1,9 +1,9 @@ name: macOS CI on: push: - branches: [development, main] + branches: [main] pull_request: - branches: [development, main] + branches: [main] types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: {} diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index ac37e91..4772f49 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -1,9 +1,9 @@ name: pre-commit on: push: - branches: [development, main] + branches: [main] pull_request: - branches: [development, main] + branches: [main] types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: {} diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index bc202ff..175376c 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -1,9 +1,9 @@ name: Windows CI on: push: - branches: [development, main] + branches: [main] pull_request: - branches: [development, main] + branches: [main] types: [opened, synchronize, reopened, ready_for_review] workflow_dispatch: {} diff --git a/pj_base/include/pj_base/number_parse.hpp b/pj_base/include/pj_base/number_parse.hpp index 7c95698..912d6dc 100644 --- a/pj_base/include/pj_base/number_parse.hpp +++ b/pj_base/include/pj_base/number_parse.hpp @@ -1,4 +1,6 @@ #pragma once +// Copyright 2026 Davide Faconti +// SPDX-License-Identifier: Apache-2.0 #include #include diff --git a/pj_base/src/number_parse.cpp b/pj_base/src/number_parse.cpp index 1f5bfb7..bbd9224 100644 --- a/pj_base/src/number_parse.cpp +++ b/pj_base/src/number_parse.cpp @@ -1,3 +1,6 @@ +// Copyright 2026 Davide Faconti +// SPDX-License-Identifier: Apache-2.0 + #include "pj_base/number_parse.hpp" #include