Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tdigest/include/tdigest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ template<typename T, typename A>
class tdigest<T, A>::const_iterator {
public:
using iterator_category = std::input_iterator_tag;
using value_type = std::pair<const T&, const W>;
using value_type = std::pair<T, W>;
using difference_type = void;
using pointer = const return_value_holder<value_type>;
using reference = const value_type;
Expand Down
1 change: 1 addition & 0 deletions tdigest/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ target_sources(tdigest_test
PRIVATE
tdigest_test.cpp
tdigest_custom_allocator_test.cpp
tdigest_iterator_test.cpp
)

if (SERDE_COMPAT)
Expand Down
274 changes: 274 additions & 0 deletions tdigest/test/tdigest_iterator_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#include <catch2/catch.hpp>
#include <memory>
#include <map>
#include <vector>
#include <set>

#include "tdigest.hpp"

namespace datasketches {

TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") {
tdigest_double td(100);

// Insert 10 distinct values
for (int i = 0; i < 10; i++) {
td.update(static_cast<double>(i));
}

// Collect all centroids via iteration
std::map<double, uint64_t> centroids;
for (const auto&& centroid : td) {
centroids[centroid.first] = centroid.second;
}

// Should have collected all 10 distinct values
REQUIRE(centroids.size() == 10);

// Verify each value was captured correctly
for (int i = 0; i < 10; i++) {
REQUIRE(centroids.count(static_cast<double>(i)) == 1);
REQUIRE(centroids[static_cast<double>(i)] == 1);
}
}

TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") {
// This test reproduces the bug scenario found in ClickHouse
std::unique_ptr<tdigest_double> td(new tdigest_double(100));

// Insert distinct values
for (int i = 0; i < 10; i++) {
td->update(static_cast<double>(i));
}

// Use explicit begin/end iterators
auto it = td->begin();
auto end_it = td->end();

std::vector<double> means;
std::vector<uint64_t> weights;

while (it != end_it) {
// Before the fix, accessing it->first would return garbage or same value repeatedly
double mean = it->first;
uint64_t weight = it->second;
means.push_back(mean);
weights.push_back(weight);
++it;
}

// Should have collected 10 centroids
REQUIRE(means.size() == 10);
REQUIRE(weights.size() == 10);

// All means should be distinct (not all zeros or garbage)
std::set<double> unique_means(means.begin(), means.end());
REQUIRE(unique_means.size() == 10);

// Verify all expected values are present
for (int i = 0; i < 10; i++) {
REQUIRE(unique_means.count(static_cast<double>(i)) == 1);
}
}

TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") {
tdigest_double td(100);

for (int i = 0; i < 5; i++) {
td.update(static_cast<double>(i * 10));
}

std::vector<std::pair<double, uint64_t>> collected;

// Test structured bindings
for (auto it = td.begin(); it != td.end(); ++it) {
const auto& centroid = *it;
collected.emplace_back(centroid.first, centroid.second);
}

REQUIRE(collected.size() == 5);

// Verify distinct values were collected
std::set<double> means;
for (const auto& pair : collected) {
means.insert(pair.first);
REQUIRE(pair.second == 1); // Each value inserted once
}

REQUIRE(means.size() == 5);
for (int i = 0; i < 5; i++) {
REQUIRE(means.count(static_cast<double>(i * 10)) == 1);
}
}

TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") {
tdigest_double td(100);

// Insert values
for (int i = 1; i <= 10; i++) {
td.update(static_cast<double>(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100
}

// Access via operator->
std::map<double, uint64_t> centroids;
auto end_it = td.end();
for (auto it = td.begin(); it != end_it; ++it) {
// operator-> should return valid values
centroids[it->first] = it->second;
}

REQUIRE(centroids.size() == 10);

// Verify the squared values
for (int i = 1; i <= 10; i++) {
double expected = static_cast<double>(i * i);
REQUIRE(centroids.count(expected) == 1);
}
}

TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") {
tdigest_double td(100);

// Insert values
for (double d = 0.0; d < 10.0; d += 1.0) {
td.update(d);
}

size_t count = 0;
std::set<double> seen_means;

// This pattern was working in simple tests but failing in optimized builds
for (const auto&& centroid : td) {
seen_means.insert(centroid.first);
count++;
}

REQUIRE(count == 10);
REQUIRE(seen_means.size() == 10);

// Verify all values from 0 to 9 are present
for (int i = 0; i < 10; i++) {
REQUIRE(seen_means.count(static_cast<double>(i)) == 1);
}
}

TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") {
tdigest_double td(100);

td.update(1.0);
td.update(2.0);
td.update(3.0);

auto it = td.begin();

// Store the pair
auto pair1 = *it;
double mean1 = pair1.first;

++it;

// Store another pair
auto pair2 = *it;
double mean2 = pair2.first;

++it;

auto pair3 = *it;
double mean3 = pair3.first;

// All three means should be distinct
REQUIRE(mean1 != mean2);
REQUIRE(mean2 != mean3);
REQUIRE(mean1 != mean3);

// And they should match our input values
std::set<double> means = {mean1, mean2, mean3};
REQUIRE(means.count(1.0) == 1);
REQUIRE(means.count(2.0) == 1);
REQUIRE(means.count(3.0) == 1);
}

TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") {
tdigest_double td(100);

// Empty sketch should have begin() == end()
REQUIRE(td.begin() == td.end());

// Range-based for should not execute
size_t count = 0;
for (const auto&& centroid : td) {
(void)centroid; // Silence unused warning
count++;
}
REQUIRE(count == 0);
}

TEST_CASE("tdigest iterator: single value", "[tdigest]") {
tdigest_double td(100);
td.update(42.0);

size_t count = 0;
double captured_mean = 0.0;
uint64_t captured_weight = 0;

for (const auto&& centroid : td) {
captured_mean = centroid.first;
captured_weight = centroid.second;
count++;
}

REQUIRE(count == 1);
REQUIRE(captured_mean == 42.0);
REQUIRE(captured_weight == 1);
}

TEST_CASE("tdigest iterator: large dataset", "[tdigest]") {
tdigest_double td(100);

// Insert 1000 distinct values
for (int i = 0; i < 1000; i++) {
td.update(static_cast<double>(i));
}

// Iterator should provide compressed centroids (not all 1000)
size_t centroid_count = 0;
std::set<double> unique_means;
uint64_t total_weight = 0;

for (const auto&& centroid : td) {
unique_means.insert(centroid.first);
total_weight += centroid.second;
centroid_count++;
}

// Should have fewer centroids than input values due to compression
REQUIRE(centroid_count < 1000);
REQUIRE(centroid_count > 0);

// Total weight should equal number of input values
REQUIRE(total_weight == 1000);

// All means should be unique (no duplicates)
REQUIRE(unique_means.size() == centroid_count);
}

} // namespace datasketches