diff --git a/tdigest/include/tdigest.hpp b/tdigest/include/tdigest.hpp index 2d3620b1..095752e9 100644 --- a/tdigest/include/tdigest.hpp +++ b/tdigest/include/tdigest.hpp @@ -316,7 +316,7 @@ template class tdigest::const_iterator { public: using iterator_category = std::input_iterator_tag; - using value_type = std::pair; + using value_type = std::pair; using difference_type = void; using pointer = const return_value_holder; using reference = const value_type; diff --git a/tdigest/test/CMakeLists.txt b/tdigest/test/CMakeLists.txt index 18bf3599..8dcfb4f0 100644 --- a/tdigest/test/CMakeLists.txt +++ b/tdigest/test/CMakeLists.txt @@ -39,6 +39,7 @@ target_sources(tdigest_test PRIVATE tdigest_test.cpp tdigest_custom_allocator_test.cpp + tdigest_iterator_test.cpp ) if (SERDE_COMPAT) diff --git a/tdigest/test/tdigest_iterator_test.cpp b/tdigest/test/tdigest_iterator_test.cpp new file mode 100644 index 00000000..e7c03205 --- /dev/null +++ b/tdigest/test/tdigest_iterator_test.cpp @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include + +#include "tdigest.hpp" + +namespace datasketches { + +TEST_CASE("tdigest iterator: basic iteration", "[tdigest]") { + tdigest_double td(100); + + // Insert 10 distinct values + for (int i = 0; i < 10; i++) { + td.update(static_cast(i)); + } + + // Collect all centroids via iteration + std::map centroids; + for (const auto&& centroid : td) { + centroids[centroid.first] = centroid.second; + } + + // Should have collected all 10 distinct values + REQUIRE(centroids.size() == 10); + + // Verify each value was captured correctly + for (int i = 0; i < 10; i++) { + REQUIRE(centroids.count(static_cast(i)) == 1); + REQUIRE(centroids[static_cast(i)] == 1); + } +} + +TEST_CASE("tdigest iterator: explicit begin/end with unique_ptr", "[tdigest]") { + // This test reproduces the bug scenario found in ClickHouse + std::unique_ptr td(new tdigest_double(100)); + + // Insert distinct values + for (int i = 0; i < 10; i++) { + td->update(static_cast(i)); + } + + // Use explicit begin/end iterators + auto it = td->begin(); + auto end_it = td->end(); + + std::vector means; + std::vector weights; + + while (it != end_it) { + // Before the fix, accessing it->first would return garbage or same value repeatedly + double mean = it->first; + uint64_t weight = it->second; + means.push_back(mean); + weights.push_back(weight); + ++it; + } + + // Should have collected 10 centroids + REQUIRE(means.size() == 10); + REQUIRE(weights.size() == 10); + + // All means should be distinct (not all zeros or garbage) + std::set unique_means(means.begin(), means.end()); + REQUIRE(unique_means.size() == 10); + + // Verify all expected values are present + for (int i = 0; i < 10; i++) { + REQUIRE(unique_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: structured bindings", "[tdigest]") { + tdigest_double td(100); + + for (int i = 0; i < 5; i++) { + td.update(static_cast(i * 10)); + } + + std::vector> collected; + + // Test structured bindings + for (auto it = td.begin(); it != td.end(); ++it) { + const auto& centroid = *it; + collected.emplace_back(centroid.first, centroid.second); + } + + REQUIRE(collected.size() == 5); + + // Verify distinct values were collected + std::set means; + for (const auto& pair : collected) { + means.insert(pair.first); + REQUIRE(pair.second == 1); // Each value inserted once + } + + REQUIRE(means.size() == 5); + for (int i = 0; i < 5; i++) { + REQUIRE(means.count(static_cast(i * 10)) == 1); + } +} + +TEST_CASE("tdigest iterator: operator-> access", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (int i = 1; i <= 10; i++) { + td.update(static_cast(i * i)); // 1, 4, 9, 16, 25, 36, 49, 64, 81, 100 + } + + // Access via operator-> + std::map centroids; + auto end_it = td.end(); + for (auto it = td.begin(); it != end_it; ++it) { + // operator-> should return valid values + centroids[it->first] = it->second; + } + + REQUIRE(centroids.size() == 10); + + // Verify the squared values + for (int i = 1; i <= 10; i++) { + double expected = static_cast(i * i); + REQUIRE(centroids.count(expected) == 1); + } +} + +TEST_CASE("tdigest iterator: range-based for with const auto&&", "[tdigest]") { + tdigest_double td(100); + + // Insert values + for (double d = 0.0; d < 10.0; d += 1.0) { + td.update(d); + } + + size_t count = 0; + std::set seen_means; + + // This pattern was working in simple tests but failing in optimized builds + for (const auto&& centroid : td) { + seen_means.insert(centroid.first); + count++; + } + + REQUIRE(count == 10); + REQUIRE(seen_means.size() == 10); + + // Verify all values from 0 to 9 are present + for (int i = 0; i < 10; i++) { + REQUIRE(seen_means.count(static_cast(i)) == 1); + } +} + +TEST_CASE("tdigest iterator: copy vs reference semantics", "[tdigest]") { + tdigest_double td(100); + + td.update(1.0); + td.update(2.0); + td.update(3.0); + + auto it = td.begin(); + + // Store the pair + auto pair1 = *it; + double mean1 = pair1.first; + + ++it; + + // Store another pair + auto pair2 = *it; + double mean2 = pair2.first; + + ++it; + + auto pair3 = *it; + double mean3 = pair3.first; + + // All three means should be distinct + REQUIRE(mean1 != mean2); + REQUIRE(mean2 != mean3); + REQUIRE(mean1 != mean3); + + // And they should match our input values + std::set means = {mean1, mean2, mean3}; + REQUIRE(means.count(1.0) == 1); + REQUIRE(means.count(2.0) == 1); + REQUIRE(means.count(3.0) == 1); +} + +TEST_CASE("tdigest iterator: empty sketch", "[tdigest]") { + tdigest_double td(100); + + // Empty sketch should have begin() == end() + REQUIRE(td.begin() == td.end()); + + // Range-based for should not execute + size_t count = 0; + for (const auto&& centroid : td) { + (void)centroid; // Silence unused warning + count++; + } + REQUIRE(count == 0); +} + +TEST_CASE("tdigest iterator: single value", "[tdigest]") { + tdigest_double td(100); + td.update(42.0); + + size_t count = 0; + double captured_mean = 0.0; + uint64_t captured_weight = 0; + + for (const auto&& centroid : td) { + captured_mean = centroid.first; + captured_weight = centroid.second; + count++; + } + + REQUIRE(count == 1); + REQUIRE(captured_mean == 42.0); + REQUIRE(captured_weight == 1); +} + +TEST_CASE("tdigest iterator: large dataset", "[tdigest]") { + tdigest_double td(100); + + // Insert 1000 distinct values + for (int i = 0; i < 1000; i++) { + td.update(static_cast(i)); + } + + // Iterator should provide compressed centroids (not all 1000) + size_t centroid_count = 0; + std::set unique_means; + uint64_t total_weight = 0; + + for (const auto&& centroid : td) { + unique_means.insert(centroid.first); + total_weight += centroid.second; + centroid_count++; + } + + // Should have fewer centroids than input values due to compression + REQUIRE(centroid_count < 1000); + REQUIRE(centroid_count > 0); + + // Total weight should equal number of input values + REQUIRE(total_weight == 1000); + + // All means should be unique (no duplicates) + REQUIRE(unique_means.size() == centroid_count); +} + +} // namespace datasketches