From 7a234df677d02f299e8c86d5d4cc69fd171f5be4 Mon Sep 17 00:00:00 2001 From: Gary Wolfman Date: Tue, 5 May 2026 23:54:48 -0400 Subject: [PATCH] refactor(phase2-3): correctness and static-analysis cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2.1 — student_t.cpp: - Replace std::log(1.0 + x²/ν) with std::log1p(x²/ν) in all 9 batch probability sites (6 parallel/GPU lambdas + 3 scalar fallback paths); avoids catastrophic cancellation when x²/ν ≈ 0 Phase 3.1 — stream operator prefix checks: - Replace find(...) != 0 / == 0 with std::string::starts_with() in operator>> for beta, chi_squared, discrete, exponential, gamma (×2), gaussian, poisson, student_t, uniform — C++20 idiomatic and intent-clearer Phase 3.2 — static-analysis sweep: - simd_policy.cpp: remove redundant = SIMDPolicy::Level::None initializer (every branch including the final unconditional else assigns it) - gaussian.cpp: remove redundant double l1 = ZERO_DOUBLE initializer (immediately overwritten by std::accumulate on the next line) - math_utils.cpp: rename local empirical_cdf → ecdf_i to avoid shadowing the free function empirical_cdf declared at line 400 - work_stealing_pool.cpp: const auto& in destructor and resetStatistics range-for loops (pointer not rebound, atomic stores through const& are fine) - system_capabilities.cpp: add volatile bench_sink read after benchmark loop to prevent dead-code elimination of results[] writes Co-Authored-By: Oz --- src/beta.cpp | 2 +- src/chi_squared.cpp | 2 +- src/discrete.cpp | 2 +- src/exponential.cpp | 2 +- src/gamma.cpp | 4 ++-- src/gaussian.cpp | 7 +++---- src/math_utils.cpp | 4 ++-- src/poisson.cpp | 2 +- src/simd_policy.cpp | 4 +++- src/student_t.cpp | 20 +++++++++++--------- src/system_capabilities.cpp | 3 +++ src/uniform.cpp | 2 +- src/work_stealing_pool.cpp | 4 ++-- 13 files changed, 32 insertions(+), 26 deletions(-) diff --git a/src/beta.cpp b/src/beta.cpp index 60c7bf9..05e9ffe 100644 --- a/src/beta.cpp +++ b/src/beta.cpp @@ -897,7 +897,7 @@ std::ostream& operator<<(std::ostream& os, const BetaDistribution& dist) { std::istream& operator>>(std::istream& is, BetaDistribution& dist) { std::string token; is >> token; - if (token.find("BetaDistribution(") != 0) { + if (!token.starts_with("BetaDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/chi_squared.cpp b/src/chi_squared.cpp index cee2985..c1161c8 100644 --- a/src/chi_squared.cpp +++ b/src/chi_squared.cpp @@ -209,7 +209,7 @@ std::istream& operator>>(std::istream& is, ChiSquaredDistribution& dist) { double k; is >> token; - if (token.find("ChiSquaredDistribution(") != 0) { + if (!token.starts_with("ChiSquaredDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/discrete.cpp b/src/discrete.cpp index f8df0eb..12db5f8 100644 --- a/src/discrete.cpp +++ b/src/discrete.cpp @@ -2785,7 +2785,7 @@ std::istream& operator>>(std::istream& is, DiscreteDistribution& distribution) { // Skip whitespace and read the first part is >> token; - if (token.find("DiscreteUniform(") != 0) { + if (!token.starts_with("DiscreteUniform(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/exponential.cpp b/src/exponential.cpp index 81b9405..7806524 100644 --- a/src/exponential.cpp +++ b/src/exponential.cpp @@ -2333,7 +2333,7 @@ std::istream& operator>>(std::istream& is, ExponentialDistribution& distribution // Skip whitespace and read the first part is >> token; - if (token.find("ExponentialDistribution(") != 0) { + if (!token.starts_with("ExponentialDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/gamma.cpp b/src/gamma.cpp index ec6b749..ff99dac 100644 --- a/src/gamma.cpp +++ b/src/gamma.cpp @@ -2450,7 +2450,7 @@ std::istream& operator>>(std::istream& is, GammaDistribution& dist) { // Read "GammaDistribution(alpha=" is >> temp; // "GammaDistribution(alpha=value," - if (temp.find("GammaDistribution(alpha=") == 0) { + if (temp.starts_with("GammaDistribution(alpha=")) { // Extract alpha value size_t equals_pos = temp.find('='); size_t comma_pos = temp.find(','); @@ -2461,7 +2461,7 @@ std::istream& operator>>(std::istream& is, GammaDistribution& dist) { // Read "beta=value)" is >> temp; - if (temp.find("beta=") == 0) { + if (temp.starts_with("beta=")) { size_t beta_equals_pos = temp.find('='); size_t close_paren_pos = temp.find(')'); if (beta_equals_pos != std::string::npos && close_paren_pos != std::string::npos) { diff --git a/src/gaussian.cpp b/src/gaussian.cpp index d38bac8..c1a714f 100644 --- a/src/gaussian.cpp +++ b/src/gaussian.cpp @@ -1222,12 +1222,11 @@ std::pair GaussianDistribution::lMomentsEstimation( const size_t n = sorted_data.size(); // Calculate L-moments - double l1 = detail::ZERO_DOUBLE; // L-mean double l2 = detail::ZERO_DOUBLE; // L-scale // L1 (L-mean) = mean of order statistics - l1 = std::accumulate(sorted_data.begin(), sorted_data.end(), detail::ZERO_DOUBLE) / - static_cast(n); + double l1 = std::accumulate(sorted_data.begin(), sorted_data.end(), detail::ZERO_DOUBLE) / + static_cast(n); // L2 (L-scale) = 0.5 * E[X_{2:2} - X_{1:2}] for (size_t i = 0; i < n; ++i) { @@ -2813,7 +2812,7 @@ std::istream& operator>>(std::istream& is, GaussianDistribution& distribution) { } line = line.substr(start); - if (line.find("GaussianDistribution(") != 0) { + if (!line.starts_with("GaussianDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/math_utils.cpp b/src/math_utils.cpp index 44e86af..591f82b 100644 --- a/src/math_utils.cpp +++ b/src/math_utils.cpp @@ -584,11 +584,11 @@ double calculate_ks_statistic(const std::vector& data, // Calculate KS statistic: max |F_n(x) - F(x)| for (std::size_t i = 0; i < sorted_data.size(); ++i) { - double empirical_cdf = static_cast(i + 1) / n; + double ecdf_i = static_cast(i + 1) / n; // empirical CDF at step i double theoretical_cdf = dist.getCumulativeProbability(sorted_data[i]); // Check both F_n(x) - F(x) and F(x) - F_{n-1}(x) - double diff1 = std::abs(empirical_cdf - theoretical_cdf); + double diff1 = std::abs(ecdf_i - theoretical_cdf); double diff2 = std::abs(theoretical_cdf - static_cast(i) / n); max_diff = std::max(max_diff, std::max(diff1, diff2)); diff --git a/src/poisson.cpp b/src/poisson.cpp index 490b284..2b01b49 100644 --- a/src/poisson.cpp +++ b/src/poisson.cpp @@ -2792,7 +2792,7 @@ std::istream& operator>>(std::istream& is, PoissonDistribution& distribution) { // Skip whitespace and read the first part is >> token; - if (token.find("Poisson(") != 0) { + if (!token.starts_with("Poisson(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/simd_policy.cpp b/src/simd_policy.cpp index 291f346..840d2d7 100644 --- a/src/simd_policy.cpp +++ b/src/simd_policy.cpp @@ -24,7 +24,9 @@ struct SIMDState { void initialize() { // Detect best level inline since detectBestLevel is private - SIMDPolicy::Level detected_level = SIMDPolicy::Level::None; + // (No initializer needed: every code path — including the final unconditional + // else — assigns detected_level before it is read.) + SIMDPolicy::Level detected_level; #if defined(LIBSTATS_HAS_AVX512) if (stats::arch::supports_avx512()) { diff --git a/src/student_t.cpp b/src/student_t.cpp index 28e3ce1..efb3fd7 100644 --- a/src/student_t.cpp +++ b/src/student_t.cpp @@ -466,13 +466,15 @@ void StudentTDistribution::getProbability(std::span values, std::s const double nhnpo = dist.negHalfNuPlusOne_; const double inv_nu = dist.invNu_; lock.unlock(); + // std::log1p(x²/ν) avoids catastrophic cancellation when x²/ν ≈ 0; + // log(1 + x²/ν) loses precision there. See : log1p(x) = log(1+x). if (arch::should_use_parallel(count)) { ParallelUtils::parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = std::exp(lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu)); + res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu)); }); } else { for (std::size_t i = 0; i < count; ++i) { - res[i] = std::exp(lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu)); + res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu)); } } }, @@ -490,7 +492,7 @@ void StudentTDistribution::getProbability(std::span values, std::s const double inv_nu = dist.invNu_; lock.unlock(); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = std::exp(lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu)); + res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu)); }); }, [](const StudentTDistribution& dist, std::span vals, std::span res, @@ -507,7 +509,7 @@ void StudentTDistribution::getProbability(std::span values, std::s const double inv_nu = dist.invNu_; lock.unlock(); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = std::exp(lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu)); + res[i] = std::exp(lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu)); }); }); } @@ -552,11 +554,11 @@ void StudentTDistribution::getLogProbability(std::span values, lock.unlock(); if (arch::should_use_parallel(count)) { ParallelUtils::parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu); + res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu); }); } else { for (std::size_t i = 0; i < count; ++i) { - res[i] = lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu); + res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu); } } }, @@ -574,7 +576,7 @@ void StudentTDistribution::getLogProbability(std::span values, const double inv_nu = dist.invNu_; lock.unlock(); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu); + res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu); }); }, [](const StudentTDistribution& dist, std::span vals, std::span res, @@ -591,7 +593,7 @@ void StudentTDistribution::getLogProbability(std::span values, const double inv_nu = dist.invNu_; lock.unlock(); pool.parallelFor(std::size_t{0}, count, [&](std::size_t i) { - res[i] = lnc + nhnpo * std::log(1.0 + vals[i] * vals[i] * inv_nu); + res[i] = lnc + nhnpo * std::log1p(vals[i] * vals[i] * inv_nu); }); }); } @@ -724,7 +726,7 @@ std::istream& operator>>(std::istream& is, StudentTDistribution& dist) { double nu; is >> token; - if (token.find("StudentTDistribution(") != 0) { + if (!token.starts_with("StudentTDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/system_capabilities.cpp b/src/system_capabilities.cpp index 73f446a..2b5cda0 100644 --- a/src/system_capabilities.cpp +++ b/src/system_capabilities.cpp @@ -29,6 +29,9 @@ double benchmarkSIMDEfficiency() { results[j] = data[j] * detail::TWO + detail::ONE; } } + // Prevent dead-code elimination of the benchmark loop. + volatile double bench_sink = results[BENCHMARK_ARRAY_SIZE / 2]; + (void)bench_sink; auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start); diff --git a/src/uniform.cpp b/src/uniform.cpp index 51efa1a..1abb12e 100644 --- a/src/uniform.cpp +++ b/src/uniform.cpp @@ -2285,7 +2285,7 @@ std::istream& operator>>(std::istream& is, UniformDistribution& distribution) { // Skip whitespace and read the first part is >> token; - if (token.find("UniformDistribution(") != 0) { + if (!token.starts_with("UniformDistribution(")) { is.setstate(std::ios::failbit); return is; } diff --git a/src/work_stealing_pool.cpp b/src/work_stealing_pool.cpp index b5d15f8..5a3ac2a 100644 --- a/src/work_stealing_pool.cpp +++ b/src/work_stealing_pool.cpp @@ -101,7 +101,7 @@ WorkStealingPool::~WorkStealingPool() { } // Wait for all threads to finish - for (auto& worker : workers_) { + for (const auto& worker : workers_) { if (worker && worker->worker.joinable()) { worker->worker.join(); } @@ -166,7 +166,7 @@ WorkStealingPool::Statistics WorkStealingPool::getStatistics() const { } void WorkStealingPool::resetStatistics() { - for (auto& worker : workers_) { + for (const auto& worker : workers_) { worker->tasksExecuted.store(detail::ZERO_INT, std::memory_order_relaxed); worker->workSteals.store(detail::ZERO_INT, std::memory_order_relaxed); worker->failedSteals.store(detail::ZERO_INT, std::memory_order_relaxed);