From 1d786e31ba0f0ddfe806a55aec1b86e94231b8f0 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 11 Mar 2026 13:40:38 -0400 Subject: [PATCH 1/9] Add CUDA testing of functions --- test/cuda_jamfile | 15 ++++++ test/test_signed_add_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_signed_div_sat.cu | 96 ++++++++++++++++++++++++++++++++++ test/test_signed_gcd.cu | 93 ++++++++++++++++++++++++++++++++ test/test_signed_lcm.cu | 94 +++++++++++++++++++++++++++++++++ test/test_signed_midpoint.cu | 93 ++++++++++++++++++++++++++++++++ test/test_signed_mul_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_signed_sub_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_add_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_div_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_gcd.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_lcm.cu | 94 +++++++++++++++++++++++++++++++++ test/test_unsigned_midpoint.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_mul_sat.cu | 93 ++++++++++++++++++++++++++++++++ test/test_unsigned_sub_sat.cu | 93 ++++++++++++++++++++++++++++++++ 15 files changed, 1322 insertions(+) create mode 100644 test/test_signed_add_sat.cu create mode 100644 test/test_signed_div_sat.cu create mode 100644 test/test_signed_gcd.cu create mode 100644 test/test_signed_lcm.cu create mode 100644 test/test_signed_midpoint.cu create mode 100644 test/test_signed_mul_sat.cu create mode 100644 test/test_signed_sub_sat.cu create mode 100644 test/test_unsigned_add_sat.cu create mode 100644 test/test_unsigned_div_sat.cu create mode 100644 test/test_unsigned_gcd.cu create mode 100644 test/test_unsigned_lcm.cu create mode 100644 test/test_unsigned_midpoint.cu create mode 100644 test/test_unsigned_mul_sat.cu create mode 100644 test/test_unsigned_sub_sat.cu diff --git a/test/cuda_jamfile b/test/cuda_jamfile index bac4ab2b..82e2c8ee 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -64,3 +64,18 @@ run test_signed_cstdlib_div.cu ; run test_signed_to_unsigned_conversion.cu ; run test_unsigned_to_signed_conversion.cu ; + +run test_unsigned_add_sat.cu ; +run test_signed_add_sat.cu ; +run test_unsigned_sub_sat.cu ; +run test_signed_sub_sat.cu ; +run test_unsigned_mul_sat.cu ; +run test_signed_mul_sat.cu ; +run test_unsigned_div_sat.cu ; +run test_signed_div_sat.cu ; +run test_unsigned_gcd.cu ; +run test_signed_gcd.cu ; +run test_unsigned_lcm.cu ; +run test_signed_lcm.cu ; +run test_unsigned_midpoint.cu ; +run test_signed_midpoint.cu ; diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu new file mode 100644 index 00000000..a86457f9 --- /dev/null +++ b/test/test_signed_add_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::add_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_div_sat.cu b/test/test_signed_div_sat.cu new file mode 100644 index 00000000..804d4dc9 --- /dev/null +++ b/test/test_signed_div_sat.cu @@ -0,0 +1,96 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::div_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + do + { + input_vector2[i] = dist(rng); + } while (input_vector2[i] == 0); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_gcd.cu b/test/test_signed_gcd.cu new file mode 100644 index 00000000..7d5c8434 --- /dev/null +++ b/test/test_signed_gcd.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::gcd(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_lcm.cu b/test/test_signed_lcm.cu new file mode 100644 index 00000000..bafe559d --- /dev/null +++ b/test/test_signed_lcm.cu @@ -0,0 +1,94 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::lcm(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + // Use smaller values to avoid overflow in lcm computation + boost::random::uniform_int_distribution dist {test_type{-1, UINT64_MAX}, test_type{0, UINT64_MAX}}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu new file mode 100644 index 00000000..803ba974 --- /dev/null +++ b/test/test_signed_midpoint.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::midpoint(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_mul_sat.cu b/test/test_signed_mul_sat.cu new file mode 100644 index 00000000..569e583c --- /dev/null +++ b/test/test_signed_mul_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::mul_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_sub_sat.cu b/test/test_signed_sub_sat.cu new file mode 100644 index 00000000..7dd40f30 --- /dev/null +++ b/test/test_signed_sub_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::sub_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_add_sat.cu b/test/test_unsigned_add_sat.cu new file mode 100644 index 00000000..3cfc0317 --- /dev/null +++ b/test/test_unsigned_add_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::add_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_div_sat.cu b/test/test_unsigned_div_sat.cu new file mode 100644 index 00000000..9f76b869 --- /dev/null +++ b/test/test_unsigned_div_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::div_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{1U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_gcd.cu b/test/test_unsigned_gcd.cu new file mode 100644 index 00000000..f23abe48 --- /dev/null +++ b/test/test_unsigned_gcd.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::gcd(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_lcm.cu b/test/test_unsigned_lcm.cu new file mode 100644 index 00000000..d586d58b --- /dev/null +++ b/test/test_unsigned_lcm.cu @@ -0,0 +1,94 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::lcm(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + // Use smaller values to avoid overflow in lcm computation + boost::random::uniform_int_distribution dist {test_type{0U}, test_type{0U, UINT64_MAX}}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_midpoint.cu b/test/test_unsigned_midpoint.cu new file mode 100644 index 00000000..e695b5ff --- /dev/null +++ b/test/test_unsigned_midpoint.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::midpoint(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_mul_sat.cu b/test/test_unsigned_mul_sat.cu new file mode 100644 index 00000000..228ef806 --- /dev/null +++ b/test/test_unsigned_mul_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::mul_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_sub_sat.cu b/test/test_unsigned_sub_sat.cu new file mode 100644 index 00000000..73bf36d7 --- /dev/null +++ b/test/test_unsigned_sub_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::sub_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} From 0796f36fdeb8eb026850bd2cf7ac3ecb51f4a8a6 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Wed, 11 Mar 2026 13:56:10 -0400 Subject: [PATCH 2/9] Improve diagnostics --- test/test_signed_add_sat.cu | 17 +++++++++++++++-- test/test_signed_midpoint.cu | 17 +++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu index a86457f9..45b45116 100644 --- a/test/test_signed_add_sat.cu +++ b/test/test_signed_add_sat.cu @@ -77,14 +77,27 @@ int main(void) } double t = w.elapsed(); + int fail_count = 0; for (int i = 0; i < numElements; ++i) { if (output_vector[i] != results[i]) { - std::cerr << "Result verification failed at element " << i << "!" << std::endl; - return EXIT_FAILURE; + if (fail_count < 5) + { + std::cerr << "Result verification failed at element " << i << std::endl; + std::cerr << " input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl; + std::cerr << " input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl; + std::cerr << " GPU high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl; + std::cerr << " CPU high: " << results[i].high << " low: " << results[i].low << std::endl; + } + ++fail_count; } } + if (fail_count > 0) + { + std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl; + return EXIT_FAILURE; + } std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; std::cout << "Done\n"; diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu index 803ba974..5ee28d71 100644 --- a/test/test_signed_midpoint.cu +++ b/test/test_signed_midpoint.cu @@ -77,14 +77,27 @@ int main(void) } double t = w.elapsed(); + int fail_count = 0; for (int i = 0; i < numElements; ++i) { if (output_vector[i] != results[i]) { - std::cerr << "Result verification failed at element " << i << "!" << std::endl; - return EXIT_FAILURE; + if (fail_count < 5) + { + std::cerr << "Result verification failed at element " << i << std::endl; + std::cerr << " input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl; + std::cerr << " input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl; + std::cerr << " GPU high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl; + std::cerr << " CPU high: " << results[i].high << " low: " << results[i].low << std::endl; + } + ++fail_count; } } + if (fail_count > 0) + { + std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl; + return EXIT_FAILURE; + } std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; std::cout << "Done\n"; From a1e05752f605796a8541ea7807222ad983dc5698 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Thu, 12 Mar 2026 14:25:06 -0400 Subject: [PATCH 3/9] Implement conversion operators between types --- include/boost/int128/detail/conversions.hpp | 14 ++++++++++++++ include/boost/int128/detail/int128_imp.hpp | 1 + include/boost/int128/detail/uint128_imp.hpp | 1 + 3 files changed, 16 insertions(+) diff --git a/include/boost/int128/detail/conversions.hpp b/include/boost/int128/detail/conversions.hpp index a549db03..f471d570 100644 --- a/include/boost/int128/detail/conversions.hpp +++ b/include/boost/int128/detail/conversions.hpp @@ -38,6 +38,20 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexc #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE +//===================================== +// Conversion Operators +//===================================== + +BOOST_INT128_HOST_DEVICE constexpr int128_t::operator uint128_t() const noexcept +{ + return uint128_t{static_cast(this->high), static_cast(this->low)}; +} + +BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator int128_t() const noexcept +{ + return int128_t{static_cast(this->high), static_cast(this->low)}; +} + //===================================== // Comparison Operators //===================================== diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp index d2bdfa62..4e5fcc11 100644 --- a/include/boost/int128/detail/int128_imp.hpp +++ b/include/boost/int128/detail/int128_imp.hpp @@ -57,6 +57,7 @@ int128_t // Requires a conversion file to be implemented BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept; + BOOST_INT128_HOST_DEVICE explicit constexpr operator uint128_t() const noexcept; // Construct from integral types #if BOOST_INT128_ENDIAN_LITTLE_BYTE diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp index 26aa9370..fdd0b309 100644 --- a/include/boost/int128/detail/uint128_imp.hpp +++ b/include/boost/int128/detail/uint128_imp.hpp @@ -58,6 +58,7 @@ uint128_t // Requires a conversion file to be implemented BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept; + BOOST_INT128_HOST_DEVICE explicit constexpr operator int128_t() const noexcept; // Construct from integral types #if BOOST_INT128_ENDIAN_LITTLE_BYTE From 39e43140175ac8b9895d3ce749e34337ee5c634e Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 08:36:25 -0500 Subject: [PATCH 4/9] Remove forward declarations --- include/boost/int128/numeric.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp index 2375154f..33111e46 100644 --- a/include/boost/int128/numeric.hpp +++ b/include/boost/int128/numeric.hpp @@ -76,9 +76,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u return z; } -BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(int128_t x, int128_t y) noexcept; -BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t x, int128_t y) noexcept; - #ifdef _MSC_VER # pragma warning(push) # pragma warning(disable : 4307) // Addition Overflow From f188a38b8df6dc256464d111d916edf83b1edb16 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 08:38:54 -0500 Subject: [PATCH 5/9] Avoid undefined rollover --- include/boost/int128/numeric.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp index 33111e46..0665887f 100644 --- a/include/boost/int128/numeric.hpp +++ b/include/boost/int128/numeric.hpp @@ -115,7 +115,9 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int1 if (x <= 0 && y >= 0) { // Underflow case - const auto res {x - y}; + const auto big_x {static_cast(x)}; + const auto big_y {static_cast(y)}; + const auto res {static_cast(big_x - big_y)}; return res > x ? (std::numeric_limits::min)() : res; } else if (x > 0 && y < 0) From 2d1fd23d1ea9ce469a96fdde9759d54dbb2eabf9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 11:15:45 -0500 Subject: [PATCH 6/9] Improve algorithms --- include/boost/int128/numeric.hpp | 68 ++++++++++++++------------------ 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp index 0665887f..23832a5c 100644 --- a/include/boost/int128/numeric.hpp +++ b/include/boost/int128/numeric.hpp @@ -84,56 +84,40 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept { - if (x >= 0 && y >= 0) - { - constexpr auto max_value {static_cast((std::numeric_limits::max)())}; - const auto big_x {static_cast(x)}; - const auto big_y {static_cast(y)}; - const auto big_res {big_x + big_y}; + const auto result {x + y}; - return big_res > max_value ? (std::numeric_limits::max)() : static_cast(big_res); - } - else if ((x < 0 && y > 0) || (x > 0 && y < 0)) + // Positive overflow: both non-negative but result wrapped to negative + if (x.high >= 0 && y.high >= 0 && result.high < 0) { - return x + y; + return (std::numeric_limits::max)(); } - else + + // Negative overflow: both negative but result wrapped to non-negative + if (x.high < 0 && y.high < 0 && result.high >= 0) { - // x < 0 and y < 0 - // Nearly the same technique as the positive values case - constexpr auto max_value {-static_cast((std::numeric_limits::min)())}; - const auto big_x {static_cast(abs(x))}; - const auto big_y {static_cast(abs(y))}; - const auto big_res {big_x + big_y}; - - return big_res > max_value ? (std::numeric_limits::min)() : -static_cast(big_res); + return (std::numeric_limits::min)(); } + + return result; } BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept { - if (x <= 0 && y >= 0) + const auto result {x - y}; + + // Positive overflow: positive minus negative but result wrapped to negative + if (x.high >= 0 && y.high < 0 && result.high < 0) { - // Underflow case - const auto big_x {static_cast(x)}; - const auto big_y {static_cast(y)}; - const auto res {static_cast(big_x - big_y)}; - return res > x ? (std::numeric_limits::min)() : res; + return (std::numeric_limits::max)(); } - else if (x > 0 && y < 0) - { - // Overflow Case - constexpr auto max_val {static_cast((std::numeric_limits::max)())}; - const auto big_x {static_cast(x)}; - const auto big_y {-static_cast(y)}; - const auto res {big_x + big_y}; - return (res > max_val || res < big_x) ? (std::numeric_limits::max)() : static_cast(res); - } - else + // Negative overflow: negative minus non-negative but result wrapped to non-negative + if (x.high < 0 && y.high >= 0 && result.high >= 0) { - return x - y; + return (std::numeric_limits::min)(); } + + return result; } #ifdef _MSC_VER @@ -403,11 +387,17 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int // For signed integers, we use a + (b - a) / 2 or a - (a - b) / 2 // The subtraction is done in unsigned arithmetic to handle overflow correctly // Integer division automatically rounds toward the first argument + // + // Use direct field access for both the uint128 construction and the + // comparison to avoid NVCC host compiler issues with operator<= and + // static_cast on int128_t for large-magnitude values + + const uint128_t ua {static_cast(a.high), a.low}; + const uint128_t ub {static_cast(b.high), b.low}; - const auto ua {static_cast(a)}; - const auto ub {static_cast(b)}; + const bool a_le_b {a.high == b.high ? a.low <= b.low : a.high < b.high}; - if (a <= b) + if (a_le_b) { // diff = b - a (computed in unsigned, handles wrap-around correctly) const auto diff {ub - ua}; From 3a8b5c36f8f0c24c0c2880292e4bb7b971b791cd Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 11:18:23 -0500 Subject: [PATCH 7/9] Expand testing ranges --- test/test_signed_eq.cu | 2 +- test/test_signed_ge.cu | 2 +- test/test_signed_gt.cu | 2 +- test/test_signed_le.cu | 2 +- test/test_signed_lt.cu | 2 +- test/test_signed_ne.cu | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/test/test_signed_eq.cu b/test/test_signed_eq.cu index ac09ae44..4f7156c1 100644 --- a/test/test_signed_eq.cu +++ b/test/test_signed_eq.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_ge.cu b/test/test_signed_ge.cu index af5628fb..efe510ea 100644 --- a/test/test_signed_ge.cu +++ b/test/test_signed_ge.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_gt.cu b/test/test_signed_gt.cu index f37577b8..820b0797 100644 --- a/test/test_signed_gt.cu +++ b/test/test_signed_gt.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_le.cu b/test/test_signed_le.cu index 05ee992a..d2d67ce6 100644 --- a/test/test_signed_le.cu +++ b/test/test_signed_le.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_lt.cu b/test/test_signed_lt.cu index c356ae00..c4094c4d 100644 --- a/test/test_signed_lt.cu +++ b/test/test_signed_lt.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_ne.cu b/test/test_signed_ne.cu index 39f9d64a..6c34a111 100644 --- a/test/test_signed_ne.cu +++ b/test/test_signed_ne.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); From 41d230381d6fc2e9a04c4bda033b60633eeb160d Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 11:23:33 -0500 Subject: [PATCH 8/9] Update deprecated actions --- .github/workflows/codecov.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index bd2f7c8d..c1486f38 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -80,13 +80,13 @@ jobs: fi git config --global pack.threads 0 - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # For coverage builds fetch the whole history, else only 1 commit using a 'fake ternary' fetch-depth: ${{ matrix.coverage && '0' || '1' }} - name: Cache ccache - uses: actions/cache@v3 + uses: actions/cache@v4 if: env.B2_USE_CCACHE with: path: ~/.ccache @@ -94,7 +94,7 @@ jobs: restore-keys: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}- - name: Fetch Boost.CI - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: boostorg/boost-ci ref: master From 705630c6f9745ecf121b8a6aa4945d34152312b9 Mon Sep 17 00:00:00 2001 From: Matt Borland Date: Fri, 13 Mar 2026 11:28:53 -0500 Subject: [PATCH 9/9] Further avoidance of UB --- include/boost/int128/numeric.hpp | 48 ++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp index 23832a5c..0699cd8b 100644 --- a/include/boost/int128/numeric.hpp +++ b/include/boost/int128/numeric.hpp @@ -84,40 +84,52 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept { - const auto result {x + y}; + // Detect overflow BEFORE the addition to avoid signed overflow UB. + // When both are non-negative: overflow iff x > max - y (subtraction safe: max - non_negative >= 0) + // When both are negative: overflow iff x < min - y (subtraction safe: min - negative > min) + // Mixed signs: overflow is impossible. - // Positive overflow: both non-negative but result wrapped to negative - if (x.high >= 0 && y.high >= 0 && result.high < 0) + if (x.high >= 0 && y.high >= 0) { - return (std::numeric_limits::max)(); + if (x > (std::numeric_limits::max)() - y) + { + return (std::numeric_limits::max)(); + } } - - // Negative overflow: both negative but result wrapped to non-negative - if (x.high < 0 && y.high < 0 && result.high >= 0) + else if (x.high < 0 && y.high < 0) { - return (std::numeric_limits::min)(); + if (x < (std::numeric_limits::min)() - y) + { + return (std::numeric_limits::min)(); + } } - return result; + return x + y; } BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept { - const auto result {x - y}; + // Detect overflow BEFORE the subtraction to avoid signed overflow UB. + // Positive overflow: x >= 0 and y < 0 and x > max + y (safe: max + negative < max) + // Negative overflow: x < 0 and y >= 0 and x < min + y (safe: min + non_negative > min) + // Same signs: overflow is impossible. - // Positive overflow: positive minus negative but result wrapped to negative - if (x.high >= 0 && y.high < 0 && result.high < 0) + if (x.high >= 0 && y.high < 0) { - return (std::numeric_limits::max)(); + if (x > (std::numeric_limits::max)() + y) + { + return (std::numeric_limits::max)(); + } } - - // Negative overflow: negative minus non-negative but result wrapped to non-negative - if (x.high < 0 && y.high >= 0 && result.high >= 0) + else if (x.high < 0 && y.high >= 0) { - return (std::numeric_limits::min)(); + if (x < (std::numeric_limits::min)() + y) + { + return (std::numeric_limits::min)(); + } } - return result; + return x - y; } #ifdef _MSC_VER