diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index bd2f7c8d..c1486f38 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -80,13 +80,13 @@ jobs: fi git config --global pack.threads 0 - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: # For coverage builds fetch the whole history, else only 1 commit using a 'fake ternary' fetch-depth: ${{ matrix.coverage && '0' || '1' }} - name: Cache ccache - uses: actions/cache@v3 + uses: actions/cache@v4 if: env.B2_USE_CCACHE with: path: ~/.ccache @@ -94,7 +94,7 @@ jobs: restore-keys: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}- - name: Fetch Boost.CI - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: repository: boostorg/boost-ci ref: master diff --git a/include/boost/int128/detail/conversions.hpp b/include/boost/int128/detail/conversions.hpp index a549db03..f471d570 100644 --- a/include/boost/int128/detail/conversions.hpp +++ b/include/boost/int128/detail/conversions.hpp @@ -38,6 +38,20 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexc #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE +//===================================== +// Conversion Operators +//===================================== + +BOOST_INT128_HOST_DEVICE constexpr int128_t::operator uint128_t() const noexcept +{ + return uint128_t{static_cast(this->high), static_cast(this->low)}; +} + +BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator int128_t() const noexcept +{ + return int128_t{static_cast(this->high), static_cast(this->low)}; +} + //===================================== // Comparison Operators //===================================== diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp index d2bdfa62..4e5fcc11 100644 --- a/include/boost/int128/detail/int128_imp.hpp +++ b/include/boost/int128/detail/int128_imp.hpp @@ -57,6 +57,7 @@ int128_t // Requires a conversion file to be implemented BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept; + BOOST_INT128_HOST_DEVICE explicit constexpr operator uint128_t() const noexcept; // Construct from integral types #if BOOST_INT128_ENDIAN_LITTLE_BYTE diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp index 26aa9370..fdd0b309 100644 --- a/include/boost/int128/detail/uint128_imp.hpp +++ b/include/boost/int128/detail/uint128_imp.hpp @@ -58,6 +58,7 @@ uint128_t // Requires a conversion file to be implemented BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept; + BOOST_INT128_HOST_DEVICE explicit constexpr operator int128_t() const noexcept; // Construct from integral types #if BOOST_INT128_ENDIAN_LITTLE_BYTE diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp index 2375154f..0699cd8b 100644 --- a/include/boost/int128/numeric.hpp +++ b/include/boost/int128/numeric.hpp @@ -76,9 +76,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u return z; } -BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(int128_t x, int128_t y) noexcept; -BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t x, int128_t y) noexcept; - #ifdef _MSC_VER # pragma warning(push) # pragma warning(disable : 4307) // Addition Overflow @@ -87,54 +84,52 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept { - if (x >= 0 && y >= 0) - { - constexpr auto max_value {static_cast((std::numeric_limits::max)())}; - const auto big_x {static_cast(x)}; - const auto big_y {static_cast(y)}; - const auto big_res {big_x + big_y}; + // Detect overflow BEFORE the addition to avoid signed overflow UB. + // When both are non-negative: overflow iff x > max - y (subtraction safe: max - non_negative >= 0) + // When both are negative: overflow iff x < min - y (subtraction safe: min - negative > min) + // Mixed signs: overflow is impossible. - return big_res > max_value ? (std::numeric_limits::max)() : static_cast(big_res); - } - else if ((x < 0 && y > 0) || (x > 0 && y < 0)) + if (x.high >= 0 && y.high >= 0) { - return x + y; + if (x > (std::numeric_limits::max)() - y) + { + return (std::numeric_limits::max)(); + } } - else + else if (x.high < 0 && y.high < 0) { - // x < 0 and y < 0 - // Nearly the same technique as the positive values case - constexpr auto max_value {-static_cast((std::numeric_limits::min)())}; - const auto big_x {static_cast(abs(x))}; - const auto big_y {static_cast(abs(y))}; - const auto big_res {big_x + big_y}; - - return big_res > max_value ? (std::numeric_limits::min)() : -static_cast(big_res); + if (x < (std::numeric_limits::min)() - y) + { + return (std::numeric_limits::min)(); + } } + + return x + y; } BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept { - if (x <= 0 && y >= 0) - { - // Underflow case - const auto res {x - y}; - return res > x ? (std::numeric_limits::min)() : res; - } - else if (x > 0 && y < 0) - { - // Overflow Case - constexpr auto max_val {static_cast((std::numeric_limits::max)())}; - const auto big_x {static_cast(x)}; - const auto big_y {-static_cast(y)}; - const auto res {big_x + big_y}; + // Detect overflow BEFORE the subtraction to avoid signed overflow UB. + // Positive overflow: x >= 0 and y < 0 and x > max + y (safe: max + negative < max) + // Negative overflow: x < 0 and y >= 0 and x < min + y (safe: min + non_negative > min) + // Same signs: overflow is impossible. - return (res > max_val || res < big_x) ? (std::numeric_limits::max)() : static_cast(res); + if (x.high >= 0 && y.high < 0) + { + if (x > (std::numeric_limits::max)() + y) + { + return (std::numeric_limits::max)(); + } } - else + else if (x.high < 0 && y.high >= 0) { - return x - y; + if (x < (std::numeric_limits::min)() + y) + { + return (std::numeric_limits::min)(); + } } + + return x - y; } #ifdef _MSC_VER @@ -404,11 +399,17 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int // For signed integers, we use a + (b - a) / 2 or a - (a - b) / 2 // The subtraction is done in unsigned arithmetic to handle overflow correctly // Integer division automatically rounds toward the first argument + // + // Use direct field access for both the uint128 construction and the + // comparison to avoid NVCC host compiler issues with operator<= and + // static_cast on int128_t for large-magnitude values + + const uint128_t ua {static_cast(a.high), a.low}; + const uint128_t ub {static_cast(b.high), b.low}; - const auto ua {static_cast(a)}; - const auto ub {static_cast(b)}; + const bool a_le_b {a.high == b.high ? a.low <= b.low : a.high < b.high}; - if (a <= b) + if (a_le_b) { // diff = b - a (computed in unsigned, handles wrap-around correctly) const auto diff {ub - ua}; diff --git a/test/cuda_jamfile b/test/cuda_jamfile index bac4ab2b..82e2c8ee 100644 --- a/test/cuda_jamfile +++ b/test/cuda_jamfile @@ -64,3 +64,18 @@ run test_signed_cstdlib_div.cu ; run test_signed_to_unsigned_conversion.cu ; run test_unsigned_to_signed_conversion.cu ; + +run test_unsigned_add_sat.cu ; +run test_signed_add_sat.cu ; +run test_unsigned_sub_sat.cu ; +run test_signed_sub_sat.cu ; +run test_unsigned_mul_sat.cu ; +run test_signed_mul_sat.cu ; +run test_unsigned_div_sat.cu ; +run test_signed_div_sat.cu ; +run test_unsigned_gcd.cu ; +run test_signed_gcd.cu ; +run test_unsigned_lcm.cu ; +run test_signed_lcm.cu ; +run test_unsigned_midpoint.cu ; +run test_signed_midpoint.cu ; diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu new file mode 100644 index 00000000..45b45116 --- /dev/null +++ b/test/test_signed_add_sat.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::add_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + int fail_count = 0; + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + if (fail_count < 5) + { + std::cerr << "Result verification failed at element " << i << std::endl; + std::cerr << " input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl; + std::cerr << " input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl; + std::cerr << " GPU high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl; + std::cerr << " CPU high: " << results[i].high << " low: " << results[i].low << std::endl; + } + ++fail_count; + } + } + if (fail_count > 0) + { + std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_div_sat.cu b/test/test_signed_div_sat.cu new file mode 100644 index 00000000..804d4dc9 --- /dev/null +++ b/test/test_signed_div_sat.cu @@ -0,0 +1,96 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::div_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + do + { + input_vector2[i] = dist(rng); + } while (input_vector2[i] == 0); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_eq.cu b/test/test_signed_eq.cu index ac09ae44..4f7156c1 100644 --- a/test/test_signed_eq.cu +++ b/test/test_signed_eq.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_gcd.cu b/test/test_signed_gcd.cu new file mode 100644 index 00000000..7d5c8434 --- /dev/null +++ b/test/test_signed_gcd.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::gcd(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_ge.cu b/test/test_signed_ge.cu index af5628fb..efe510ea 100644 --- a/test/test_signed_ge.cu +++ b/test/test_signed_ge.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_gt.cu b/test/test_signed_gt.cu index f37577b8..820b0797 100644 --- a/test/test_signed_gt.cu +++ b/test/test_signed_gt.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_lcm.cu b/test/test_signed_lcm.cu new file mode 100644 index 00000000..bafe559d --- /dev/null +++ b/test/test_signed_lcm.cu @@ -0,0 +1,94 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::lcm(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + // Use smaller values to avoid overflow in lcm computation + boost::random::uniform_int_distribution dist {test_type{-1, UINT64_MAX}, test_type{0, UINT64_MAX}}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_le.cu b/test/test_signed_le.cu index 05ee992a..d2d67ce6 100644 --- a/test/test_signed_le.cu +++ b/test/test_signed_le.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_lt.cu b/test/test_signed_lt.cu index c356ae00..c4094c4d 100644 --- a/test/test_signed_lt.cu +++ b/test/test_signed_lt.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu new file mode 100644 index 00000000..5ee28d71 --- /dev/null +++ b/test/test_signed_midpoint.cu @@ -0,0 +1,106 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::midpoint(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + int fail_count = 0; + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + if (fail_count < 5) + { + std::cerr << "Result verification failed at element " << i << std::endl; + std::cerr << " input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl; + std::cerr << " input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl; + std::cerr << " GPU high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl; + std::cerr << " CPU high: " << results[i].high << " low: " << results[i].low << std::endl; + } + ++fail_count; + } + } + if (fail_count > 0) + { + std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_mul_sat.cu b/test/test_signed_mul_sat.cu new file mode 100644 index 00000000..569e583c --- /dev/null +++ b/test/test_signed_mul_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::mul_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_signed_ne.cu b/test/test_signed_ne.cu index 39f9d64a..6c34a111 100644 --- a/test/test_signed_ne.cu +++ b/test/test_signed_ne.cu @@ -58,7 +58,7 @@ int main(void) cuda_managed_ptr output_vector(numElements); // Initialize the input vectors - boost::random::uniform_int_distribution dist {test_type{(std::numeric_limits::min)()} + 1, test_type{(std::numeric_limits::max)()} - 1}; + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)() + 1, (std::numeric_limits::max)() - 1}; for (std::size_t i = 0; i < numElements; ++i) { input_vector1[i] = dist(rng); diff --git a/test/test_signed_sub_sat.cu b/test/test_signed_sub_sat.cu new file mode 100644 index 00000000..7dd40f30 --- /dev/null +++ b/test/test_signed_sub_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::int128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::sub_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {(std::numeric_limits::min)(), (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_add_sat.cu b/test/test_unsigned_add_sat.cu new file mode 100644 index 00000000..3cfc0317 --- /dev/null +++ b/test/test_unsigned_add_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::add_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_div_sat.cu b/test/test_unsigned_div_sat.cu new file mode 100644 index 00000000..9f76b869 --- /dev/null +++ b/test/test_unsigned_div_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::div_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{1U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_gcd.cu b/test/test_unsigned_gcd.cu new file mode 100644 index 00000000..f23abe48 --- /dev/null +++ b/test/test_unsigned_gcd.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::gcd(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_lcm.cu b/test/test_unsigned_lcm.cu new file mode 100644 index 00000000..d586d58b --- /dev/null +++ b/test/test_unsigned_lcm.cu @@ -0,0 +1,94 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::lcm(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + // Use smaller values to avoid overflow in lcm computation + boost::random::uniform_int_distribution dist {test_type{0U}, test_type{0U, UINT64_MAX}}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_midpoint.cu b/test/test_unsigned_midpoint.cu new file mode 100644 index 00000000..e695b5ff --- /dev/null +++ b/test/test_unsigned_midpoint.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::midpoint(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_mul_sat.cu b/test/test_unsigned_mul_sat.cu new file mode 100644 index 00000000..228ef806 --- /dev/null +++ b/test/test_unsigned_mul_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::mul_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +} diff --git a/test/test_unsigned_sub_sat.cu b/test/test_unsigned_sub_sat.cu new file mode 100644 index 00000000..73bf36d7 --- /dev/null +++ b/test/test_unsigned_sub_sat.cu @@ -0,0 +1,93 @@ +// Copyright Matt Borland 2026. +// Use, modification and distribution are subject to the +// Boost Software License, Version 1.0. (See accompanying file +// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#define BOOST_INT128_ALLOW_SIGN_CONVERSION + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cuda_managed_ptr.hpp" +#include "stopwatch.hpp" + +#include + +using test_type = boost::int128::uint128_t; + +__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < numElements) + { + out[i] = boost::int128::sub_sat(in[i], in2[i]); + } +} + +int main(void) +{ + std::mt19937_64 rng {42}; + + cudaError_t err = cudaSuccess; + + int numElements = 50000; + std::cout << "[Vector operation on " << numElements << " elements]" << std::endl; + + cuda_managed_ptr input_vector(numElements); + cuda_managed_ptr input_vector2(numElements); + cuda_managed_ptr output_vector(numElements); + + boost::random::uniform_int_distribution dist {test_type{0U}, (std::numeric_limits::max)()}; + for (std::size_t i = 0; i < numElements; ++i) + { + input_vector[i] = dist(rng); + input_vector2[i] = dist(rng); + } + + int threadsPerBlock = 256; + int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl; + + watch w; + + cuda_test<<>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements); + cudaDeviceSynchronize(); + + std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl; + + err = cudaGetLastError(); + if (err != cudaSuccess) + { + std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl; + return EXIT_FAILURE; + } + + std::vector results; + results.reserve(numElements); + w.reset(); + for (int i = 0; i < numElements; ++i) + { + results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i])); + } + double t = w.elapsed(); + + for (int i = 0; i < numElements; ++i) + { + if (output_vector[i] != results[i]) + { + std::cerr << "Result verification failed at element " << i << "!" << std::endl; + return EXIT_FAILURE; + } + } + + std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl; + std::cout << "Done\n"; + + return 0; +}