From 1d786e31ba0f0ddfe806a55aec1b86e94231b8f0 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 11 Mar 2026 13:40:38 -0400
Subject: [PATCH 1/9] Add CUDA testing of <numeric> functions

---
 test/cuda_jamfile              | 15 ++++++
 test/test_signed_add_sat.cu    | 93 ++++++++++++++++++++++++++++++++
 test/test_signed_div_sat.cu    | 96 ++++++++++++++++++++++++++++++++++
 test/test_signed_gcd.cu        | 93 ++++++++++++++++++++++++++++++++
 test/test_signed_lcm.cu        | 94 +++++++++++++++++++++++++++++++++
 test/test_signed_midpoint.cu   | 93 ++++++++++++++++++++++++++++++++
 test/test_signed_mul_sat.cu    | 93 ++++++++++++++++++++++++++++++++
 test/test_signed_sub_sat.cu    | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_add_sat.cu  | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_div_sat.cu  | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_gcd.cu      | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_lcm.cu      | 94 +++++++++++++++++++++++++++++++++
 test/test_unsigned_midpoint.cu | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_mul_sat.cu  | 93 ++++++++++++++++++++++++++++++++
 test/test_unsigned_sub_sat.cu  | 93 ++++++++++++++++++++++++++++++++
 15 files changed, 1322 insertions(+)
 create mode 100644 test/test_signed_add_sat.cu
 create mode 100644 test/test_signed_div_sat.cu
 create mode 100644 test/test_signed_gcd.cu
 create mode 100644 test/test_signed_lcm.cu
 create mode 100644 test/test_signed_midpoint.cu
 create mode 100644 test/test_signed_mul_sat.cu
 create mode 100644 test/test_signed_sub_sat.cu
 create mode 100644 test/test_unsigned_add_sat.cu
 create mode 100644 test/test_unsigned_div_sat.cu
 create mode 100644 test/test_unsigned_gcd.cu
 create mode 100644 test/test_unsigned_lcm.cu
 create mode 100644 test/test_unsigned_midpoint.cu
 create mode 100644 test/test_unsigned_mul_sat.cu
 create mode 100644 test/test_unsigned_sub_sat.cu
diff --git a/test/cuda_jamfile b/test/cuda_jamfile
index bac4ab2b..82e2c8ee 100644
--- a/test/cuda_jamfile
+++ b/test/cuda_jamfile
@@ -64,3 +64,18 @@ run test_signed_cstdlib_div.cu ;
 
 run test_signed_to_unsigned_conversion.cu ;
 run test_unsigned_to_signed_conversion.cu ;
+
+run test_unsigned_add_sat.cu ;
+run test_signed_add_sat.cu ;
+run test_unsigned_sub_sat.cu ;
+run test_signed_sub_sat.cu ;
+run test_unsigned_mul_sat.cu ;
+run test_signed_mul_sat.cu ;
+run test_unsigned_div_sat.cu ;
+run test_signed_div_sat.cu ;
+run test_unsigned_gcd.cu ;
+run test_signed_gcd.cu ;
+run test_unsigned_lcm.cu ;
+run test_signed_lcm.cu ;
+run test_unsigned_midpoint.cu ;
+run test_signed_midpoint.cu ;
diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu
new file mode 100644
index 00000000..a86457f9
--- /dev/null
+++ b/test/test_signed_add_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_div_sat.cu b/test/test_signed_div_sat.cu
new file mode 100644
index 00000000..804d4dc9
--- /dev/null
+++ b/test/test_signed_div_sat.cu
@@ -0,0 +1,96 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        do
+        {
+            input_vector2[i] = dist(rng);
+        } while (input_vector2[i] == 0);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_gcd.cu b/test/test_signed_gcd.cu
new file mode 100644
index 00000000..7d5c8434
--- /dev/null
+++ b/test/test_signed_gcd.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_lcm.cu b/test/test_signed_lcm.cu
new file mode 100644
index 00000000..bafe559d
--- /dev/null
+++ b/test/test_signed_lcm.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::lcm(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Use smaller values to avoid overflow in lcm computation
+    boost::random::uniform_int_distribution<test_type> dist {test_type{-1, UINT64_MAX}, test_type{0, UINT64_MAX}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu
new file mode 100644
index 00000000..803ba974
--- /dev/null
+++ b/test/test_signed_midpoint.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::midpoint(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_mul_sat.cu b/test/test_signed_mul_sat.cu
new file mode 100644
index 00000000..569e583c
--- /dev/null
+++ b/test/test_signed_mul_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::mul_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_signed_sub_sat.cu b/test/test_signed_sub_sat.cu
new file mode 100644
index 00000000..7dd40f30
--- /dev/null
+++ b/test/test_signed_sub_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::sub_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_add_sat.cu b/test/test_unsigned_add_sat.cu
new file mode 100644
index 00000000..3cfc0317
--- /dev/null
+++ b/test/test_unsigned_add_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_div_sat.cu b/test/test_unsigned_div_sat.cu
new file mode 100644
index 00000000..9f76b869
--- /dev/null
+++ b/test/test_unsigned_div_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::div_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{1U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::div_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_gcd.cu b/test/test_unsigned_gcd.cu
new file mode 100644
index 00000000..f23abe48
--- /dev/null
+++ b/test/test_unsigned_gcd.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::gcd(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::gcd(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_lcm.cu b/test/test_unsigned_lcm.cu
new file mode 100644
index 00000000..d586d58b
--- /dev/null
+++ b/test/test_unsigned_lcm.cu
@@ -0,0 +1,94 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::lcm(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    // Use smaller values to avoid overflow in lcm computation
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, test_type{0U, UINT64_MAX}};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::lcm(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_midpoint.cu b/test/test_unsigned_midpoint.cu
new file mode 100644
index 00000000..e695b5ff
--- /dev/null
+++ b/test/test_unsigned_midpoint.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::midpoint(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::midpoint(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_mul_sat.cu b/test/test_unsigned_mul_sat.cu
new file mode 100644
index 00000000..228ef806
--- /dev/null
+++ b/test/test_unsigned_mul_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::mul_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::mul_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}
diff --git a/test/test_unsigned_sub_sat.cu b/test/test_unsigned_sub_sat.cu
new file mode 100644
index 00000000..73bf36d7
--- /dev/null
+++ b/test/test_unsigned_sub_sat.cu
@@ -0,0 +1,93 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::uint128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::sub_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {test_type{0U}, (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::sub_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
+            return EXIT_FAILURE;
+        }
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}

From 0796f36fdeb8eb026850bd2cf7ac3ecb51f4a8a6 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Wed, 11 Mar 2026 13:56:10 -0400
Subject: [PATCH 2/9] Improve diagnostics

---
 test/test_signed_add_sat.cu  | 17 +++++++++++++++--
 test/test_signed_midpoint.cu | 17 +++++++++++++++--
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu
index a86457f9..45b45116 100644
--- a/test/test_signed_add_sat.cu
+++ b/test/test_signed_add_sat.cu
@@ -77,14 +77,27 @@ int main(void)
     }
     double t = w.elapsed();
 
+    int fail_count = 0;
     for (int i = 0; i < numElements; ++i)
     {
         if (output_vector[i] != results[i])
         {
-            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
-            return EXIT_FAILURE;
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
         }
     }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
 
     std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
     std::cout << "Done\n";
diff --git a/test/test_signed_midpoint.cu b/test/test_signed_midpoint.cu
index 803ba974..5ee28d71 100644
--- a/test/test_signed_midpoint.cu
+++ b/test/test_signed_midpoint.cu
@@ -77,14 +77,27 @@ int main(void)
     }
     double t = w.elapsed();
 
+    int fail_count = 0;
     for (int i = 0; i < numElements; ++i)
     {
         if (output_vector[i] != results[i])
         {
-            std::cerr << "Result verification failed at element " << i << "!" << std::endl;
-            return EXIT_FAILURE;
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
         }
     }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
 
     std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
     std::cout << "Done\n";

From a1e05752f605796a8541ea7807222ad983dc5698 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Thu, 12 Mar 2026 14:25:06 -0400
Subject: [PATCH 3/9] Implement conversion operators between types

---
 include/boost/int128/detail/conversions.hpp | 14 ++++++++++++++
 include/boost/int128/detail/int128_imp.hpp  |  1 +
 include/boost/int128/detail/uint128_imp.hpp |  1 +
 3 files changed, 16 insertions(+)

diff --git a/include/boost/int128/detail/conversions.hpp b/include/boost/int128/detail/conversions.hpp
index a549db03..f471d570 100644
--- a/include/boost/int128/detail/conversions.hpp
+++ b/include/boost/int128/detail/conversions.hpp
@@ -38,6 +38,20 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexc
 
 #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE
 
+//=====================================
+// Conversion Operators
+//=====================================
+
+BOOST_INT128_HOST_DEVICE constexpr int128_t::operator uint128_t() const noexcept
+{
+    return uint128_t{static_cast<std::uint64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator int128_t() const noexcept
+{
+    return int128_t{static_cast<std::int64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
 //=====================================
 // Comparison Operators
 //=====================================
diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp
index d2bdfa62..4e5fcc11 100644
--- a/include/boost/int128/detail/int128_imp.hpp
+++ b/include/boost/int128/detail/int128_imp.hpp
@@ -57,6 +57,7 @@ int128_t
 
     // Requires a conversion file to be implemented
     BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator uint128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE
diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp
index 26aa9370..fdd0b309 100644
--- a/include/boost/int128/detail/uint128_imp.hpp
+++ b/include/boost/int128/detail/uint128_imp.hpp
@@ -58,6 +58,7 @@ uint128_t
 
     // Requires a conversion file to be implemented
     BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator int128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE

From 39e43140175ac8b9895d3ce749e34337ee5c634e Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 08:36:25 -0500
Subject: [PATCH 4/9] Remove forward declarations

---
 include/boost/int128/numeric.hpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
index 2375154f..33111e46 100644
--- a/include/boost/int128/numeric.hpp
+++ b/include/boost/int128/numeric.hpp
@@ -76,9 +76,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u
     return z;
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(int128_t x, int128_t y) noexcept;
-BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t x, int128_t y) noexcept;
-
 #ifdef _MSC_VER
 #  pragma warning(push)
 #  pragma warning(disable : 4307) // Addition Overflow

From f188a38b8df6dc256464d111d916edf83b1edb16 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 08:38:54 -0500
Subject: [PATCH 5/9] Avoid undefined rollover

---
 include/boost/int128/numeric.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
index 33111e46..0665887f 100644
--- a/include/boost/int128/numeric.hpp
+++ b/include/boost/int128/numeric.hpp
@@ -115,7 +115,9 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int1
     if (x <= 0 && y >= 0)
     {
         // Underflow case
-        const auto res {x - y};
+        const auto big_x {static_cast<uint128_t>(x)};
+        const auto big_y {static_cast<uint128_t>(y)};
+        const auto res {static_cast<int128_t>(big_x - big_y)};
         return res > x ? (std::numeric_limits<int128_t>::min)() : res;
     }
     else if (x > 0 && y < 0)

From 2d1fd23d1ea9ce469a96fdde9759d54dbb2eabf9 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 11:15:45 -0500
Subject: [PATCH 6/9] Improve algorithms

---
 include/boost/int128/numeric.hpp | 68 ++++++++++++++------------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
index 0665887f..23832a5c 100644
--- a/include/boost/int128/numeric.hpp
+++ b/include/boost/int128/numeric.hpp
@@ -84,56 +84,40 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x >= 0 && y >= 0)
-    {
-        constexpr auto max_value {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {static_cast<uint128_t>(y)};
-        const auto big_res {big_x + big_y};
+    const auto result {x + y};
 
-        return big_res > max_value ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(big_res);
-    }
-    else if ((x < 0 && y > 0) || (x > 0 && y < 0))
+    // Positive overflow: both non-negative but result wrapped to negative
+    if (x.high >= 0 && y.high >= 0 && result.high < 0)
     {
-        return x + y;
+        return (std::numeric_limits<int128_t>::max)();
     }
-    else
+
+    // Negative overflow: both negative but result wrapped to non-negative
+    if (x.high < 0 && y.high < 0 && result.high >= 0)
     {
-        // x < 0 and y < 0
-        // Nearly the same technique as the positive values case
-        constexpr auto max_value {-static_cast<uint128_t>((std::numeric_limits<int128_t>::min)())};
-        const auto big_x {static_cast<uint128_t>(abs(x))};
-        const auto big_y {static_cast<uint128_t>(abs(y))};
-        const auto big_res {big_x + big_y};
-
-        return big_res > max_value ? (std::numeric_limits<int128_t>::min)() : -static_cast<int128_t>(big_res);
+        return (std::numeric_limits<int128_t>::min)();
     }
+
+    return result;
 }
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x <= 0 && y >= 0)
+    const auto result {x - y};
+
+    // Positive overflow: positive minus negative but result wrapped to negative
+    if (x.high >= 0 && y.high < 0 && result.high < 0)
     {
-        // Underflow case
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {static_cast<uint128_t>(y)};
-        const auto res {static_cast<int128_t>(big_x - big_y)};
-        return res > x ? (std::numeric_limits<int128_t>::min)() : res;
+        return (std::numeric_limits<int128_t>::max)();
     }
-    else if (x > 0 && y < 0)
-    {
-        // Overflow Case
-        constexpr auto max_val {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {-static_cast<uint128_t>(y)};
-        const auto res {big_x + big_y};
 
-        return (res > max_val || res < big_x) ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(res);
-    }
-    else
+    // Negative overflow: negative minus non-negative but result wrapped to non-negative
+    if (x.high < 0 && y.high >= 0 && result.high >= 0)
     {
-        return x - y;
+        return (std::numeric_limits<int128_t>::min)();
     }
+
+    return result;
 }
 
 #ifdef _MSC_VER
@@ -403,11 +387,17 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int
     // For signed integers, we use a + (b - a) / 2 or a - (a - b) / 2
     // The subtraction is done in unsigned arithmetic to handle overflow correctly
     // Integer division automatically rounds toward the first argument
+    //
+    // Use direct field access for both the uint128 construction and the
+    // comparison to avoid NVCC host compiler issues with operator<= and
+    // static_cast on int128_t for large-magnitude values
+
+    const uint128_t ua {static_cast<std::uint64_t>(a.high), a.low};
+    const uint128_t ub {static_cast<std::uint64_t>(b.high), b.low};
 
-    const auto ua {static_cast<uint128_t>(a)};
-    const auto ub {static_cast<uint128_t>(b)};
+    const bool a_le_b {a.high == b.high ? a.low <= b.low : a.high < b.high};
 
-    if (a <= b)
+    if (a_le_b)
     {
         // diff = b - a (computed in unsigned, handles wrap-around correctly)
         const auto diff {ub - ua};

From 3a8b5c36f8f0c24c0c2880292e4bb7b971b791cd Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 11:18:23 -0500
Subject: [PATCH 7/9] Expand testing ranges

---
 test/test_signed_eq.cu | 2 +-
 test/test_signed_ge.cu | 2 +-
 test/test_signed_gt.cu | 2 +-
 test/test_signed_le.cu | 2 +-
 test/test_signed_lt.cu | 2 +-
 test/test_signed_ne.cu | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_signed_eq.cu b/test/test_signed_eq.cu
index ac09ae44..4f7156c1 100644
--- a/test/test_signed_eq.cu
+++ b/test/test_signed_eq.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);
diff --git a/test/test_signed_ge.cu b/test/test_signed_ge.cu
index af5628fb..efe510ea 100644
--- a/test/test_signed_ge.cu
+++ b/test/test_signed_ge.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);
diff --git a/test/test_signed_gt.cu b/test/test_signed_gt.cu
index f37577b8..820b0797 100644
--- a/test/test_signed_gt.cu
+++ b/test/test_signed_gt.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);
diff --git a/test/test_signed_le.cu b/test/test_signed_le.cu
index 05ee992a..d2d67ce6 100644
--- a/test/test_signed_le.cu
+++ b/test/test_signed_le.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);
diff --git a/test/test_signed_lt.cu b/test/test_signed_lt.cu
index c356ae00..c4094c4d 100644
--- a/test/test_signed_lt.cu
+++ b/test/test_signed_lt.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);
diff --git a/test/test_signed_ne.cu b/test/test_signed_ne.cu
index 39f9d64a..6c34a111 100644
--- a/test/test_signed_ne.cu
+++ b/test/test_signed_ne.cu
@@ -58,7 +58,7 @@ int main(void)
     cuda_managed_ptr<bool> output_vector(numElements);
 
     // Initialize the input vectors
-    boost::random::uniform_int_distribution<test_type> dist {test_type{(std::numeric_limits<std::int64_t>::min)()} + 1, test_type{(std::numeric_limits<test_type>::max)()} - 1};
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)() + 1, (std::numeric_limits<test_type>::max)() - 1};
     for (std::size_t i = 0; i < numElements; ++i)
     {
         input_vector1[i] = dist(rng);

From 41d230381d6fc2e9a04c4bda033b60633eeb160d Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 11:23:33 -0500
Subject: [PATCH 8/9] Update deprecated actions

---
 .github/workflows/codecov.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
index bd2f7c8d..c1486f38 100644
--- a/.github/workflows/codecov.yml
+++ b/.github/workflows/codecov.yml
@@ -80,13 +80,13 @@ jobs:
           fi
           git config --global pack.threads 0
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # For coverage builds fetch the whole history, else only 1 commit using a 'fake ternary'
           fetch-depth: ${{ matrix.coverage && '0' || '1' }}
 
       - name: Cache ccache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         if: env.B2_USE_CCACHE
         with:
           path: ~/.ccache
@@ -94,7 +94,7 @@ jobs:
           restore-keys: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}-
 
       - name: Fetch Boost.CI
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: boostorg/boost-ci
           ref: master

From 705630c6f9745ecf121b8a6aa4945d34152312b9 Mon Sep 17 00:00:00 2001
From: Matt Borland <matt@mattborland.com>
Date: Fri, 13 Mar 2026 11:28:53 -0500
Subject: [PATCH 9/9] Further avoidance of UB

---
 include/boost/int128/numeric.hpp | 48 ++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
index 23832a5c..0699cd8b 100644
--- a/include/boost/int128/numeric.hpp
+++ b/include/boost/int128/numeric.hpp
@@ -84,40 +84,52 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept
 {
-    const auto result {x + y};
+    // Detect overflow BEFORE the addition to avoid signed overflow UB.
+    // When both are non-negative: overflow iff x > max - y (subtraction safe: max - non_negative >= 0)
+    // When both are negative: overflow iff x < min - y (subtraction safe: min - negative > min)
+    // Mixed signs: overflow is impossible.
 
-    // Positive overflow: both non-negative but result wrapped to negative
-    if (x.high >= 0 && y.high >= 0 && result.high < 0)
+    if (x.high >= 0 && y.high >= 0)
     {
-        return (std::numeric_limits<int128_t>::max)();
+        if (x > (std::numeric_limits<int128_t>::max)() - y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-
-    // Negative overflow: both negative but result wrapped to non-negative
-    if (x.high < 0 && y.high < 0 && result.high >= 0)
+    else if (x.high < 0 && y.high < 0)
     {
-        return (std::numeric_limits<int128_t>::min)();
+        if (x < (std::numeric_limits<int128_t>::min)() - y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
 
-    return result;
+    return x + y;
 }
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept
 {
-    const auto result {x - y};
+    // Detect overflow BEFORE the subtraction to avoid signed overflow UB.
+    // Positive overflow: x >= 0 and y < 0 and x > max + y (safe: max + negative < max)
+    // Negative overflow: x < 0 and y >= 0 and x < min + y (safe: min + non_negative > min)
+    // Same signs: overflow is impossible.
 
-    // Positive overflow: positive minus negative but result wrapped to negative
-    if (x.high >= 0 && y.high < 0 && result.high < 0)
+    if (x.high >= 0 && y.high < 0)
     {
-        return (std::numeric_limits<int128_t>::max)();
+        if (x > (std::numeric_limits<int128_t>::max)() + y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-
-    // Negative overflow: negative minus non-negative but result wrapped to non-negative
-    if (x.high < 0 && y.high >= 0 && result.high >= 0)
+    else if (x.high < 0 && y.high >= 0)
     {
-        return (std::numeric_limits<int128_t>::min)();
+        if (x < (std::numeric_limits<int128_t>::min)() + y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
 
-    return result;
+    return x - y;
 }
 
 #ifdef _MSC_VER