cppalliance · mborland · Mar 13, 2026 · Mar 11, 2026 · Mar 11, 2026 · Mar 12, 2026
diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml
@@ -80,21 +80,21 @@ jobs:
           fi
           git config --global pack.threads 0
 
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
         with:
           # For coverage builds fetch the whole history, else only 1 commit using a 'fake ternary'
           fetch-depth: ${{ matrix.coverage && '0' || '1' }}
 
       - name: Cache ccache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         if: env.B2_USE_CCACHE
         with:
           path: ~/.ccache
           key: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}-${{github.sha}}
           restore-keys: ${{matrix.os}}-${{matrix.container}}-${{matrix.compiler}}-
 
       - name: Fetch Boost.CI
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: boostorg/boost-ci
           ref: master

diff --git a/include/boost/int128/detail/conversions.hpp b/include/boost/int128/detail/conversions.hpp
@@ -38,6 +38,20 @@ BOOST_INT128_HOST_DEVICE constexpr uint128_t::uint128_t(const int128_t& v) noexc
 
 #endif // BOOST_INT128_ENDIAN_LITTLE_BYTE
 
+//=====================================
+// Conversion Operators
+//=====================================
+
+BOOST_INT128_HOST_DEVICE constexpr int128_t::operator uint128_t() const noexcept
+{
+    return uint128_t{static_cast<std::uint64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
+BOOST_INT128_HOST_DEVICE constexpr uint128_t::operator int128_t() const noexcept
+{
+    return int128_t{static_cast<std::int64_t>(this->high), static_cast<std::uint64_t>(this->low)};
+}
+
 //=====================================
 // Comparison Operators
 //=====================================

diff --git a/include/boost/int128/detail/int128_imp.hpp b/include/boost/int128/detail/int128_imp.hpp
@@ -57,6 +57,7 @@ int128_t
 
     // Requires a conversion file to be implemented
     BOOST_INT128_HOST_DEVICE explicit constexpr int128_t(const uint128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator uint128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE

diff --git a/include/boost/int128/detail/uint128_imp.hpp b/include/boost/int128/detail/uint128_imp.hpp
@@ -58,6 +58,7 @@ uint128_t
 
     // Requires a conversion file to be implemented
     BOOST_INT128_HOST_DEVICE explicit constexpr uint128_t(const int128_t& v) noexcept;
+    BOOST_INT128_HOST_DEVICE explicit constexpr operator int128_t() const noexcept;
 
     // Construct from integral types
     #if BOOST_INT128_ENDIAN_LITTLE_BYTE

diff --git a/include/boost/int128/numeric.hpp b/include/boost/int128/numeric.hpp
@@ -76,9 +76,6 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr uint128_t sub_sat(const u
     return z;
 }
 
-BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(int128_t x, int128_t y) noexcept;
-BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t x, int128_t y) noexcept;
-
 #ifdef _MSC_VER
 #  pragma warning(push)
 #  pragma warning(disable : 4307) // Addition Overflow
@@ -87,54 +84,52 @@ BOOST_INT128_EXPORT BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(int128_t
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t add_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x >= 0 && y >= 0)
-    {
-        constexpr auto max_value {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {static_cast<uint128_t>(y)};
-        const auto big_res {big_x + big_y};
+    // Detect overflow BEFORE the addition to avoid signed overflow UB.
+    // When both are non-negative: overflow iff x > max - y (subtraction safe: max - non_negative >= 0)
+    // When both are negative: overflow iff x < min - y (subtraction safe: min - negative > min)
+    // Mixed signs: overflow is impossible.
 
-        return big_res > max_value ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(big_res);
-    }
-    else if ((x < 0 && y > 0) || (x > 0 && y < 0))
+    if (x.high >= 0 && y.high >= 0)
     {
-        return x + y;
+        if (x > (std::numeric_limits<int128_t>::max)() - y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-    else
+    else if (x.high < 0 && y.high < 0)
     {
-        // x < 0 and y < 0
-        // Nearly the same technique as the positive values case
-        constexpr auto max_value {-static_cast<uint128_t>((std::numeric_limits<int128_t>::min)())};
-        const auto big_x {static_cast<uint128_t>(abs(x))};
-        const auto big_y {static_cast<uint128_t>(abs(y))};
-        const auto big_res {big_x + big_y};
-
-        return big_res > max_value ? (std::numeric_limits<int128_t>::min)() : -static_cast<int128_t>(big_res);
+        if (x < (std::numeric_limits<int128_t>::min)() - y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
+
+    return x + y;
 }
 
 BOOST_INT128_HOST_DEVICE constexpr int128_t sub_sat(const int128_t x, const int128_t y) noexcept
 {
-    if (x <= 0 && y >= 0)
-    {
-        // Underflow case
-        const auto res {x - y};
-        return res > x ? (std::numeric_limits<int128_t>::min)() : res;
-    }
-    else if (x > 0 && y < 0)
-    {
-        // Overflow Case
-        constexpr auto max_val {static_cast<uint128_t>((std::numeric_limits<int128_t>::max)())};
-        const auto big_x {static_cast<uint128_t>(x)};
-        const auto big_y {-static_cast<uint128_t>(y)};
-        const auto res {big_x + big_y};
+    // Detect overflow BEFORE the subtraction to avoid signed overflow UB.
+    // Positive overflow: x >= 0 and y < 0 and x > max + y (safe: max + negative < max)
+    // Negative overflow: x < 0 and y >= 0 and x < min + y (safe: min + non_negative > min)
+    // Same signs: overflow is impossible.
 
-        return (res > max_val || res < big_x) ? (std::numeric_limits<int128_t>::max)() : static_cast<int128_t>(res);
+    if (x.high >= 0 && y.high < 0)
+    {
+        if (x > (std::numeric_limits<int128_t>::max)() + y)
+        {
+            return (std::numeric_limits<int128_t>::max)();
+        }
     }
-    else
+    else if (x.high < 0 && y.high >= 0)
     {
-        return x - y;
+        if (x < (std::numeric_limits<int128_t>::min)() + y)
+        {
+            return (std::numeric_limits<int128_t>::min)();
+        }
     }
+
+    return x - y;
 }
 
 #ifdef _MSC_VER
@@ -404,11 +399,17 @@ BOOST_INT128_HOST_DEVICE constexpr int128_t midpoint(const int128_t a, const int
     // For signed integers, we use a + (b - a) / 2 or a - (a - b) / 2
     // The subtraction is done in unsigned arithmetic to handle overflow correctly
     // Integer division automatically rounds toward the first argument
+    //
+    // Use direct field access for both the uint128 construction and the
+    // comparison to avoid NVCC host compiler issues with operator<= and
+    // static_cast on int128_t for large-magnitude values
+
+    const uint128_t ua {static_cast<std::uint64_t>(a.high), a.low};
+    const uint128_t ub {static_cast<std::uint64_t>(b.high), b.low};
 
-    const auto ua {static_cast<uint128_t>(a)};
-    const auto ub {static_cast<uint128_t>(b)};
+    const bool a_le_b {a.high == b.high ? a.low <= b.low : a.high < b.high};
 
-    if (a <= b)
+    if (a_le_b)
     {
         // diff = b - a (computed in unsigned, handles wrap-around correctly)
         const auto diff {ub - ua};

diff --git a/test/cuda_jamfile b/test/cuda_jamfile
@@ -64,3 +64,18 @@ run test_signed_cstdlib_div.cu ;
 
 run test_signed_to_unsigned_conversion.cu ;
 run test_unsigned_to_signed_conversion.cu ;
+
+run test_unsigned_add_sat.cu ;
+run test_signed_add_sat.cu ;
+run test_unsigned_sub_sat.cu ;
+run test_signed_sub_sat.cu ;
+run test_unsigned_mul_sat.cu ;
+run test_signed_mul_sat.cu ;
+run test_unsigned_div_sat.cu ;
+run test_signed_div_sat.cu ;
+run test_unsigned_gcd.cu ;
+run test_signed_gcd.cu ;
+run test_unsigned_lcm.cu ;
+run test_signed_lcm.cu ;
+run test_unsigned_midpoint.cu ;
+run test_signed_midpoint.cu ;
diff --git a/test/test_signed_add_sat.cu b/test/test_signed_add_sat.cu
@@ -0,0 +1,106 @@
+//  Copyright Matt Borland 2026.
+//  Use, modification and distribution are subject to the
+//  Boost Software License, Version 1.0. (See accompanying file
+//  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#define BOOST_INT128_ALLOW_SIGN_CONVERSION
+
+#include <iostream>
+#include <vector>
+#include <random>
+#include <limits>
+#include <boost/int128.hpp>
+#include <boost/int128/numeric.hpp>
+#include <boost/int128/random.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+#include "cuda_managed_ptr.hpp"
+#include "stopwatch.hpp"
+
+#include <cuda_runtime.h>
+
+using test_type = boost::int128::int128_t;
+
+__global__ void cuda_test(const test_type *in, const test_type *in2, test_type *out, int numElements)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < numElements)
+    {
+        out[i] = boost::int128::add_sat(in[i], in2[i]);
+    }
+}
+
+int main(void)
+{
+    std::mt19937_64 rng {42};
+
+    cudaError_t err = cudaSuccess;
+
+    int numElements = 50000;
+    std::cout << "[Vector operation on " << numElements << " elements]" << std::endl;
+
+    cuda_managed_ptr<test_type> input_vector(numElements);
+    cuda_managed_ptr<test_type> input_vector2(numElements);
+    cuda_managed_ptr<test_type> output_vector(numElements);
+
+    boost::random::uniform_int_distribution<test_type> dist {(std::numeric_limits<test_type>::min)(), (std::numeric_limits<test_type>::max)()};
+    for (std::size_t i = 0; i < numElements; ++i)
+    {
+        input_vector[i] = dist(rng);
+        input_vector2[i] = dist(rng);
+    }
+
+    int threadsPerBlock = 256;
+    int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
+    std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" << std::endl;
+
+    watch w;
+
+    cuda_test<<<blocksPerGrid, threadsPerBlock>>>(input_vector.get(), input_vector2.get(), output_vector.get(), numElements);
+    cudaDeviceSynchronize();
+
+    std::cout << "CUDA kernal done in: " << w.elapsed() << "s" << std::endl;
+
+    err = cudaGetLastError();
+    if (err != cudaSuccess)
+    {
+        std::cerr << "Failed to launch kernel (error code " << cudaGetErrorString(err) << ")!" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::vector<test_type> results;
+    results.reserve(numElements);
+    w.reset();
+    for (int i = 0; i < numElements; ++i)
+    {
+        results.push_back(boost::int128::add_sat(input_vector[i], input_vector2[i]));
+    }
+    double t = w.elapsed();
+
+    int fail_count = 0;
+    for (int i = 0; i < numElements; ++i)
+    {
+        if (output_vector[i] != results[i])
+        {
+            if (fail_count < 5)
+            {
+                std::cerr << "Result verification failed at element " << i << std::endl;
+                std::cerr << "  input1 high: " << input_vector[i].high << " low: " << input_vector[i].low << std::endl;
+                std::cerr << "  input2 high: " << input_vector2[i].high << " low: " << input_vector2[i].low << std::endl;
+                std::cerr << "  GPU    high: " << output_vector[i].high << " low: " << output_vector[i].low << std::endl;
+                std::cerr << "  CPU    high: " << results[i].high << " low: " << results[i].low << std::endl;
+            }
+            ++fail_count;
+        }
+    }
+    if (fail_count > 0)
+    {
+        std::cerr << "Total failures: " << fail_count << " out of " << numElements << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "Test PASSED, normal calculation time: " << t << "s" << std::endl;
+    std::cout << "Done\n";
+
+    return 0;
+}