diff --git a/src/DoubleDouble.jl b/src/DoubleDouble.jl index 5511147..51756f1 100644 --- a/src/DoubleDouble.jl +++ b/src/DoubleDouble.jl @@ -31,19 +31,16 @@ end Double{T<:AbstractFloat}(u::T, v::T) = Double{T}(u, v) Double{T<:AbstractFloat}(x::T) = Double(x, zero(T)) - -const half64 = 1.34217729e8 -const half32 = 4097f0 -const half16 = Float16(33.0) -const halfBig = 3.402823669209384634633746074317682114570000000000000000000000000000000000000000e+38 -# 6.805647338418769269267492148635364229120000000000000000000000000000000000000000e38 - -# round floats to half-precision -# TODO: fix overflow for large values -halfprec(x::Float64) = (p = x*half64; (x-p)+p) # signif(x,26,2) for 26 is 6.7108865e7, this seems like 27 -halfprec(x::Float32) = (p = x*half32; (x-p)+p) # float32(signif(x,12,2)) -halfprec(x::Float16) = (p = x*half16; (x-p)+p) # float16(signif(x,5,2)) -halfprec(x::BigFloat) = (p = x*halfBig; (x-p)+p) # BigFloat(signif(x,128,2)) +# use Veltkamp splitting to remove trailing digits +# see "Handbook of Floating-point Arithmetic", ยง4.4.1 +# TODO: +# - fix overflow for large values +# - use fma when available +function halfprec{T<:AbstractFloat}(x::T) + c = T((1 << cld(precision(T),2)) + 1) + p = x*c + (x-p)+p +end function splitprec(x::AbstractFloat) h = halfprec(x) @@ -139,10 +136,16 @@ end # Dekker mul12 function *{T}(x::Single{T},y::Single{T}) - hx,lx = splitprec(x.hi) - hy,ly = splitprec(y.hi) - z = x.hi*y.hi - Double(z, ((hx*hy-z) + hx*ly + lx*hy) + lx*ly) + xs, xe = frexp(x.hi) + ys, ye = frexp(y.hi) + z = xs*ys + zexp = ldexp(z, xe+ye) + if (iszero(zexp) | !isfinite(zexp)) + return Double(zexp, zexp) + end + hx,lx = splitprec(xs) + hy,ly = splitprec(ys) + Double(zexp, ldexp(((hx*hy-z) + hx*ly + lx*hy) + lx*ly, xe+ye)) end # Dekker mul2 diff --git a/test/extratests.jl b/test/extratests.jl new file mode 100644 index 0000000..f0f5272 --- /dev/null +++ b/test/extratests.jl @@ -0,0 +1,81 @@ +using DoubleDouble, ProgressMeter, Base.Test + +# Compare precision in a manner sensitive to subnormals, which lose +# precision compared to widening. +function cmp_sn(w, hi, lo, slopbits=0) + if !isfinite(hi) + if abs(w) > realmax(typeof(hi)) + return isinf(hi) && sign(w) == sign(hi) + end + if isnan(w) && isnan(hi) + return true + end + return w == hi + end + if abs(w) < subnormalmin(typeof(hi)) + return (hi == zero(hi) || abs(w - widen(hi)) < abs(w)) && lo == zero(hi) + end + # Compare w == hi + lo unless `lo` issubnormal + z = widen(hi) + widen(lo) + if !issubnormal(lo) && lo != 0 + if slopbits == 0 + return z == w + end + wr, zr = roundshift(w, slopbits), roundshift(z, slopbits) + return max(wr-1, zero(wr)) <= zr <= wr+1 + end + # round w to the same number of bits as z + zu = asbits(z) + wu = asbits(w) + lastbit = false + while zu > 0 && !isodd(zu) + lastbit = isodd(wu) + zu = zu >> 1 + wu = wu >> 1 + end + return wu <= zu <= wu + lastbit +end + +asbits(x) = reinterpret(Base.fpinttype(typeof(x)), x) + +function roundshift(x, n) + xu = asbits(x) + lastbit = false + for i = 1:n + lastbit = isodd(xu) + xu = xu >> 1 + end + xu + lastbit +end + +subnormalmin(::Type{T}) where T = reinterpret(T, Base.fpinttype(T)(1)) + +function Base.issubnormal(x::Float16) + y = reinterpret(UInt16, x) + (y & Base.exponent_mask(Float16) == 0) & (y & Base.significand_mask(Float16) != 0) +end + +Base.iszero(x::Float16) = reinterpret(UInt16, x) & ~Base.sign_mask(Float16) == 0x0000 + +# Troublesome values +x = Float16(6.45e4) +y = Float16(2112.0) +d = Single(x) * Single(y) +@test cmp_sn(widen(x)*widen(y), d.hi, d.lo) + +function pair16() + @showprogress 1 "testing" for yu in 0x0000:0xffff + for xu in 0x0000:0xffff + x, y = reinterpret(Float16, xu), reinterpret(Float16, yu) + try + d = Single(x) * Single(y) + @test cmp_sn(widen(x)*widen(y), d.hi, d.lo) + catch + @show x y + rethrow() + end + end + end +end + +pair16() diff --git a/test/runtests.jl b/test/runtests.jl index 5158aeb..a3cc38a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -64,3 +64,9 @@ a = Double(big"3.1") @test Double{Float32}(3) === Double{Float32}(3.0f0, 0.0f0) @test Single{Float32}(BigFloat(3)) === Single{Float32}(3.0f0) @test Double{Float32}(BigFloat(3)) === Double{Float32}(3.0f0, 0.0f0) + + +# issue #30 +x, y = Single(Float16(0.9775)), Single(Float16(0.5156)) +d = x*y +@test widen(d.hi) + widen(d.lo) == widen(x.hi) * widen(y.hi)