diff --git a/src/scalarstats.jl b/src/scalarstats.jl index bff0cec24..666d8b310 100644 --- a/src/scalarstats.jl +++ b/src/scalarstats.jl @@ -46,14 +46,67 @@ end # compute mode, given the range of integer values """ - mode(a, [r]) - mode(a::AbstractArray, wv::AbstractWeights) + mode(a; method=:frequency) + mode(a::AbstractArray, wv::AbstractWeights; method=:frequency) + mode(a::AbstractArray{T}, r::UnitRange{T}; method=:frequency) where T<:Integer + +Return the mode (most common value) of `a`, optionally +over a specified range `r` or weighted via a vector +`wv`. + +The `method` keyword argument selects the estimation method: + +- `:frequency`: Frequency-based mode. Counts occurrences and returns the most common + value. If several modes exist, the first one (in order of appearance) is returned. + This is appropriate for discrete data. + +- `:halfsample`: Half-sample mode (HSM). A robust estimator of the mode for + continuous data. Repeatedly finds the contiguous half-sample with the smallest + range until at most 2 points remain, then returns their midpoint. The return value + is always a floating-point number and may not be an element of `a`. + Throws an `ArgumentError` if `a` contains any non-finite values (`NaN`, `Inf`, + or `-Inf`). + This method currently does not support `r` and `wv` arguments. + + # References +- D.R. Bickel, R. Fruehwirth (2006). On a fast, robust estimator of the mode. + Computational Statistics & Data Analysis, 50(12), 3500-3530. +- T. Robertson, J.D. Cryer (1974). An iterative procedure for estimating the mode. + Journal of the American Statistical Association, 69(348), 1012-1016. -Return the mode (most common number) of an array, optionally -over a specified range `r` or weighted via a vector `wv`. -If several modes exist, the first one (in order of appearance) is returned. +# Examples +```julia +julia> mode([1, 2, 2, 3, 3, 3, 4]) +3 + +julia> mode([1, 2, 2, 3, 3, 3, 4], method=:frequency) +3 + +julia> mode([1.0, 1.1, 1.2, 5.0, 5.1], method=:halfsample) +1.1 +``` """ -function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer +function mode(a; method::Symbol=:frequency) + if method === :halfsample + return _hsm_mode(a) + elseif method === :frequency + return _frequency_mode(a) + else + throw(ArgumentError(LazyString("`method` must be `:frequency` or `:halfsample`, got `:", method, "`"))) + end +end + +function mode(a::AbstractArray{T}, r::UnitRange{T}; method::Symbol=:frequency) where T<:Integer + if method === :halfsample + throw(ArgumentError("The `:halfsample` method does not support a range argument. Call `mode(a, method=:halfsample)` without a range.")) + elseif method === :frequency + return _frequency_mode(a, r) + else + throw(ArgumentError(LazyString("`method` must be `:frequency` or `:halfsample`, got `:", method, "`"))) + end +end + +function _frequency_mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) len = length(a) r0 = r[1] @@ -107,8 +160,7 @@ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer return ms end -# compute mode over arbitrary iterable -function mode(a) +function _frequency_mode(a) isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) cnts = Dict{eltype(a),Int}() # first element @@ -204,6 +256,41 @@ function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real return [x for (x, w) in weights if w == mw] end +#Internal implementation of the Half-Sample Mode (HSM) estimator. +function _hsm_mode(a) + isempty(a) && throw(ArgumentError("mode is not defined for empty collections")) + + # Filter NaN values and sort + if !all(isfinite, a) + throw(ArgumentError("mode with `method=:halfsample` is not defined " * + "for collections containing non-finite values")) + end + filteredv = sort!(collect(a)) + len = length(filteredv) + + len == 1 && return middle(filteredv[1], filteredv[1]) + len == 2 && return middle(filteredv[1], filteredv[2]) + + # Iteratively find the half-sample with the smallest range + filteredv = @view filteredv[1:end] + while len > 2 + half = cld(len, 2) + best_i = 1 + best_width = filteredv[half] - filteredv[1] + for i in 2:(len - half + 1) + w = filteredv[i + half - 1] - filteredv[i] + if w < best_width + best_width = w + best_i = i + end + end + filteredv = @view filteredv[best_i:(best_i + half - 1)] + len = length(filteredv) + end + + return middle(filteredv[1], filteredv[len]) +end + ############################# # # quantile and friends diff --git a/test/scalarstats.jl b/test/scalarstats.jl index eec64ad74..2d98969ef 100644 --- a/test/scalarstats.jl +++ b/test/scalarstats.jl @@ -63,6 +63,53 @@ wv = weights([0.1:0.1:0.7; 0.1]) @test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3])) @test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3])) +## mode with method=:frequency (explicit, same as default) +@test mode([1, 1, 1, 2, 3, 4], method=:frequency) == 1 +@test mode((x for x in [1, 1, 1, 2, 3, 4]), method=:frequency) == 1 + +# Check no type inference regression on the default/frequency path +@test @inferred(mode([1]))::Int == 1 +my_frequency_mode(x) = mode(x, method=:frequency) +@test @inferred(my_frequency_mode([1]))::Int == 1 + +# Check type inference on halfsample path +my_hsm_mode(x) = mode(x, method=:halfsample) +@test @inferred(my_hsm_mode([1]))::Float64 == 1.0 + +## mode with method=:halfsample (half-sample mode) +@test mode([10], method=:halfsample)::Float64 == 10.0 +@test mode([1, 5], method=:halfsample)::Float64 == 3.0 # midpoint of two elements +@test mode([1, 2, 2, 3, 4, 4, 4, 5], method=:halfsample)::Float64 == 4.0 +@test mode([1.0, 1.1, 1.2, 5.0, 5.1], method=:halfsample) == 1.15 +@test mode([1.0, 2.0, 3.0, 4.0, 10.0, 11.0, 12.0, 13.0], method=:halfsample) == 1.5 +@test mode([1.0, 2.0, 10.0, 10.1, 10.2, 10.3], method=:halfsample) == 10.05 + +# Robustness to outliers +@test mode([1.0, 1.01, 1.011, 100.0, 200.0], method=:halfsample) == 1.0105 + +# Non-finite values throw ArgumentError +@test_throws ArgumentError mode([1.0, NaN, 2.0, 2.0, Inf], method=:halfsample) +@test_throws ArgumentError mode([1.0, NaN, Inf, -Inf], method=:halfsample) +@test_throws ArgumentError mode([NaN, Inf, -Inf], method=:halfsample) +@test_throws ArgumentError mode([Inf, -Inf], method=:halfsample) +@test_throws ArgumentError mode([NaN], method=:halfsample) +@test_throws ArgumentError mode([NaN, NaN, NaN], method=:halfsample) + +# Edge cases +@test_throws ArgumentError mode(Float64[], method=:halfsample) + + +# Invalid method throws +@test_throws ArgumentError mode([1, 2, 3], method=:invalid) + +## mode with range argument +@test mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:frequency) == 4 +@test_throws ArgumentError mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:halfsample) +@test_throws ArgumentError mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:invalid) + + + +@test mode([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 5.0, 5.1, 5.2, 5.3], method=:halfsample) == 1.15 ## zscores @test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]