Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 95 additions & 8 deletions src/scalarstats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,67 @@ end

# compute mode, given the range of integer values
"""
mode(a, [r])
mode(a::AbstractArray, wv::AbstractWeights)
mode(a; method=:frequency)
mode(a::AbstractArray, wv::AbstractWeights; method=:frequency)
mode(a::AbstractArray{T}, r::UnitRange{T}; method=:frequency) where T<:Integer
Comment thread
anurag-mds marked this conversation as resolved.

Return the mode (most common value) of `a`, optionally
over a specified range `r` or weighted via a vector
`wv`.

The `method` keyword argument selects the estimation method:

- `:frequency`: Frequency-based mode. Counts occurrences and returns the most common
value. If several modes exist, the first one (in order of appearance) is returned.
This is appropriate for discrete data.

- `:halfsample`: Half-sample mode (HSM). A robust estimator of the mode for
continuous data. Repeatedly finds the contiguous half-sample with the smallest
range until at most 2 points remain, then returns their midpoint. The return value
is always a floating-point number and may not be an element of `a`.
Throws an `ArgumentError` if `a` contains any non-finite values (`NaN`, `Inf`,
or `-Inf`).
This method currently does not support `r` and `wv` arguments.

# References
- D.R. Bickel, R. Fruehwirth (2006). On a fast, robust estimator of the mode.
Computational Statistics & Data Analysis, 50(12), 3500-3530.
- T. Robertson, J.D. Cryer (1974). An iterative procedure for estimating the mode.
Journal of the American Statistical Association, 69(348), 1012-1016.

Return the mode (most common number) of an array, optionally
over a specified range `r` or weighted via a vector `wv`.
If several modes exist, the first one (in order of appearance) is returned.
# Examples
```julia
julia> mode([1, 2, 2, 3, 3, 3, 4])
3

julia> mode([1, 2, 2, 3, 3, 3, 4], method=:frequency)
3

julia> mode([1.0, 1.1, 1.2, 5.0, 5.1], method=:halfsample)
1.1
```
"""
function mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer
function mode(a; method::Symbol=:frequency)
if method === :halfsample
return _hsm_mode(a)
elseif method === :frequency
return _frequency_mode(a)
else
throw(ArgumentError(LazyString("`method` must be `:frequency` or `:halfsample`, got `:", method, "`")))
end
end

function mode(a::AbstractArray{T}, r::UnitRange{T}; method::Symbol=:frequency) where T<:Integer
if method === :halfsample
throw(ArgumentError("The `:halfsample` method does not support a range argument. Call `mode(a, method=:halfsample)` without a range."))
elseif method === :frequency
return _frequency_mode(a, r)
else
throw(ArgumentError(LazyString("`method` must be `:frequency` or `:halfsample`, got `:", method, "`")))
end
end

function _frequency_mode(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer
isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
len = length(a)
r0 = r[1]
Expand Down Expand Up @@ -107,8 +160,7 @@ function modes(a::AbstractArray{T}, r::UnitRange{T}) where T<:Integer
return ms
end

# compute mode over arbitrary iterable
function mode(a)
function _frequency_mode(a)
isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))
cnts = Dict{eltype(a),Int}()
# first element
Expand Down Expand Up @@ -204,6 +256,41 @@ function modes(a::AbstractVector, wv::AbstractWeights{T}) where T <: Real
return [x for (x, w) in weights if w == mw]
end

#Internal implementation of the Half-Sample Mode (HSM) estimator.
function _hsm_mode(a)
isempty(a) && throw(ArgumentError("mode is not defined for empty collections"))

# Filter NaN values and sort
if !all(isfinite, a)
throw(ArgumentError("mode with `method=:halfsample` is not defined " *
"for collections containing non-finite values"))
end
filteredv = sort!(collect(a))
len = length(filteredv)

len == 1 && return middle(filteredv[1], filteredv[1])
len == 2 && return middle(filteredv[1], filteredv[2])

# Iteratively find the half-sample with the smallest range
filteredv = @view filteredv[1:end]
while len > 2
half = cld(len, 2)
best_i = 1
best_width = filteredv[half] - filteredv[1]
for i in 2:(len - half + 1)
w = filteredv[i + half - 1] - filteredv[i]
if w < best_width
best_width = w
best_i = i
end
end
filteredv = @view filteredv[best_i:(best_i + half - 1)]
len = length(filteredv)
end

return middle(filteredv[1], filteredv[len])
end

#############################
#
# quantile and friends
Expand Down
47 changes: 47 additions & 0 deletions test/scalarstats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,53 @@ wv = weights([0.1:0.1:0.7; 0.1])
@test_throws ArgumentError mode([1, 2, 3], weights([0.1, 0.3]))
@test_throws ArgumentError modes([1, 2, 3], weights([0.1, 0.3]))

## mode with method=:frequency (explicit, same as default)
@test mode([1, 1, 1, 2, 3, 4], method=:frequency) == 1
@test mode((x for x in [1, 1, 1, 2, 3, 4]), method=:frequency) == 1

# Check no type inference regression on the default/frequency path
@test @inferred(mode([1]))::Int == 1
my_frequency_mode(x) = mode(x, method=:frequency)
@test @inferred(my_frequency_mode([1]))::Int == 1

# Check type inference on halfsample path
my_hsm_mode(x) = mode(x, method=:halfsample)
@test @inferred(my_hsm_mode([1]))::Float64 == 1.0

## mode with method=:halfsample (half-sample mode)
@test mode([10], method=:halfsample)::Float64 == 10.0
@test mode([1, 5], method=:halfsample)::Float64 == 3.0 # midpoint of two elements
@test mode([1, 2, 2, 3, 4, 4, 4, 5], method=:halfsample)::Float64 == 4.0
@test mode([1.0, 1.1, 1.2, 5.0, 5.1], method=:halfsample) == 1.15
@test mode([1.0, 2.0, 3.0, 4.0, 10.0, 11.0, 12.0, 13.0], method=:halfsample) == 1.5
@test mode([1.0, 2.0, 10.0, 10.1, 10.2, 10.3], method=:halfsample) == 10.05

# Robustness to outliers
@test mode([1.0, 1.01, 1.011, 100.0, 200.0], method=:halfsample) == 1.0105

# Non-finite values throw ArgumentError
@test_throws ArgumentError mode([1.0, NaN, 2.0, 2.0, Inf], method=:halfsample)
@test_throws ArgumentError mode([1.0, NaN, Inf, -Inf], method=:halfsample)
@test_throws ArgumentError mode([NaN, Inf, -Inf], method=:halfsample)
@test_throws ArgumentError mode([Inf, -Inf], method=:halfsample)
@test_throws ArgumentError mode([NaN], method=:halfsample)
@test_throws ArgumentError mode([NaN, NaN, NaN], method=:halfsample)

# Edge cases
@test_throws ArgumentError mode(Float64[], method=:halfsample)


# Invalid method throws
@test_throws ArgumentError mode([1, 2, 3], method=:invalid)

## mode with range argument
@test mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:frequency) == 4
@test_throws ArgumentError mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:halfsample)
@test_throws ArgumentError mode([1, 2, 2, 3, 4, 4, 4, 5], 2:4, method=:invalid)



@test mode([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 5.0, 5.1, 5.2, 5.3], method=:halfsample) == 1.15
Comment thread
anurag-mds marked this conversation as resolved.
## zscores

@test zscore([-3:3;], 1.5, 0.5) == [-9.0:2.0:3.0;]
Expand Down
Loading