From 98a1a407dd37f939df511fb3cad9c6ea3f9f1fa4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller-Widmann?= Date: Fri, 10 Oct 2025 21:49:27 +0200 Subject: [PATCH 1/3] Improve performance of unweighted `ecdf` --- Project.toml | 2 +- src/empirical.jl | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index e48fd5ce1..80431779a 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.34.6" +version = "0.34.7" [deps] AliasTables = "66dad0bd-aa9a-41b7-9441-69ab47430ed8" diff --git a/src/empirical.jl b/src/empirical.jl index 45f985468..fe3368bc0 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -42,7 +42,7 @@ function (ecdf::ECDF)(v::AbstractVector{<:Real}) end """ - ecdf(X; weights::AbstractWeights) + ecdf(X[; weights::AbstractVector{<:Real}]) Return an empirical cumulative distribution function (ECDF) based on a vector of samples given in `X`. Optionally providing `weights` returns a weighted ECDF. @@ -53,12 +53,17 @@ evaluate CDF values on other samples. `extrema`, `minimum`, and `maximum` are supported to for obtaining the range over which function is inside the interval ``(0,1)``; the function is defined for the whole real line. """ -function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=Weights(Float64[])) +function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=weights(Float64[])) any(isnan, X) && throw(ArgumentError("ecdf can not include NaN values")) - isempty(weights) || length(X) == length(weights) || throw(ArgumentError("data and weight vectors must be the same size," * - "got $(length(X)) and $(length(weights))")) - ord = sortperm(X) - ECDF(X[ord], isempty(weights) ? weights : Weights(weights[ord])) + _weights = weights isa AbstractWeights ? weights : StatsBase.weights(weights) + if isempty(_weights) + return ECDF(sort(X), _weights) + else + length(X) == length(_weights) || throw(ArgumentError("data and weight vectors must be the same size," * + "got $(length(X)) and $(length(_weights))")) + ord = sortperm(X) + ECDF(X[ord], _weights[ord]) + end end minimum(ecdf::ECDF) = first(ecdf.sorted_values) From 0d9bf7078d1142d4acdcd879ea22567d88182780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller-Widmann?= Date: Mon, 5 Jan 2026 23:25:53 +0100 Subject: [PATCH 2/3] Apply suggestions from code review --- src/empirical.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/empirical.jl b/src/empirical.jl index fe3368bc0..8efe04d1d 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -59,7 +59,14 @@ function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=weights if isempty(_weights) return ECDF(sort(X), _weights) else - length(X) == length(_weights) || throw(ArgumentError("data and weight vectors must be the same size," * + if length(X) != length(_weights) + throw(ArgumentError(LazyString( + "data and weight vectors must be the same size, got ", + length(X), + " and ", + length(_weights), + ))) + end "got $(length(X)) and $(length(_weights))")) ord = sortperm(X) ECDF(X[ord], _weights[ord]) From 7dd79eac3dcb41ebf40c3801f9a70b5b4ea2dc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller-Widmann?= Date: Mon, 5 Jan 2026 23:26:27 +0100 Subject: [PATCH 3/3] Remove line Remove debug print statement for lengths of X and weights. --- src/empirical.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/empirical.jl b/src/empirical.jl index 8efe04d1d..e791ec943 100644 --- a/src/empirical.jl +++ b/src/empirical.jl @@ -67,7 +67,6 @@ function ecdf(X::AbstractVector{<:Real}; weights::AbstractVector{<:Real}=weights length(_weights), ))) end - "got $(length(X)) and $(length(_weights))")) ord = sortperm(X) ECDF(X[ord], _weights[ord]) end