From c2fda942f133ce1b97707cbb27864411d054ccb3 Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Tue, 4 Jul 2023 22:39:11 +0200 Subject: [PATCH 01/23] Outlier detection and outlier exclusion --- lib/statistex.ex | 197 ++++++++++++++++++++++++----- lib/statistex/percentile.ex | 3 +- test/statistex/percentile_test.exs | 34 ++--- test/statistex_test.exs | 98 ++++++++++++++ 4 files changed, 279 insertions(+), 53 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index 9fcf019..cc8ca7b 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -27,6 +27,8 @@ defmodule Statistex do :mode, :minimum, :maximum, + :outliers_bounds, + :outliers, sample_size: 0 ] @@ -47,6 +49,8 @@ defmodule Statistex do mode: mode, minimum: number, maximum: number, + outliers_bounds: {number, number}, + outliers: [number], sample_size: non_neg_integer } @@ -81,6 +85,8 @@ defmodule Statistex do @empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." + @iqr_factor 1.5 + @doc """ Calculate all statistics Statistex offers for a given list of numbers. @@ -89,7 +95,15 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. ## Options - In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The 50th percentile is always calculated as it is the median. + + In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can + be given. The percentiles 25th, 50th (median) and 75th are always calculated. + + The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`, + `nil` is the default. If this option set to `:once` the outliers are excluded + and the statistics are calculated with the rest of the samples. The value + `:repeatedly` repeats the outlier exclusion until the samples no longer + contains outliers. ## Examples @@ -100,7 +114,7 @@ defmodule Statistex do standard_deviation: 200.0, standard_deviation_ratio: 0.4, median: 500.0, - percentiles: %{50 => 500.0}, + percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, frequency_distribution: %{ 200 => 1, 400 => 3, @@ -112,7 +126,9 @@ defmodule Statistex do minimum: 200, maximum: 900, sample_size: 9, - total: 4500 + total: 4500, + outliers: [], + outliers_bounds: {200, 900.0} } iex> Statistex.statistics([]) @@ -125,13 +141,15 @@ defmodule Statistex do standard_deviation: 0.0, standard_deviation_ratio: 0.0, median: 0.0, - percentiles: %{50 => 0.0}, + percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0}, frequency_distribution: %{0 => 4}, mode: 0, minimum: 0, maximum: 0, sample_size: 4, - total: 0 + total: 0, + outliers: [], + outliers_bounds: {0.0, 0.0} } """ @@ -143,33 +161,65 @@ defmodule Statistex do end def statistics(samples, configuration) do - total = total(samples) - sample_size = length(samples) - average = average(samples, total: total, sample_size: sample_size) - variance = variance(samples, average: average, sample_size: sample_size) - standard_deviation = standard_deviation(samples, variance: variance) + samples = Enum.sort(samples) - standard_deviation_ratio = - standard_deviation_ratio(samples, standard_deviation: standard_deviation) + minimum = hd(samples) + maximum = List.last(samples) percentiles = calculate_percentiles(samples, configuration) - frequency_distribution = frequency_distribution(samples) - - %__MODULE__{ - total: total, - average: average, - variance: variance, - standard_deviation: standard_deviation, - standard_deviation_ratio: standard_deviation_ratio, - median: median(samples, percentiles: percentiles), - percentiles: percentiles, - frequency_distribution: frequency_distribution, - mode: mode(samples, frequency_distribution: frequency_distribution), - minimum: minimum(samples), - maximum: maximum(samples), - sample_size: sample_size - } + outliers_bounds = + do_outliers_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) + + {outliers, rest} = do_outliers(samples, outliers_bounds: outliers_bounds) + + if exclude_outliers?(configuration) and not Enum.empty?(outliers) do + configuration = + configuration + |> Keyword.put(:outliers_excluded, true) + |> Keyword.update!(:exclude_outliers, fn + :once -> :stop + :repeatedly -> :repeatedly + end) + |> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end) + + statistics(rest, configuration) + else + outliers = outliers ++ Keyword.get(configuration, :acc_outliers, []) + + total = total(samples) + sample_size = length(samples) + average = average(samples, total: total, sample_size: sample_size) + variance = variance(samples, average: average, sample_size: sample_size) + + frequency_distribution = frequency_distribution(samples) + + standard_deviation = standard_deviation(samples, variance: variance) + + standard_deviation_ratio = + standard_deviation_ratio(samples, standard_deviation: standard_deviation) + + %__MODULE__{ + total: total, + average: average, + variance: variance, + standard_deviation: standard_deviation, + standard_deviation_ratio: standard_deviation_ratio, + median: median(samples, percentiles: percentiles), + percentiles: percentiles, + frequency_distribution: frequency_distribution, + mode: mode(samples, frequency_distribution: frequency_distribution), + minimum: minimum, + maximum: maximum, + outliers_bounds: outliers_bounds, + outliers: outliers, + sample_size: sample_size + } + end + end + + defp exclude_outliers?(configuration) do + Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly] end @doc """ @@ -396,8 +446,10 @@ defmodule Statistex do percentiles_configuration = Keyword.get(configuration, :percentiles, []) # median_percentile is manually added so that it can be used directly by median - percentiles_configuration = Enum.uniq([@median_percentile | percentiles_configuration]) - percentiles(samples, percentiles_configuration) + percentiles_configuration = + Enum.uniq([25, @median_percentile, 75 | percentiles_configuration]) + + Percentile.percentiles(samples, percentiles_configuration) end @doc """ @@ -447,7 +499,9 @@ defmodule Statistex do """ @spec percentiles(samples, number | [number(), ...]) :: percentiles() - defdelegate(percentiles(samples, percentiles), to: Percentile) + def percentiles(samples, percentiles) do + samples |> Enum.sort() |> Percentile.percentiles(percentiles) + end @doc """ A map showing which sample occurs how often in the samples. @@ -541,10 +595,85 @@ defmodule Statistex do def median(samples, options) do percentiles = - Keyword.get_lazy(options, :percentiles, fn -> percentiles(samples, @median_percentile) end) + Keyword.get_lazy(options, :percentiles, fn -> + Percentile.percentiles(samples, @median_percentile) + end) + + get_percentile(samples, @median_percentile, percentiles) + end + + @doc """ + Calculates the lower and upper bound for outliers. + + Any sample that is `<` as the lower bound and any sample `>` are outliers of + the given `samples`. + + ## Examples + + iex> Statistex.outliers_bounds([3, 4, 5]) + {3, 5} + + iex> Statistex.outliers_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + {22.5, 50} + + iex> Statistex.outliers_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + {50, 80.625} + """ + @spec outliers_bounds(samples, keyword) :: {lower :: number, upper :: number} + def outliers_bounds(samples, options \\ []) + def outliers_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) + def outliers_bounds(samples, options), do: samples |> Enum.sort() |> do_outliers_bounds(options) + + defp do_outliers_bounds(samples, options) do + percentiles = + Keyword.get_lazy(options, :percentiles, fn -> Percentile.percentiles(samples, [25, 75]) end) + + minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end) + maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end) + + p25 = get_percentile(samples, 25, percentiles) + p75 = get_percentile(samples, 75, percentiles) + iqr = p75 - p25 + + {max(p25 - iqr * @iqr_factor, minimum), min(p75 + iqr * @iqr_factor, maximum)} + end + + @doc """ + Returns all outliers for the given `samples`. + + ## Examples + + iex> Statistex.outliers([3, 4, 5]) + [] + + iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + [1, 2, 6] + + iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + [99, 99, 99] + """ + @spec outliers(samples, keyword) :: samples | [] + def outliers(samples, options \\ []) do + {outliers, _rest} = samples |> Enum.sort() |> do_outliers(options) + + outliers + end + + defp do_outliers(samples, options) do + {lower_bound, upper_bound} = + Keyword.get_lazy(options, :outliers_bounds, fn -> do_outliers_bounds(samples, options) end) + + {min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end) + + {max, rest} = + rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end) + + {min ++ max, rest} + end - Map.get_lazy(percentiles, @median_percentile, fn -> - samples |> percentiles(@median_percentile) |> Map.fetch!(@median_percentile) + defp get_percentile(samples, percentile, percentiles) do + Map.get_lazy(percentiles, percentile, fn -> + samples |> Percentile.percentiles(percentile) |> Map.fetch!(percentile) end) end diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index dd6bd31..1a32f01 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -12,12 +12,11 @@ defmodule Statistex.Percentile do def percentiles(samples, percentile_ranks) do number_of_samples = length(samples) - sorted_samples = Enum.sort(samples) percentile_ranks |> List.wrap() |> Enum.reduce(%{}, fn percentile_rank, acc -> - perc = percentile(sorted_samples, number_of_samples, percentile_rank) + perc = percentile(samples, number_of_samples, percentile_rank) Map.put(acc, percentile_rank, perc) end) end diff --git a/test/statistex/percentile_test.exs b/test/statistex/percentile_test.exs index fbc03a5..020f523 100644 --- a/test/statistex/percentile_test.exs +++ b/test/statistex/percentile_test.exs @@ -4,20 +4,20 @@ defmodule Statistex.PercentileTest do doctest Statistex.Percentile - @nist_sample_data [ - 95.1772, - 95.1567, - 95.1937, - 95.1959, - 95.1442, - 95.0610, - 95.1591, - 95.1195, - 95.1065, - 95.0925, - 95.1990, - 95.1682 - ] + @nist_sample_data Enum.sort([ + 95.1772, + 95.1567, + 95.1937, + 95.1959, + 95.1442, + 95.0610, + 95.1591, + 95.1195, + 95.1065, + 95.0925, + 95.1990, + 95.1682 + ]) # Test data from: # http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm @@ -49,7 +49,7 @@ defmodule Statistex.PercentileTest do end describe "a list of two elements" do - @samples [300, 200] + @samples [200, 300] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 200.0 @@ -67,7 +67,7 @@ defmodule Statistex.PercentileTest do end describe "seemingly problematic 2 element list [9, 1]" do - @samples [9, 1] + @samples [1, 9] percentiles = %{ 25 => 1, @@ -88,7 +88,7 @@ defmodule Statistex.PercentileTest do end describe "a list of three elements" do - @samples [100, 300, 200] + @samples [100, 200, 300] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 100.0 diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 3c602a2..65ef5fd 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -12,6 +12,104 @@ defmodule Statistex.StatistexTest do end end + describe ".outliers_bounds/2" do + test "returns outlier bounds for samples without outliers" do + assert Statistex.outliers_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + {200, 900.0} + end + + test "returns outlier bounds for samples with outliers" do + assert Statistex.outliers_bounds([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + {87.5, 787.5} + end + end + + describe ".statistics/2" do + test "returns Statistex struct without outliers" do + assert Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + %Statistex{ + total: 4500, + average: 500.0, + variance: 40000.0, + standard_deviation: 200.0, + standard_deviation_ratio: 0.4, + median: 500.0, + percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, + frequency_distribution: %{200 => 1, 400 => 3, 500 => 3, 700 => 1, 900 => 1}, + mode: [500, 400], + minimum: 200, + maximum: 900, + outliers_bounds: {200, 900.0}, + outliers: [], + sample_size: 9 + } + end + + test "returns Statistex struct with outliers" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + %Statistex{ + total: 4450, + average: 445.0, + variance: 61361.11111111111, + standard_deviation: 247.71175004652304, + standard_deviation_ratio: 0.5566556180820742, + median: 475.0, + percentiles: %{25 => 350.0, 50 => 475.0, 75 => 525.0}, + frequency_distribution: %{50 => 2, 450 => 3, 500 => 3, 600 => 1, 900 => 1}, + mode: [500, 450], + minimum: 50, + maximum: 900, + outliers_bounds: {87.5, 787.5}, + outliers: [50, 50, 900], + sample_size: 10 + } + end + + test "returns Statistex struct with excluded outliers once" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], + exclude_outliers: :once + ) == + %Statistex{ + total: 3450, + average: 492.85714285714283, + variance: 2857.142857142857, + standard_deviation: 53.452248382484875, + standard_deviation_ratio: 0.1084538372977954, + median: 500.0, + percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3, 600 => 1}, + mode: [500, 450], + minimum: 450, + maximum: 600, + outliers_bounds: {450, 575.0}, + outliers: [600, 50, 50, 900], + sample_size: 7 + } + end + + test "returns Statistex struct with excluded outliers repeatedly" do + assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], + exclude_outliers: :repeatedly + ) == + %Statistex{ + total: 2850, + average: 475.0, + variance: 750.0, + standard_deviation: 27.386127875258307, + standard_deviation_ratio: 0.05765500605317538, + median: 475.0, + percentiles: %{25 => 450.0, 50 => 475.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3}, + mode: [500, 450], + minimum: 450, + maximum: 500, + outliers_bounds: {450, 500}, + outliers: [50, 50, 900, 600], + sample_size: 6 + } + end + end + describe "property testing as we might get loads of data" do property "doesn't blow up no matter what kind of nonempty list of floats it's given" do check all(samples <- list_of(float(), min_length: 1)) do From 2182dda7184bdf1f14f364c402c93dae1f29fbac Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Wed, 15 Jan 2025 21:57:40 +0100 Subject: [PATCH 02/23] Replace not Enum.empty? by Enum.any? --- lib/statistex.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index cc8ca7b..27ec576 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -173,7 +173,7 @@ defmodule Statistex do {outliers, rest} = do_outliers(samples, outliers_bounds: outliers_bounds) - if exclude_outliers?(configuration) and not Enum.empty?(outliers) do + if exclude_outliers?(configuration) and Enum.any?(outliers) do configuration = configuration |> Keyword.put(:outliers_excluded, true) From 505bd571e17465247b52cd0c6cc94385b847d6df Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Wed, 15 Jan 2025 22:04:36 +0100 Subject: [PATCH 03/23] Remove outliers_excluded value --- lib/statistex.ex | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index 27ec576..f2a782f 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -176,7 +176,6 @@ defmodule Statistex do if exclude_outliers?(configuration) and Enum.any?(outliers) do configuration = configuration - |> Keyword.put(:outliers_excluded, true) |> Keyword.update!(:exclude_outliers, fn :once -> :stop :repeatedly -> :repeatedly From 8a8497f043fe9ac7f678a53b595f49a4f8656cff Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Wed, 15 Jan 2025 22:12:16 +0100 Subject: [PATCH 04/23] Add @first_quartile and @third_quartile --- lib/statistex.ex | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index f2a782f..8ca99fb 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -85,6 +85,8 @@ defmodule Statistex do @empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." + @first_quartile 25 + @third_quartile 75 @iqr_factor 1.5 @doc """ @@ -625,16 +627,18 @@ defmodule Statistex do defp do_outliers_bounds(samples, options) do percentiles = - Keyword.get_lazy(options, :percentiles, fn -> Percentile.percentiles(samples, [25, 75]) end) + Keyword.get_lazy(options, :percentiles, fn -> + Percentile.percentiles(samples, [@first_quartile, @third_quartile]) + end) minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end) maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end) - p25 = get_percentile(samples, 25, percentiles) - p75 = get_percentile(samples, 75, percentiles) - iqr = p75 - p25 + q1 = get_percentile(samples, @first_quartile, percentiles) + q3 = get_percentile(samples, @third_quartile, percentiles) + iqr = q1 - q3 - {max(p25 - iqr * @iqr_factor, minimum), min(p75 + iqr * @iqr_factor, maximum)} + {max(q1 - iqr * @iqr_factor, minimum), min(q3 + iqr * @iqr_factor, maximum)} end @doc """ From 44d062812d3efdf9ac8b3de5dd0066898a64b0ad Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Wed, 15 Jan 2025 23:32:57 +0100 Subject: [PATCH 05/23] Fix calculation --- lib/statistex.ex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index 8ca99fb..ccef934 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -636,7 +636,7 @@ defmodule Statistex do q1 = get_percentile(samples, @first_quartile, percentiles) q3 = get_percentile(samples, @third_quartile, percentiles) - iqr = q1 - q3 + iqr = q3 - q1 {max(q1 - iqr * @iqr_factor, minimum), min(q3 + iqr * @iqr_factor, maximum)} end From 947483c79ab7ddc4d01fcd4a449015958be34995 Mon Sep 17 00:00:00 2001 From: Marcus Kruse Date: Wed, 15 Jan 2025 23:34:40 +0100 Subject: [PATCH 06/23] Fix typo in outlier_bound --- lib/statistex.ex | 34 +++++++++++++++++----------------- test/statistex_test.exs | 14 +++++++------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index ccef934..9c66643 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -27,7 +27,7 @@ defmodule Statistex do :mode, :minimum, :maximum, - :outliers_bounds, + :outlier_bounds, :outliers, sample_size: 0 ] @@ -49,7 +49,7 @@ defmodule Statistex do mode: mode, minimum: number, maximum: number, - outliers_bounds: {number, number}, + outlier_bounds: {number, number}, outliers: [number], sample_size: non_neg_integer } @@ -130,7 +130,7 @@ defmodule Statistex do sample_size: 9, total: 4500, outliers: [], - outliers_bounds: {200, 900.0} + outlier_bounds: {200, 900.0} } iex> Statistex.statistics([]) @@ -151,7 +151,7 @@ defmodule Statistex do sample_size: 4, total: 0, outliers: [], - outliers_bounds: {0.0, 0.0} + outlier_bounds: {0.0, 0.0} } """ @@ -170,10 +170,10 @@ defmodule Statistex do percentiles = calculate_percentiles(samples, configuration) - outliers_bounds = - do_outliers_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) + outlier_bounds = + do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) - {outliers, rest} = do_outliers(samples, outliers_bounds: outliers_bounds) + {outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds) if exclude_outliers?(configuration) and Enum.any?(outliers) do configuration = @@ -212,7 +212,7 @@ defmodule Statistex do mode: mode(samples, frequency_distribution: frequency_distribution), minimum: minimum, maximum: maximum, - outliers_bounds: outliers_bounds, + outlier_bounds: outlier_bounds, outliers: outliers, sample_size: sample_size } @@ -611,21 +611,21 @@ defmodule Statistex do ## Examples - iex> Statistex.outliers_bounds([3, 4, 5]) + iex> Statistex.outlier_bounds([3, 4, 5]) {3, 5} - iex> Statistex.outliers_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) + iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) {22.5, 50} - iex> Statistex.outliers_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) + iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) {50, 80.625} """ - @spec outliers_bounds(samples, keyword) :: {lower :: number, upper :: number} - def outliers_bounds(samples, options \\ []) - def outliers_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) - def outliers_bounds(samples, options), do: samples |> Enum.sort() |> do_outliers_bounds(options) + @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} + def outlier_bounds(samples, options \\ []) + def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) + def outlier_bounds(samples, options), do: samples |> Enum.sort() |> do_outlier_bounds(options) - defp do_outliers_bounds(samples, options) do + defp do_outlier_bounds(samples, options) do percentiles = Keyword.get_lazy(options, :percentiles, fn -> Percentile.percentiles(samples, [@first_quartile, @third_quartile]) @@ -664,7 +664,7 @@ defmodule Statistex do defp do_outliers(samples, options) do {lower_bound, upper_bound} = - Keyword.get_lazy(options, :outliers_bounds, fn -> do_outliers_bounds(samples, options) end) + Keyword.get_lazy(options, :outlier_bounds, fn -> do_outlier_bounds(samples, options) end) {min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end) diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 65ef5fd..301ce2a 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -12,14 +12,14 @@ defmodule Statistex.StatistexTest do end end - describe ".outliers_bounds/2" do + describe ".outlier_bounds/2" do test "returns outlier bounds for samples without outliers" do - assert Statistex.outliers_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == + assert Statistex.outlier_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == {200, 900.0} end test "returns outlier bounds for samples with outliers" do - assert Statistex.outliers_bounds([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == + assert Statistex.outlier_bounds([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) == {87.5, 787.5} end end @@ -39,7 +39,7 @@ defmodule Statistex.StatistexTest do mode: [500, 400], minimum: 200, maximum: 900, - outliers_bounds: {200, 900.0}, + outlier_bounds: {200, 900.0}, outliers: [], sample_size: 9 } @@ -59,7 +59,7 @@ defmodule Statistex.StatistexTest do mode: [500, 450], minimum: 50, maximum: 900, - outliers_bounds: {87.5, 787.5}, + outlier_bounds: {87.5, 787.5}, outliers: [50, 50, 900], sample_size: 10 } @@ -81,7 +81,7 @@ defmodule Statistex.StatistexTest do mode: [500, 450], minimum: 450, maximum: 600, - outliers_bounds: {450, 575.0}, + outlier_bounds: {450, 575.0}, outliers: [600, 50, 50, 900], sample_size: 7 } @@ -103,7 +103,7 @@ defmodule Statistex.StatistexTest do mode: [500, 450], minimum: 450, maximum: 500, - outliers_bounds: {450, 500}, + outlier_bounds: {450, 500}, outliers: [50, 50, 900, 600], sample_size: 6 } From 5b5c329090e141727764be2f071d176d012af109 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Fri, 2 May 2025 13:33:26 +0200 Subject: [PATCH 07/23] Provide sources for the 1.5 iqr rule --- lib/statistex.ex | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/statistex.ex b/lib/statistex.ex index 9c66643..44c4d0d 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -87,6 +87,8 @@ defmodule Statistex do @first_quartile 25 @third_quartile 75 + # https://en.wikipedia.org/wiki/Interquartile_range#Outliers + # https://builtin.com/articles/1-5-iqr-rule @iqr_factor 1.5 @doc """ From ad7174b51d1fe36b54d001d32f1b583515b05587 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Fri, 2 May 2025 14:00:27 +0200 Subject: [PATCH 08/23] WIP: (known failure) try to remove repeatedly identifying outliers From: https://github.com/bencheeorg/statistex/pull/5#discussion_r1917296624 Need to think through it again/and or check some more samples and test it against that. Getting different bounds/outliers right now although I think they're right. --- lib/statistex.ex | 110 +++++++++++++++++++++------------------- test/statistex_test.exs | 26 ++-------- 2 files changed, 62 insertions(+), 74 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index 44c4d0d..dc449ba 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -96,18 +96,16 @@ defmodule Statistex do The statistics themselves are described in the individual samples that can be used to calculate individual values. - `Argumenterror` is raised if the given list is empty. + `ArgumentError` is raised if the given list is empty. ## Options - In a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can + With a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can be given. The percentiles 25th, 50th (median) and 75th are always calculated. - The option `exclude_outliers` can be set to `:once`, `:repeatedly` or `nil`, - `nil` is the default. If this option set to `:once` the outliers are excluded - and the statistics are calculated with the rest of the samples. The value - `:repeatedly` repeats the outlier exclusion until the samples no longer - contains outliers. + The option `exclude_outliers` can be set to `true`, `false`. Defaults to `false`. + If this option is set to `true` the outliers are excluded + and the statistics are calculated with the rest of the samples. ## Examples @@ -167,62 +165,72 @@ defmodule Statistex do def statistics(samples, configuration) do samples = Enum.sort(samples) - minimum = hd(samples) - maximum = List.last(samples) - - percentiles = calculate_percentiles(samples, configuration) + # these statistics are required to do the outlier calculations + %{minimum: minimum, maximum: maximum, percentiles: percentiles} = + base_statistics(samples, configuration) outlier_bounds = do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) + # make sure rest remains sorted and so can be used again to ok results {outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds) if exclude_outliers?(configuration) and Enum.any?(outliers) do - configuration = - configuration - |> Keyword.update!(:exclude_outliers, fn - :once -> :stop - :repeatedly -> :repeatedly - end) - |> Keyword.update(:acc_outliers, outliers, fn list -> list ++ outliers end) - - statistics(rest, configuration) + # figure out to avoid double sorting + rest = Enum.sort(rest) + # need to recalculate with the outliers removed + %{minimum: minimum, maximum: maximum, percentiles: percentiles} = + base_statistics(rest, configuration) + + create_full_statistics(rest, minimum, maximum, percentiles, outliers, outlier_bounds) else - outliers = outliers ++ Keyword.get(configuration, :acc_outliers, []) - - total = total(samples) - sample_size = length(samples) - average = average(samples, total: total, sample_size: sample_size) - variance = variance(samples, average: average, sample_size: sample_size) - - frequency_distribution = frequency_distribution(samples) - - standard_deviation = standard_deviation(samples, variance: variance) - - standard_deviation_ratio = - standard_deviation_ratio(samples, standard_deviation: standard_deviation) - - %__MODULE__{ - total: total, - average: average, - variance: variance, - standard_deviation: standard_deviation, - standard_deviation_ratio: standard_deviation_ratio, - median: median(samples, percentiles: percentiles), - percentiles: percentiles, - frequency_distribution: frequency_distribution, - mode: mode(samples, frequency_distribution: frequency_distribution), - minimum: minimum, - maximum: maximum, - outlier_bounds: outlier_bounds, - outliers: outliers, - sample_size: sample_size - } + create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds) end end + defp base_statistics(samples, configuration) do + minimum = hd(samples) + maximum = List.last(samples) + + percentiles = calculate_percentiles(samples, configuration) + + %{minimum: minimum, maximum: maximum, percentiles: percentiles} + end + defp exclude_outliers?(configuration) do - Keyword.get(configuration, :exclude_outliers) in [:once, :repeatedly] + Access.get(configuration, :exclude_outliers) == true + end + + # maybe make argument a map + defp create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds) do + total = total(samples) + sample_size = length(samples) + average = average(samples, total: total, sample_size: sample_size) + variance = variance(samples, average: average, sample_size: sample_size) + + frequency_distribution = frequency_distribution(samples) + + standard_deviation = standard_deviation(samples, variance: variance) + + standard_deviation_ratio = + standard_deviation_ratio(samples, standard_deviation: standard_deviation) + + %__MODULE__{ + total: total, + average: average, + variance: variance, + standard_deviation: standard_deviation, + standard_deviation_ratio: standard_deviation_ratio, + median: median(samples, percentiles: percentiles), + percentiles: percentiles, + frequency_distribution: frequency_distribution, + mode: mode(samples, frequency_distribution: frequency_distribution), + minimum: minimum, + maximum: maximum, + outlier_bounds: outlier_bounds, + outliers: outliers, + sample_size: sample_size + } end @doc """ diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 301ce2a..5c0b742 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -67,7 +67,7 @@ defmodule Statistex.StatistexTest do test "returns Statistex struct with excluded outliers once" do assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], - exclude_outliers: :once + exclude_outliers: true ) == %Statistex{ total: 3450, @@ -81,33 +81,13 @@ defmodule Statistex.StatistexTest do mode: [500, 450], minimum: 450, maximum: 600, + # check with other sources what is right and what isn't, I fear we may have calculated outliers twice before outlier_bounds: {450, 575.0}, + # Either sort them or make the test ignorant of order outliers: [600, 50, 50, 900], sample_size: 7 } end - - test "returns Statistex struct with excluded outliers repeatedly" do - assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], - exclude_outliers: :repeatedly - ) == - %Statistex{ - total: 2850, - average: 475.0, - variance: 750.0, - standard_deviation: 27.386127875258307, - standard_deviation_ratio: 0.05765500605317538, - median: 475.0, - percentiles: %{25 => 450.0, 50 => 475.0, 75 => 500.0}, - frequency_distribution: %{450 => 3, 500 => 3}, - mode: [500, 450], - minimum: 450, - maximum: 500, - outlier_bounds: {450, 500}, - outliers: [50, 50, 900, 600], - sample_size: 6 - } - end end describe "property testing as we might get loads of data" do From cdf1490ce1a35677e9623e1d8024557d6f7b3eae Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 13:44:42 +0200 Subject: [PATCH 09/23] Further deep dive into outliers, quantiles * Remove the limiting of bounds with min/max * consult and get some more samples * include R samples as some authorative examples --- lib/statistex.ex | 13 ++--- lib/statistex/percentile.ex | 11 +++- test/statistex_test.exs | 102 +++++++++++++++++++++++++++--------- 3 files changed, 91 insertions(+), 35 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index dc449ba..fd60259 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -130,7 +130,7 @@ defmodule Statistex do sample_size: 9, total: 4500, outliers: [], - outlier_bounds: {200, 900.0} + outlier_bounds: {100.0, 900.0} } iex> Statistex.statistics([]) @@ -622,13 +622,13 @@ defmodule Statistex do ## Examples iex> Statistex.outlier_bounds([3, 4, 5]) - {3, 5} + {0.0, 8.0} iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) - {22.5, 50} + {22.5, 66.5} iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) - {50, 80.625} + {31.625, 80.625} """ @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} def outlier_bounds(samples, options \\ []) @@ -641,14 +641,11 @@ defmodule Statistex do Percentile.percentiles(samples, [@first_quartile, @third_quartile]) end) - minimum = Keyword.get_lazy(options, :minimum, fn -> hd(samples) end) - maximum = Keyword.get_lazy(options, :maximum, fn -> List.last(samples) end) - q1 = get_percentile(samples, @first_quartile, percentiles) q3 = get_percentile(samples, @third_quartile, percentiles) iqr = q3 - q1 - {max(q1 - iqr * @iqr_factor, minimum), min(q3 + iqr * @iqr_factor, maximum)} + {q1 - iqr * @iqr_factor, q3 + iqr * @iqr_factor} end @doc """ diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index 1a32f01..53887a9 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -62,11 +62,20 @@ defmodule Statistex.Percentile do # particular sample). Of the 9 main strategies, (types 1-9), types 6, 7, and 8 # are generally acceptable and give similar results. # + # R uses type 7, but you can change the strategies used in R with arguments. + # + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6) + # 25% 50% 75% + # 9.25 10.00 11.75 + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 7) + # 25% 50% 75% + # 9.75 10.00 11.25 + # # For more information on interpolation strategies, see: # - https://stat.ethz.ch/R-manual/R-devel/library/stats/html/quantile.html # - http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm defp interpolation_value(lower_bound, upper_bound, rank) do - # in our source rank is k, and interpolation_weitgh is d + # in our source rank is k, and interpolation_weight is d interpolation_weight = rank - trunc(rank) interpolation_weight * (upper_bound - lower_bound) end diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 5c0b742..fa646f2 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -13,9 +13,10 @@ defmodule Statistex.StatistexTest do end describe ".outlier_bounds/2" do + # examples doubled up, maybe get rid of them? test "returns outlier bounds for samples without outliers" do assert Statistex.outlier_bounds([200, 400, 400, 400, 500, 500, 500, 700, 900]) == - {200, 900.0} + {100.0, 900.0} end test "returns outlier bounds for samples with outliers" do @@ -30,7 +31,7 @@ defmodule Statistex.StatistexTest do %Statistex{ total: 4500, average: 500.0, - variance: 40000.0, + variance: 40_000.0, standard_deviation: 200.0, standard_deviation_ratio: 0.4, median: 500.0, @@ -39,7 +40,7 @@ defmodule Statistex.StatistexTest do mode: [500, 400], minimum: 200, maximum: 900, - outlier_bounds: {200, 900.0}, + outlier_bounds: {100.0, 900.0}, outliers: [], sample_size: 9 } @@ -50,7 +51,7 @@ defmodule Statistex.StatistexTest do %Statistex{ total: 4450, average: 445.0, - variance: 61361.11111111111, + variance: 61_361.11111111111, standard_deviation: 247.71175004652304, standard_deviation_ratio: 0.5566556180820742, median: 475.0, @@ -65,28 +66,77 @@ defmodule Statistex.StatistexTest do } end - test "returns Statistex struct with excluded outliers once" do - assert Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], - exclude_outliers: true - ) == - %Statistex{ - total: 3450, - average: 492.85714285714283, - variance: 2857.142857142857, - standard_deviation: 53.452248382484875, - standard_deviation_ratio: 0.1084538372977954, - median: 500.0, - percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0}, - frequency_distribution: %{450 => 3, 500 => 3, 600 => 1}, - mode: [500, 450], - minimum: 450, - maximum: 600, - # check with other sources what is right and what isn't, I fear we may have calculated outliers twice before - outlier_bounds: {450, 575.0}, - # Either sort them or make the test ignorant of order - outliers: [600, 50, 50, 900], - sample_size: 7 - } + # https://www.youtube.com/watch?v=rZJbj2I-_Ek + test "gets outliers from the sample right" do + # One could argue that this is controversial, R comes up with these results (by default): + # > summary(c(9, 9, 10, 10, 10, 11, 12, 36)) + # Min. 1st Qu. Median Mean 3rd Qu. Max. + # 9.00 9.75 10.00 13.38 11.25 36.00 + # + # R by default uses type 7 interpolation, we implemented type 6 interpolation though. Which + # R can also use: + # > quantile(c(9, 9, 10, 10, 10, 11, 12, 36), probs = c(0.25, 0.5, 0.75), type = 6) + # 25% 50% 75% + # 9.25 10.00 11.75 + # Which is our result. + + assert %Statistex{ + median: 10.0, + percentiles: %{25 => 9.25, 50 => 10.0, 75 => 11.75}, + minimum: 9, + maximum: 36, + outlier_bounds: {5.5, 15.5}, + outliers: [36] + } = Statistex.statistics([9, 9, 10, 10, 10, 11, 12, 36], exclude_outliers: false) + end + + # https://en.wikipedia.org/wiki/Box_plot#Example_with_outliers + test "another example with outliers" do + data = [ + 52, + 57, + 57, + 58, + 63, + 66, + 66, + 67, + 67, + 68, + 69, + 70, + 70, + 70, + 70, + 72, + 73, + 75, + 75, + 76, + 76, + 78, + 79, + 89 + ] + + assert %Statistex{ + median: 70.0, + percentiles: %{25 => 66.0, 50 => 70.0, 75 => 75.0}, + # report interquantile range? + outlier_bounds: {52.5, 88.5}, + outliers: [52, 89] + } = Statistex.statistics(data, exclude_outliers: false) + end + + # https://en.wikipedia.org/wiki/Interquartile_range#Data_set_in_a_table + test "quartile example" do + assert %Statistex{ + median: 87.0, + percentiles: %{25 => 31.0, 50 => 87.0, 75 => 119.0} + } = + Statistex.statistics([7, 7, 31, 31, 47, 75, 87, 115, 116, 119, 119, 155, 177], + exclude_outliers: false + ) end end From d769e4e8848b650bde638b1c6e2072f7b4d775dc Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 14:09:26 +0200 Subject: [PATCH 10/23] separate keys for outlier boudns --- lib/statistex.ex | 17 ++++++++++++----- test/statistex_test.exs | 12 ++++++++---- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index fd60259..3c812fc 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -27,7 +27,8 @@ defmodule Statistex do :mode, :minimum, :maximum, - :outlier_bounds, + :lower_outlier_bound, + :upper_outlier_bound, :outliers, sample_size: 0 ] @@ -49,7 +50,8 @@ defmodule Statistex do mode: mode, minimum: number, maximum: number, - outlier_bounds: {number, number}, + lower_outlier_bound: number, + upper_outlier_bound: number, outliers: [number], sample_size: non_neg_integer } @@ -130,7 +132,8 @@ defmodule Statistex do sample_size: 9, total: 4500, outliers: [], - outlier_bounds: {100.0, 900.0} + lower_outlier_bound: 100.0, + upper_outlier_bound: 900.0 } iex> Statistex.statistics([]) @@ -151,7 +154,8 @@ defmodule Statistex do sample_size: 4, total: 0, outliers: [], - outlier_bounds: {0.0, 0.0} + lower_outlier_bound: 0.0, + upper_outlier_bound: 0.0, } """ @@ -215,6 +219,8 @@ defmodule Statistex do standard_deviation_ratio = standard_deviation_ratio(samples, standard_deviation: standard_deviation) + {lower_outlier_bound, upper_outlier_bound} = outlier_bounds + %__MODULE__{ total: total, average: average, @@ -227,7 +233,8 @@ defmodule Statistex do mode: mode(samples, frequency_distribution: frequency_distribution), minimum: minimum, maximum: maximum, - outlier_bounds: outlier_bounds, + lower_outlier_bound: lower_outlier_bound, + upper_outlier_bound: upper_outlier_bound, outliers: outliers, sample_size: sample_size } diff --git a/test/statistex_test.exs b/test/statistex_test.exs index fa646f2..59a30d5 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -40,7 +40,8 @@ defmodule Statistex.StatistexTest do mode: [500, 400], minimum: 200, maximum: 900, - outlier_bounds: {100.0, 900.0}, + lower_outlier_bound: 100.0, + upper_outlier_bound: 900.0, outliers: [], sample_size: 9 } @@ -60,7 +61,8 @@ defmodule Statistex.StatistexTest do mode: [500, 450], minimum: 50, maximum: 900, - outlier_bounds: {87.5, 787.5}, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, outliers: [50, 50, 900], sample_size: 10 } @@ -85,7 +87,8 @@ defmodule Statistex.StatistexTest do percentiles: %{25 => 9.25, 50 => 10.0, 75 => 11.75}, minimum: 9, maximum: 36, - outlier_bounds: {5.5, 15.5}, + lower_outlier_bound: 5.5, + upper_outlier_bound: 15.5, outliers: [36] } = Statistex.statistics([9, 9, 10, 10, 10, 11, 12, 36], exclude_outliers: false) end @@ -123,7 +126,8 @@ defmodule Statistex.StatistexTest do median: 70.0, percentiles: %{25 => 66.0, 50 => 70.0, 75 => 75.0}, # report interquantile range? - outlier_bounds: {52.5, 88.5}, + lower_outlier_bound: 52.5, + upper_outlier_bound: 88.5, outliers: [52, 89] } = Statistex.statistics(data, exclude_outliers: false) end From 3b709c01081f4ae38a4d2bf5cb6379385e2125df Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 15:06:29 +0200 Subject: [PATCH 11/23] Simplify the determination of outliers & pass on sorted?: There could be an argument made that if we have few outliers, reversing the lists twice could be faster than passing through the entire list once with 2 conditions. We can probably optimize & benchmark on this later. --- lib/statistex.ex | 79 +++++++++++++++++++++++-------------- lib/statistex/helper.ex | 18 +++++++++ lib/statistex/percentile.ex | 13 ++++-- 3 files changed, 77 insertions(+), 33 deletions(-) create mode 100644 lib/statistex/helper.ex diff --git a/lib/statistex.ex b/lib/statistex.ex index 3c812fc..d0661b9 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -15,6 +15,8 @@ defmodule Statistex do alias Statistex.{Mode, Percentile} require Integer + import Statistex.Helper, only: [maybe_sort: 2] + defstruct [ :total, :average, @@ -88,6 +90,7 @@ defmodule Statistex do @empty_list_error_message "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." @first_quartile 25 + @median_percentile 50 @third_quartile 75 # https://en.wikipedia.org/wiki/Interquartile_range#Outliers # https://builtin.com/articles/1-5-iqr-rule @@ -167,17 +170,21 @@ defmodule Statistex do end def statistics(samples, configuration) do - samples = Enum.sort(samples) + sorted_samples = Enum.sort(samples) # these statistics are required to do the outlier calculations %{minimum: minimum, maximum: maximum, percentiles: percentiles} = - base_statistics(samples, configuration) + base_statistics(sorted_samples, configuration) outlier_bounds = - do_outlier_bounds(samples, percentiles: percentiles, minimum: minimum, maximum: maximum) + do_outlier_bounds(sorted_samples, + percentiles: percentiles, + minimum: minimum, + maximum: maximum + ) # make sure rest remains sorted and so can be used again to ok results - {outliers, rest} = do_outliers(samples, outlier_bounds: outlier_bounds) + {outliers, rest} = do_outliers(sorted_samples, outlier_bounds: outlier_bounds) if exclude_outliers?(configuration) and Enum.any?(outliers) do # figure out to avoid double sorting @@ -188,15 +195,22 @@ defmodule Statistex do create_full_statistics(rest, minimum, maximum, percentiles, outliers, outlier_bounds) else - create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds) + create_full_statistics( + sorted_samples, + minimum, + maximum, + percentiles, + outliers, + outlier_bounds + ) end end - defp base_statistics(samples, configuration) do - minimum = hd(samples) - maximum = List.last(samples) + defp base_statistics(sorted_samples, configuration) do + minimum = hd(sorted_samples) + maximum = List.last(sorted_samples) - percentiles = calculate_percentiles(samples, configuration) + percentiles = calculate_percentiles(sorted_samples, configuration) %{minimum: minimum, maximum: maximum, percentiles: percentiles} end @@ -459,15 +473,18 @@ defmodule Statistex do end end - @median_percentile 50 - defp calculate_percentiles(samples, configuration) do + defp calculate_percentiles(sorted_samples, configuration) do percentiles_configuration = Keyword.get(configuration, :percentiles, []) # median_percentile is manually added so that it can be used directly by median percentiles_configuration = - Enum.uniq([25, @median_percentile, 75 | percentiles_configuration]) + Enum.uniq([ + @first_quartile, + @median_percentile, + @third_quartile | percentiles_configuration + ]) - Percentile.percentiles(samples, percentiles_configuration) + Percentile.percentiles(sorted_samples, percentiles_configuration, sorted: true) end @doc """ @@ -475,7 +492,7 @@ defmodule Statistex do Think of this as the value below which `percentile_rank` percent of the samples lie. For example, - if `Statistex.percentile(samples, 99)` == 123.45, + if `Statistex.percentile(samples, 99) == 123.45`, 99% of samples are less than 123.45. Passing a number for `percentile_rank` calculates a single percentile. @@ -517,9 +534,8 @@ defmodule Statistex do """ @spec percentiles(samples, number | [number(), ...]) :: percentiles() - def percentiles(samples, percentiles) do - samples |> Enum.sort() |> Percentile.percentiles(percentiles) - end + defdelegate percentiles(samples, percentiles, options), to: Percentile + defdelegate percentiles(samples, percentiles), to: Percentile @doc """ A map showing which sample occurs how often in the samples. @@ -631,6 +647,9 @@ defmodule Statistex do iex> Statistex.outlier_bounds([3, 4, 5]) {0.0, 8.0} + iex> Statistex.outlier_bounds([4, 5, 3]) + {0.0, 8.0} + iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) {22.5, 66.5} @@ -640,19 +659,21 @@ defmodule Statistex do @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} def outlier_bounds(samples, options \\ []) def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) - def outlier_bounds(samples, options), do: samples |> Enum.sort() |> do_outlier_bounds(options) + def outlier_bounds(samples, options), do: do_outlier_bounds(samples, options) defp do_outlier_bounds(samples, options) do + # double check do we need both get lazies here? percentiles = Keyword.get_lazy(options, :percentiles, fn -> - Percentile.percentiles(samples, [@first_quartile, @third_quartile]) + Percentile.percentiles(samples, [@first_quartile, @third_quartile], options) end) q1 = get_percentile(samples, @first_quartile, percentiles) q3 = get_percentile(samples, @third_quartile, percentiles) iqr = q3 - q1 + outlier_tolerance = iqr * @iqr_factor - {q1 - iqr * @iqr_factor, q3 + iqr * @iqr_factor} + {q1 - outlier_tolerance, q3 + outlier_tolerance} end @doc """ @@ -671,21 +692,21 @@ defmodule Statistex do """ @spec outliers(samples, keyword) :: samples | [] def outliers(samples, options \\ []) do - {outliers, _rest} = samples |> Enum.sort() |> do_outliers(options) + sorted_samples = maybe_sort(samples, options) + + # maybe allow folks to get the same + {outliers, _rest} = do_outliers(sorted_samples, options) outliers end - defp do_outliers(samples, options) do + defp do_outliers(sorted_samples, options) do {lower_bound, upper_bound} = - Keyword.get_lazy(options, :outlier_bounds, fn -> do_outlier_bounds(samples, options) end) - - {min, rest} = Enum.split_while(samples, fn sample -> sample < lower_bound end) - - {max, rest} = - rest |> Enum.reverse() |> Enum.split_while(fn sample -> sample > upper_bound end) + Keyword.get_lazy(options, :outlier_bounds, fn -> + do_outlier_bounds(sorted_samples, options) + end) - {min ++ max, rest} + Enum.split_with(sorted_samples, fn sample -> sample < lower_bound || sample > upper_bound end) end defp get_percentile(samples, percentile, percentiles) do diff --git a/lib/statistex/helper.ex b/lib/statistex/helper.ex new file mode 100644 index 0000000..958d034 --- /dev/null +++ b/lib/statistex/helper.ex @@ -0,0 +1,18 @@ +defmodule Statistex.Helper do + @moduledoc false + # Everyone loves helper modules... ok ok, no. But I needed/wanted this function, + # but didn't wanna put it on the main module. + + # With the design goal that we don't want to needlessly do operations, esp. big ones + # like sorting we need an optional `sorted?` arguments in a bunch of places. + # This unifies the handling of that. + def maybe_sort(samples, options) do + sorted? = Access.get(options, :sorted?, false) + + if sorted? do + samples + else + Enum.sort(samples) + end + end +end diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index 53887a9..4cab86b 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -1,22 +1,27 @@ defmodule Statistex.Percentile do @moduledoc false - @spec percentiles(Statistex.samples(), number | [number, ...]) :: + import Statistex.Helper, only: [maybe_sort: 2] + + @spec percentiles(Statistex.samples(), number | [number, ...], keyword()) :: Statistex.percentiles() - def percentiles([], _) do + def percentiles(samples, percentiles, options \\ []) + + def percentiles([], _, _) do raise( ArgumentError, "Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number." ) end - def percentiles(samples, percentile_ranks) do + def percentiles(samples, percentile_ranks, options) do number_of_samples = length(samples) + sorted_samples = maybe_sort(samples, options) percentile_ranks |> List.wrap() |> Enum.reduce(%{}, fn percentile_rank, acc -> - perc = percentile(samples, number_of_samples, percentile_rank) + perc = percentile(sorted_samples, number_of_samples, percentile_rank) Map.put(acc, percentile_rank, perc) end) end From 0d6bdbefea13d5a1e1aedbfa82fe9e1095f74b5c Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 15:32:25 +0200 Subject: [PATCH 12/23] Clean up the property based tests a little --- test/statistex_test.exs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 59a30d5..be590cd 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -162,6 +162,12 @@ defmodule Statistex.StatistexTest do defp assert_statistics_properties(samples) do stats = statistics(samples) + assert_basic_statistics(stats) + assert_mode_in_samples(stats, samples) + frequency_assertions(stats, samples) + end + + defp assert_basic_statistics(stats) do assert stats.sample_size >= 1 assert stats.minimum <= stats.maximum @@ -176,8 +182,9 @@ defmodule Statistex.StatistexTest do assert stats.variance >= 0 assert stats.standard_deviation >= 0 assert stats.standard_deviation_ratio >= 0 + end - # mode actually occurs in the samples + defp assert_mode_in_samples(stats, samples) do case stats.mode do [_ | _] -> Enum.each(stats.mode, fn mode -> @@ -191,7 +198,9 @@ defmodule Statistex.StatistexTest do mode -> assert mode in samples end + end + defp frequency_assertions(stats, samples) do frequency_distribution = stats.frequency_distribution frequency_entry_count = map_size(frequency_distribution) @@ -208,7 +217,7 @@ defmodule Statistex.StatistexTest do # all samples are in frequencies Enum.each(samples, fn sample -> assert Map.has_key?(frequency_distribution, sample) end) - # counts some up to sample_size + # counts of frequencies sum up to sample_size count_sum = frequency_distribution |> Map.values() From 20e720d2d70885195f81b03cef273d83766d1253 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 15:35:04 +0200 Subject: [PATCH 13/23] New property: shuffling the samples doesn't change the result --- test/statistex_test.exs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/statistex_test.exs b/test/statistex_test.exs index be590cd..cb27589 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -165,6 +165,10 @@ defmodule Statistex.StatistexTest do assert_basic_statistics(stats) assert_mode_in_samples(stats, samples) frequency_assertions(stats, samples) + + # shuffling values around shouldn't change the results + shuffled_stats = samples |> Enum.shuffle() |> statistics() + assert stats == shuffled_stats end defp assert_basic_statistics(stats) do From 29a2f30f1491e2bf721157772933ce0650537458 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 15:42:54 +0200 Subject: [PATCH 14/23] More property based properties --- test/statistex_test.exs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/test/statistex_test.exs b/test/statistex_test.exs index cb27589..187d27b 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -164,7 +164,8 @@ defmodule Statistex.StatistexTest do assert_basic_statistics(stats) assert_mode_in_samples(stats, samples) - frequency_assertions(stats, samples) + assert_frequencies(stats, samples) + assert_bounds(stats, samples) # shuffling values around shouldn't change the results shuffled_stats = samples |> Enum.shuffle() |> statistics() @@ -183,6 +184,9 @@ defmodule Statistex.StatistexTest do assert stats.median == stats.percentiles[50] + assert stats.median >= stats.percentiles[25] + assert stats.percentiles[75] >= stats.median + assert stats.variance >= 0 assert stats.standard_deviation >= 0 assert stats.standard_deviation_ratio >= 0 @@ -204,7 +208,7 @@ defmodule Statistex.StatistexTest do end end - defp frequency_assertions(stats, samples) do + defp assert_frequencies(stats, samples) do frequency_distribution = stats.frequency_distribution frequency_entry_count = map_size(frequency_distribution) @@ -230,6 +234,16 @@ defmodule Statistex.StatistexTest do assert count_sum == stats.sample_size end + defp assert_bounds(stats, samples) do + Enum.each(stats.outliers, fn outlier -> + assert outlier in samples + assert outlier < stats.lower_outlier_bound || outlier > stats.upper_outlier_bound + end) + + assert stats.lower_outlier_bound <= stats.percentiles[25] + assert stats.upper_outlier_bound >= stats.percentiles[75] + end + defp big_list_big_floats do sized(fn size -> resize( From fee2065fa8293bc6b55d69dea4fa987a5fb568f3 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 16:11:43 +0200 Subject: [PATCH 15/23] exclude_statistics shown in docs, cleaned up docs and some properties to go along --- lib/statistex.ex | 87 +++++++++++++++++++---------------------- test/statistex_test.exs | 45 ++++++++++++++++++++- 2 files changed, 84 insertions(+), 48 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index d0661b9..b3402d5 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -105,62 +105,57 @@ defmodule Statistex do ## Options - With a `percentiles` options arguments for the calculation of percentiles (see `percentiles/2`) can - be given. The percentiles 25th, 50th (median) and 75th are always calculated. + * `percentiles`: percentiles to calculate (see `percentiles/2`). + The percentiles 25th, 50th (median) and 75th are always calculated. - The option `exclude_outliers` can be set to `true`, `false`. Defaults to `false`. - If this option is set to `true` the outliers are excluded - and the statistics are calculated with the rest of the samples. + * `exclude_outliers` can be set to `true` or `false`. Defaults to `false`. + If this option is set to `true` the outliers are excluded from the calculation + of the statistics. ## Examples - iex> Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) + iex> Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900]) %Statistex{ - average: 500.0, - variance: 40_000.0, - standard_deviation: 200.0, - standard_deviation_ratio: 0.4, - median: 500.0, - percentiles: %{25 => 400.0, 50 => 500.0, 75 => 600.0}, - frequency_distribution: %{ - 200 => 1, - 400 => 3, - 500 => 3, - 700 => 1, - 900 => 1 - }, - mode: [500, 400], - minimum: 200, - maximum: 900, - sample_size: 9, - total: 4500, - outliers: [], - lower_outlier_bound: 100.0, - upper_outlier_bound: 900.0 + total: 4450, + average: 445.0, + variance: 61_361.11111111111, + standard_deviation: 247.71175004652304, + standard_deviation_ratio: 0.5566556180820742, + median: 475.0, + percentiles: %{25 => 350.0, 50 => 475.0, 75 => 525.0}, + frequency_distribution: %{50 => 2, 450 => 3, 500 => 3, 600 => 1, 900 => 1}, + mode: [500, 450], + minimum: 50, + maximum: 900, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, + outliers: [50, 50, 900], + sample_size: 10 } - iex> Statistex.statistics([]) - ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. - - iex> Statistex.statistics([0, 0, 0, 0]) + # excluding outliers changes the results + iex> Statistex.statistics([50, 50, 450, 450, 450, 500, 500, 500, 600, 900], exclude_outliers: true) %Statistex{ - average: 0.0, - variance: 0.0, - standard_deviation: 0.0, - standard_deviation_ratio: 0.0, - median: 0.0, - percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0}, - frequency_distribution: %{0 => 4}, - mode: 0, - minimum: 0, - maximum: 0, - sample_size: 4, - total: 0, - outliers: [], - lower_outlier_bound: 0.0, - upper_outlier_bound: 0.0, + total: 3450, + average: 492.85714285714283, + variance: 2857.142857142857, + standard_deviation: 53.452248382484875, + standard_deviation_ratio: 0.1084538372977954, + median: 500.0, + percentiles: %{25 => 450.0, 50 => 500.0, 75 => 500.0}, + frequency_distribution: %{450 => 3, 500 => 3, 600 => 1}, + mode: [500, 450], + maximum: 600, + minimum: 450, + lower_outlier_bound: 87.5, + upper_outlier_bound: 787.5, + outliers: [50, 50, 900], + sample_size: 7 } + iex> Statistex.statistics([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. + """ @spec statistics(samples, configuration) :: t() def statistics(samples, configuration \\ []) diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 187d27b..e4a62d6 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -26,6 +26,26 @@ defmodule Statistex.StatistexTest do end describe ".statistics/2" do + test "all 0 values do what you think they would" do + assert Statistex.statistics([0, 0, 0, 0]) == %Statistex{ + average: 0.0, + variance: 0.0, + standard_deviation: 0.0, + standard_deviation_ratio: 0.0, + median: 0.0, + percentiles: %{25 => 0.0, 50 => 0.0, 75 => 0.0}, + frequency_distribution: %{0 => 4}, + mode: 0, + minimum: 0, + maximum: 0, + sample_size: 4, + total: 0, + outliers: [], + lower_outlier_bound: 0.0, + upper_outlier_bound: 0.0 + } + end + test "returns Statistex struct without outliers" do assert Statistex.statistics([200, 400, 400, 400, 500, 500, 500, 700, 900]) == %Statistex{ @@ -165,7 +185,7 @@ defmodule Statistex.StatistexTest do assert_basic_statistics(stats) assert_mode_in_samples(stats, samples) assert_frequencies(stats, samples) - assert_bounds(stats, samples) + assert_bounds_and_outliers(stats, samples) # shuffling values around shouldn't change the results shuffled_stats = samples |> Enum.shuffle() |> statistics() @@ -234,7 +254,7 @@ defmodule Statistex.StatistexTest do assert count_sum == stats.sample_size end - defp assert_bounds(stats, samples) do + defp assert_bounds_and_outliers(stats, samples) do Enum.each(stats.outliers, fn outlier -> assert outlier in samples assert outlier < stats.lower_outlier_bound || outlier > stats.upper_outlier_bound @@ -242,6 +262,27 @@ defmodule Statistex.StatistexTest do assert stats.lower_outlier_bound <= stats.percentiles[25] assert stats.upper_outlier_bound >= stats.percentiles[75] + + non_outlier_statistics = Statistex.statistics(samples, exclude_outliers: true) + # outlier or not, outliers or bounds aren't changed + assert non_outlier_statistics.outliers == stats.outliers + assert non_outlier_statistics.lower_outlier_bound == stats.lower_outlier_bound + assert non_outlier_statistics.upper_outlier_bound == stats.upper_outlier_bound + + if Enum.empty?(stats.outliers) do + # no outliers? Then excluding outliers shouldn't change anything! + assert non_outlier_statistics == stats + else + assert non_outlier_statistics.sample_size < stats.sample_size + assert non_outlier_statistics.standard_deviation < stats.standard_deviation + # property may not hold vor the std_dev ratio seemingly as values may be skewed too much + + frequency_occurrences = Map.keys(non_outlier_statistics.percentiles) + + # outliers don't make an appearances in the frequency occurrences + assert MapSet.intersection(MapSet.new(stats.outliers), MapSet.new(frequency_occurrences)) == + MapSet.new([]) + end end defp big_list_big_floats do From 690773f6250efd60e1864862c1560bfd64d07d9b Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 16:23:00 +0200 Subject: [PATCH 16/23] Simplify `statistics/2` as min/max aren't needed Since we changed the rules for outlier bounds we don't need them any more. Helps simplify the code quite a bit. --- lib/statistex.ex | 71 ++++++++++++++++-------------------------------- 1 file changed, 24 insertions(+), 47 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index b3402d5..6c1e83a 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -105,12 +105,13 @@ defmodule Statistex do ## Options - * `percentiles`: percentiles to calculate (see `percentiles/2`). + * `:percentiles`: percentiles to calculate (see `percentiles/2`). The percentiles 25th, 50th (median) and 75th are always calculated. - - * `exclude_outliers` can be set to `true` or `false`. Defaults to `false`. + * `:exclude_outliers` can be set to `true` or `false`. Defaults to `false`. If this option is set to `true` the outliers are excluded from the calculation of the statistics. + * `:sorted?`: indicating the samples you're passing in are already sorted. Only set this, + if they are truly sorted - otherwise your results will be wrong. ## Examples @@ -165,68 +166,44 @@ defmodule Statistex do end def statistics(samples, configuration) do - sorted_samples = Enum.sort(samples) - - # these statistics are required to do the outlier calculations - %{minimum: minimum, maximum: maximum, percentiles: percentiles} = - base_statistics(sorted_samples, configuration) + sorted_samples = maybe_sort(samples, configuration) - outlier_bounds = - do_outlier_bounds(sorted_samples, - percentiles: percentiles, - minimum: minimum, - maximum: maximum - ) + percentiles = calculate_percentiles(sorted_samples, configuration) + outlier_bounds = do_outlier_bounds(sorted_samples, percentiles: percentiles) - # make sure rest remains sorted and so can be used again to ok results + # rest remains sorted here/it's an important property {outliers, rest} = do_outliers(sorted_samples, outlier_bounds: outlier_bounds) if exclude_outliers?(configuration) and Enum.any?(outliers) do - # figure out to avoid double sorting - rest = Enum.sort(rest) # need to recalculate with the outliers removed - %{minimum: minimum, maximum: maximum, percentiles: percentiles} = - base_statistics(rest, configuration) + percentiles = calculate_percentiles(rest, configuration) - create_full_statistics(rest, minimum, maximum, percentiles, outliers, outlier_bounds) + create_full_statistics(rest, percentiles, outliers, outlier_bounds) else - create_full_statistics( - sorted_samples, - minimum, - maximum, - percentiles, - outliers, - outlier_bounds - ) + create_full_statistics(sorted_samples, percentiles, outliers, outlier_bounds) end end - defp base_statistics(sorted_samples, configuration) do - minimum = hd(sorted_samples) - maximum = List.last(sorted_samples) - - percentiles = calculate_percentiles(sorted_samples, configuration) - - %{minimum: minimum, maximum: maximum, percentiles: percentiles} - end - defp exclude_outliers?(configuration) do Access.get(configuration, :exclude_outliers) == true end # maybe make argument a map - defp create_full_statistics(samples, minimum, maximum, percentiles, outliers, outlier_bounds) do - total = total(samples) - sample_size = length(samples) - average = average(samples, total: total, sample_size: sample_size) - variance = variance(samples, average: average, sample_size: sample_size) + defp create_full_statistics(sorted_samples, percentiles, outliers, outlier_bounds) do + total = total(sorted_samples) + sample_size = length(sorted_samples) + minimum = hd(sorted_samples) + maximum = List.last(sorted_samples) + + average = average(sorted_samples, total: total, sample_size: sample_size) + variance = variance(sorted_samples, average: average, sample_size: sample_size) - frequency_distribution = frequency_distribution(samples) + frequency_distribution = frequency_distribution(sorted_samples) - standard_deviation = standard_deviation(samples, variance: variance) + standard_deviation = standard_deviation(sorted_samples, variance: variance) standard_deviation_ratio = - standard_deviation_ratio(samples, standard_deviation: standard_deviation) + standard_deviation_ratio(sorted_samples, standard_deviation: standard_deviation) {lower_outlier_bound, upper_outlier_bound} = outlier_bounds @@ -236,10 +213,10 @@ defmodule Statistex do variance: variance, standard_deviation: standard_deviation, standard_deviation_ratio: standard_deviation_ratio, - median: median(samples, percentiles: percentiles), + median: median(sorted_samples, percentiles: percentiles), percentiles: percentiles, frequency_distribution: frequency_distribution, - mode: mode(samples, frequency_distribution: frequency_distribution), + mode: mode(sorted_samples, frequency_distribution: frequency_distribution), minimum: minimum, maximum: maximum, lower_outlier_bound: lower_outlier_bound, From 1da5611b2f5286a6da5c5b535fa2763274961ca0 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:02:52 +0200 Subject: [PATCH 17/23] Redo docs to highlight optional arguments Also cracked down on some of the too flexbile code surrounding `get_percentile` - there were layers of get lazies here because the API we promised is almost too forgiving. Hence, made it easier - if the percentiles we need are there take them, if not calculate them without some layers to it that I also found too hard to follow now :) --- lib/statistex.ex | 94 ++++++++++++++++++++++++++++++----------- test/statistex_test.exs | 5 +++ 2 files changed, 74 insertions(+), 25 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index 6c1e83a..f5890af 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -110,7 +110,7 @@ defmodule Statistex do * `:exclude_outliers` can be set to `true` or `false`. Defaults to `false`. If this option is set to `true` the outliers are excluded from the calculation of the statistics. - * `:sorted?`: indicating the samples you're passing in are already sorted. Only set this, + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, if they are truly sorted - otherwise your results will be wrong. ## Examples @@ -188,7 +188,6 @@ defmodule Statistex do Access.get(configuration, :exclude_outliers) == true end - # maybe make argument a map defp create_full_statistics(sorted_samples, percentiles, outliers, outlier_bounds) do total = total(sorted_samples) sample_size = length(sorted_samples) @@ -376,7 +375,7 @@ defmodule Statistex do iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12]) 4.0 - iex> Statistex.standard_deviation([4, 9, 11, 12, 17, 5, 8, 12, 12], variance: 16.0) + iex> Statistex.standard_deviation(:dontcare, variance: 16.0) 4.0 iex> Statistex.standard_deviation([42]) @@ -462,9 +461,8 @@ defmodule Statistex do @doc """ Calculates the value at the `percentile_rank`-th percentile. - Think of this as the - value below which `percentile_rank` percent of the samples lie. For example, - if `Statistex.percentile(samples, 99) == 123.45`, + Think of this as the value below which `percentile_rank` percent of the samples lie. + For example, if `Statistex.percentile(samples, 99) == 123.45`, 99% of samples are less than 123.45. Passing a number for `percentile_rank` calculates a single percentile. @@ -478,11 +476,19 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. + ## Options + + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. + ## Examples iex> Statistex.percentiles([5, 3, 4, 5, 1, 3, 1, 3], 12.5) %{12.5 => 1.0} + iex> Statistex.percentiles([1, 1, 3, 3, 3, 4, 5, 5], 12.5, sorted?: true) + %{12.5 => 1.0} + iex> Statistex.percentiles([5, 3, 4, 5, 1, 3, 1, 3], [50]) %{50 => 3.0} @@ -581,11 +587,26 @@ defmodule Statistex do `Argumenterror` is raised if the given list is empty. + ## Options + * `:percentiles` - you can pass it a map of calculated percentiles to fetch the median from (it is the 50th percentile). + If it doesn't include the median/50th percentile - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + ## Examples iex> Statistex.median([1, 3, 4, 6, 7, 8, 9]) 6.0 + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], percentiles: %{50 => 6.0}) + 6.0 + + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], percentiles: %{25 => 3.0}) + 6.0 + + iex> Statistex.median([1, 3, 4, 6, 7, 8, 9], sorted?: true) + 6.0 + iex> Statistex.median([1, 2, 3, 4, 5, 6, 8, 9]) 4.5 @@ -600,12 +621,19 @@ defmodule Statistex do def median([], _), do: raise(ArgumentError, @empty_list_error_message) def median(samples, options) do + percentiles = Access.get(options, :percentiles, %{}) + percentiles = - Keyword.get_lazy(options, :percentiles, fn -> - Percentile.percentiles(samples, @median_percentile) - end) + case percentiles do + %{@median_percentile => _} -> + percentiles - get_percentile(samples, @median_percentile, percentiles) + # missing necessary keys + %{} -> + Percentile.percentiles(samples, @median_percentile, options) + end + + Map.fetch!(percentiles, @median_percentile) end @doc """ @@ -614,6 +642,12 @@ defmodule Statistex do Any sample that is `<` as the lower bound and any sample `>` are outliers of the given `samples`. + ## Options + * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). + If it doesn't include them - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + ## Examples iex> Statistex.outlier_bounds([3, 4, 5]) @@ -622,6 +656,12 @@ defmodule Statistex do iex> Statistex.outlier_bounds([4, 5, 3]) {0.0, 8.0} + iex> Statistex.outlier_bounds([3, 4, 5], sorted?: true) + {0.0, 8.0} + + iex> Statistex.outlier_bounds([3, 4, 5], percentiles: %{25 => 3.0, 75 => 5.0}) + {0.0, 8.0} + iex> Statistex.outlier_bounds([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) {22.5, 66.5} @@ -634,14 +674,20 @@ defmodule Statistex do def outlier_bounds(samples, options), do: do_outlier_bounds(samples, options) defp do_outlier_bounds(samples, options) do - # double check do we need both get lazies here? + percentiles = Access.get(options, :percentiles, %{}) + percentiles = - Keyword.get_lazy(options, :percentiles, fn -> - Percentile.percentiles(samples, [@first_quartile, @third_quartile], options) - end) + case percentiles do + %{@first_quartile => _, @third_quartile => _} -> + percentiles + + # missing necessary keys + %{} -> + Percentile.percentiles(samples, [@first_quartile, @third_quartile], options) + end - q1 = get_percentile(samples, @first_quartile, percentiles) - q3 = get_percentile(samples, @third_quartile, percentiles) + q1 = Map.fetch!(percentiles, @first_quartile) + q3 = Map.fetch!(percentiles, @third_quartile) iqr = q3 - q1 outlier_tolerance = iqr * @iqr_factor @@ -651,6 +697,12 @@ defmodule Statistex do @doc """ Returns all outliers for the given `samples`. + ## Options + * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). + If it doesn't include them - it will still be computed. + * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, + if they are truly sorted - otherwise your results will be wrong. Sorting only occurs when percentiles aren't provided. + ## Examples iex> Statistex.outliers([3, 4, 5]) @@ -664,10 +716,8 @@ defmodule Statistex do """ @spec outliers(samples, keyword) :: samples | [] def outliers(samples, options \\ []) do - sorted_samples = maybe_sort(samples, options) - # maybe allow folks to get the same - {outliers, _rest} = do_outliers(sorted_samples, options) + {outliers, _rest} = do_outliers(samples, options) outliers end @@ -681,12 +731,6 @@ defmodule Statistex do Enum.split_with(sorted_samples, fn sample -> sample < lower_bound || sample > upper_bound end) end - defp get_percentile(samples, percentile, percentiles) do - Map.get_lazy(percentiles, percentile, fn -> - samples |> Percentile.percentiles(percentile) |> Map.fetch!(percentile) - end) - end - @doc """ The biggest sample. diff --git a/test/statistex_test.exs b/test/statistex_test.exs index e4a62d6..43d77f2 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -10,6 +10,11 @@ defmodule Statistex.StatistexTest do test "if handed percentiles missing the median percentile still calculates it" do assert Statistex.median([1, 2, 3, 4, 5, 6, 8, 9], percentiles: %{}) == 4.5 end + + # what an odd test to write, huh? Well that way we can see we trust the `sorted?` value not resorting. + test "if told that the list is sorted while it isn't the result will be wrong" do + assert Statistex.median([1, 6, 4, 3, 5, 9, 2, 8], sorted?: true) != 4.5 + end end describe ".outlier_bounds/2" do From 2b45471e672e0a5ef760262f74c0eb6369e9cb69 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:15:43 +0200 Subject: [PATCH 18/23] Make outliers return both the outliers and remaining values just as we use it --- lib/statistex.ex | 34 ++++++++++++++++------------------ test/statistex_test.exs | 3 ++- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/lib/statistex.ex b/lib/statistex.ex index f5890af..91b985c 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -169,10 +169,10 @@ defmodule Statistex do sorted_samples = maybe_sort(samples, configuration) percentiles = calculate_percentiles(sorted_samples, configuration) - outlier_bounds = do_outlier_bounds(sorted_samples, percentiles: percentiles) + outlier_bounds = outlier_bounds(sorted_samples, percentiles: percentiles) # rest remains sorted here/it's an important property - {outliers, rest} = do_outliers(sorted_samples, outlier_bounds: outlier_bounds) + {outliers, rest} = outliers(sorted_samples, outlier_bounds: outlier_bounds) if exclude_outliers?(configuration) and Enum.any?(outliers) do # need to recalculate with the outliers removed @@ -671,9 +671,8 @@ defmodule Statistex do @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} def outlier_bounds(samples, options \\ []) def outlier_bounds([], _), do: raise(ArgumentError, @empty_list_error_message) - def outlier_bounds(samples, options), do: do_outlier_bounds(samples, options) - defp do_outlier_bounds(samples, options) do + def outlier_bounds(samples, options) do percentiles = Access.get(options, :percentiles, %{}) percentiles = @@ -695,9 +694,12 @@ defmodule Statistex do end @doc """ - Returns all outliers for the given `samples`. + Returns all outliers for the given `samples`, along with the remaining values. + + Returns: `{outliers, remaining_samples`} where `remaining_samples` has the outliers removed. ## Options + * `:outlier_bounds` - if you already have calculated the outlier bounds. * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). If it doesn't include them - it will still be computed. * `:sorted?`: indicating the samples you're passing in are already sorted. Defaults to `false`. Only set this, @@ -706,29 +708,25 @@ defmodule Statistex do ## Examples iex> Statistex.outliers([3, 4, 5]) - [] + {[], [3, 4, 5]} iex> Statistex.outliers([1, 2, 6, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]) - [1, 2, 6] + {[1, 2, 6], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} + + iex> Statistex.outliers([50, 50, 1, 50, 50, 50, 50, 50, 2, 50, 50, 50, 50, 6]) + {[1, 2, 6], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} iex> Statistex.outliers([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) - [99, 99, 99] + {[99, 99, 99], [50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50]} """ - @spec outliers(samples, keyword) :: samples | [] + @spec outliers(samples, keyword) :: {samples | [], samples} def outliers(samples, options \\ []) do - # maybe allow folks to get the same - {outliers, _rest} = do_outliers(samples, options) - - outliers - end - - defp do_outliers(sorted_samples, options) do {lower_bound, upper_bound} = Keyword.get_lazy(options, :outlier_bounds, fn -> - do_outlier_bounds(sorted_samples, options) + outlier_bounds(samples, options) end) - Enum.split_with(sorted_samples, fn sample -> sample < lower_bound || sample > upper_bound end) + Enum.split_with(samples, fn sample -> sample < lower_bound || sample > upper_bound end) end @doc """ diff --git a/test/statistex_test.exs b/test/statistex_test.exs index 43d77f2..5292270 100644 --- a/test/statistex_test.exs +++ b/test/statistex_test.exs @@ -11,7 +11,8 @@ defmodule Statistex.StatistexTest do assert Statistex.median([1, 2, 3, 4, 5, 6, 8, 9], percentiles: %{}) == 4.5 end - # what an odd test to write, huh? Well that way we can see we trust the `sorted?` value not resorting. + # what an odd test to write, huh? Well that way we can see we trust the `sorted?` + # value not resorting. test "if told that the list is sorted while it isn't the result will be wrong" do assert Statistex.median([1, 6, 4, 3, 5, 9, 2, 8], sorted?: true) != 4.5 end From c222eeadd9f02a430c5c37685839d4e922f4b96d Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:16:08 +0200 Subject: [PATCH 19/23] ignore dialyzer stuff --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index dbc9658..33b2fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ docs # Don't feel like tracking that gives me what I want any more :) .tool-versions + +# dialyzer +/tools From b9a38f42c324cbf46e6458951c5c35798971ef58 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:18:19 +0200 Subject: [PATCH 20/23] Map.new >>>> Enum.reduce in this case --- lib/statistex/percentile.ex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/statistex/percentile.ex b/lib/statistex/percentile.ex index 4cab86b..91688b6 100644 --- a/lib/statistex/percentile.ex +++ b/lib/statistex/percentile.ex @@ -20,9 +20,9 @@ defmodule Statistex.Percentile do percentile_ranks |> List.wrap() - |> Enum.reduce(%{}, fn percentile_rank, acc -> + |> Map.new(fn percentile_rank -> perc = percentile(sorted_samples, number_of_samples, percentile_rank) - Map.put(acc, percentile_rank, perc) + {percentile_rank, perc} end) end From e5edbb62c095a0f7bebad83767a1c594f895dee5 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:20:57 +0200 Subject: [PATCH 21/23] Get test coverage back to 100% cos we can --- lib/statistex.ex | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/statistex.ex b/lib/statistex.ex index 91b985c..8b67d9c 100644 --- a/lib/statistex.ex +++ b/lib/statistex.ex @@ -642,6 +642,8 @@ defmodule Statistex do Any sample that is `<` as the lower bound and any sample `>` are outliers of the given `samples`. + List passed needs to be non empty, otherwise an `ArgumentError` is raised. + ## Options * `:percentiles` - you can pass it a map of calculated percentiles (25th and 75th are needed). If it doesn't include them - it will still be computed. @@ -667,6 +669,9 @@ defmodule Statistex do iex> Statistex.outlier_bounds([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 99, 99, 99]) {31.625, 80.625} + + iex> Statistex.outlier_bounds([]) + ** (ArgumentError) Passed an empty list ([]) to calculate statistics from, please pass a list containing at least one number. """ @spec outlier_bounds(samples, keyword) :: {lower :: number, upper :: number} def outlier_bounds(samples, options \\ []) From 93620e7ce239e932987edcd2c7301c01f12a080f Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:24:31 +0200 Subject: [PATCH 22/23] Undo changes having the tests expect the input list is ordered I think that's a dangerous assumption and our code should be resilient to it :) --- test/statistex/percentile_test.exs | 34 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/test/statistex/percentile_test.exs b/test/statistex/percentile_test.exs index 020f523..fbc03a5 100644 --- a/test/statistex/percentile_test.exs +++ b/test/statistex/percentile_test.exs @@ -4,20 +4,20 @@ defmodule Statistex.PercentileTest do doctest Statistex.Percentile - @nist_sample_data Enum.sort([ - 95.1772, - 95.1567, - 95.1937, - 95.1959, - 95.1442, - 95.0610, - 95.1591, - 95.1195, - 95.1065, - 95.0925, - 95.1990, - 95.1682 - ]) + @nist_sample_data [ + 95.1772, + 95.1567, + 95.1937, + 95.1959, + 95.1442, + 95.0610, + 95.1591, + 95.1195, + 95.1065, + 95.0925, + 95.1990, + 95.1682 + ] # Test data from: # http://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm @@ -49,7 +49,7 @@ defmodule Statistex.PercentileTest do end describe "a list of two elements" do - @samples [200, 300] + @samples [300, 200] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 200.0 @@ -67,7 +67,7 @@ defmodule Statistex.PercentileTest do end describe "seemingly problematic 2 element list [9, 1]" do - @samples [1, 9] + @samples [9, 1] percentiles = %{ 25 => 1, @@ -88,7 +88,7 @@ defmodule Statistex.PercentileTest do end describe "a list of three elements" do - @samples [100, 200, 300] + @samples [100, 300, 200] test "1st percentile (small sample size simply picks first element)" do %{1 => result} = percentiles(@samples, [1]) assert result == 100.0 From 5b7349835f93550f6f8f1f3e35e32495095f5c9c Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Sat, 3 May 2025 17:32:29 +0200 Subject: [PATCH 23/23] Preliminary changelog --- CHANGELOG.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 34d69b2..2f43b51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,17 @@ +## 1.1 (Unreleased) + +This release adds functionality around identifying outliers. + +* the Statistex struct comes with more keys: `:lower_outlier_bound`, `:upper_outlier_bound` & `:outliers`, +along with the new public functions `:outliers/2` and `:outlier_bounds/2`. +* `statistics/2` now also accepts `exclude_outliers: true` to exclude the outliers from the calculation +of statistics. +* some functions have also been updated to accept more optional arguments such as `:sorted?` to avoid unnecessary extra work. + +Huge thanks for these changes go to [@NickNeck](https://github.com/NickNeck)! + ## 1.0 2019-07-05 Import of the initial functionality from [benchee](github.com/bencheeorg/benchee). -Dubbed 1.0 because many people had already been running this code indirectly through benchee. \ No newline at end of file +Dubbed 1.0 because many people had already been running this code indirectly through benchee.