From 59c3ae5802354e99d23e3d514dcd853698b72608 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 15:15:01 +0530 Subject: [PATCH 01/11] Added DP based implementation of the MLE classifier --- mapclassify/classifiers.py | 88 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 68b65c7..aa566bc 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -36,6 +36,7 @@ "gadf", "KClassifiers", "CLASSIFIERS", + "MaximumLikelihood" ] CLASSIFIERS = ( @@ -55,6 +56,7 @@ "PrettyBreaks", "StdMean", "UserDefined", + "MaximumLikelihood" ) K = 5 # default number of classes in any map scheme with this as an argument @@ -3109,3 +3111,89 @@ def __init__(self, y, pct=0.8): pct0 = pct1 self.results = results self.best = best[1] + +class MaximumLikelihood(MapClassifier): + """ + Maximumum Likelihood Estimation based Map Classification. + + Parameters + ---------- + + y : numpy.array + (n, 1), values to classify. + sigma : numpy.array + (n, 1), standard deviation corresponding to each value in y. + k : int (default 5) + The number of classes required. + + Attributes + ---------- + + yb = numpy.array + :math:`(n,1)`, bin IDs for observations. + bins : numpy.array + :math:`(k,1)`, the upper bounds of each class. + k : int + The number of classes. + counts : numpy.array + :math:`(k,1)`, the number of observations falling in each class. + """ + + def __init__(self, y, sigma, k=5): + self.sigma = np.asarray(sigma) + self.k = k + self.name = "MaximumLikelihood" + MapClassifier.__init__(self, y) + + def _set_bins(self): + y = self.y + sigma = self.sigma + k = self.k + n = len(y) + + # The algorithm requires sorted areal values + sort_idx = np.argsort(y) + y_sorted = y[sort_idx] + sigma_sorted = sigma[sort_idx] + + # Precompute the objective value matrix (omega) + omega = np.zeros((n, n)) + for i in range(n): + for j in range(i, n): + y_slice = y_sorted[i:j + 1] + sig_slice = sigma_sorted[i:j + 1] + + # Calculate the representative value v + variance_inv = 1.0 / (sig_slice ** 2) + v_val = np.sum(y_slice * variance_inv) / np.sum(variance_inv) + + # Calculate objective cost for the class + omega[i, j] = np.sum(((v_val - y_slice) ** 2) * variance_inv) + + # Dynamic Programming approach to find the optimal breaks + dp = np.full((n + 1, k + 1), np.inf) + dp[0, 0] = 0 + backtrack = np.zeros((n + 1, k + 1), dtype=int) + + for j in range(1, k + 1): + for i in range(j, n + 1): + for m in range(j - 1, i): + cost = dp[m, j - 1] + omega[m, i - 1] + if cost < dp[i, j]: + dp[i, j] = cost + backtrack[i, j] = m + + breaks_idx = [] + curr_i = n + for j in range(k, 0, -1): + curr_i = backtrack[curr_i, j] + if curr_i > 0: + breaks_idx.append(curr_i - 1) + breaks_idx.reverse() + bins = [y_sorted[idx] for idx in breaks_idx] + + # Ensure the maximum value is the final bin edge + if len(bins) > 0 and bins[-1] != y_sorted[-1]: + bins[-1] = y_sorted[-1] + + self.bins = np.array(bins) From c8a50270819f81a467cba953efc0bb73341af55f Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 15:24:05 +0530 Subject: [PATCH 02/11] Added tests for MLE based classifier --- mapclassify/tests/test_mapclassify.py | 38 +++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/mapclassify/tests/test_mapclassify.py b/mapclassify/tests/test_mapclassify.py index a9a2012..4efa18f 100644 --- a/mapclassify/tests/test_mapclassify.py +++ b/mapclassify/tests/test_mapclassify.py @@ -15,6 +15,7 @@ JenksCaspallSampled, KClassifiers, MaximumBreaks, + MaximumLikelihood, MaxP, NaturalBreaks, Percentiles, @@ -790,3 +791,40 @@ def test_histogram_plot_linewidth(self): linewidth=3, linecolor="red", color="yellow" ) return ax.get_figure() + + +class TestMaximumLikelihood: + def setup_method(self): + # A deterministic dataset designed to clearly cluster into 3 groups + self.y = numpy.array([10.0, 20.0, 100.0, 110.0, 200.0, 210.0]) + self.sigma = numpy.array([1.0, 1.0, 2.0, 2.0, 1.0, 1.0]) + + def test_MaximumLikelihood(self): + ml = MaximumLikelihood(self.y, self.sigma, k=3) + assert ml.k == 3 + assert len(ml.counts) == 3 + + numpy.testing.assert_array_almost_equal( + ml.bins, numpy.array([20.0, 110.0, 210.0]) + ) + numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) + + def test_MaximumLikelihood_unordered(self): + # The classifier should be able to handle unsorted data + y_unordered = numpy.array([210.0, 10.0, 110.0, 20.0, 100.0, 200.0]) + sigma_unordered = numpy.array([1.0, 1.0, 2.0, 1.0, 2.0, 1.0]) + + ml = MaximumLikelihood(y_unordered, sigma_unordered, k=3) + + numpy.testing.assert_array_almost_equal( + ml.bins, numpy.array([20.0, 110.0, 210.0]) + ) + numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) + + def test_MaximumLikelihood_stability(self): + # Verify the dynamic programming approach consistently yields the same results + for _ in range(10): + ml = MaximumLikelihood(self.y, self.sigma, k=3) + assert ml.k == 3 + assert len(ml.counts) == 3 + numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) From 17ef611e6c4725b953f28e44e4c5e05ea17ce2ba Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 15:28:00 +0530 Subject: [PATCH 03/11] Added reference to the MLE based classifier article --- docs/_static/references.bib | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/docs/_static/references.bib b/docs/_static/references.bib index 8f7caca..a39cf87 100644 --- a/docs/_static/references.bib +++ b/docs/_static/references.bib @@ -38,6 +38,21 @@ @article{Rey_2016 Year = 2016, Bdsk-Url-1 = {http://dx.doi.org/10.1111/tgis.12236}} +@article{Mu_2019, + Author = {Mu, Wangshu and Tong, Daoqin}, + Doi = {10.1080/24694452.2018.1549971}, + Issn = {2469-4452}, + Journal = {Annals of the American Association of Geographers}, + Month = {Mar}, + Number = 5, + Pages = {1493--1510}, + Publisher = {Taylor \& Francis}, + Title = {Choropleth Mapping with Uncertainty: A Maximum Likelihood-Based Classification Scheme}, + Url = {https://doi.org/10.1080/24694452.2018.1549971}, + Volume = 109, + Year = 2019, + Bdsk-Url-1 = {https://doi.org/10.1080/24694452.2018.1549971}} + @book{Slocum_2009, Author = {Slocum, Terry A. and McMaster, Robert B. and Kessler, Fritz C. and Howard, Hugh H.}, Publisher = {Pearson Prentice Hall, Upper Saddle River}, From 55689229f4b3f8e3b5820c499f6a18b0edfb73d9 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 15:53:28 +0530 Subject: [PATCH 04/11] Added citation and examples --- mapclassify/classifiers.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index aa566bc..c522482 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -36,7 +36,7 @@ "gadf", "KClassifiers", "CLASSIFIERS", - "MaximumLikelihood" + "MaximumLikelihood", ] CLASSIFIERS = ( @@ -56,7 +56,7 @@ "PrettyBreaks", "StdMean", "UserDefined", - "MaximumLikelihood" + "MaximumLikelihood", ) K = 5 # default number of classes in any map scheme with this as an argument @@ -3112,6 +3112,7 @@ def __init__(self, y, pct=0.8): self.results = results self.best = best[1] + class MaximumLikelihood(MapClassifier): """ Maximumum Likelihood Estimation based Map Classification. @@ -3137,6 +3138,26 @@ class MaximumLikelihood(MapClassifier): The number of classes. counts : numpy.array :math:`(k,1)`, the number of observations falling in each class. + + Notes + ----- + This classification scheme incorporates data uncertainty (standard deviation) + into the creation of choropleth maps. It assumes the existence of a + representative value for each class and determines class breaks using + a dynamic programming approach to minimize the overall within-class + deviation while accounting for the uncertainty. :cite:`Mu_2019`. + + Examples + -------- + >>> import numpy as np + >>> import mapclassify + >>> y = np.array([32000, 45000, 46000, 50000, 61000, 62000, 85000, 90000]) + >>> sigma = np.array([1500, 2000, 2100, 1800, 3000, 3100, 4500, 5000]) + >>> ml = mapclassify.MaximumLikelihood(y, sigma, k=3) + >>> ml.bins + array([46000, 62000, 90000]) + >>> ml.counts.tolist() + [3, 3, 2] """ def __init__(self, y, sigma, k=5): @@ -3160,11 +3181,11 @@ def _set_bins(self): omega = np.zeros((n, n)) for i in range(n): for j in range(i, n): - y_slice = y_sorted[i:j + 1] - sig_slice = sigma_sorted[i:j + 1] + y_slice = y_sorted[i : j + 1] + sig_slice = sigma_sorted[i : j + 1] # Calculate the representative value v - variance_inv = 1.0 / (sig_slice ** 2) + variance_inv = 1.0 / (sig_slice**2) v_val = np.sum(y_slice * variance_inv) / np.sum(variance_inv) # Calculate objective cost for the class From f5b8c21d4f616d40e8c1efda7a71c6f3e563a808 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 16:02:08 +0530 Subject: [PATCH 05/11] fixed a bug in MLE classifier --- mapclassify/classifiers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index c522482..d609567 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -3207,9 +3207,8 @@ def _set_bins(self): breaks_idx = [] curr_i = n for j in range(k, 0, -1): + breaks_idx.append(curr_i - 1) curr_i = backtrack[curr_i, j] - if curr_i > 0: - breaks_idx.append(curr_i - 1) breaks_idx.reverse() bins = [y_sorted[idx] for idx in breaks_idx] From 99e59334390b51eb306afd8d35ab4d866372f023 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 20:55:18 +0530 Subject: [PATCH 06/11] Added MaximumLikelihood in import --- mapclassify/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mapclassify/__init__.py b/mapclassify/__init__.py index 9c7ba9d..bb81ebe 100644 --- a/mapclassify/__init__.py +++ b/mapclassify/__init__.py @@ -15,6 +15,7 @@ JenksCaspallSampled, KClassifiers, MaximumBreaks, + MaximumLikelihood, MaxP, NaturalBreaks, Percentiles, From cb7c5b868d5ee8afcd2884168eb609835c1938f0 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 20:55:45 +0530 Subject: [PATCH 07/11] Added extra comments --- mapclassify/classifiers.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index d609567..01236f7 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -3151,8 +3151,8 @@ class MaximumLikelihood(MapClassifier): -------- >>> import numpy as np >>> import mapclassify - >>> y = np.array([32000, 45000, 46000, 50000, 61000, 62000, 85000, 90000]) - >>> sigma = np.array([1500, 2000, 2100, 1800, 3000, 3100, 4500, 5000]) + >>> y = [32000, 45000, 46000, 50000, 61000, 62000, 85000, 90000] + >>> sigma = [1500, 2000, 2100, 1800, 3000, 3100, 4500, 5000] >>> ml = mapclassify.MaximumLikelihood(y, sigma, k=3) >>> ml.bins array([46000, 62000, 90000]) @@ -3177,33 +3177,39 @@ def _set_bins(self): y_sorted = y[sort_idx] sigma_sorted = sigma[sort_idx] - # Precompute the objective value matrix (omega) + # omega[m][i] -> cost of segment (m+1, i) omega = np.zeros((n, n)) for i in range(n): for j in range(i, n): y_slice = y_sorted[i : j + 1] sig_slice = sigma_sorted[i : j + 1] - # Calculate the representative value v + # Calculate the representative value for each class variance_inv = 1.0 / (sig_slice**2) v_val = np.sum(y_slice * variance_inv) / np.sum(variance_inv) - # Calculate objective cost for the class + # Calculate the cost for each class omega[i, j] = np.sum(((v_val - y_slice) ** 2) * variance_inv) - # Dynamic Programming approach to find the optimal breaks + # dp[i][j] stores minimum cost to split first i elements into j classes dp = np.full((n + 1, k + 1), np.inf) dp[0, 0] = 0 + + # backtrack[i][j] stores the index where the j-th class starts backtrack = np.zeros((n + 1, k + 1), dtype=int) + # j -> no.of classes, j = k is the required solution for j in range(1, k + 1): + # i -> no.of elements considered, i must be >= j for i in range(j, n + 1): + # try all possible breakpoints, m -> end of previous classs for m in range(j - 1, i): cost = dp[m, j - 1] + omega[m, i - 1] if cost < dp[i, j]: dp[i, j] = cost backtrack[i, j] = m + # find the breakpoints by looping backwards on dp. breaks_idx = [] curr_i = n for j in range(k, 0, -1): @@ -3212,8 +3218,4 @@ def _set_bins(self): breaks_idx.reverse() bins = [y_sorted[idx] for idx in breaks_idx] - # Ensure the maximum value is the final bin edge - if len(bins) > 0 and bins[-1] != y_sorted[-1]: - bins[-1] = y_sorted[-1] - self.bins = np.array(bins) From 2d2b8dd98c92ac28611564aa82a7c7aa253b4dcb Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 23:32:52 +0530 Subject: [PATCH 08/11] Added MaximumLikelihood class --- docs/api.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/api.rst b/docs/api.rst index 2e29a54..e0500fd 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -24,6 +24,7 @@ Classifiers mapclassify.JenksCaspallSampled mapclassify.MaxP mapclassify.MaximumBreaks + mapclassify.MaximumLikelihood mapclassify.NaturalBreaks mapclassify.Percentiles mapclassify.PrettyBreaks From bb6eadfbedc75ff4b2dbaacc4983ebf4bd765807 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 23:35:30 +0530 Subject: [PATCH 09/11] Minor changes --- mapclassify/classifiers.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 01236f7..2077e89 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -27,6 +27,7 @@ "HeadTailBreaks", "MaxP", "MaximumBreaks", + "MaximumLikelihood", "NaturalBreaks", "Quantiles", "Percentiles", @@ -36,7 +37,6 @@ "gadf", "KClassifiers", "CLASSIFIERS", - "MaximumLikelihood", ] CLASSIFIERS = ( @@ -49,14 +49,13 @@ "JenksCaspallForced", "JenksCaspallSampled", "MaxP", - "MaximumBreaks", + "MaximumBreaksMaximumLikelihood", "NaturalBreaks", "Quantiles", "Percentiles", "PrettyBreaks", "StdMean", "UserDefined", - "MaximumLikelihood", ) K = 5 # default number of classes in any map scheme with this as an argument @@ -3141,23 +3140,27 @@ class MaximumLikelihood(MapClassifier): Notes ----- + This classification scheme incorporates data uncertainty (standard deviation) into the creation of choropleth maps. It assumes the existence of a representative value for each class and determines class breaks using a dynamic programming approach to minimize the overall within-class - deviation while accounting for the uncertainty. :cite:`Mu_2019`. + deviation while accounting for the uncertainty. :cite:`Mu_2019` Examples -------- + >>> import numpy as np >>> import mapclassify >>> y = [32000, 45000, 46000, 50000, 61000, 62000, 85000, 90000] >>> sigma = [1500, 2000, 2100, 1800, 3000, 3100, 4500, 5000] >>> ml = mapclassify.MaximumLikelihood(y, sigma, k=3) >>> ml.bins - array([46000, 62000, 90000]) + array([32000, 62000, 90000]) + For e.g. FisherJenks would return array([50000., 62000., 90000.]) >>> ml.counts.tolist() [3, 3, 2] + """ def __init__(self, y, sigma, k=5): From 19b65e5b0784ca83d60bd1f613b01737f8728c23 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sat, 7 Mar 2026 23:36:52 +0530 Subject: [PATCH 10/11] Added better test example --- mapclassify/tests/test_mapclassify.py | 57 +++++++++++++++------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/mapclassify/tests/test_mapclassify.py b/mapclassify/tests/test_mapclassify.py index 4efa18f..6655cae 100644 --- a/mapclassify/tests/test_mapclassify.py +++ b/mapclassify/tests/test_mapclassify.py @@ -794,37 +794,44 @@ def test_histogram_plot_linewidth(self): class TestMaximumLikelihood: - def setup_method(self): - # A deterministic dataset designed to clearly cluster into 3 groups - self.y = numpy.array([10.0, 20.0, 100.0, 110.0, 200.0, 210.0]) - self.sigma = numpy.array([1.0, 1.0, 2.0, 2.0, 1.0, 1.0]) - def test_MaximumLikelihood(self): - ml = MaximumLikelihood(self.y, self.sigma, k=3) + # An ordered dataset that has 2 groups + # Normal classifier would group 90 as its own class + # but MLE will group it with 50 and instead split the lower + # valued elements with low uncertainity + y = [10, 12, 14, 50, 90] + sigma = [1, 1, 1, 1, 20] + ml = MaximumLikelihood(y, sigma, k=3) + + # not needed but just checking :) assert ml.k == 3 assert len(ml.counts) == 3 + # the last value in bin should be the maximum value (invariant) + assert ml.bins[-1] == max(y) + # total count should equal the length of input + assert ml.counts.sum() == len(y) + # each class must have atleast one element + assert numpy.all(ml.counts > 0) - numpy.testing.assert_array_almost_equal( - ml.bins, numpy.array([20.0, 110.0, 210.0]) - ) - numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) + numpy.testing.assert_array_equal(ml.bins, numpy.array([10, 14, 90])) + numpy.testing.assert_array_equal(ml.counts, numpy.array([1, 2, 2])) def test_MaximumLikelihood_unordered(self): - # The classifier should be able to handle unsorted data - y_unordered = numpy.array([210.0, 10.0, 110.0, 20.0, 100.0, 200.0]) - sigma_unordered = numpy.array([1.0, 1.0, 2.0, 1.0, 2.0, 1.0]) + # Ordered and unordered data should give same result + y_unordered = numpy.array([10, 90, 14, 50, 12]) + sigma_unordered = numpy.array([1, 20, 1, 1, 1]) ml = MaximumLikelihood(y_unordered, sigma_unordered, k=3) - numpy.testing.assert_array_almost_equal( - ml.bins, numpy.array([20.0, 110.0, 210.0]) - ) - numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) - - def test_MaximumLikelihood_stability(self): - # Verify the dynamic programming approach consistently yields the same results - for _ in range(10): - ml = MaximumLikelihood(self.y, self.sigma, k=3) - assert ml.k == 3 - assert len(ml.counts) == 3 - numpy.testing.assert_array_almost_equal(ml.counts, numpy.array([2, 2, 2])) + # not needed but just checking :) + assert ml.k == 3 + assert len(ml.counts) == 3 + # the last value in bin should be the maximum value (invariant) + assert ml.bins[-1] == max(y_unordered) + # total count should equal the length of input + assert ml.counts.sum() == len(y_unordered) + # each class must have atleast one element + assert numpy.all(ml.counts > 0) + + numpy.testing.assert_array_equal(ml.bins, numpy.array([10, 14, 90])) + numpy.testing.assert_array_equal(ml.counts, numpy.array([1, 2, 2])) From df62f838fde424e59a8beafa3df381a3a70192a1 Mon Sep 17 00:00:00 2001 From: R Virinchi Date: Sun, 8 Mar 2026 00:02:01 +0530 Subject: [PATCH 11/11] Fixed a typo --- mapclassify/classifiers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mapclassify/classifiers.py b/mapclassify/classifiers.py index 2077e89..88c617a 100644 --- a/mapclassify/classifiers.py +++ b/mapclassify/classifiers.py @@ -49,7 +49,8 @@ "JenksCaspallForced", "JenksCaspallSampled", "MaxP", - "MaximumBreaksMaximumLikelihood", + "MaximumBreaks", + "MaximumLikelihood", "NaturalBreaks", "Quantiles", "Percentiles",