Skip to content
15 changes: 15 additions & 0 deletions docs/_static/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,21 @@ @article{Rey_2016
Year = 2016,
Bdsk-Url-1 = {http://dx.doi.org/10.1111/tgis.12236}}

@article{Mu_2019,
Author = {Mu, Wangshu and Tong, Daoqin},
Doi = {10.1080/24694452.2018.1549971},
Issn = {2469-4452},
Journal = {Annals of the American Association of Geographers},
Month = {Mar},
Number = 5,
Pages = {1493--1510},
Publisher = {Taylor \& Francis},
Title = {Choropleth Mapping with Uncertainty: A Maximum Likelihood-Based Classification Scheme},
Url = {https://doi.org/10.1080/24694452.2018.1549971},
Volume = 109,
Year = 2019,
Bdsk-Url-1 = {https://doi.org/10.1080/24694452.2018.1549971}}

@book{Slocum_2009,
Author = {Slocum, Terry A. and McMaster, Robert B. and Kessler, Fritz C. and Howard, Hugh H.},
Publisher = {Pearson Prentice Hall, Upper Saddle River},
Expand Down
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Classifiers
mapclassify.JenksCaspallSampled
mapclassify.MaxP
mapclassify.MaximumBreaks
mapclassify.MaximumLikelihood
mapclassify.NaturalBreaks
mapclassify.Percentiles
mapclassify.PrettyBreaks
Expand Down
1 change: 1 addition & 0 deletions mapclassify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
JenksCaspallSampled,
KClassifiers,
MaximumBreaks,
MaximumLikelihood,
MaxP,
NaturalBreaks,
Percentiles,
Expand Down
114 changes: 114 additions & 0 deletions mapclassify/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"HeadTailBreaks",
"MaxP",
"MaximumBreaks",
"MaximumLikelihood",
"NaturalBreaks",
"Quantiles",
"Percentiles",
Expand All @@ -49,6 +50,7 @@
"JenksCaspallSampled",
"MaxP",
"MaximumBreaks",
"MaximumLikelihood",
"NaturalBreaks",
"Quantiles",
"Percentiles",
Expand Down Expand Up @@ -3109,3 +3111,115 @@ def __init__(self, y, pct=0.8):
pct0 = pct1
self.results = results
self.best = best[1]


class MaximumLikelihood(MapClassifier):
"""
Maximumum Likelihood Estimation based Map Classification.

Parameters
----------

y : numpy.array
(n, 1), values to classify.
sigma : numpy.array
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any guidance on where sigma values are to be obtained?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, the widely used American Community Survey (ACS) data include sampling uncertainty through published “margin of error” (MOE) estimates.

The uncertainty values were included in the dataset that they were using. If the data is generated by a statistical model then the standard deviation could be calculated directly from the model's own variance estimates i believe.

Ultimately uncertainty values have to come from either the people collecting the data or the ones modeling it, so it falls outside of our scope i guess.

(n, 1), standard deviation corresponding to each value in y.
k : int (default 5)
The number of classes required.

Attributes
----------

yb = numpy.array
:math:`(n,1)`, bin IDs for observations.
bins : numpy.array
:math:`(k,1)`, the upper bounds of each class.
k : int
The number of classes.
counts : numpy.array
:math:`(k,1)`, the number of observations falling in each class.

Notes
-----

This classification scheme incorporates data uncertainty (standard deviation)
into the creation of choropleth maps. It assumes the existence of a
representative value for each class and determines class breaks using
a dynamic programming approach to minimize the overall within-class
deviation while accounting for the uncertainty. :cite:`Mu_2019`

Examples
--------

>>> import numpy as np
>>> import mapclassify
>>> y = [32000, 45000, 46000, 50000, 61000, 62000, 85000, 90000]
>>> sigma = [1500, 2000, 2100, 1800, 3000, 3100, 4500, 5000]
>>> ml = mapclassify.MaximumLikelihood(y, sigma, k=3)
>>> ml.bins
array([32000, 62000, 90000])
For e.g. FisherJenks would return array([50000., 62000., 90000.])
>>> ml.counts.tolist()
[3, 3, 2]

"""

def __init__(self, y, sigma, k=5):
self.sigma = np.asarray(sigma)
self.k = k
self.name = "MaximumLikelihood"
MapClassifier.__init__(self, y)

def _set_bins(self):
y = self.y
sigma = self.sigma
k = self.k
n = len(y)

# The algorithm requires sorted areal values
sort_idx = np.argsort(y)
y_sorted = y[sort_idx]
sigma_sorted = sigma[sort_idx]

# omega[m][i] -> cost of segment (m+1, i)
omega = np.zeros((n, n))
for i in range(n):
for j in range(i, n):
y_slice = y_sorted[i : j + 1]
sig_slice = sigma_sorted[i : j + 1]

# Calculate the representative value for each class
variance_inv = 1.0 / (sig_slice**2)
v_val = np.sum(y_slice * variance_inv) / np.sum(variance_inv)

# Calculate the cost for each class
omega[i, j] = np.sum(((v_val - y_slice) ** 2) * variance_inv)

# dp[i][j] stores minimum cost to split first i elements into j classes
dp = np.full((n + 1, k + 1), np.inf)
dp[0, 0] = 0

# backtrack[i][j] stores the index where the j-th class starts
backtrack = np.zeros((n + 1, k + 1), dtype=int)

# j -> no.of classes, j = k is the required solution
for j in range(1, k + 1):
# i -> no.of elements considered, i must be >= j
for i in range(j, n + 1):
# try all possible breakpoints, m -> end of previous classs
for m in range(j - 1, i):
cost = dp[m, j - 1] + omega[m, i - 1]
if cost < dp[i, j]:
dp[i, j] = cost
backtrack[i, j] = m

# find the breakpoints by looping backwards on dp.
breaks_idx = []
curr_i = n
for j in range(k, 0, -1):
breaks_idx.append(curr_i - 1)
curr_i = backtrack[curr_i, j]
breaks_idx.reverse()
bins = [y_sorted[idx] for idx in breaks_idx]

self.bins = np.array(bins)
45 changes: 45 additions & 0 deletions mapclassify/tests/test_mapclassify.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
JenksCaspallSampled,
KClassifiers,
MaximumBreaks,
MaximumLikelihood,
MaxP,
NaturalBreaks,
Percentiles,
Expand Down Expand Up @@ -790,3 +791,47 @@ def test_histogram_plot_linewidth(self):
linewidth=3, linecolor="red", color="yellow"
)
return ax.get_figure()


class TestMaximumLikelihood:
def test_MaximumLikelihood(self):
# An ordered dataset that has 2 groups
# Normal classifier would group 90 as its own class
# but MLE will group it with 50 and instead split the lower
# valued elements with low uncertainity
y = [10, 12, 14, 50, 90]
sigma = [1, 1, 1, 1, 20]
ml = MaximumLikelihood(y, sigma, k=3)

# not needed but just checking :)
assert ml.k == 3
assert len(ml.counts) == 3
# the last value in bin should be the maximum value (invariant)
assert ml.bins[-1] == max(y)
# total count should equal the length of input
assert ml.counts.sum() == len(y)
# each class must have atleast one element
assert numpy.all(ml.counts > 0)

numpy.testing.assert_array_equal(ml.bins, numpy.array([10, 14, 90]))
numpy.testing.assert_array_equal(ml.counts, numpy.array([1, 2, 2]))

def test_MaximumLikelihood_unordered(self):
# Ordered and unordered data should give same result
y_unordered = numpy.array([10, 90, 14, 50, 12])
sigma_unordered = numpy.array([1, 20, 1, 1, 1])

ml = MaximumLikelihood(y_unordered, sigma_unordered, k=3)

# not needed but just checking :)
assert ml.k == 3
assert len(ml.counts) == 3
# the last value in bin should be the maximum value (invariant)
assert ml.bins[-1] == max(y_unordered)
# total count should equal the length of input
assert ml.counts.sum() == len(y_unordered)
# each class must have atleast one element
assert numpy.all(ml.counts > 0)

numpy.testing.assert_array_equal(ml.bins, numpy.array([10, 14, 90]))
numpy.testing.assert_array_equal(ml.counts, numpy.array([1, 2, 2]))
Loading