From 878c6cf3f1a2f62899dbcd6857fa5644f3811e30 Mon Sep 17 00:00:00 2001 From: Tom Reitz Date: Tue, 2 Jan 2024 11:54:53 -0600 Subject: [PATCH 1/2] add ability to benfordize distributions (see readme) --- README.md | 14 +++++++++++ macros/distributions/benfordize.sql | 38 +++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 macros/distributions/benfordize.sql diff --git a/README.md b/README.md index 5fab849..3b515bb 100644 --- a/README.md +++ b/README.md @@ -353,6 +353,20 @@ For all but the last option, you may optionally specify a `label_precision`, whi +## Benford's Law +Real-world numeric distributions (such as bank account balances) often follow [Benford's law](https://en.wikipedia.org/wiki/Benford%27s_law), where the leading digit follows a specific non-uniform distribution. To facilitate synthesis of such data, `dbt_synth_data` provides a convenience macro to "`benfordize()`" any distribution: + +```sql + {{synth_column_distribution(name="account_balance", + distribution=synth_distribution_benfordize( + distribution=synth_distribution_continuous_uniform(min=0, max=200000) + ) + )}} +``` + +The macro works by casting values from the `distribution` to a text-minimal scientific notation string (`1.2345E2`), replacing the leading digit with one from the Benford distribution (`probabilities={"1":0.301, "2":0.176, "3":0.125, "4":0.097, "5":0.079, "6":0.067, "7":0.058, "8":0.051, "9":0.046}` by default), and casting back to a number (`type="double"` by default). Note that this casting may result in loss of precision. + + ## Constructing Complex Distributions This package provides the following mechanisms for composing several distributions: diff --git a/macros/distributions/benfordize.sql b/macros/distributions/benfordize.sql new file mode 100644 index 0000000..0c75bd4 --- /dev/null +++ b/macros/distributions/benfordize.sql @@ -0,0 +1,38 @@ +{% macro synth_distribution_benfordize(distribution, type="double", probabilities={ + "1":0.301, "2":0.176, "3":0.125, "4":0.097, "5":0.079, "6":0.067, "7":0.058, "8":0.051, "9":0.046 +}) %} + {{ return(adapter.dispatch('synth_distribution_benfordize')(distribution, type, probabilities)) }} +{% endmacro %} + +{% macro default__synth_distribution_discretize_floor(distribution, type, probabilities) -%} + {# NOT YET IMPLEMENTED #} +{%- endmacro %} + +{% macro sqlite__synth_distribution_discretize_floor(distribution, type, probabilities) %} + concat( + {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, + substr(printf('%.12e', {{distribution}}), 2) + )::{{type}} +{% endmacro %} + +{% macro duckdb__synth_distribution_discretize_floor(distribution, type, probabilities) %} + concat( + {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, + substring(format('{:E}', {{distribution}}), 2) + )::{{type}} +{% endmacro %} + +{% macro postgres__synth_distribution_discretize_floor(distribution, type, probabilities) %} + concat( + {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, + substring(to_char({{distribution}}, '9.9999999999999999999EEEE') from 2) + )::{{type}} +{% endmacro %} + +{% macro snowflake__synth_distribution_discretize_floor(distribution, type, probabilities) %} + concat( + {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, + substring(to_varchar({{distribution}}, 'TME')::varchar, 2) + {# see https://docs.snowflake.com/en/sql-reference/sql-format-models#text-minimal-format-elements #} + )::{{type}} +{% endmacro%} From 99065ba729301df0fb1a869e303d7036926d35d0 Mon Sep 17 00:00:00 2001 From: Tom Reitz Date: Wed, 3 Jan 2024 07:44:57 -0600 Subject: [PATCH 2/2] bugfix to macro names --- macros/distributions/benfordize.sql | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/macros/distributions/benfordize.sql b/macros/distributions/benfordize.sql index 0c75bd4..c9c50bd 100644 --- a/macros/distributions/benfordize.sql +++ b/macros/distributions/benfordize.sql @@ -4,32 +4,32 @@ {{ return(adapter.dispatch('synth_distribution_benfordize')(distribution, type, probabilities)) }} {% endmacro %} -{% macro default__synth_distribution_discretize_floor(distribution, type, probabilities) -%} +{% macro default__synth_distribution_benfordize(distribution, type, probabilities) -%} {# NOT YET IMPLEMENTED #} {%- endmacro %} -{% macro sqlite__synth_distribution_discretize_floor(distribution, type, probabilities) %} +{% macro sqlite__synth_distribution_benfordize(distribution, type, probabilities) %} concat( {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, substr(printf('%.12e', {{distribution}}), 2) )::{{type}} {% endmacro %} -{% macro duckdb__synth_distribution_discretize_floor(distribution, type, probabilities) %} +{% macro duckdb__synth_distribution_benfordize(distribution, type, probabilities) %} concat( {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, substring(format('{:E}', {{distribution}}), 2) )::{{type}} {% endmacro %} -{% macro postgres__synth_distribution_discretize_floor(distribution, type, probabilities) %} +{% macro postgres__synth_distribution_benfordize(distribution, type, probabilities) %} concat( {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, substring(to_char({{distribution}}, '9.9999999999999999999EEEE') from 2) )::{{type}} {% endmacro %} -{% macro snowflake__synth_distribution_discretize_floor(distribution, type, probabilities) %} +{% macro snowflake__synth_distribution_benfordize(distribution, type, probabilities) %} concat( {{synth_distribution_discrete_probabilities(probabilities=probabilities)}}, substring(to_varchar({{distribution}}, 'TME')::varchar, 2)