Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ inference), they were developed independently and have different trade-offs:

### 1. In-Memory (Local) Mode

**Entry point:** [`dpsynth.generate()`](dpsynth/__init__.py) (backed by
[`data_generation_v2.py`](dpsynth/data_generation_v2.py))
**Entry point:** [`dpsynth.TabularSynthesizer`](dpsynth/__init__.py) (backed by
[`data_generation_v3.py`](dpsynth/data_generation_v3.py))

Designed for **datasets that fit in memory** (e.g., Pandas DataFrames). We have
tested this on datasets up to ~100M rows, though performance will depend on the
Expand Down Expand Up @@ -127,8 +127,9 @@ These modules are used by both the in-memory and pipeline code paths:
* **[`discrete_mechanisms/`](dpsynth/discrete_mechanisms/README.md)**: Local,
single-machine DP mechanisms (AIM, MST, etc.) and shared mathematical
utilities like domain compression.
* **[`data_generation_v2.py`](dpsynth/data_generation_v2.py)**: The end-to-end
in-memory generation pipeline. This is what `dpsynth.generate()` calls.
* **[`data_generation_v3.py`](dpsynth/data_generation_v3.py)**: The
end-to-end in-memory generation pipeline. This is what
`dpsynth.TabularSynthesizer` exposes.
* **[`local_mode/`](dpsynth/local_mode/)**: Locally-optimized DP primitives
for quantiles and partition selection (NumPy/SciPy-based).
* **[`pydantic_api.py`](dpsynth/pydantic_api.py)**: API for synthesizing
Expand Down Expand Up @@ -166,7 +167,7 @@ These modules are used by both the in-memory and pipeline code paths:

| Scenario | Recommended |
|---|---|
| Fits in memory, Pandas workflow | **In-Memory** (`dpsynth.generate`) |
| Fits in memory, Pandas workflow | **In-Memory** (`dpsynth.TabularSynthesizer`) |
| Discrete data, precomputed marginals | **In-Memory** (`discrete_mechanisms`) |
| Large-scale, distributed processing | **Pipeline** (`data_generation`) |
| Marginals from an external system | **Post-Processing** |
Expand Down
2 changes: 1 addition & 1 deletion dpsynth/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@
__version__ = '0.1.0'
from dpsynth import discrete_mechanisms
from dpsynth import domain
from dpsynth.data_generation_v2 import generate
from dpsynth.data_generation_v3 import TabularSynthesizer
from dpsynth.domain import CategoricalAttribute
from dpsynth.domain import NumericalAttribute
12 changes: 5 additions & 7 deletions dpsynth/bin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,11 @@ def main(_):
case _:
raise ValueError(f'Unknown mechanism: {_MECHANISM.value}')

synthetic_df = dpsynth.generate(
df,
attribute_domains,
epsilon=_EPSILON.value,
delta=_DELTA.value,
discrete_config=mechanism_config,
)
mechanism = dpsynth.TabularSynthesizer(
domains=attribute_domains,
discrete_mechanism=mechanism_config,
).calibrate(epsilon=_EPSILON.value, delta=_DELTA.value)
synthetic_df = mechanism(np.random.default_rng(_SEED.value), df)

synthetic_df.to_csv(_OUTPUT_PATH.value, index=False)

Expand Down
12 changes: 11 additions & 1 deletion dpsynth/data_generation_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@

"""Implementation of an end-to-end DP synthetic data generation mechanism.

In this module there is implementation to run locally.
.. deprecated::
This module is deprecated. Use
:class:`dpsynth.data_generation_v3.TabularSynthesizer`
instead.
"""

from collections.abc import Mapping, Sequence
from typing import TypeAlias
import warnings

from absl import logging
import dp_accounting
Expand Down Expand Up @@ -133,6 +137,12 @@ def generate(
Returns:
A synthetic dataset.
"""
warnings.warn(
'data_generation_v2.generate() is deprecated. Use'
' data_generation_v3.TabularSynthesizer instead.',
DeprecationWarning,
stacklevel=2,
)
assert 0 <= one_way_marginal_budget_fraction <= 1
if not skip_compression and cross_attribute_constraints:
raise ValueError(
Expand Down
18 changes: 11 additions & 7 deletions dpsynth/data_generation_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

"""End-to-end DP synthetic data generation using local mode primitives."""
"""End-to-end DP synthetic tabular data generation using local mode primitives."""

from __future__ import annotations

Expand Down Expand Up @@ -72,7 +72,7 @@ def _create_initializers(


@dataclasses.dataclass
class DataGenerationV3(primitives.DPMechanism):
class TabularSynthesizer(primitives.DPMechanism):
"""End-to-end DP synthetic data generation mechanism.

This mechanism encodes input categorical and numerical data into a discrete
Expand All @@ -82,8 +82,8 @@ class DataGenerationV3(primitives.DPMechanism):

Usage::

v3 = DataGenerationV3(domains=domains)
calibrated = v3.calibrate(zcdp_rho=1.0)
synth = TabularSynthesizer(domains=domains)
calibrated = synth.calibrate(zcdp_rho=1.0)
synthetic_df = calibrated(rng, df)

Attributes:
Expand All @@ -110,7 +110,7 @@ def calibrate(
delta: float | None = None,
numerical_bins: int = 32,
init_budget_fraction: float = 0.1,
) -> DataGenerationV3:
) -> TabularSynthesizer:
"""Returns a calibrated copy of this mechanism.

Supports two calibration modes:
Expand All @@ -133,7 +133,7 @@ def calibrate(
init_budget_fraction: Fraction of total budget for initialization.

Returns:
A new DataGenerationV3 instance with calibrated sub-mechanisms.
A new TabularSynthesizer instance with calibrated sub-mechanisms.

Raises:
ValueError: If arguments are invalid or delta is missing when required.
Expand Down Expand Up @@ -224,7 +224,7 @@ def _calibrate_approx_dp(
init_budget_fraction: Fraction of zCDP budget for initialization.

Returns:
A new DataGenerationV3 instance with calibrated sub-mechanisms.
A new TabularSynthesizer instance with calibrated sub-mechanisms.
"""
inits = self.initializers or _create_initializers(
self.domains, numerical_bins, init_delta
Expand Down Expand Up @@ -374,3 +374,7 @@ def __call__(

column_order = [col for col in data.columns if col in self.domains]
return pd.DataFrame(synthetic_columns)[column_order]


# Backward-compatible alias.
DataGenerationV3 = TabularSynthesizer
Loading
Loading