diffix · yoid2000 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025 · Mar 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,7 @@ target-version = ['py310']
 python_version = "3.10"
 check_untyped_defs = true
 disallow_untyped_defs = true
+ignore_missing_imports = true
 
 [tool.isort]
 profile = "black"

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
@@ -3,7 +3,7 @@
 from itertools import islice
 from os.path import commonprefix
 from random import Random
-from typing import Generator, Iterable, Literal, Set, cast
+from typing import Generator, Iterable, Literal, Optional, Set, cast
 
 import numpy as np
 import pandas as pd
@@ -14,6 +14,7 @@
     is_integer_dtype,
     is_string_dtype,
 )
+from sklearn.preprocessing import MinMaxScaler
 
 from .bucket import Buckets
 from .common import ColumnType, Value
@@ -31,6 +32,9 @@
 
 
 class DataConvertor(ABC):
+    def __init__(self) -> None:
+        self.scaler: Optional[MinMaxScaler] = None
+
     @abstractmethod
     def column_type(self) -> ColumnType:
         pass
@@ -48,6 +52,9 @@ def analyze_tree(self, root: Node) -> None:
 
 
 class BooleanConvertor(DataConvertor):
+    def __init__(self) -> None:
+        super().__init__()
+
     def column_type(self) -> ColumnType:
         return ColumnType.BOOLEAN
 
@@ -61,6 +68,14 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
 
 
 class RealConvertor(DataConvertor):
+    def __init__(self, values: Iterable[Value]) -> None:
+        super().__init__()
+        # Fit up to 0.9999 so that the max bucket range is 0-1
+        self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
+        # This value-neutral fitting is only for passing unit tests.
+        self.scaler.fit(np.array([[0.0], [0.9999]]))
+        self.round_precision = _get_round_precision(values)
+
     def column_type(self) -> ColumnType:
         return ColumnType.REAL
 
@@ -70,10 +85,19 @@ def to_float(self, value: Value) -> float:
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         value = _generate_float(interval, rng)
+        value = _inverse_normalize_value(value, self.scaler)
+        value = round(value, self.round_precision)
         return (value, value)
 
 
 class IntegerConvertor(DataConvertor):
+    def __init__(self) -> None:
+        super().__init__()
+        # Fit up to 0.9999 so that the max bucket range is 0-1
+        self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
+        # This value-neutral fitting is only for passing unit tests.
+        self.scaler.fit(np.array([[0.0], [0.9999]]))
+
     def column_type(self) -> ColumnType:
         return ColumnType.INTEGER
 
@@ -82,11 +106,20 @@ def to_float(self, value: Value) -> float:
         return float(value)
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        value = int(_generate_float(interval, rng))
+        value = _generate_float(interval, rng)
+        value = _inverse_normalize_value(value, self.scaler)
+        value = round(value)
         return (value, float(value))
 
 
 class TimestampConvertor(DataConvertor):
+    def __init__(self) -> None:
+        super().__init__()
+        # Fit up to 0.9999 so that the max bucket range is 0-1
+        self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
+        # This value-neutral fitting is only for passing unit tests.
+        self.scaler.fit(np.array([[0.0], [0.9999]]))
+
     def column_type(self) -> ColumnType:
         return ColumnType.TIMESTAMP
 
@@ -97,12 +130,14 @@ def to_float(self, value: Value) -> float:
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
         value = _generate_float(interval, rng)
+        value = _inverse_normalize_value(value, self.scaler)
         datetime = TIMESTAMP_REFERENCE + np.timedelta64(int(value), "s")
         return (datetime, value)
 
 
 class StringConvertor(DataConvertor):
     def __init__(self, values: Iterable[Value]) -> None:
+        super().__init__()
         unique_values = set(v for v in values if not pd.isna(v))
         for value in unique_values:
             if not isinstance(value, str):
@@ -160,6 +195,21 @@ def _generate_float(interval: Interval, rng: Random) -> float:
     return rng.uniform(interval.min, interval.max)
 
 
+def _get_round_precision(values: Iterable[Value]) -> int:
+    max_precision = 0
+    for value in values:
+        assert isinstance(value, float) or isinstance(value, np.floating)
+        value_str = str(value)
+        if "." in value_str:
+            decimal_part = value_str.split(".")[1]
+            precision = len(decimal_part)
+        else:
+            precision = 0
+        if precision > max_precision:
+            max_precision = precision
+    return max_precision
+
+
 def _generate(interval: Interval, convertor: DataConvertor, null_mapping: float, rng: Random) -> MicrodataValue:
     return convertor.from_interval(interval, rng) if interval.min != null_mapping else (None, null_mapping)
 
@@ -178,7 +228,7 @@ def get_convertor(df: pd.DataFrame, column: str) -> DataConvertor:
     if is_integer_dtype(dtype):
         return IntegerConvertor()
     elif is_float_dtype(dtype):
-        return RealConvertor()
+        return RealConvertor(df[column])
     elif is_bool_dtype(dtype):
         return BooleanConvertor()
     elif is_datetime64_dtype(dtype):
@@ -190,6 +240,32 @@ def get_convertor(df: pd.DataFrame, column: str) -> DataConvertor:
         raise TypeError(f"Dtype {dtype} is not supported.")
 
 
+def _inverse_normalize_value(value: float, scaler: MinMaxScaler) -> float:
+    # Inverse of normalize, but for one value at a time
+    value_array = np.array([[value]])
+    inverse_transformed_array = scaler.inverse_transform(value_array)
+    inverse_transformed_value = inverse_transformed_array[0, 0]
+    return float(inverse_transformed_value)
+
+
+def _normalize(values: pd.Series, scaler: Optional[MinMaxScaler]) -> pd.Series:
+    if scaler is None:
+        # Convertors that don't need normalization
+        return values
+
+    # MinMax normalize values, while retaining the NaN values
+    values_array = values.to_numpy()
+    nan_indices = np.isnan(values_array)
+    if nan_indices.all():
+        return values
+    median_value = np.nanmedian(values_array)
+    values_array[nan_indices] = median_value
+    values_reshaped = values_array.reshape(-1, 1)
+    normalized_values = scaler.fit_transform(values_reshaped).flatten()
+    normalized_values[nan_indices] = np.nan
+    return pd.Series(normalized_values, index=values.index)
+
+
 def _apply_convertor(value: Value, convertor: DataConvertor) -> float:
     if pd.isna(value):
         return np.nan
@@ -203,7 +279,11 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) ->
         for column, convertor in zip(raw_data.columns, convertors)
     ]
 
-    return pd.DataFrame(dict(zip(raw_data.columns, converted_columns)), copy=False)
+    possibly_normalized_columns = [
+        _normalize(column, convertor.scaler) for column, convertor in zip(converted_columns, convertors)
+    ]
+
+    return pd.DataFrame(dict(zip(raw_data.columns, possibly_normalized_columns)), copy=False)
 
 
 def generate_microdata(

diff --git a/tests/clustering/test_measures.py b/tests/clustering/test_measures.py
@@ -19,16 +19,16 @@ def test_measure_all() -> None:
         np.round(measures.dependency_matrix, 2),
         np.array(
             [
-                [1.00, 0.21, 0.18, 0.02, 0.05],
-                [0.21, 1.00, 0.17, 0.02, 0.04],
-                [0.18, 0.17, 1.00, 0.04, 0.07],
-                [0.02, 0.02, 0.04, 1.00, 0.01],
-                [0.05, 0.04, 0.07, 0.01, 1.00],
+                [1.00, 0.2, 0.15, 0.02, 0.05],
+                [0.2, 1.00, 0.18, 0.02, 0.05],
+                [0.15, 0.18, 1.00, 0.04, 0.07],
+                [0.02, 0.02, 0.04, 1.00, 0],
+                [0.05, 0.05, 0.07, 0, 1.00],
             ]
         ),
     )
 
     assert np.array_equal(
         np.round(measures.entropy_1dim, 3),
-        np.array([15.992, 15.987, 5.453, 0.118, 1.350]),
+        np.array([16.042, 15.886, 5.398, 0.118, 1.350]),
     )
diff --git a/tests/data/tree.0.json b/tests/data/tree.0.json
@@ -1,38 +1,38 @@
 {
-  "ranges": [[0.0, 8.0]],
+  "ranges": [[0.0, 1.0]],
   "count": 32,
   "children": {
     "0": {
-      "ranges": [[0.0, 4.0]],
+      "ranges": [[0.0, 0.5]],
       "count": 16,
       "children": {
         "0": {
-          "ranges": [[0.0, 2.0]],
+          "ranges": [[0.0, 0.25]],
           "count": 8,
           "children": {
             "0": {
-              "ranges": [[0.0, 1.0]],
+              "ranges": [[0.0, 0.125]],
               "count": 4,
               "children": null
             },
             "1": {
-              "ranges": [[1.0, 2.0]],
+              "ranges": [[0.125, 0.25]],
               "count": 4,
               "children": null
             }
           }
         },
         "1": {
-          "ranges": [[2.0, 4.0]],
+          "ranges": [[0.25, 0.5]],
           "count": 8,
           "children": {
             "0": {
-              "ranges": [[2.0, 3.0]],
+              "ranges": [[0.25, 0.375]],
               "count": 4,
               "children": null
             },
             "1": {
-              "ranges": [[3.0, 4.0]],
+              "ranges": [[0.375, 0.5]],
               "count": 4,
               "children": null
             }
@@ -41,36 +41,36 @@
       }
     },
     "1": {
-      "ranges": [[4.0, 8.0]],
+      "ranges": [[0.5, 1.0]],
       "count": 16,
       "children": {
         "0": {
-          "ranges": [[4.0, 6.0]],
+          "ranges": [[0.5, 0.75]],
           "count": 8,
           "children": {
             "0": {
-              "ranges": [[4.0, 5.0]],
+              "ranges": [[0.5, 0.625]],
               "count": 4,
               "children": null
             },
             "1": {
-              "ranges": [[5.0, 6.0]],
+              "ranges": [[0.625, 0.75]],
               "count": 4,
               "children": null
             }
           }
         },
         "1": {
-          "ranges": [[6.0, 8.0]],
+          "ranges": [[0.75, 1.0]],
           "count": 8,
           "children": {
             "0": {
-              "ranges": [[6.0, 7.0]],
+              "ranges": [[0.75, 0.875]],
               "count": 4,
               "children": null
             },
             "1": {
-              "ranges": [[7.0, 8.0]],
+              "ranges": [[0.875, 1.0]],
               "count": 4,
               "children": null
             }