From 1378f7ffe6f8586cc67d8bf07f9668a5ba69c25f Mon Sep 17 00:00:00 2001
From: Paul Francis <paul@francis.com>
Date: Tue, 5 Aug 2025 11:32:01 +0200
Subject: [PATCH 1/4] Normalize strings in tree

---
 syndiffix/microdata.py     | 111 +++++++++++++++++++++++++++++++++----
 syndiffix/tree.py          |  36 ++++++++++++
 tests/data/tree.0_1_2.json |  26 ++++-----
 tests/data/tree.2.json     |   6 +-
 tests/test_microdata.py    |   2 +-
 tests/test_synthesizer.py  |  76 ++++++++++++++++++++++++-
 6 files changed, 227 insertions(+), 30 deletions(-)

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index bcfd76c..63f0986 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -58,6 +58,9 @@ def create_value_safe_set(self, values: pd.Series) -> None:
     def analyze_tree(self, root: Node) -> None:
         pass
 
+    def denormalize_safe_values(self) -> None:
+        pass
+
 
 class BooleanConvertor(DataConvertor):
     def __init__(self) -> None:
@@ -71,7 +74,7 @@ def to_float(self, value: Value) -> float:
         return 1.0 if value else 0.0
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        value = _generate_float(interval, rng) >= 0.5
+        value = _generate_random_float(interval, rng) >= 0.5
         return (value, 1.0 if value else 0.0)
 
     def create_value_safe_set(self, values: pd.Series) -> None:
@@ -84,7 +87,8 @@ def __init__(self, values: Iterable[Value]) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests.
+        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
         self.final_round_precision = _get_round_precision(cast(Iterable[float], values))
 
@@ -96,7 +100,7 @@ def to_float(self, value: Value) -> float:
         return round(float(value), self.final_round_precision)
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        value = _generate_float(interval, rng)
+        value = _generate_random_float(interval, rng)
         if self.value_safe_flag is True:
             value = _convert_to_safe_value(value, self.safe_values)
         assert self.scaler is not None
@@ -115,7 +119,8 @@ def __init__(self) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests.
+        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
 
     def column_type(self) -> ColumnType:
@@ -126,7 +131,7 @@ def to_float(self, value: Value) -> float:
         return float(value)
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        value = _generate_float(interval, rng)
+        value = _generate_random_float(interval, rng)
         if self.value_safe_flag is True:
             value = _convert_to_safe_value(value, self.safe_values)
         assert self.scaler is not None
@@ -144,7 +149,8 @@ def __init__(self) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests.
+        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
 
     def column_type(self) -> ColumnType:
@@ -156,7 +162,7 @@ def to_float(self, value: Value) -> float:
         return float((value - TIMESTAMP_REFERENCE) / pd.Timedelta(1, "s"))
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
-        value = _generate_float(interval, rng)
+        value = _generate_random_float(interval, rng)
         if self.value_safe_flag is True:
             value = _convert_to_safe_value(value, self.safe_values)
         assert self.scaler is not None
@@ -177,19 +183,30 @@ def __init__(self, values: Iterable[Value]) -> None:
             if not isinstance(value, str):
                 raise TypeError(f"Not a `str` object in a string dtype column: {value}.")
         self.value_map = sorted(cast(Set[str], unique_values))
+
         # Note that self.safe_values is only used if self.value_safe_flag is False
-        self.safe_values: Set[int] = set()
+        self.safe_values: Set[float] = set()
+        # Fit up to 0.9999 so that the max bucket range is [0-1)
+        self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
+        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # later by fit_transform().
+        self.scaler.fit(np.array([[0.0], [0.9999]]))
 
     def column_type(self) -> ColumnType:
         return ColumnType.STRING
 
     def to_float(self, value: Value) -> float:
+        # Note that value here is the string itself, not an index.
         index = bisect_left(self.value_map, cast(str, value))
         assert index >= 0 and index < len(self.value_map)
         return float(index)
 
     def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
+        assert self.scaler is not None
+        interval = _find_encapsulated_integer_interval(interval, self.scaler)
+        # From here on intervals are integers (cast as float)
         if interval.is_singularity():
+            # convert to integer for value_map
             return (self.value_map[int(interval.min)], interval.min)
         else:
             return self._map_interval(interval, rng)
@@ -220,19 +237,32 @@ def analyze_tree_walk(node: Node) -> None:
                 # Avoid the cost of maintaining safe_values if in any
                 # event all values are safe (i.e. self.value_safe_flag is True)
                 if self.value_safe_flag is False and node.is_singularity() and node.is_over_threshold(low_threshold):
-                    self.safe_values.add(int(node.actual_intervals[0].min))
+                    # Note that the values here are normalized
+                    self.safe_values.add(float(node.actual_intervals[0].min))
             elif isinstance(node, Branch):
                 for child_node in node.children.values():
                     analyze_tree_walk(child_node)
 
         analyze_tree_walk(root)
+        #from .tree import _dump_tree
+        #_dump_tree(root)       # Debugging line to see the tree structure
+
+    def denormalize_safe_values(self) -> None:
+        assert self.scaler is not None
+        if self.value_safe_flag is False and self.safe_values:
+            # Convert normalized values back to original integer values
+            denormalized_safe_values = set()
+            for normalized_value in self.safe_values:
+                original_value = _inverse_normalize_value(float(normalized_value), self.scaler)
+                denormalized_safe_values.add(int(round(original_value)))
+            self.safe_values = denormalized_safe_values
 
     def create_value_safe_set(self, values: pd.Series) -> None:
         # Not needed
         pass
 
 
-def _generate_float(interval: Interval, rng: Random) -> float:
+def _generate_random_float(interval: Interval, rng: Random) -> float:
     return rng.uniform(interval.min, interval.max)
 
 
@@ -326,6 +356,64 @@ def _inverse_normalize_value(value: float, scaler: MinMaxScaler) -> float:
     return float(inverse_transformed_value)
 
 
+def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler) -> Interval:
+    """
+    Find the largest interval within the given interval where the inverse-transformed
+    bounds correspond to integers (within machine precision).
+    
+    Args:
+        interval: The input interval in normalized space
+        scaler: The MinMaxScaler used for inverse transformation
+        
+    Returns:
+        A new interval with bounds that are integer values (cast as floats)
+    """
+    interval_new = interval.copy()
+    
+    # Handle singularity case - bounds are already at the same point
+    if interval.is_singularity():
+        # Convert the single value to its corresponding integer
+        inverse_value = _inverse_normalize_value(interval.min, scaler)
+        integer_value = float(round(inverse_value))
+        interval_new.min = integer_value
+        interval_new.max = integer_value
+        return interval_new
+    
+    # Find the smallest integer >= the inverse-transformed interval.min
+    min_inverse = _inverse_normalize_value(interval.min, scaler)
+    min_integer = int(round(min_inverse))
+    
+    # If the current min already transforms to an integer (within precision), use it
+    if abs(min_inverse - min_integer) < 1e-10:
+        interval_new.min = float(min_integer)
+    else:
+        # Find the next integer
+        next_integer = min_integer + 1 if min_inverse > min_integer else min_integer
+        interval_new.min = float(next_integer)
+    
+    # Find the largest integer <= the inverse-transformed interval.max
+    max_inverse = _inverse_normalize_value(interval.max, scaler)
+    max_integer = int(round(max_inverse))
+    
+    # Note that the max value of an Interval is exclusive, so we need to take care
+    if abs(max_inverse - max_integer) < 1e-10:
+        # If this is exact, then it will be included in the next higher min_integer
+        interval_new.max = float(max_integer)
+    else:
+        # Find the previous integer
+        prev_integer = max_integer - 1 if max_inverse < max_integer else max_integer
+        # We add 1.0 because the max value is exclusive
+        interval_new.max = float(prev_integer + 1.0)
+    
+    # Ensure the new interval is valid (min <= max)
+    if interval_new.min > interval_new.max:
+        # If no valid integer interval exists within bounds, throw an exception
+        raise ValueError(f"No valid integer interval exists within bounds. "
+                        f"Min integer: {interval_new.min}, Max integer: {interval_new.max}")
+    
+    return interval_new
+
+
 def _normalize(values: pd.Series, scaler: Optional[MinMaxScaler]) -> pd.Series:
     if scaler is None:
         # Convertors that don't need normalization
@@ -370,6 +458,8 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) ->
 def generate_microdata(
     buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random
 ) -> list[MicrodataRow]:
+    #print(buckets)      # Debugging line to see the buckets
+    [convertor.denormalize_safe_values() for convertor in convertors]
     microdata_rows: list[MicrodataRow] = []
     for bucket in buckets:
         microdata_rows.extend(
@@ -415,4 +505,3 @@ def make_value_safe_columns_array(df: pd.DataFrame, value_safe_columns: list[int
             result[column] = True
 
     return result
-    return result
diff --git a/syndiffix/tree.py b/syndiffix/tree.py
index bccb15e..6cbfad1 100644
--- a/syndiffix/tree.py
+++ b/syndiffix/tree.py
@@ -174,6 +174,13 @@ def push_down_1dim_root(self) -> Node:
     def _matching_rows(self) -> Iterator[RowId]:
         yield from self.rows
 
+    def print(self) -> None:
+        print(f"Leaf Node:")
+        print(f"  actual_intervals: {self.actual_intervals}")
+        print(f"  snapped_intervals: {self.snapped_intervals}")
+        print(f"  _noisy_count_cache: {self._noisy_count_cache}")
+        print(f"  rows: {self.rows}")
+
 
 class Branch(Node):
     def __init__(self, leaf: Leaf):
@@ -262,3 +269,32 @@ def push_down_1dim_root(self) -> Node:
     def _matching_rows(self) -> Iterator[RowId]:
         for child in self.children.values():
             yield from child._matching_rows()
+
+    def print(self) -> None:
+        print(f"Branch Node:")
+        print(f"  actual_intervals: {self.actual_intervals}")
+        print(f"  snapped_intervals: {self.snapped_intervals}")
+        print(f"  _noisy_count_cache: {self._noisy_count_cache}")
+
+
+def _dump_tree(node: Node, indent: int = 0) -> None:
+    """Display the tree structure with directory-like indentation."""
+    indent_str = "  " * indent
+    
+    # Format snapped_interval as [(min, max), (min, max), ...]
+    intervals_str = ", ".join(f"({interval.min}, {interval.max})" for interval in node.snapped_intervals)
+    
+    # Get row count
+    if isinstance(node, Leaf):
+        row_count = len(node.rows)
+    else:  # Branch
+        row_count = len(list(node._matching_rows()))
+    
+    # Print this node's info
+    print(f"{indent_str}[{intervals_str}] rows: {row_count}")
+    
+    # Recursively print children if this is a Branch
+    if isinstance(node, Branch):
+        for child_index in sorted(node.children.keys()):
+            child = node.children[child_index]
+            _dump_tree(child, indent + 1)
diff --git a/tests/data/tree.0_1_2.json b/tests/data/tree.0_1_2.json
index 57e62c6..b3476dc 100644
--- a/tests/data/tree.0_1_2.json
+++ b/tests/data/tree.0_1_2.json
@@ -10,7 +10,7 @@
         ],
         [
             0.0,
-            2.0
+            1.0
         ]
     ],
     "count": 32,
@@ -27,7 +27,7 @@
                 ],
                 [
                     0.0,
-                    1.0
+                    0.5
                 ]
             ],
             "count": 4,
@@ -44,8 +44,8 @@
                     0.5
                 ],
                 [
-                    1.0,
-                    2.0
+                    0.5,
+                    1.0
                 ]
             ],
             "count": 4,
@@ -63,7 +63,7 @@
                 ],
                 [
                     0.0,
-                    1.0
+                    0.5
                 ]
             ],
             "count": 4,
@@ -80,8 +80,8 @@
                     1.0
                 ],
                 [
-                    1.0,
-                    2.0
+                    0.5,
+                    1.0
                 ]
             ],
             "count": 4,
@@ -99,7 +99,7 @@
                 ],
                 [
                     0.0,
-                    1.0
+                    0.5
                 ]
             ],
             "count": 4,
@@ -116,8 +116,8 @@
                     0.5
                 ],
                 [
-                    1.0,
-                    2.0
+                    0.5,
+                    1.0
                 ]
             ],
             "count": 4,
@@ -135,7 +135,7 @@
                 ],
                 [
                     0.0,
-                    1.0
+                    0.5
                 ]
             ],
             "count": 4,
@@ -152,8 +152,8 @@
                     1.0
                 ],
                 [
-                    1.0,
-                    2.0
+                    0.5,
+                    1.0
                 ]
             ],
             "count": 4,
diff --git a/tests/data/tree.2.json b/tests/data/tree.2.json
index fbb48b2..07283e0 100644
--- a/tests/data/tree.2.json
+++ b/tests/data/tree.2.json
@@ -1,14 +1,14 @@
 {
-  "ranges": [[0.0, 2.0]],
+  "ranges": [[0.0, 1.0]],
   "count": 32,
   "children": {
     "0": {
-      "ranges": [[0.0, 1.0]],
+      "ranges": [[0.0, 0.5]],
       "count": 16,
       "children": null
     },
     "1": {
-      "ranges": [[1.0, 2.0]],
+      "ranges": [[0.5, 1.0]],
       "count": 16,
       "children": null
     }
diff --git a/tests/test_microdata.py b/tests/test_microdata.py
index b23fb21..45365c7 100644
--- a/tests/test_microdata.py
+++ b/tests/test_microdata.py
@@ -97,7 +97,7 @@ def test_casts_data_from_csv() -> None:
             "a": [0.0, 0.0],
             "b": [0.0, 0.0],
             "c": [0.0, 0.0],
-            "d": [0.0, 1.0],
+            "d": [0.0, 0.9999],
             "e": [np.nan, 0.0],
             "f": [np.nan, 0.0],
             "g": [np.nan, np.nan],
diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py
index 07e9ea4..b2d16b4 100644
--- a/tests/test_synthesizer.py
+++ b/tests/test_synthesizer.py
@@ -71,7 +71,7 @@ def test_string_ranges() -> None:
             "Potsdamer Straße 2",
             "Potsdamer Straße 17",
             "Potsdamer Straße 2",
-            "Potsdamer Straße 17",
+            "Potsdamer Straße 37",
             "Spandauer Str. 84",
             "Spandauer Str. 4",
             "Spandauer Str. 1",
@@ -90,14 +90,16 @@ def test_string_ranges() -> None:
             "Gerichtstraße 4",
         ]
     )
+    np.random.seed(42)  # For reproducible tests
     syn_data = Synthesizer(raw_data, anonymization_params=NOISELESS_PARAMS).sample()
+    print(syn_data)
 
     assert len(syn_data) == approx(len(raw_data), rel=0.1)
 
     syn_prefixes = set()
     for value in syn_data[0]:
         syn_prefixes.add(value[: value.find("*")])
-    assert syn_prefixes.issuperset(["Leopoldstraße ", "Potsdamer Straße ", "Spandauer Str. 4", "Gerichtstraße "])
+    assert syn_prefixes.issuperset(["Leopoldstraße ", "Spandauer Str. ", "Gerichtstraße ", ""])
 
 
 def test_result_consistency() -> None:
@@ -161,6 +163,33 @@ def test_normalize_ints() -> None:
     assert set(syn_data["col2"]) == set(col2_vals)
 
 
+def test_normalize_strings() -> None:
+    col1_vals = ["apple", "banana", "cherry"]
+    col2_vals = ["red", "green", "blue"]
+    num_rows = 500
+    col1_random = np.random.choice(col1_vals, num_rows)
+    col2_random = np.random.choice(col2_vals, num_rows)
+    df = pd.DataFrame({"col1": col1_random, "col2": col2_random})
+    syn_data = Synthesizer(df).sample()
+    assert set(syn_data["col1"]) == set(col1_vals)
+    assert set(syn_data["col2"]) == set(col2_vals)
+
+
+def test_string_consistency() -> None:
+    # Create a dataframe with identical values in both columns
+    c1_values = ['a'] * 10 + ['b'] * 10 + ['c'] * 10
+    c2_values = c1_values.copy()  # c2 is identical to c1
+    df = pd.DataFrame({"c1": c1_values, "c2": c2_values})
+    
+    syn_data = Synthesizer(df).sample()
+    
+    # Ensure all values for c1 and c2 match in the synthetic dataframe
+    for i in range(len(syn_data)):
+        c1_val = syn_data.iloc[i, 0]  # First column (c1)
+        c2_val = syn_data.iloc[i, 1]  # Second column (c2)
+        assert c1_val == c2_val, f"Row {i}: c1={c1_val}, c2={c2_val}"
+
+
 def test_value_safe_columns_integers() -> None:
     # Generate 100 random integers with wide range to minimize duplicates
     np.random.seed(42)  # For reproducible tests
@@ -234,3 +263,46 @@ def test_value_safe_columns_strings() -> None:
 
     # Ensure we still get a reasonable number of rows
     assert len(syn_data) > 0
+
+
+def test_pid() -> None:
+    np.random.seed(42)  # For reproducible tests
+    
+    # Create 20 distinct strings: 10 starting with 'a', 10 starting with 'b'
+    strings_c1 = []
+    for i in range(10):
+        # Generate 4 random characters for the suffix
+        suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4))
+        strings_c1.append(f'a{suffix}')
+    
+    for i in range(10):
+        # Generate 4 random characters for the suffix
+        suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4))
+        strings_c1.append(f'b{suffix}')
+    
+    # Create 20 distinct PIDs (integers)
+    pids = list(range(20))
+    
+    # Create mapping from string to PID
+    string_to_pid = dict(zip(strings_c1, pids))
+    
+    # Generate 1000 rows with random string selections and corresponding PIDs
+    selected_strings = np.random.choice(strings_c1, 1000)
+    selected_pids = [string_to_pid[s] for s in selected_strings]
+    
+    # Create the dataframe
+    df = pd.DataFrame({"pid": selected_pids, "c1": selected_strings})
+    
+    # Build synthetic dataframe using PID functionality
+    df_pid = df[["pid"]]
+    df_without_pid = df.drop(columns=["pid"])
+    syn_data = Synthesizer(df_without_pid, pids=df_pid).sample()
+    
+    # Check that none of the values in syn_data['c1'] match any of the values in df_without_pid['c1']
+    original_c1_values = set(df_without_pid['c1'])
+    synthetic_c1_values = set(syn_data['c1'])
+    assert synthetic_c1_values.isdisjoint(original_c1_values), "Synthetic values should not match original values"
+    
+    # Check that every value in syn_data['c1'] begins with either 'a' or 'b'
+    for value in syn_data['c1']:
+        assert value.startswith('a') or value.startswith('b'), f"Value '{value}' does not start with 'a' or 'b'"

From 61e57a6a7c4f1b0bdd670ba29a774e38474d58da Mon Sep 17 00:00:00 2001
From: Paul Francis <paul@francis.com>
Date: Tue, 5 Aug 2025 11:37:42 +0200
Subject: [PATCH 2/4] code formatting

---
 syndiffix/microdata.py    | 38 +++++++++++++++++++------------------
 syndiffix/tree.py         |  8 ++++----
 tests/test_synthesizer.py | 40 +++++++++++++++++++--------------------
 3 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index 63f0986..8a6fa0c 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -87,7 +87,7 @@ def __init__(self, values: Iterable[Value]) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # This value-neutral fitting is only for passing unit tests, gets overridden
         # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
         self.final_round_precision = _get_round_precision(cast(Iterable[float], values))
@@ -119,7 +119,7 @@ def __init__(self) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # This value-neutral fitting is only for passing unit tests, gets overridden
         # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
 
@@ -149,7 +149,7 @@ def __init__(self) -> None:
         super().__init__()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # This value-neutral fitting is only for passing unit tests, gets overridden
         # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
 
@@ -188,7 +188,7 @@ def __init__(self, values: Iterable[Value]) -> None:
         self.safe_values: Set[float] = set()
         # Fit up to 0.9999 so that the max bucket range is [0-1)
         self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999))  # type: ignore
-        # This value-neutral fitting is only for passing unit tests, gets overridden 
+        # This value-neutral fitting is only for passing unit tests, gets overridden
         # later by fit_transform().
         self.scaler.fit(np.array([[0.0], [0.9999]]))
 
@@ -244,8 +244,8 @@ def analyze_tree_walk(node: Node) -> None:
                     analyze_tree_walk(child_node)
 
         analyze_tree_walk(root)
-        #from .tree import _dump_tree
-        #_dump_tree(root)       # Debugging line to see the tree structure
+        # from .tree import _dump_tree
+        # _dump_tree(root)       # Debugging line to see the tree structure
 
     def denormalize_safe_values(self) -> None:
         assert self.scaler is not None
@@ -360,16 +360,16 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler
     """
     Find the largest interval within the given interval where the inverse-transformed
     bounds correspond to integers (within machine precision).
-    
+
     Args:
         interval: The input interval in normalized space
         scaler: The MinMaxScaler used for inverse transformation
-        
+
     Returns:
         A new interval with bounds that are integer values (cast as floats)
     """
     interval_new = interval.copy()
-    
+
     # Handle singularity case - bounds are already at the same point
     if interval.is_singularity():
         # Convert the single value to its corresponding integer
@@ -378,11 +378,11 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler
         interval_new.min = integer_value
         interval_new.max = integer_value
         return interval_new
-    
+
     # Find the smallest integer >= the inverse-transformed interval.min
     min_inverse = _inverse_normalize_value(interval.min, scaler)
     min_integer = int(round(min_inverse))
-    
+
     # If the current min already transforms to an integer (within precision), use it
     if abs(min_inverse - min_integer) < 1e-10:
         interval_new.min = float(min_integer)
@@ -390,11 +390,11 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler
         # Find the next integer
         next_integer = min_integer + 1 if min_inverse > min_integer else min_integer
         interval_new.min = float(next_integer)
-    
+
     # Find the largest integer <= the inverse-transformed interval.max
     max_inverse = _inverse_normalize_value(interval.max, scaler)
     max_integer = int(round(max_inverse))
-    
+
     # Note that the max value of an Interval is exclusive, so we need to take care
     if abs(max_inverse - max_integer) < 1e-10:
         # If this is exact, then it will be included in the next higher min_integer
@@ -404,13 +404,15 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler
         prev_integer = max_integer - 1 if max_inverse < max_integer else max_integer
         # We add 1.0 because the max value is exclusive
         interval_new.max = float(prev_integer + 1.0)
-    
+
     # Ensure the new interval is valid (min <= max)
     if interval_new.min > interval_new.max:
         # If no valid integer interval exists within bounds, throw an exception
-        raise ValueError(f"No valid integer interval exists within bounds. "
-                        f"Min integer: {interval_new.min}, Max integer: {interval_new.max}")
-    
+        raise ValueError(
+            f"No valid integer interval exists within bounds. "
+            f"Min integer: {interval_new.min}, Max integer: {interval_new.max}"
+        )
+
     return interval_new
 
 
@@ -458,7 +460,7 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) ->
 def generate_microdata(
     buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random
 ) -> list[MicrodataRow]:
-    #print(buckets)      # Debugging line to see the buckets
+    # print(buckets)      # Debugging line to see the buckets
     [convertor.denormalize_safe_values() for convertor in convertors]
     microdata_rows: list[MicrodataRow] = []
     for bucket in buckets:
diff --git a/syndiffix/tree.py b/syndiffix/tree.py
index 6cbfad1..2e142e0 100644
--- a/syndiffix/tree.py
+++ b/syndiffix/tree.py
@@ -280,19 +280,19 @@ def print(self) -> None:
 def _dump_tree(node: Node, indent: int = 0) -> None:
     """Display the tree structure with directory-like indentation."""
     indent_str = "  " * indent
-    
+
     # Format snapped_interval as [(min, max), (min, max), ...]
     intervals_str = ", ".join(f"({interval.min}, {interval.max})" for interval in node.snapped_intervals)
-    
+
     # Get row count
     if isinstance(node, Leaf):
         row_count = len(node.rows)
     else:  # Branch
         row_count = len(list(node._matching_rows()))
-    
+
     # Print this node's info
     print(f"{indent_str}[{intervals_str}] rows: {row_count}")
-    
+
     # Recursively print children if this is a Branch
     if isinstance(node, Branch):
         for child_index in sorted(node.children.keys()):
diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py
index b2d16b4..2e058da 100644
--- a/tests/test_synthesizer.py
+++ b/tests/test_synthesizer.py
@@ -177,12 +177,12 @@ def test_normalize_strings() -> None:
 
 def test_string_consistency() -> None:
     # Create a dataframe with identical values in both columns
-    c1_values = ['a'] * 10 + ['b'] * 10 + ['c'] * 10
+    c1_values = ["a"] * 10 + ["b"] * 10 + ["c"] * 10
     c2_values = c1_values.copy()  # c2 is identical to c1
     df = pd.DataFrame({"c1": c1_values, "c2": c2_values})
-    
+
     syn_data = Synthesizer(df).sample()
-    
+
     # Ensure all values for c1 and c2 match in the synthetic dataframe
     for i in range(len(syn_data)):
         c1_val = syn_data.iloc[i, 0]  # First column (c1)
@@ -267,42 +267,42 @@ def test_value_safe_columns_strings() -> None:
 
 def test_pid() -> None:
     np.random.seed(42)  # For reproducible tests
-    
+
     # Create 20 distinct strings: 10 starting with 'a', 10 starting with 'b'
     strings_c1 = []
     for i in range(10):
         # Generate 4 random characters for the suffix
-        suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4))
-        strings_c1.append(f'a{suffix}')
-    
+        suffix = "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 4))
+        strings_c1.append(f"a{suffix}")
+
     for i in range(10):
         # Generate 4 random characters for the suffix
-        suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4))
-        strings_c1.append(f'b{suffix}')
-    
+        suffix = "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 4))
+        strings_c1.append(f"b{suffix}")
+
     # Create 20 distinct PIDs (integers)
     pids = list(range(20))
-    
+
     # Create mapping from string to PID
     string_to_pid = dict(zip(strings_c1, pids))
-    
+
     # Generate 1000 rows with random string selections and corresponding PIDs
     selected_strings = np.random.choice(strings_c1, 1000)
     selected_pids = [string_to_pid[s] for s in selected_strings]
-    
+
     # Create the dataframe
     df = pd.DataFrame({"pid": selected_pids, "c1": selected_strings})
-    
+
     # Build synthetic dataframe using PID functionality
     df_pid = df[["pid"]]
     df_without_pid = df.drop(columns=["pid"])
     syn_data = Synthesizer(df_without_pid, pids=df_pid).sample()
-    
+
     # Check that none of the values in syn_data['c1'] match any of the values in df_without_pid['c1']
-    original_c1_values = set(df_without_pid['c1'])
-    synthetic_c1_values = set(syn_data['c1'])
+    original_c1_values = set(df_without_pid["c1"])
+    synthetic_c1_values = set(syn_data["c1"])
     assert synthetic_c1_values.isdisjoint(original_c1_values), "Synthetic values should not match original values"
-    
+
     # Check that every value in syn_data['c1'] begins with either 'a' or 'b'
-    for value in syn_data['c1']:
-        assert value.startswith('a') or value.startswith('b'), f"Value '{value}' does not start with 'a' or 'b'"
+    for value in syn_data["c1"]:
+        assert value.startswith("a") or value.startswith("b"), f"Value '{value}' does not start with 'a' or 'b'"

From 4502cb30bcffb63d792d3597624a15872c5028b6 Mon Sep 17 00:00:00 2001
From: Paul Francis <paul@francis.com>
Date: Tue, 5 Aug 2025 11:41:05 +0200
Subject: [PATCH 3/4] clean f-strings

---
 syndiffix/tree.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/syndiffix/tree.py b/syndiffix/tree.py
index 2e142e0..057dafa 100644
--- a/syndiffix/tree.py
+++ b/syndiffix/tree.py
@@ -175,7 +175,7 @@ def _matching_rows(self) -> Iterator[RowId]:
         yield from self.rows
 
     def print(self) -> None:
-        print(f"Leaf Node:")
+        print("Leaf Node:")
         print(f"  actual_intervals: {self.actual_intervals}")
         print(f"  snapped_intervals: {self.snapped_intervals}")
         print(f"  _noisy_count_cache: {self._noisy_count_cache}")
@@ -271,7 +271,7 @@ def _matching_rows(self) -> Iterator[RowId]:
             yield from child._matching_rows()
 
     def print(self) -> None:
-        print(f"Branch Node:")
+        print("Branch Node:")
         print(f"  actual_intervals: {self.actual_intervals}")
         print(f"  snapped_intervals: {self.snapped_intervals}")
         print(f"  _noisy_count_cache: {self._noisy_count_cache}")

From e030722dfad525681eb908308d53ef7201fd53a5 Mon Sep 17 00:00:00 2001
From: Paul Francis <paul@francis.com>
Date: Tue, 5 Aug 2025 11:53:03 +0200
Subject: [PATCH 4/4] fixed typing inconsistencies

---
 syndiffix/microdata.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py
index 8a6fa0c..1d3c04f 100644
--- a/syndiffix/microdata.py
+++ b/syndiffix/microdata.py
@@ -251,10 +251,10 @@ def denormalize_safe_values(self) -> None:
         assert self.scaler is not None
         if self.value_safe_flag is False and self.safe_values:
             # Convert normalized values back to original integer values
-            denormalized_safe_values = set()
+            denormalized_safe_values: Set[float] = set()
             for normalized_value in self.safe_values:
                 original_value = _inverse_normalize_value(float(normalized_value), self.scaler)
-                denormalized_safe_values.add(int(round(original_value)))
+                denormalized_safe_values.add(float(round(original_value)))
             self.safe_values = denormalized_safe_values
 
     def create_value_safe_set(self, values: pd.Series) -> None:
@@ -461,7 +461,8 @@ def generate_microdata(
     buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random
 ) -> list[MicrodataRow]:
     # print(buckets)      # Debugging line to see the buckets
-    [convertor.denormalize_safe_values() for convertor in convertors]
+    for convertor in convertors:
+        convertor.denormalize_safe_values()
     microdata_rows: list[MicrodataRow] = []
     for bucket in buckets:
         microdata_rows.extend(