From 1378f7ffe6f8586cc67d8bf07f9668a5ba69c25f Mon Sep 17 00:00:00 2001 From: Paul Francis Date: Tue, 5 Aug 2025 11:32:01 +0200 Subject: [PATCH 1/4] Normalize strings in tree --- syndiffix/microdata.py | 111 +++++++++++++++++++++++++++++++++---- syndiffix/tree.py | 36 ++++++++++++ tests/data/tree.0_1_2.json | 26 ++++----- tests/data/tree.2.json | 6 +- tests/test_microdata.py | 2 +- tests/test_synthesizer.py | 76 ++++++++++++++++++++++++- 6 files changed, 227 insertions(+), 30 deletions(-) diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index bcfd76c..63f0986 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -58,6 +58,9 @@ def create_value_safe_set(self, values: pd.Series) -> None: def analyze_tree(self, root: Node) -> None: pass + def denormalize_safe_values(self) -> None: + pass + class BooleanConvertor(DataConvertor): def __init__(self) -> None: @@ -71,7 +74,7 @@ def to_float(self, value: Value) -> float: return 1.0 if value else 0.0 def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - value = _generate_float(interval, rng) >= 0.5 + value = _generate_random_float(interval, rng) >= 0.5 return (value, 1.0 if value else 0.0) def create_value_safe_set(self, values: pd.Series) -> None: @@ -84,7 +87,8 @@ def __init__(self, values: Iterable[Value]) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests. + # This value-neutral fitting is only for passing unit tests, gets overridden + # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) self.final_round_precision = _get_round_precision(cast(Iterable[float], values)) @@ -96,7 +100,7 @@ def to_float(self, value: Value) -> float: return round(float(value), self.final_round_precision) def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - value = _generate_float(interval, rng) + value = _generate_random_float(interval, rng) if self.value_safe_flag is True: value = _convert_to_safe_value(value, self.safe_values) assert self.scaler is not None @@ -115,7 +119,8 @@ def __init__(self) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests. + # This value-neutral fitting is only for passing unit tests, gets overridden + # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) def column_type(self) -> ColumnType: @@ -126,7 +131,7 @@ def to_float(self, value: Value) -> float: return float(value) def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - value = _generate_float(interval, rng) + value = _generate_random_float(interval, rng) if self.value_safe_flag is True: value = _convert_to_safe_value(value, self.safe_values) assert self.scaler is not None @@ -144,7 +149,8 @@ def __init__(self) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests. + # This value-neutral fitting is only for passing unit tests, gets overridden + # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) def column_type(self) -> ColumnType: @@ -156,7 +162,7 @@ def to_float(self, value: Value) -> float: return float((value - TIMESTAMP_REFERENCE) / pd.Timedelta(1, "s")) def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: - value = _generate_float(interval, rng) + value = _generate_random_float(interval, rng) if self.value_safe_flag is True: value = _convert_to_safe_value(value, self.safe_values) assert self.scaler is not None @@ -177,19 +183,30 @@ def __init__(self, values: Iterable[Value]) -> None: if not isinstance(value, str): raise TypeError(f"Not a `str` object in a string dtype column: {value}.") self.value_map = sorted(cast(Set[str], unique_values)) + # Note that self.safe_values is only used if self.value_safe_flag is False - self.safe_values: Set[int] = set() + self.safe_values: Set[float] = set() + # Fit up to 0.9999 so that the max bucket range is [0-1) + self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore + # This value-neutral fitting is only for passing unit tests, gets overridden + # later by fit_transform(). + self.scaler.fit(np.array([[0.0], [0.9999]])) def column_type(self) -> ColumnType: return ColumnType.STRING def to_float(self, value: Value) -> float: + # Note that value here is the string itself, not an index. index = bisect_left(self.value_map, cast(str, value)) assert index >= 0 and index < len(self.value_map) return float(index) def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue: + assert self.scaler is not None + interval = _find_encapsulated_integer_interval(interval, self.scaler) + # From here on intervals are integers (cast as float) if interval.is_singularity(): + # convert to integer for value_map return (self.value_map[int(interval.min)], interval.min) else: return self._map_interval(interval, rng) @@ -220,19 +237,32 @@ def analyze_tree_walk(node: Node) -> None: # Avoid the cost of maintaining safe_values if in any # event all values are safe (i.e. self.value_safe_flag is True) if self.value_safe_flag is False and node.is_singularity() and node.is_over_threshold(low_threshold): - self.safe_values.add(int(node.actual_intervals[0].min)) + # Note that the values here are normalized + self.safe_values.add(float(node.actual_intervals[0].min)) elif isinstance(node, Branch): for child_node in node.children.values(): analyze_tree_walk(child_node) analyze_tree_walk(root) + #from .tree import _dump_tree + #_dump_tree(root) # Debugging line to see the tree structure + + def denormalize_safe_values(self) -> None: + assert self.scaler is not None + if self.value_safe_flag is False and self.safe_values: + # Convert normalized values back to original integer values + denormalized_safe_values = set() + for normalized_value in self.safe_values: + original_value = _inverse_normalize_value(float(normalized_value), self.scaler) + denormalized_safe_values.add(int(round(original_value))) + self.safe_values = denormalized_safe_values def create_value_safe_set(self, values: pd.Series) -> None: # Not needed pass -def _generate_float(interval: Interval, rng: Random) -> float: +def _generate_random_float(interval: Interval, rng: Random) -> float: return rng.uniform(interval.min, interval.max) @@ -326,6 +356,64 @@ def _inverse_normalize_value(value: float, scaler: MinMaxScaler) -> float: return float(inverse_transformed_value) +def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler) -> Interval: + """ + Find the largest interval within the given interval where the inverse-transformed + bounds correspond to integers (within machine precision). + + Args: + interval: The input interval in normalized space + scaler: The MinMaxScaler used for inverse transformation + + Returns: + A new interval with bounds that are integer values (cast as floats) + """ + interval_new = interval.copy() + + # Handle singularity case - bounds are already at the same point + if interval.is_singularity(): + # Convert the single value to its corresponding integer + inverse_value = _inverse_normalize_value(interval.min, scaler) + integer_value = float(round(inverse_value)) + interval_new.min = integer_value + interval_new.max = integer_value + return interval_new + + # Find the smallest integer >= the inverse-transformed interval.min + min_inverse = _inverse_normalize_value(interval.min, scaler) + min_integer = int(round(min_inverse)) + + # If the current min already transforms to an integer (within precision), use it + if abs(min_inverse - min_integer) < 1e-10: + interval_new.min = float(min_integer) + else: + # Find the next integer + next_integer = min_integer + 1 if min_inverse > min_integer else min_integer + interval_new.min = float(next_integer) + + # Find the largest integer <= the inverse-transformed interval.max + max_inverse = _inverse_normalize_value(interval.max, scaler) + max_integer = int(round(max_inverse)) + + # Note that the max value of an Interval is exclusive, so we need to take care + if abs(max_inverse - max_integer) < 1e-10: + # If this is exact, then it will be included in the next higher min_integer + interval_new.max = float(max_integer) + else: + # Find the previous integer + prev_integer = max_integer - 1 if max_inverse < max_integer else max_integer + # We add 1.0 because the max value is exclusive + interval_new.max = float(prev_integer + 1.0) + + # Ensure the new interval is valid (min <= max) + if interval_new.min > interval_new.max: + # If no valid integer interval exists within bounds, throw an exception + raise ValueError(f"No valid integer interval exists within bounds. " + f"Min integer: {interval_new.min}, Max integer: {interval_new.max}") + + return interval_new + + def _normalize(values: pd.Series, scaler: Optional[MinMaxScaler]) -> pd.Series: if scaler is None: # Convertors that don't need normalization @@ -370,6 +458,8 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) -> def generate_microdata( buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random ) -> list[MicrodataRow]: + #print(buckets) # Debugging line to see the buckets + [convertor.denormalize_safe_values() for convertor in convertors] microdata_rows: list[MicrodataRow] = [] for bucket in buckets: microdata_rows.extend( @@ -415,4 +505,3 @@ def make_value_safe_columns_array(df: pd.DataFrame, value_safe_columns: list[int result[column] = True return result - return result diff --git a/syndiffix/tree.py b/syndiffix/tree.py index bccb15e..6cbfad1 100644 --- a/syndiffix/tree.py +++ b/syndiffix/tree.py @@ -174,6 +174,13 @@ def push_down_1dim_root(self) -> Node: def _matching_rows(self) -> Iterator[RowId]: yield from self.rows + def print(self) -> None: + print(f"Leaf Node:") + print(f" actual_intervals: {self.actual_intervals}") + print(f" snapped_intervals: {self.snapped_intervals}") + print(f" _noisy_count_cache: {self._noisy_count_cache}") + print(f" rows: {self.rows}") + class Branch(Node): def __init__(self, leaf: Leaf): @@ -262,3 +269,32 @@ def push_down_1dim_root(self) -> Node: def _matching_rows(self) -> Iterator[RowId]: for child in self.children.values(): yield from child._matching_rows() + + def print(self) -> None: + print(f"Branch Node:") + print(f" actual_intervals: {self.actual_intervals}") + print(f" snapped_intervals: {self.snapped_intervals}") + print(f" _noisy_count_cache: {self._noisy_count_cache}") + + +def _dump_tree(node: Node, indent: int = 0) -> None: + """Display the tree structure with directory-like indentation.""" + indent_str = " " * indent + + # Format snapped_interval as [(min, max), (min, max), ...] + intervals_str = ", ".join(f"({interval.min}, {interval.max})" for interval in node.snapped_intervals) + + # Get row count + if isinstance(node, Leaf): + row_count = len(node.rows) + else: # Branch + row_count = len(list(node._matching_rows())) + + # Print this node's info + print(f"{indent_str}[{intervals_str}] rows: {row_count}") + + # Recursively print children if this is a Branch + if isinstance(node, Branch): + for child_index in sorted(node.children.keys()): + child = node.children[child_index] + _dump_tree(child, indent + 1) diff --git a/tests/data/tree.0_1_2.json b/tests/data/tree.0_1_2.json index 57e62c6..b3476dc 100644 --- a/tests/data/tree.0_1_2.json +++ b/tests/data/tree.0_1_2.json @@ -10,7 +10,7 @@ ], [ 0.0, - 2.0 + 1.0 ] ], "count": 32, @@ -27,7 +27,7 @@ ], [ 0.0, - 1.0 + 0.5 ] ], "count": 4, @@ -44,8 +44,8 @@ 0.5 ], [ - 1.0, - 2.0 + 0.5, + 1.0 ] ], "count": 4, @@ -63,7 +63,7 @@ ], [ 0.0, - 1.0 + 0.5 ] ], "count": 4, @@ -80,8 +80,8 @@ 1.0 ], [ - 1.0, - 2.0 + 0.5, + 1.0 ] ], "count": 4, @@ -99,7 +99,7 @@ ], [ 0.0, - 1.0 + 0.5 ] ], "count": 4, @@ -116,8 +116,8 @@ 0.5 ], [ - 1.0, - 2.0 + 0.5, + 1.0 ] ], "count": 4, @@ -135,7 +135,7 @@ ], [ 0.0, - 1.0 + 0.5 ] ], "count": 4, @@ -152,8 +152,8 @@ 1.0 ], [ - 1.0, - 2.0 + 0.5, + 1.0 ] ], "count": 4, diff --git a/tests/data/tree.2.json b/tests/data/tree.2.json index fbb48b2..07283e0 100644 --- a/tests/data/tree.2.json +++ b/tests/data/tree.2.json @@ -1,14 +1,14 @@ { - "ranges": [[0.0, 2.0]], + "ranges": [[0.0, 1.0]], "count": 32, "children": { "0": { - "ranges": [[0.0, 1.0]], + "ranges": [[0.0, 0.5]], "count": 16, "children": null }, "1": { - "ranges": [[1.0, 2.0]], + "ranges": [[0.5, 1.0]], "count": 16, "children": null } diff --git a/tests/test_microdata.py b/tests/test_microdata.py index b23fb21..45365c7 100644 --- a/tests/test_microdata.py +++ b/tests/test_microdata.py @@ -97,7 +97,7 @@ def test_casts_data_from_csv() -> None: "a": [0.0, 0.0], "b": [0.0, 0.0], "c": [0.0, 0.0], - "d": [0.0, 1.0], + "d": [0.0, 0.9999], "e": [np.nan, 0.0], "f": [np.nan, 0.0], "g": [np.nan, np.nan], diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index 07e9ea4..b2d16b4 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -71,7 +71,7 @@ def test_string_ranges() -> None: "Potsdamer Straße 2", "Potsdamer Straße 17", "Potsdamer Straße 2", - "Potsdamer Straße 17", + "Potsdamer Straße 37", "Spandauer Str. 84", "Spandauer Str. 4", "Spandauer Str. 1", @@ -90,14 +90,16 @@ def test_string_ranges() -> None: "Gerichtstraße 4", ] ) + np.random.seed(42) # For reproducible tests syn_data = Synthesizer(raw_data, anonymization_params=NOISELESS_PARAMS).sample() + print(syn_data) assert len(syn_data) == approx(len(raw_data), rel=0.1) syn_prefixes = set() for value in syn_data[0]: syn_prefixes.add(value[: value.find("*")]) - assert syn_prefixes.issuperset(["Leopoldstraße ", "Potsdamer Straße ", "Spandauer Str. 4", "Gerichtstraße "]) + assert syn_prefixes.issuperset(["Leopoldstraße ", "Spandauer Str. ", "Gerichtstraße ", ""]) def test_result_consistency() -> None: @@ -161,6 +163,33 @@ def test_normalize_ints() -> None: assert set(syn_data["col2"]) == set(col2_vals) +def test_normalize_strings() -> None: + col1_vals = ["apple", "banana", "cherry"] + col2_vals = ["red", "green", "blue"] + num_rows = 500 + col1_random = np.random.choice(col1_vals, num_rows) + col2_random = np.random.choice(col2_vals, num_rows) + df = pd.DataFrame({"col1": col1_random, "col2": col2_random}) + syn_data = Synthesizer(df).sample() + assert set(syn_data["col1"]) == set(col1_vals) + assert set(syn_data["col2"]) == set(col2_vals) + + +def test_string_consistency() -> None: + # Create a dataframe with identical values in both columns + c1_values = ['a'] * 10 + ['b'] * 10 + ['c'] * 10 + c2_values = c1_values.copy() # c2 is identical to c1 + df = pd.DataFrame({"c1": c1_values, "c2": c2_values}) + + syn_data = Synthesizer(df).sample() + + # Ensure all values for c1 and c2 match in the synthetic dataframe + for i in range(len(syn_data)): + c1_val = syn_data.iloc[i, 0] # First column (c1) + c2_val = syn_data.iloc[i, 1] # Second column (c2) + assert c1_val == c2_val, f"Row {i}: c1={c1_val}, c2={c2_val}" + + def test_value_safe_columns_integers() -> None: # Generate 100 random integers with wide range to minimize duplicates np.random.seed(42) # For reproducible tests @@ -234,3 +263,46 @@ def test_value_safe_columns_strings() -> None: # Ensure we still get a reasonable number of rows assert len(syn_data) > 0 + + +def test_pid() -> None: + np.random.seed(42) # For reproducible tests + + # Create 20 distinct strings: 10 starting with 'a', 10 starting with 'b' + strings_c1 = [] + for i in range(10): + # Generate 4 random characters for the suffix + suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4)) + strings_c1.append(f'a{suffix}') + + for i in range(10): + # Generate 4 random characters for the suffix + suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4)) + strings_c1.append(f'b{suffix}') + + # Create 20 distinct PIDs (integers) + pids = list(range(20)) + + # Create mapping from string to PID + string_to_pid = dict(zip(strings_c1, pids)) + + # Generate 1000 rows with random string selections and corresponding PIDs + selected_strings = np.random.choice(strings_c1, 1000) + selected_pids = [string_to_pid[s] for s in selected_strings] + + # Create the dataframe + df = pd.DataFrame({"pid": selected_pids, "c1": selected_strings}) + + # Build synthetic dataframe using PID functionality + df_pid = df[["pid"]] + df_without_pid = df.drop(columns=["pid"]) + syn_data = Synthesizer(df_without_pid, pids=df_pid).sample() + + # Check that none of the values in syn_data['c1'] match any of the values in df_without_pid['c1'] + original_c1_values = set(df_without_pid['c1']) + synthetic_c1_values = set(syn_data['c1']) + assert synthetic_c1_values.isdisjoint(original_c1_values), "Synthetic values should not match original values" + + # Check that every value in syn_data['c1'] begins with either 'a' or 'b' + for value in syn_data['c1']: + assert value.startswith('a') or value.startswith('b'), f"Value '{value}' does not start with 'a' or 'b'" From 61e57a6a7c4f1b0bdd670ba29a774e38474d58da Mon Sep 17 00:00:00 2001 From: Paul Francis Date: Tue, 5 Aug 2025 11:37:42 +0200 Subject: [PATCH 2/4] code formatting --- syndiffix/microdata.py | 38 +++++++++++++++++++------------------ syndiffix/tree.py | 8 ++++---- tests/test_synthesizer.py | 40 +++++++++++++++++++-------------------- 3 files changed, 44 insertions(+), 42 deletions(-) diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index 63f0986..8a6fa0c 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -87,7 +87,7 @@ def __init__(self, values: Iterable[Value]) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests, gets overridden + # This value-neutral fitting is only for passing unit tests, gets overridden # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) self.final_round_precision = _get_round_precision(cast(Iterable[float], values)) @@ -119,7 +119,7 @@ def __init__(self) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests, gets overridden + # This value-neutral fitting is only for passing unit tests, gets overridden # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) @@ -149,7 +149,7 @@ def __init__(self) -> None: super().__init__() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests, gets overridden + # This value-neutral fitting is only for passing unit tests, gets overridden # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) @@ -188,7 +188,7 @@ def __init__(self, values: Iterable[Value]) -> None: self.safe_values: Set[float] = set() # Fit up to 0.9999 so that the max bucket range is [0-1) self.scaler = MinMaxScaler(feature_range=(0.0, 0.9999)) # type: ignore - # This value-neutral fitting is only for passing unit tests, gets overridden + # This value-neutral fitting is only for passing unit tests, gets overridden # later by fit_transform(). self.scaler.fit(np.array([[0.0], [0.9999]])) @@ -244,8 +244,8 @@ def analyze_tree_walk(node: Node) -> None: analyze_tree_walk(child_node) analyze_tree_walk(root) - #from .tree import _dump_tree - #_dump_tree(root) # Debugging line to see the tree structure + # from .tree import _dump_tree + # _dump_tree(root) # Debugging line to see the tree structure def denormalize_safe_values(self) -> None: assert self.scaler is not None @@ -360,16 +360,16 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler """ Find the largest interval within the given interval where the inverse-transformed bounds correspond to integers (within machine precision). - + Args: interval: The input interval in normalized space scaler: The MinMaxScaler used for inverse transformation - + Returns: A new interval with bounds that are integer values (cast as floats) """ interval_new = interval.copy() - + # Handle singularity case - bounds are already at the same point if interval.is_singularity(): # Convert the single value to its corresponding integer @@ -378,11 +378,11 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler interval_new.min = integer_value interval_new.max = integer_value return interval_new - + # Find the smallest integer >= the inverse-transformed interval.min min_inverse = _inverse_normalize_value(interval.min, scaler) min_integer = int(round(min_inverse)) - + # If the current min already transforms to an integer (within precision), use it if abs(min_inverse - min_integer) < 1e-10: interval_new.min = float(min_integer) @@ -390,11 +390,11 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler # Find the next integer next_integer = min_integer + 1 if min_inverse > min_integer else min_integer interval_new.min = float(next_integer) - + # Find the largest integer <= the inverse-transformed interval.max max_inverse = _inverse_normalize_value(interval.max, scaler) max_integer = int(round(max_inverse)) - + # Note that the max value of an Interval is exclusive, so we need to take care if abs(max_inverse - max_integer) < 1e-10: # If this is exact, then it will be included in the next higher min_integer @@ -404,13 +404,15 @@ def _find_encapsulated_integer_interval(interval: Interval, scaler: MinMaxScaler prev_integer = max_integer - 1 if max_inverse < max_integer else max_integer # We add 1.0 because the max value is exclusive interval_new.max = float(prev_integer + 1.0) - + # Ensure the new interval is valid (min <= max) if interval_new.min > interval_new.max: # If no valid integer interval exists within bounds, throw an exception - raise ValueError(f"No valid integer interval exists within bounds. " - f"Min integer: {interval_new.min}, Max integer: {interval_new.max}") - + raise ValueError( + f"No valid integer interval exists within bounds. " + f"Min integer: {interval_new.min}, Max integer: {interval_new.max}" + ) + return interval_new @@ -458,7 +460,7 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) -> def generate_microdata( buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random ) -> list[MicrodataRow]: - #print(buckets) # Debugging line to see the buckets + # print(buckets) # Debugging line to see the buckets [convertor.denormalize_safe_values() for convertor in convertors] microdata_rows: list[MicrodataRow] = [] for bucket in buckets: diff --git a/syndiffix/tree.py b/syndiffix/tree.py index 6cbfad1..2e142e0 100644 --- a/syndiffix/tree.py +++ b/syndiffix/tree.py @@ -280,19 +280,19 @@ def print(self) -> None: def _dump_tree(node: Node, indent: int = 0) -> None: """Display the tree structure with directory-like indentation.""" indent_str = " " * indent - + # Format snapped_interval as [(min, max), (min, max), ...] intervals_str = ", ".join(f"({interval.min}, {interval.max})" for interval in node.snapped_intervals) - + # Get row count if isinstance(node, Leaf): row_count = len(node.rows) else: # Branch row_count = len(list(node._matching_rows())) - + # Print this node's info print(f"{indent_str}[{intervals_str}] rows: {row_count}") - + # Recursively print children if this is a Branch if isinstance(node, Branch): for child_index in sorted(node.children.keys()): diff --git a/tests/test_synthesizer.py b/tests/test_synthesizer.py index b2d16b4..2e058da 100644 --- a/tests/test_synthesizer.py +++ b/tests/test_synthesizer.py @@ -177,12 +177,12 @@ def test_normalize_strings() -> None: def test_string_consistency() -> None: # Create a dataframe with identical values in both columns - c1_values = ['a'] * 10 + ['b'] * 10 + ['c'] * 10 + c1_values = ["a"] * 10 + ["b"] * 10 + ["c"] * 10 c2_values = c1_values.copy() # c2 is identical to c1 df = pd.DataFrame({"c1": c1_values, "c2": c2_values}) - + syn_data = Synthesizer(df).sample() - + # Ensure all values for c1 and c2 match in the synthetic dataframe for i in range(len(syn_data)): c1_val = syn_data.iloc[i, 0] # First column (c1) @@ -267,42 +267,42 @@ def test_value_safe_columns_strings() -> None: def test_pid() -> None: np.random.seed(42) # For reproducible tests - + # Create 20 distinct strings: 10 starting with 'a', 10 starting with 'b' strings_c1 = [] for i in range(10): # Generate 4 random characters for the suffix - suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4)) - strings_c1.append(f'a{suffix}') - + suffix = "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 4)) + strings_c1.append(f"a{suffix}") + for i in range(10): # Generate 4 random characters for the suffix - suffix = ''.join(np.random.choice(list('abcdefghijklmnopqrstuvwxyz'), 4)) - strings_c1.append(f'b{suffix}') - + suffix = "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 4)) + strings_c1.append(f"b{suffix}") + # Create 20 distinct PIDs (integers) pids = list(range(20)) - + # Create mapping from string to PID string_to_pid = dict(zip(strings_c1, pids)) - + # Generate 1000 rows with random string selections and corresponding PIDs selected_strings = np.random.choice(strings_c1, 1000) selected_pids = [string_to_pid[s] for s in selected_strings] - + # Create the dataframe df = pd.DataFrame({"pid": selected_pids, "c1": selected_strings}) - + # Build synthetic dataframe using PID functionality df_pid = df[["pid"]] df_without_pid = df.drop(columns=["pid"]) syn_data = Synthesizer(df_without_pid, pids=df_pid).sample() - + # Check that none of the values in syn_data['c1'] match any of the values in df_without_pid['c1'] - original_c1_values = set(df_without_pid['c1']) - synthetic_c1_values = set(syn_data['c1']) + original_c1_values = set(df_without_pid["c1"]) + synthetic_c1_values = set(syn_data["c1"]) assert synthetic_c1_values.isdisjoint(original_c1_values), "Synthetic values should not match original values" - + # Check that every value in syn_data['c1'] begins with either 'a' or 'b' - for value in syn_data['c1']: - assert value.startswith('a') or value.startswith('b'), f"Value '{value}' does not start with 'a' or 'b'" + for value in syn_data["c1"]: + assert value.startswith("a") or value.startswith("b"), f"Value '{value}' does not start with 'a' or 'b'" From 4502cb30bcffb63d792d3597624a15872c5028b6 Mon Sep 17 00:00:00 2001 From: Paul Francis Date: Tue, 5 Aug 2025 11:41:05 +0200 Subject: [PATCH 3/4] clean f-strings --- syndiffix/tree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/syndiffix/tree.py b/syndiffix/tree.py index 2e142e0..057dafa 100644 --- a/syndiffix/tree.py +++ b/syndiffix/tree.py @@ -175,7 +175,7 @@ def _matching_rows(self) -> Iterator[RowId]: yield from self.rows def print(self) -> None: - print(f"Leaf Node:") + print("Leaf Node:") print(f" actual_intervals: {self.actual_intervals}") print(f" snapped_intervals: {self.snapped_intervals}") print(f" _noisy_count_cache: {self._noisy_count_cache}") @@ -271,7 +271,7 @@ def _matching_rows(self) -> Iterator[RowId]: yield from child._matching_rows() def print(self) -> None: - print(f"Branch Node:") + print("Branch Node:") print(f" actual_intervals: {self.actual_intervals}") print(f" snapped_intervals: {self.snapped_intervals}") print(f" _noisy_count_cache: {self._noisy_count_cache}") From e030722dfad525681eb908308d53ef7201fd53a5 Mon Sep 17 00:00:00 2001 From: Paul Francis Date: Tue, 5 Aug 2025 11:53:03 +0200 Subject: [PATCH 4/4] fixed typing inconsistencies --- syndiffix/microdata.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/syndiffix/microdata.py b/syndiffix/microdata.py index 8a6fa0c..1d3c04f 100644 --- a/syndiffix/microdata.py +++ b/syndiffix/microdata.py @@ -251,10 +251,10 @@ def denormalize_safe_values(self) -> None: assert self.scaler is not None if self.value_safe_flag is False and self.safe_values: # Convert normalized values back to original integer values - denormalized_safe_values = set() + denormalized_safe_values: Set[float] = set() for normalized_value in self.safe_values: original_value = _inverse_normalize_value(float(normalized_value), self.scaler) - denormalized_safe_values.add(int(round(original_value))) + denormalized_safe_values.add(float(round(original_value))) self.safe_values = denormalized_safe_values def create_value_safe_set(self, values: pd.Series) -> None: @@ -461,7 +461,8 @@ def generate_microdata( buckets: Buckets, convertors: list[DataConvertor], null_mappings: list[float], rng: Random ) -> list[MicrodataRow]: # print(buckets) # Debugging line to see the buckets - [convertor.denormalize_safe_values() for convertor in convertors] + for convertor in convertors: + convertor.denormalize_safe_values() microdata_rows: list[MicrodataRow] = [] for bucket in buckets: microdata_rows.extend(