Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ target-version = ['py310']
python_version = "3.10"
check_untyped_defs = true
disallow_untyped_defs = true
ignore_missing_imports = true

[tool.isort]
profile = "black"
Expand Down
88 changes: 84 additions & 4 deletions syndiffix/microdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from itertools import islice
from os.path import commonprefix
from random import Random
from typing import Generator, Iterable, Literal, Set, cast
from typing import Generator, Iterable, Literal, Optional, Set, cast

import numpy as np
import pandas as pd
Expand All @@ -14,6 +14,7 @@
is_integer_dtype,
is_string_dtype,
)
from sklearn.preprocessing import MinMaxScaler

from .bucket import Buckets
from .common import ColumnType, Value
Expand All @@ -31,6 +32,9 @@


class DataConvertor(ABC):
def __init__(self) -> None:
self.scaler: Optional[MinMaxScaler] = None

@abstractmethod
def column_type(self) -> ColumnType:
pass
Expand All @@ -48,6 +52,9 @@ def analyze_tree(self, root: Node) -> None:


class BooleanConvertor(DataConvertor):
def __init__(self) -> None:
super().__init__()

def column_type(self) -> ColumnType:
return ColumnType.BOOLEAN

Expand All @@ -61,6 +68,14 @@ def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:


class RealConvertor(DataConvertor):
def __init__(self, values: Iterable[Value]) -> None:
super().__init__()
# Fit up to 0.9999 so that the max bucket range is 0-1
self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
# This value-neutral fitting is only for passing unit tests.
self.scaler.fit(np.array([[0.0], [0.9999]]))
self.round_precision = _get_round_precision(values)

def column_type(self) -> ColumnType:
return ColumnType.REAL

Expand All @@ -70,10 +85,19 @@ def to_float(self, value: Value) -> float:

def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
value = _generate_float(interval, rng)
value = _inverse_normalize_value(value, self.scaler)
value = round(value, self.round_precision)
return (value, value)


class IntegerConvertor(DataConvertor):
def __init__(self) -> None:
super().__init__()
# Fit up to 0.9999 so that the max bucket range is 0-1
self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
# This value-neutral fitting is only for passing unit tests.
self.scaler.fit(np.array([[0.0], [0.9999]]))

def column_type(self) -> ColumnType:
return ColumnType.INTEGER

Expand All @@ -82,11 +106,20 @@ def to_float(self, value: Value) -> float:
return float(value)

def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
value = int(_generate_float(interval, rng))
value = _generate_float(interval, rng)
value = _inverse_normalize_value(value, self.scaler)
value = round(value)
return (value, float(value))


class TimestampConvertor(DataConvertor):
def __init__(self) -> None:
super().__init__()
# Fit up to 0.9999 so that the max bucket range is 0-1
self.scaler = MinMaxScaler(feature_range=(0, 0.9999))
# This value-neutral fitting is only for passing unit tests.
self.scaler.fit(np.array([[0.0], [0.9999]]))

def column_type(self) -> ColumnType:
return ColumnType.TIMESTAMP

Expand All @@ -97,12 +130,14 @@ def to_float(self, value: Value) -> float:

def from_interval(self, interval: Interval, rng: Random) -> MicrodataValue:
value = _generate_float(interval, rng)
value = _inverse_normalize_value(value, self.scaler)
datetime = TIMESTAMP_REFERENCE + np.timedelta64(int(value), "s")
return (datetime, value)


class StringConvertor(DataConvertor):
def __init__(self, values: Iterable[Value]) -> None:
super().__init__()
unique_values = set(v for v in values if not pd.isna(v))
for value in unique_values:
if not isinstance(value, str):
Expand Down Expand Up @@ -160,6 +195,21 @@ def _generate_float(interval: Interval, rng: Random) -> float:
return rng.uniform(interval.min, interval.max)


def _get_round_precision(values: Iterable[Value]) -> int:
max_precision = 0
for value in values:
assert isinstance(value, float) or isinstance(value, np.floating)
value_str = str(value)
if "." in value_str:
decimal_part = value_str.split(".")[1]
precision = len(decimal_part)
else:
precision = 0
if precision > max_precision:
max_precision = precision
return max_precision


def _generate(interval: Interval, convertor: DataConvertor, null_mapping: float, rng: Random) -> MicrodataValue:
return convertor.from_interval(interval, rng) if interval.min != null_mapping else (None, null_mapping)

Expand All @@ -178,7 +228,7 @@ def get_convertor(df: pd.DataFrame, column: str) -> DataConvertor:
if is_integer_dtype(dtype):
return IntegerConvertor()
elif is_float_dtype(dtype):
return RealConvertor()
return RealConvertor(df[column])
elif is_bool_dtype(dtype):
return BooleanConvertor()
elif is_datetime64_dtype(dtype):
Expand All @@ -190,6 +240,32 @@ def get_convertor(df: pd.DataFrame, column: str) -> DataConvertor:
raise TypeError(f"Dtype {dtype} is not supported.")


def _inverse_normalize_value(value: float, scaler: MinMaxScaler) -> float:
# Inverse of normalize, but for one value at a time
value_array = np.array([[value]])
inverse_transformed_array = scaler.inverse_transform(value_array)
inverse_transformed_value = inverse_transformed_array[0, 0]
return float(inverse_transformed_value)


def _normalize(values: pd.Series, scaler: Optional[MinMaxScaler]) -> pd.Series:
if scaler is None:
# Convertors that don't need normalization
return values

# MinMax normalize values, while retaining the NaN values
values_array = values.to_numpy()
nan_indices = np.isnan(values_array)
if nan_indices.all():
return values
median_value = np.nanmedian(values_array)
values_array[nan_indices] = median_value
values_reshaped = values_array.reshape(-1, 1)
normalized_values = scaler.fit_transform(values_reshaped).flatten()
normalized_values[nan_indices] = np.nan
return pd.Series(normalized_values, index=values.index)


def _apply_convertor(value: Value, convertor: DataConvertor) -> float:
if pd.isna(value):
return np.nan
Expand All @@ -203,7 +279,11 @@ def apply_convertors(convertors: list[DataConvertor], raw_data: pd.DataFrame) ->
for column, convertor in zip(raw_data.columns, convertors)
]

return pd.DataFrame(dict(zip(raw_data.columns, converted_columns)), copy=False)
possibly_normalized_columns = [
_normalize(column, convertor.scaler) for column, convertor in zip(converted_columns, convertors)
]

return pd.DataFrame(dict(zip(raw_data.columns, possibly_normalized_columns)), copy=False)


def generate_microdata(
Expand Down
12 changes: 6 additions & 6 deletions tests/clustering/test_measures.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@ def test_measure_all() -> None:
np.round(measures.dependency_matrix, 2),
np.array(
[
[1.00, 0.21, 0.18, 0.02, 0.05],
[0.21, 1.00, 0.17, 0.02, 0.04],
[0.18, 0.17, 1.00, 0.04, 0.07],
[0.02, 0.02, 0.04, 1.00, 0.01],
[0.05, 0.04, 0.07, 0.01, 1.00],
[1.00, 0.2, 0.15, 0.02, 0.05],
[0.2, 1.00, 0.18, 0.02, 0.05],
[0.15, 0.18, 1.00, 0.04, 0.07],
[0.02, 0.02, 0.04, 1.00, 0],
[0.05, 0.05, 0.07, 0, 1.00],
]
),
)

assert np.array_equal(
np.round(measures.entropy_1dim, 3),
np.array([15.992, 15.987, 5.453, 0.118, 1.350]),
np.array([16.042, 15.886, 5.398, 0.118, 1.350]),
)
30 changes: 15 additions & 15 deletions tests/data/tree.0.json
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
{
"ranges": [[0.0, 8.0]],
"ranges": [[0.0, 1.0]],
"count": 32,
"children": {
"0": {
"ranges": [[0.0, 4.0]],
"ranges": [[0.0, 0.5]],
"count": 16,
"children": {
"0": {
"ranges": [[0.0, 2.0]],
"ranges": [[0.0, 0.25]],
"count": 8,
"children": {
"0": {
"ranges": [[0.0, 1.0]],
"ranges": [[0.0, 0.125]],
"count": 4,
"children": null
},
"1": {
"ranges": [[1.0, 2.0]],
"ranges": [[0.125, 0.25]],
"count": 4,
"children": null
}
}
},
"1": {
"ranges": [[2.0, 4.0]],
"ranges": [[0.25, 0.5]],
"count": 8,
"children": {
"0": {
"ranges": [[2.0, 3.0]],
"ranges": [[0.25, 0.375]],
"count": 4,
"children": null
},
"1": {
"ranges": [[3.0, 4.0]],
"ranges": [[0.375, 0.5]],
"count": 4,
"children": null
}
Expand All @@ -41,36 +41,36 @@
}
},
"1": {
"ranges": [[4.0, 8.0]],
"ranges": [[0.5, 1.0]],
"count": 16,
"children": {
"0": {
"ranges": [[4.0, 6.0]],
"ranges": [[0.5, 0.75]],
"count": 8,
"children": {
"0": {
"ranges": [[4.0, 5.0]],
"ranges": [[0.5, 0.625]],
"count": 4,
"children": null
},
"1": {
"ranges": [[5.0, 6.0]],
"ranges": [[0.625, 0.75]],
"count": 4,
"children": null
}
}
},
"1": {
"ranges": [[6.0, 8.0]],
"ranges": [[0.75, 1.0]],
"count": 8,
"children": {
"0": {
"ranges": [[6.0, 7.0]],
"ranges": [[0.75, 0.875]],
"count": 4,
"children": null
},
"1": {
"ranges": [[7.0, 8.0]],
"ranges": [[0.875, 1.0]],
"count": 4,
"children": null
}
Expand Down
Loading