From dcd9cb286f32fc82058601a0daab1b52df241e76 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Tue, 6 Jan 2026 14:52:00 +0100 Subject: [PATCH 01/25] feat: add data profiling utilities for single-column cardinality metrics --- metis/utils/data_profiling/__init__.py | 10 ++ .../cardinalities/distinct_values.py | 13 +++ .../cardinalities/null_values.py | 28 ++++++ .../single_column/cardinalities/row_count.py | 16 ++++ .../single_column/cardinalities/uniqueness.py | 22 +++++ .../cardinalities/value_length.py | 96 +++++++++++++++++++ 6 files changed, 185 insertions(+) create mode 100644 metis/utils/data_profiling/__init__.py create mode 100644 metis/utils/data_profiling/single_column/cardinalities/distinct_values.py create mode 100644 metis/utils/data_profiling/single_column/cardinalities/null_values.py create mode 100644 metis/utils/data_profiling/single_column/cardinalities/row_count.py create mode 100644 metis/utils/data_profiling/single_column/cardinalities/uniqueness.py create mode 100644 metis/utils/data_profiling/single_column/cardinalities/value_length.py diff --git a/metis/utils/data_profiling/__init__.py b/metis/utils/data_profiling/__init__.py new file mode 100644 index 0000000..063b6d8 --- /dev/null +++ b/metis/utils/data_profiling/__init__.py @@ -0,0 +1,10 @@ +from .single_column.cardinalities.distinct_values import distinct_count +from .single_column.cardinalities.null_values import null_count, null_percentage +from .single_column.cardinalities.row_count import row_count +from .single_column.cardinalities.uniqueness import uniqueness +from .single_column.cardinalities.value_length import ( + value_length_max, + value_length_mean, + value_length_median, + value_length_min, +) diff --git a/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py b/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py new file mode 100644 index 0000000..df74e11 --- /dev/null +++ b/metis/utils/data_profiling/single_column/cardinalities/distinct_values.py @@ -0,0 +1,13 @@ +from typing import Union +import pandas as pd + + +def distinct_count(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Count the number of distinct (unique) values, excluding nulls. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Number of distinct values as int if Series input, Series of ints if DataFrame input. + """ + result = data.nunique() + return int(result) if isinstance(data, pd.Series) else result \ No newline at end of file diff --git a/metis/utils/data_profiling/single_column/cardinalities/null_values.py b/metis/utils/data_profiling/single_column/cardinalities/null_values.py new file mode 100644 index 0000000..99c6a06 --- /dev/null +++ b/metis/utils/data_profiling/single_column/cardinalities/null_values.py @@ -0,0 +1,28 @@ + +from typing import Union +import pandas as pd + + +def null_count(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Count the number of null/missing values. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Number of null values as int if Series input, Series of ints if DataFrame input. + """ + result = data.isna().sum() + return int(result) if isinstance(data, pd.Series) else result + + +def null_percentage(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: + """ + Calculate the percentage of null/missing values. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Percentage of null values (0-100) as float if Series input, Series of floats if DataFrame input. + """ + if len(data) == 0: + return 0.0 if isinstance(data, pd.Series) else pd.Series(dtype=float) + + result = (data.isna().sum() / len(data) * 100) + return float(result) if isinstance(data, pd.Series) else result \ No newline at end of file diff --git a/metis/utils/data_profiling/single_column/cardinalities/row_count.py b/metis/utils/data_profiling/single_column/cardinalities/row_count.py new file mode 100644 index 0000000..fd8c9f0 --- /dev/null +++ b/metis/utils/data_profiling/single_column/cardinalities/row_count.py @@ -0,0 +1,16 @@ +from typing import Union +import pandas as pd + + +def row_count(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Count the total number of rows (including null values). + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Total number of rows as int for Series input, Series of ints for DataFrame input. + """ + if isinstance(data, pd.Series): + return int(len(data)) + else: + # Each column has the same row count (length of the dataframe) + return pd.Series({col: len(data) for col in data.columns}) diff --git a/metis/utils/data_profiling/single_column/cardinalities/uniqueness.py b/metis/utils/data_profiling/single_column/cardinalities/uniqueness.py new file mode 100644 index 0000000..6b199c2 --- /dev/null +++ b/metis/utils/data_profiling/single_column/cardinalities/uniqueness.py @@ -0,0 +1,22 @@ +from typing import Union +import pandas as pd + + +def uniqueness(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: + """ + Calculate uniqueness as the ratio of distinct values to total rows. + Uniqueness = distinct_count / row_count + + Null values are excluded from the distinct count, meaning a column with all + null values will have uniqueness of 0.0. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Uniqueness ratio (0.0 to 1.0) as float if Series input, Series of floats if DataFrame input. + """ + if len(data) == 0: + return 0.0 if isinstance(data, pd.Series) else pd.Series(dtype=float) + + if isinstance(data, pd.Series): + return float(data.nunique() / len(data)) + else: + return data.nunique() / len(data) diff --git a/metis/utils/data_profiling/single_column/cardinalities/value_length.py b/metis/utils/data_profiling/single_column/cardinalities/value_length.py new file mode 100644 index 0000000..f1ec5d4 --- /dev/null +++ b/metis/utils/data_profiling/single_column/cardinalities/value_length.py @@ -0,0 +1,96 @@ +from typing import Union +import pandas as pd + + +def _get_string_lengths(series: pd.Series) -> pd.Series: + """ + Convert values to strings and calculate their lengths, excluding nulls. + + :param series: Input Series. + :return: Series containing the character lengths of non-null values. + """ + return series.dropna().astype(str).str.len() + + +def value_length_min(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Calculate minimum value length in characters. + + All values are converted to their string representation and the minimum + character length is returned. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Minimum character length as int if Series input, Series of ints if DataFrame input. + """ + if isinstance(data, pd.Series): + lengths = _get_string_lengths(data) + return int(lengths.min()) if not lengths.empty else 0 + else: + result = pd.Series(dtype=int, index=data.columns) + for col in data.columns: + lengths = _get_string_lengths(data[col]) + result[col] = int(lengths.min()) if not lengths.empty else 0 + return result + + +def value_length_max(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Calculate maximum value length in characters. + + All values are converted to their string representation and the maximum + character length is returned. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Maximum character length as int if Series input, Series of ints if DataFrame input. + """ + if isinstance(data, pd.Series): + lengths = _get_string_lengths(data) + return int(lengths.max()) if not lengths.empty else 0 + else: + result = pd.Series(dtype=int, index=data.columns) + for col in data.columns: + lengths = _get_string_lengths(data[col]) + result[col] = int(lengths.max()) if not lengths.empty else 0 + return result + + +def value_length_mean(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: + """ + Calculate mean value length in characters. + + All values are converted to their string representation and the mean + character length is returned. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Mean character length as float if Series input, Series of floats if DataFrame input. + """ + if isinstance(data, pd.Series): + lengths = _get_string_lengths(data) + return float(lengths.mean()) if not lengths.empty else 0.0 + else: + result = pd.Series(dtype=float, index=data.columns) + for col in data.columns: + lengths = _get_string_lengths(data[col]) + result[col] = float(lengths.mean()) if not lengths.empty else 0.0 + return result + + +def value_length_median(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: + """ + Calculate median value length in characters. + + All values are converted to their string representation and the median + character length is returned. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Median character length as float if Series input, Series of floats if DataFrame input. + """ + if isinstance(data, pd.Series): + lengths = _get_string_lengths(data) + return float(lengths.median()) if not lengths.empty else 0.0 + else: + result = pd.Series(dtype=float, index=data.columns) + for col in data.columns: + lengths = _get_string_lengths(data[col]) + result[col] = float(lengths.median()) if not lengths.empty else 0.0 + return result \ No newline at end of file From 8342e3face8b4aee53586faaaa36fccef29aaa88 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Tue, 6 Jan 2026 15:28:28 +0100 Subject: [PATCH 02/25] fix: use dict-based Series construction for int dtype columns pd.Series(dtype=int, index=...) initializes with NaN which forces float64 conversion. Building a dict first preserves int64 dtype. --- .../single_column/cardinalities/value_length.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/metis/utils/data_profiling/single_column/cardinalities/value_length.py b/metis/utils/data_profiling/single_column/cardinalities/value_length.py index f1ec5d4..2b84b9a 100644 --- a/metis/utils/data_profiling/single_column/cardinalities/value_length.py +++ b/metis/utils/data_profiling/single_column/cardinalities/value_length.py @@ -26,11 +26,11 @@ def value_length_min(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Seri lengths = _get_string_lengths(data) return int(lengths.min()) if not lengths.empty else 0 else: - result = pd.Series(dtype=int, index=data.columns) + result = {} for col in data.columns: lengths = _get_string_lengths(data[col]) result[col] = int(lengths.min()) if not lengths.empty else 0 - return result + return pd.Series(result) def value_length_max(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: @@ -47,11 +47,11 @@ def value_length_max(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Seri lengths = _get_string_lengths(data) return int(lengths.max()) if not lengths.empty else 0 else: - result = pd.Series(dtype=int, index=data.columns) + result = {} for col in data.columns: lengths = _get_string_lengths(data[col]) result[col] = int(lengths.max()) if not lengths.empty else 0 - return result + return pd.Series(result) def value_length_mean(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: @@ -68,11 +68,11 @@ def value_length_mean(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.S lengths = _get_string_lengths(data) return float(lengths.mean()) if not lengths.empty else 0.0 else: - result = pd.Series(dtype=float, index=data.columns) + result = {} for col in data.columns: lengths = _get_string_lengths(data[col]) result[col] = float(lengths.mean()) if not lengths.empty else 0.0 - return result + return pd.Series(result) def value_length_median(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: @@ -89,8 +89,8 @@ def value_length_median(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd lengths = _get_string_lengths(data) return float(lengths.median()) if not lengths.empty else 0.0 else: - result = pd.Series(dtype=float, index=data.columns) + result = {} for col in data.columns: lengths = _get_string_lengths(data[col]) result[col] = float(lengths.median()) if not lengths.empty else 0.0 - return result \ No newline at end of file + return pd.Series(result) \ No newline at end of file From 8329cafdd8bd7e62068b5bbda3c5991f34f214ad Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Wed, 14 Jan 2026 09:44:40 +0100 Subject: [PATCH 03/25] feat: add value distribution metrics --- metis/utils/data_profiling/__init__.py | 13 ++ .../value_distribution/constancy.py | 76 ++++++++++++ .../value_distribution/histogram.py | 116 ++++++++++++++++++ .../value_distribution/quartiles.py | 83 +++++++++++++ 4 files changed, 288 insertions(+) create mode 100644 metis/utils/data_profiling/single_column/value_distribution/constancy.py create mode 100644 metis/utils/data_profiling/single_column/value_distribution/histogram.py create mode 100644 metis/utils/data_profiling/single_column/value_distribution/quartiles.py diff --git a/metis/utils/data_profiling/__init__.py b/metis/utils/data_profiling/__init__.py index 063b6d8..ef4d367 100644 --- a/metis/utils/data_profiling/__init__.py +++ b/metis/utils/data_profiling/__init__.py @@ -8,3 +8,16 @@ value_length_median, value_length_min, ) + +from .single_column.value_distribution.histogram import ( + equi_width_histogram, + equi_depth_histogram, +) +from .single_column.value_distribution.constancy import ( + constancy, + most_frequent_value, +) +from .single_column.value_distribution.quartiles import ( + quartiles, + interquartile_range, +) diff --git a/metis/utils/data_profiling/single_column/value_distribution/constancy.py b/metis/utils/data_profiling/single_column/value_distribution/constancy.py new file mode 100644 index 0000000..7a81945 --- /dev/null +++ b/metis/utils/data_profiling/single_column/value_distribution/constancy.py @@ -0,0 +1,76 @@ +from typing import Any, Union +import pandas as pd + + +def constancy(data: Union[pd.Series, pd.DataFrame], include_nulls: bool = False) -> Union[float, pd.Series]: + """ + Calculate constancy as the ratio of the most frequent value's frequency + to the total number of values. + + Represents the proportion of some constant value compared with the entire + column. High constancy suggests a column with low variability or a dominant + default value. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :param include_nulls: If True, use total row count as denominator (paper definition). + If False (default), use non-null count as denominator. + :return: Constancy ratio (0.0 to 1.0) as float if Series input, Series of + floats if DataFrame input. Returns 0.0 for empty or all-null data. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0: + return 0.0 + + max_frequency = clean_data.value_counts().max() + denominator = len(data) if include_nulls else len(clean_data) + + return float(max_frequency / denominator) + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0: + result[col] = 0.0 + continue + + max_frequency = clean_data.value_counts().max() + denominator = len(data) if include_nulls else len(clean_data) + + result[col] = float(max_frequency / denominator) + + return pd.Series(result) + + +def most_frequent_value(data: Union[pd.Series, pd.DataFrame]) -> Union[Any, pd.Series]: + """ + Return the most frequent value in the data. + + If multiple values have the same maximum frequency, returns the one that + comes first in sorted order. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Most frequent value if Series input, Series of most frequent values + if DataFrame input. Returns None for empty or all-null data. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0: + return None + + return clean_data.mode().iloc[0] + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0: + result[col] = None + continue + + result[col] = clean_data.mode().iloc[0] + + return pd.Series(result) \ No newline at end of file diff --git a/metis/utils/data_profiling/single_column/value_distribution/histogram.py b/metis/utils/data_profiling/single_column/value_distribution/histogram.py new file mode 100644 index 0000000..fb1bc09 --- /dev/null +++ b/metis/utils/data_profiling/single_column/value_distribution/histogram.py @@ -0,0 +1,116 @@ +from typing import Union, Dict, List, Tuple +import pandas as pd +import numpy as np + + +def equi_width_histogram( + data: Union[pd.Series, pd.DataFrame], bins: int = 10 +) -> Union[Dict[str, Union[List[Tuple[float, float]], List[int]]], Dict[str, Dict[str, Union[List[Tuple[float, float]], List[int]]]]]: + """ + Create an equi-width histogram where buckets span value ranges of same length. + + Divides the range of values into bins of equal width and counts frequencies + in each bin. Only works with numeric data. Non-numeric columns are skipped. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :param bins: Number of bins to create (default: 10). + :return: Dictionary containing 'bin_edges' (list of tuples with min/max of each bin) + and 'frequencies' (counts per bin). For DataFrame input, returns dict of + column names to their histogram dicts. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + return {"bin_edges": [], "frequencies": []} + + counts, bin_edges = np.histogram(clean_data, bins=bins) + bin_ranges = [(float(bin_edges[i]), float(bin_edges[i + 1])) for i in range(len(bin_edges) - 1)] + + return { + "bin_edges": bin_ranges, + "frequencies": [int(c) for c in counts] + } + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + result[col] = {"bin_edges": [], "frequencies": []} + continue + + counts, bin_edges = np.histogram(clean_data, bins=bins) + bin_ranges = [(float(bin_edges[i]), float(bin_edges[i + 1])) for i in range(len(bin_edges) - 1)] + + result[col] = { + "bin_edges": bin_ranges, + "frequencies": [int(c) for c in counts] + } + + return result + + +def equi_depth_histogram( + data: Union[pd.Series, pd.DataFrame], bins: int = 10 +) -> Union[Dict[str, Union[List[Tuple[float, float]], List[int]]], Dict[str, Dict[str, Union[List[Tuple[float, float]], List[int]]]]]: + """ + Create an equi-depth (equi-height) histogram where each bucket represents + approximately the same number of value occurrences. + + Uses quantiles to determine bin boundaries such that each bin contains roughly + the same number of values. Only works with numeric data. Non-numeric columns are skipped. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :param bins: Number of bins to create (default: 10). + :return: Dictionary containing 'bin_edges' (list of tuples with min/max of each bin) + and 'frequencies' (counts per bin). For DataFrame input, returns dict of + column names to their histogram dicts. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + return {"bin_edges": [], "frequencies": []} + + quantiles = np.linspace(0, 1, bins + 1) + bin_edges = clean_data.quantile(quantiles).values + bin_edges = np.unique(bin_edges) + + if len(bin_edges) <= 1: + return {"bin_edges": [], "frequencies": []} + + counts, _ = np.histogram(clean_data, bins=bin_edges) + bin_ranges = [(float(bin_edges[i]), float(bin_edges[i + 1])) for i in range(len(bin_edges) - 1)] + + return { + "bin_edges": bin_ranges, + "frequencies": [int(c) for c in counts] + } + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + result[col] = {"bin_edges": [], "frequencies": []} + continue + + quantiles = np.linspace(0, 1, bins + 1) + bin_edges = clean_data.quantile(quantiles).values + bin_edges = np.unique(bin_edges) + + if len(bin_edges) <= 1: + result[col] = {"bin_edges": [], "frequencies": []} + continue + + counts, _ = np.histogram(clean_data, bins=bin_edges) + bin_ranges = [(float(bin_edges[i]), float(bin_edges[i + 1])) for i in range(len(bin_edges) - 1)] + + result[col] = { + "bin_edges": bin_ranges, + "frequencies": [int(c) for c in counts] + } + + return result + diff --git a/metis/utils/data_profiling/single_column/value_distribution/quartiles.py b/metis/utils/data_profiling/single_column/value_distribution/quartiles.py new file mode 100644 index 0000000..30b382b --- /dev/null +++ b/metis/utils/data_profiling/single_column/value_distribution/quartiles.py @@ -0,0 +1,83 @@ +from typing import Union, Dict +import pandas as pd + + +def quartiles(data: Union[pd.Series, pd.DataFrame]) -> Union[Dict[str, float], Dict[str, Dict[str, float]]]: + """ + Calculate quartiles (Q1, Q2/median, Q3) that divide numeric values into + four equal groups. + + A special case of equi-depth histogram with exactly four buckets. Only works + with numeric data. For non-numeric columns, returns None values. Null values + are excluded from the calculation. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Dictionary with keys 'Q1', 'Q2', 'Q3' containing the quartile values + as floats. For DataFrame input, returns dict of column names to their + quartile dicts. Returns None for quartiles if data is non-numeric or empty. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + return {"Q1": None, "Q2": None, "Q3": None} + + q1 = float(clean_data.quantile(0.25)) + q2 = float(clean_data.quantile(0.50)) + q3 = float(clean_data.quantile(0.75)) + + return {"Q1": q1, "Q2": q2, "Q3": q3} + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + result[col] = {"Q1": None, "Q2": None, "Q3": None} + continue + + q1 = float(clean_data.quantile(0.25)) + q2 = float(clean_data.quantile(0.50)) + q3 = float(clean_data.quantile(0.75)) + + result[col] = {"Q1": q1, "Q2": q2, "Q3": q3} + + return result + + +def interquartile_range(data: Union[pd.Series, pd.DataFrame]) -> Union[float, pd.Series]: + """ + Calculate the interquartile range (IQR = Q3 - Q1). + + The IQR represents the range of the middle 50% of the data and is useful + for detecting outliers. Only works with numeric data. Null values are excluded. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: IQR as float if Series input, Series of floats if DataFrame input. + Returns None for non-numeric or empty data. + """ + if isinstance(data, pd.Series): + clean_data = data.dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + return None + + q1 = clean_data.quantile(0.25) + q3 = clean_data.quantile(0.75) + + return float(q3 - q1) + else: + result = {} + for col in data.columns: + clean_data = data[col].dropna() + + if len(clean_data) == 0 or not pd.api.types.is_numeric_dtype(clean_data): + result[col] = None + continue + + q1 = clean_data.quantile(0.25) + q3 = clean_data.quantile(0.75) + + result[col] = float(q3 - q1) + + return pd.Series(result) From 22c9ee070968809c45b630289a8d6a321ae7e0f7 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Wed, 14 Jan 2026 09:45:57 +0100 Subject: [PATCH 04/25] feat: add patterns and data types profiling tasks --- metis/utils/data_profiling/__init__.py | 5 + .../patterns_and_data_types/basic_type.py | 78 +++++++++++ .../patterns_and_data_types/data_type.py | 90 ++++++++++++ .../numeric_precision.py | 128 ++++++++++++++++++ .../patterns_and_data_types/patterns.py | 94 +++++++++++++ 5 files changed, 395 insertions(+) create mode 100644 metis/utils/data_profiling/single_column/patterns_and_data_types/basic_type.py create mode 100644 metis/utils/data_profiling/single_column/patterns_and_data_types/data_type.py create mode 100644 metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py create mode 100644 metis/utils/data_profiling/single_column/patterns_and_data_types/patterns.py diff --git a/metis/utils/data_profiling/__init__.py b/metis/utils/data_profiling/__init__.py index ef4d367..4f45962 100644 --- a/metis/utils/data_profiling/__init__.py +++ b/metis/utils/data_profiling/__init__.py @@ -21,3 +21,8 @@ quartiles, interquartile_range, ) + +from .single_column.patterns_and_data_types.basic_type import basic_type +from .single_column.patterns_and_data_types.data_type import data_type +from .single_column.patterns_and_data_types.numeric_precision import size, decimals +from .single_column.patterns_and_data_types.patterns import patterns diff --git a/metis/utils/data_profiling/single_column/patterns_and_data_types/basic_type.py b/metis/utils/data_profiling/single_column/patterns_and_data_types/basic_type.py new file mode 100644 index 0000000..c52d7e7 --- /dev/null +++ b/metis/utils/data_profiling/single_column/patterns_and_data_types/basic_type.py @@ -0,0 +1,78 @@ +from typing import Union +import pandas as pd + + +def _infer_basic_type(series: pd.Series) -> str: + """ + Infer the basic type of a Series by examining its values. + + :param series: Input Series. + :return: One of: 'numeric', 'alphabetic', 'alphanumeric', 'date', 'time', 'mixed', 'empty'. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return "empty" + + if pd.api.types.is_numeric_dtype(clean_data): + return "numeric" + + if pd.api.types.is_datetime64_any_dtype(clean_data): + return "date" + + sample_size = min(100, len(clean_data)) + sample = clean_data.sample(n=sample_size, random_state=42).astype(str) + + datetime_pattern = r'^\d{1,4}[-/]\d{1,2}[-/]\d{1,4}([T\s]\d{1,2}:\d{2}(:\d{2})?)?$' + is_datetime = sample.str.match(datetime_pattern, na=False).mean() > 0.8 + + time_pattern = r'^\d{1,2}:\d{2}(:\d{2})?(\s?[AaPp][Mm])?$' + is_time = sample.str.match(time_pattern, na=False).mean() > 0.8 + + if is_datetime: + return "date" + if is_time: + return "time" + + # Scientific notation pattern + scientific_pattern = r'^-?\d+\.?\d*[eE][+-]?\d+$' + is_scientific = sample.str.match(scientific_pattern, na=False).mean() > 0.8 + + # General numeric string pattern (integers, decimals, negatives) + numeric_string_pattern = r'^-?\d+\.?\d*$' + is_numeric_string = sample.str.match(numeric_string_pattern, na=False).mean() > 0.8 + + if is_scientific or is_numeric_string: + return "numeric" + + has_digit = sample.str.contains(r'\d', regex=True, na=False).any() + has_alpha = sample.str.contains(r'[a-zA-Z]', regex=True, na=False).any() + + if has_digit and has_alpha: + return "alphanumeric" + elif has_alpha and not has_digit: + return "alphabetic" + else: + return "mixed" + + +def basic_type(data: Union[pd.Series, pd.DataFrame]) -> Union[str, pd.Series]: + """ + Classify a column as numeric, alphabetic, alphanumeric, date, or time. + + Determined by examining the presence or absence of numeric and non-numeric + characters. Date and time are recognized by numbers within certain ranges + and numbers separated in regular patterns by special symbols. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Basic type as string if Series input ('numeric', 'alphabetic', + 'alphanumeric', 'date', 'time', 'mixed', 'empty'), Series of type + strings if DataFrame input. + """ + if isinstance(data, pd.Series): + return _infer_basic_type(data) + else: + result = {} + for col in data.columns: + result[col] = _infer_basic_type(data[col]) + return pd.Series(result) diff --git a/metis/utils/data_profiling/single_column/patterns_and_data_types/data_type.py b/metis/utils/data_profiling/single_column/patterns_and_data_types/data_type.py new file mode 100644 index 0000000..9b4ddfc --- /dev/null +++ b/metis/utils/data_profiling/single_column/patterns_and_data_types/data_type.py @@ -0,0 +1,90 @@ +from typing import Union +import pandas as pd + + +def _infer_sql_type(series: pd.Series) -> str: + """ + Infer the most specific SQL data type for a Series. + + :param series: Input Series. + :return: SQL type name as string. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return "varchar" + + if pd.api.types.is_bool_dtype(clean_data): + return "boolean" + + if pd.api.types.is_datetime64_any_dtype(clean_data): + return "timestamp" + + if pd.api.types.is_integer_dtype(clean_data): + max_val = clean_data.abs().max() + if max_val <= 32767: + return "smallint" + elif max_val <= 2147483647: + return "int" + elif max_val <= 9223372036854775807: + return "bigint" + else: + return "numeric" + + if pd.api.types.is_float_dtype(clean_data): + return "double" + + sample_size = min(100, len(clean_data)) + sample = clean_data.sample(n=sample_size, random_state=42).astype(str) + + date_pattern = r'^\d{4}-\d{2}-\d{2}$' + if sample.str.match(date_pattern, na=False).mean() > 0.8: + return "date" + + time_pattern = r'^\d{2}:\d{2}:\d{2}$' + if sample.str.match(time_pattern, na=False).mean() > 0.8: + return "time" + + timestamp_pattern = r'^\d{4}-\d{2}-\d{2}[T\s]\d{2}:\d{2}:\d{2}$' + if sample.str.match(timestamp_pattern, na=False).mean() > 0.8: + return "timestamp" + + try: + numeric_converted = pd.to_numeric(clean_data, errors='coerce') + valid_numeric = numeric_converted.dropna() + if len(valid_numeric) > 0 and numeric_converted.notna().mean() > 0.8: + if (valid_numeric % 1 == 0).all(): + return "int" + else: + return "double" + except Exception: + pass + + max_length = clean_data.astype(str).str.len().max() + if max_length <= 255: + return "varchar" + else: + return "text" + +def data_type(data: Union[pd.Series, pd.DataFrame]) -> Union[str, pd.Series]: + """ + Infer the concrete DBMS-specific data type (SQL types). + + Data of many types must follow fixed, sometimes DBMS-specific patterns. + When classifying, chooses the most specific data type possible, avoiding + catchalls like char or varchar if possible. + + Possible return values: 'boolean', 'smallint', 'int', 'bigint', 'numeric', + 'double', 'date', 'time', 'timestamp', 'varchar', 'text'. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: SQL type as string if Series input, Series of type strings if + DataFrame input. + """ + if isinstance(data, pd.Series): + return _infer_sql_type(data) + else: + result = {} + for col in data.columns: + result[col] = _infer_sql_type(data[col]) + return pd.Series(result) diff --git a/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py b/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py new file mode 100644 index 0000000..64f9101 --- /dev/null +++ b/metis/utils/data_profiling/single_column/patterns_and_data_types/numeric_precision.py @@ -0,0 +1,128 @@ +from typing import Union +from decimal import Decimal, InvalidOperation +import pandas as pd + + +def _to_decimal_string(val: float) -> str: + """Convert float to full decimal string without scientific notation.""" + try: + d = Decimal(str(val)) + sign, digits, exponent = d.as_tuple() + + if exponent >= 0: + return ''.join(str(d) for d in digits) + '0' * exponent + else: + digits_str = ''.join(str(d) for d in digits) + if len(digits_str) <= -exponent: + digits_str = '0' * (-exponent - len(digits_str) + 1) + digits_str + decimal_pos = len(digits_str) + exponent + return digits_str[:decimal_pos] + '.' + digits_str[decimal_pos:] + except (InvalidOperation, ValueError): + return str(val) + + +def _calculate_size(series: pd.Series) -> int: + """ + Calculate the maximum number of digits for numeric values. + + :param series: Input Series. + :return: Maximum number of digits. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return 0 + + if not pd.api.types.is_numeric_dtype(clean_data): + try: + clean_data = pd.to_numeric(clean_data, errors='coerce').dropna() + if len(clean_data) == 0: + return 0 + except Exception: + return 0 + + max_digits = 0 + for val in clean_data: + try: + val_str = _to_decimal_string(abs(val)) + val_str = val_str.replace('.', '') + max_digits = max(max_digits, len(val_str)) + except Exception: + continue + + return max_digits + + +def _calculate_decimals(series: pd.Series) -> int: + """ + Calculate the maximum number of decimal places for numeric values. + + :param series: Input Series. + :return: Maximum number of decimal places. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return 0 + + if not pd.api.types.is_numeric_dtype(clean_data): + try: + clean_data = pd.to_numeric(clean_data, errors='coerce').dropna() + if len(clean_data) == 0: + return 0 + except Exception: + return 0 + + max_decimals = 0 + for val in clean_data: + try: + val_str = _to_decimal_string(val) + if '.' in val_str: + decimal_part = val_str.split('.')[1].rstrip('0') + max_decimals = max(max_decimals, len(decimal_part)) + except Exception: + continue + + return max_decimals + + +def size(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Extract the maximum number of digits for decimal, float, and double data types. + + For numeric columns, determines the maximum total number of digits (excluding + decimal point and sign). Used to determine appropriate data type bounds. + Returns 0 for non-numeric data. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Maximum number of digits as int if Series input, Series of ints if + DataFrame input. + """ + if isinstance(data, pd.Series): + return _calculate_size(data) + else: + result = {} + for col in data.columns: + result[col] = _calculate_size(data[col]) + return pd.Series(result) + + +def decimals(data: Union[pd.Series, pd.DataFrame]) -> Union[int, pd.Series]: + """ + Extract the maximum number of decimal places for decimal, float, and double data types. + + For numeric columns with fractional parts, determines the maximum number of + digits after the decimal point. Used alongside size to determine appropriate + precision specifications. Returns 0 for integer or non-numeric data. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Maximum number of decimal places as int if Series input, Series of + ints if DataFrame input. + """ + if isinstance(data, pd.Series): + return _calculate_decimals(data) + else: + result = {} + for col in data.columns: + result[col] = _calculate_decimals(data[col]) + return pd.Series(result) \ No newline at end of file diff --git a/metis/utils/data_profiling/single_column/patterns_and_data_types/patterns.py b/metis/utils/data_profiling/single_column/patterns_and_data_types/patterns.py new file mode 100644 index 0000000..1ab11c5 --- /dev/null +++ b/metis/utils/data_profiling/single_column/patterns_and_data_types/patterns.py @@ -0,0 +1,94 @@ +from typing import Union, List, Dict +import pandas as pd +from collections import Counter + + +def _extract_pattern(value: str) -> str: + """ + Extract a pattern representation from a string value. + + Pattern codes: A = uppercase letter, a = lowercase letter, 9 = digit, + # = special character, space = space, ? = other letter (e.g., CJK). + + :param value: Input string. + :return: Pattern representation. + """ + pattern = [] + for char in value: + if char.isupper(): + pattern.append('A') + elif char.islower(): + pattern.append('a') + elif char.isdigit(): + pattern.append('9') + elif char.isspace(): + pattern.append(' ') + elif char.isalpha(): + pattern.append('?') + else: + pattern.append('#') + return ''.join(pattern) + + +def _get_top_patterns(series: pd.Series, top_n: int = 5) -> List[Dict[str, Union[str, int, float]]]: + """ + Extract the most frequent patterns from a Series. + + :param series: Input Series. + :param top_n: Number of top patterns to return. + :return: List of dictionaries containing pattern, count, and frequency. + """ + clean_data = series.dropna().astype(str) + + if len(clean_data) == 0: + return [] + + pattern_list = [_extract_pattern(val) for val in clean_data] + pattern_counts = Counter(pattern_list) + + total = len(pattern_list) + top_patterns = [] + + for pattern, count in pattern_counts.most_common(top_n): + top_patterns.append({ + "pattern": pattern, + "count": count, + "frequency": float(count / total) + }) + + return top_patterns + + +def patterns( + data: Union[pd.Series, pd.DataFrame], top_n: int = 5 +) -> Union[List[Dict[str, Union[str, int, float]]], Dict[str, List[Dict[str, Union[str, int, float]]]]]: + """ + Extract frequent patterns observed in the data of a column. + + Patterns are expressed using pattern codes where: + - A = uppercase letter + - a = lowercase letter + - 9 = digit + - # = special character + - ? = other letter (e.g., CJK characters) + - (space) = space + + Example: "John123" -> "Aaaa999", "+1 (555) 123-4567" -> "#9 (#99) 999-9999" + + Data that does not conform to discovered patterns is likely erroneous or + ill-formed. Returns the top N most frequent patterns with their counts + and frequencies. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :param top_n: Number of top patterns to return per column (default: 5). + :return: List of pattern dictionaries if Series input, dict of column names + to pattern lists if DataFrame input. Each pattern dict contains + 'pattern' (str), 'count' (int), and 'frequency' (float). + """ + if isinstance(data, pd.Series): + return _get_top_patterns(data, top_n) + else: + result = {} + for col in data.columns: + result[col] = _get_top_patterns(data[col], top_n) + return result \ No newline at end of file From 213a506b8165134f9fb9546b3ba8d600d9ca2b30 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Wed, 14 Jan 2026 09:46:36 +0100 Subject: [PATCH 05/25] feat: add summaries and sketches profiling tasks --- metis/utils/data_profiling/__init__.py | 9 +++ .../jaccard_similarity.py | 65 +++++++++++++++++++ .../summaries_and_sketches/minhash.py | 51 +++++++++++++++ 3 files changed, 125 insertions(+) create mode 100644 metis/utils/data_profiling/single_column/summaries_and_sketches/jaccard_similarity.py create mode 100644 metis/utils/data_profiling/single_column/summaries_and_sketches/minhash.py diff --git a/metis/utils/data_profiling/__init__.py b/metis/utils/data_profiling/__init__.py index 4f45962..33cb077 100644 --- a/metis/utils/data_profiling/__init__.py +++ b/metis/utils/data_profiling/__init__.py @@ -26,3 +26,12 @@ from .single_column.patterns_and_data_types.data_type import data_type from .single_column.patterns_and_data_types.numeric_precision import size, decimals from .single_column.patterns_and_data_types.patterns import patterns + +from .single_column.summaries_and_sketches.jaccard_similarity import ( + jaccard_similarity, + jaccard_similarity_ngrams, +) +from .single_column.summaries_and_sketches.minhash import ( + minhash_signature, + estimate_jaccard_from_minhash, +) diff --git a/metis/utils/data_profiling/single_column/summaries_and_sketches/jaccard_similarity.py b/metis/utils/data_profiling/single_column/summaries_and_sketches/jaccard_similarity.py new file mode 100644 index 0000000..2c133d9 --- /dev/null +++ b/metis/utils/data_profiling/single_column/summaries_and_sketches/jaccard_similarity.py @@ -0,0 +1,65 @@ +from typing import Union +import pandas as pd + + +def jaccard_similarity(column1: pd.Series, column2: pd.Series) -> float: + """ + Calculate Jaccard similarity between two columns based on their distinct values. + + Jaccard similarity of columns A and B is defined as: + |A ∩ B| / |A ∪ B| + + This gives the relative number of distinct values appearing in both columns + divided by the total number of distinct values in either column. Null values + are excluded from the calculation. + + :param column1: First input Series. + :param column2: Second input Series. + :return: Jaccard similarity score between 0.0 and 1.0, where 1.0 means + identical sets of distinct values and 0.0 means no overlap. + """ + set1 = set(column1.dropna().unique()) + set2 = set(column2.dropna().unique()) + + if len(set1) == 0 and len(set2) == 0: + return 1.0 + + intersection = len(set1.intersection(set2)) + union = len(set1.union(set2)) + + return float(intersection / union) + + +def jaccard_similarity_ngrams(column1: pd.Series, column2: pd.Series, n: int = 2) -> float: + """ + Calculate Jaccard similarity between two columns based on n-gram distributions. + + Since semantically similar values may have different formats, this function + computes Jaccard similarity of n-gram distributions in the two columns instead + of exact value matches. This captures similarity at the character level. + + Example: "John" and "Jon" have no exact match, but share an n-gram like "Jo". + + :param column1: First input Series. + :param column2: Second input Series. + :param n: Size of n-grams to extract (default: 2 for bigrams). + :return: Jaccard similarity score between 0.0 and 1.0 based on n-gram overlap. + """ + def get_ngrams(series: pd.Series, n: int) -> set: + """Extract all n-grams from a Series.""" + ngrams = set() + for val in series.dropna().astype(str): + for i in range(len(val) - n + 1): + ngrams.add(val[i:i+n]) + return ngrams + + ngrams1 = get_ngrams(column1, n) + ngrams2 = get_ngrams(column2, n) + + if len(ngrams1) == 0 and len(ngrams2) == 0: + return 1.0 + + intersection = len(ngrams1.intersection(ngrams2)) + union = len(ngrams1.union(ngrams2)) + + return float(intersection / union) diff --git a/metis/utils/data_profiling/single_column/summaries_and_sketches/minhash.py b/metis/utils/data_profiling/single_column/summaries_and_sketches/minhash.py new file mode 100644 index 0000000..d29646b --- /dev/null +++ b/metis/utils/data_profiling/single_column/summaries_and_sketches/minhash.py @@ -0,0 +1,51 @@ +from typing import Union +import pandas as pd +from datasketch import MinHash + + +def minhash_signature(data: Union[pd.Series, pd.DataFrame], num_perm: int = 128) -> Union[MinHash, dict]: + """ + Create a MinHash signature for efficient Jaccard similarity estimation. + + MinHash creates compact signatures that can be compared efficiently to + estimate set similarity without computing full set intersections. If distinct + value sets are not directly available, Jaccard similarity can be estimated + using MinHash signatures. + + The signature size (num_perm) controls the trade-off between accuracy and + memory: more permutations give more accurate estimates but use more space. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :param num_perm: Number of permutations for MinHash (default: 128). Higher + values provide more accurate similarity estimates. + :return: MinHash object if Series input, dict of column names to MinHash + objects if DataFrame input. + """ + if isinstance(data, pd.Series): + m = MinHash(num_perm=num_perm) + for value in data.dropna().astype(str).unique(): + m.update(value.encode('utf-8')) + return m + else: + result = {} + for col in data.columns: + m = MinHash(num_perm=num_perm) + for value in data[col].dropna().astype(str).unique(): + m.update(value.encode('utf-8')) + result[col] = m + return result + + +def estimate_jaccard_from_minhash(minhash1: MinHash, minhash2: MinHash) -> float: + """ + Estimate Jaccard similarity between two columns using their MinHash signatures. + + This is much faster than computing exact Jaccard similarity for large datasets, + while providing a good approximation. The accuracy depends on the num_perm + parameter used when creating the MinHash signatures. + + :param minhash1: MinHash signature of the first column. + :param minhash2: MinHash signature of the second column. + :return: Estimated Jaccard similarity between 0.0 and 1.0. + """ + return float(minhash1.jaccard(minhash2)) From 0f8e9b7fe74d67824332ba066b374c42552f474f Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Wed, 14 Jan 2026 09:47:08 +0100 Subject: [PATCH 06/25] feat: add domain classification profiling tasks --- metis/utils/data_profiling/__init__.py | 3 + .../domain_classification/data_class.py | 77 ++++++++++++ .../domain_classification/domain.py | 119 ++++++++++++++++++ 3 files changed, 199 insertions(+) create mode 100644 metis/utils/data_profiling/single_column/domain_classification/data_class.py create mode 100644 metis/utils/data_profiling/single_column/domain_classification/domain.py diff --git a/metis/utils/data_profiling/__init__.py b/metis/utils/data_profiling/__init__.py index 33cb077..eb9c0c0 100644 --- a/metis/utils/data_profiling/__init__.py +++ b/metis/utils/data_profiling/__init__.py @@ -35,3 +35,6 @@ minhash_signature, estimate_jaccard_from_minhash, ) + +from .single_column.domain_classification.data_class import data_class +from .single_column.domain_classification.domain import domain diff --git a/metis/utils/data_profiling/single_column/domain_classification/data_class.py b/metis/utils/data_profiling/single_column/domain_classification/data_class.py new file mode 100644 index 0000000..1209beb --- /dev/null +++ b/metis/utils/data_profiling/single_column/domain_classification/data_class.py @@ -0,0 +1,77 @@ +from typing import Union +import pandas as pd + + +def _classify_data_class(series: pd.Series) -> str: + """ + Classify the semantic data class of a Series. + + :param series: Input Series. + :return: Data class as string. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return "unknown" + + if pd.api.types.is_datetime64_any_dtype(clean_data): + return "date/time" + + if pd.api.types.is_bool_dtype(clean_data): + return "indicator" + + if pd.api.types.is_numeric_dtype(clean_data): + distinct_count = clean_data.nunique() + total_count = len(clean_data) + + if distinct_count == total_count or distinct_count / total_count > 0.95: + return "identifier" + else: + return "quantity" + + sample_size = min(100, len(clean_data)) + sample = clean_data.sample(n=sample_size, random_state=42).astype(str) + + avg_length = sample.str.len().mean() + distinct_ratio = clean_data.nunique() / len(clean_data) + + if distinct_ratio > 0.95: + return "identifier" + + if distinct_ratio < 0.05: + return "code" + + code_pattern = r'^[A-Za-z0-9]{2,10}$' + if avg_length <= 10 and sample.str.match(code_pattern, na=False).mean() > 0.7: + return "code" + + if avg_length > 50: + return "text" + + return "text" + + +def data_class(data: Union[pd.Series, pd.DataFrame]) -> Union[str, pd.Series]: + """ + Classify the semantic, generic data type. + + Goes beyond syntactic patterns to categorize the semantic role of the column. + Possible classifications: + - code: Short alphanumeric codes with low cardinality (e.g., country codes, status codes) + - indicator: Boolean or binary values + - text: Free-form text with variable length + - date/time: Temporal data + - quantity: Numeric measurements or amounts + - identifier: High-cardinality unique or near-unique values (e.g., IDs, keys) + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Data class as string if Series input, Series of data class strings + if DataFrame input. + """ + if isinstance(data, pd.Series): + return _classify_data_class(data) + else: + result = {} + for col in data.columns: + result[col] = _classify_data_class(data[col]) + return pd.Series(result) \ No newline at end of file diff --git a/metis/utils/data_profiling/single_column/domain_classification/domain.py b/metis/utils/data_profiling/single_column/domain_classification/domain.py new file mode 100644 index 0000000..6cb4f6c --- /dev/null +++ b/metis/utils/data_profiling/single_column/domain_classification/domain.py @@ -0,0 +1,119 @@ +from typing import Union, Optional, Dict, List +import pandas as pd + + +DOMAIN_PATTERNS: Dict[str, str] = { + "email": r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', + "url": r'^https?://[^\s/$.?#].[^\s]*$', + "ssn": r'^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$', + "date_iso": r'^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$', + "time": r'^([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?$', + "ip_address": r'^(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$', + "zip_code": r'^\d{5}(-\d{4})?$', + "credit_card": r'^(?:\d[ -]*){13,19}$', + "phone": r'^(\+\d{1,3}[-.\s]?\(?\d{1,4}\)?[-.\s]?\d{1,4}[-.\s]?\d{1,9}|\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}|\d{3}[-.\s]\d{3}[-.\s]\d{4})$', + "currency": r'^[($€£¥₹-]?[$€£¥₹]?\s?-?\d{1,3}(?:[,.\s]\d{3})*(?:[,.]\d{1,2})?\s?[$€£¥₹]?[)]?$', +} + +NAME_INDICATORS: Dict[str, List[str]] = { + "first_name": ["firstname", "fname", "givenname"], + "last_name": ["lastname", "lname", "surname", "familyname"], + "full_name": ["fullname"], + "city": ["city", "town", "municipality"], + "state": ["state", "province", "region"], + "country": ["country", "nation"], + "address": ["address", "street", "addr"], + "postal_code": ["postal", "postcode", "zipcode"], +} + + +def _detect_domain_by_pattern(series: pd.Series, threshold: float = 0.8) -> Optional[str]: + """ + Detect domain by matching against known patterns. + + :param series: Input Series. + :param threshold: Minimum proportion of values that must match a pattern. + :return: Domain name if detected, None otherwise. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return None + + sample_size = min(100, len(clean_data)) + sample = clean_data.sample(n=sample_size, random_state=42).astype(str) + + for domain, pattern in DOMAIN_PATTERNS.items(): + match_ratio = sample.str.match(pattern, na=False).mean() + if match_ratio >= threshold: + return domain + + return None + + +def _detect_domain_by_column_name(column_name: str) -> Optional[str]: + """ + Detect domain by matching column name against known indicators. + + :param column_name: Name of the column. + :return: Domain name if detected, None otherwise. + """ + column_normalized = column_name.lower().replace('_', '').replace(' ', '').replace('-', '') + + for domain, indicators in NAME_INDICATORS.items(): + for indicator in indicators: + if column_normalized == indicator or column_normalized.endswith(indicator): + return domain + + return None + + +def _classify_domain(series: pd.Series, column_name: Optional[str] = None) -> str: + """ + Classify the semantic domain of a Series. + + :param series: Input Series. + :param column_name: Optional column name for additional context. + :return: Domain classification as string. + """ + clean_data = series.dropna() + + if len(clean_data) == 0: + return "unknown" + + domain_by_pattern = _detect_domain_by_pattern(clean_data) + if domain_by_pattern: + return domain_by_pattern + + if column_name: + domain_by_name = _detect_domain_by_column_name(column_name) + if domain_by_name: + return domain_by_name + + return "unknown" + + +def domain(data: Union[pd.Series, pd.DataFrame]) -> Union[str, pd.Series]: + """ + Classify the semantic domain of columns. + + Attempts to identify specific domains such as: email, phone, url, ip_address, + credit_card, ssn, zip_code, date_iso, time, currency, first_name, last_name, + full_name, city, state, country, address, postal_code. + + Uses a combination of pattern matching on data values and column name analysis. + Pattern matching takes priority over column name inference. + Returns "unknown" if no domain can be confidently identified. + + :param data: Input Series (single column) or DataFrame (multiple columns). + :return: Domain name as string if Series input, Series of domain names if + DataFrame input. + """ + if isinstance(data, pd.Series): + column_name = data.name if hasattr(data, 'name') else None + return _classify_domain(data, column_name) + else: + result = {} + for col in data.columns: + result[col] = _classify_domain(data[col], col) + return pd.Series(result) \ No newline at end of file From 07e16d144fd16e04cdec7a0b2f181a002d270050 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Wed, 14 Jan 2026 09:50:21 +0100 Subject: [PATCH 07/25] feat: update requirements to include numpy and datasketch --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 8d1a079..b092ffc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,5 @@ psycopg2-binary sqlite3 ; sys_platform == "win32" # sqlite3 is included with Python, but this line is for completeness sqlalchemy==2.0.44 nltk +numpy +datasketch From 0750a4dcbf30c931a6b01b1d03ea2edaba745516 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 10:25:17 +0100 Subject: [PATCH 08/25] ref: rename mesTime to timestamp in DQResult --- README.md | 4 ++-- metis/database_models.py | 2 +- .../completeness/completeness_nullRatio.py | 2 +- .../consistency_countFDViolations.py | 2 +- .../minimality/minimality_duplicateCount.py | 2 +- .../validity/validity_outOfVocabulary.py | 2 +- metis/utils/result.py | 18 +++++++++--------- metis/writer/database_writer.py | 2 +- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 76f2d2a..efae405 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Examples: `completeness_NullRatio`, `minimality_DuplicateCount` class DQResult: def __init__( self, - mesTime: pd.Timestamp, + timestamp: pd.Timestamp, DQvalue: float, DQdimension: str, DQmetric: str, @@ -57,7 +57,7 @@ class DQResult: ```` To create a new instance of DQResult, one needs to provide at least the following arguments: -- **mesTime: pd.Timestamp**: The time at which a result was assessed. +- **timestamp: pd.Timestamp**: The time at which a result was assessed. - **DQvalue: float**: The result of the assessment. This currently only supports quantitative assessments. - **DQdimension: str**: The name of the data quality dimension that was assessed e.g. completeness, accuracy, etc. - **DQmetric: str**: The name of the specific metric inside the given dimension that was assessed. diff --git a/metis/database_models.py b/metis/database_models.py index 5785ece..5e7a7dd 100644 --- a/metis/database_models.py +++ b/metis/database_models.py @@ -14,7 +14,7 @@ class DQResultModel(Base): __table_args__ = {"extend_existing": True} id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) - mes_time: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) + timestamp: Mapped[datetime] = mapped_column(DateTime(timezone=True), server_default=func.now()) dq_dimension: Mapped[str] dq_metric: Mapped[str] dq_granularity: Mapped[str] diff --git a/metis/metric/completeness/completeness_nullRatio.py b/metis/metric/completeness/completeness_nullRatio.py index 9534cd3..56cdcc7 100644 --- a/metis/metric/completeness/completeness_nullRatio.py +++ b/metis/metric/completeness/completeness_nullRatio.py @@ -21,7 +21,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None completeness = (total_rows - int(null_count)) / total_rows result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQdimension="Completeness", DQmetric="NullRatio", DQgranularity="column", diff --git a/metis/metric/consistency/consistency_countFDViolations.py b/metis/metric/consistency/consistency_countFDViolations.py index 160b204..ec0ba06 100644 --- a/metis/metric/consistency/consistency_countFDViolations.py +++ b/metis/metric/consistency/consistency_countFDViolations.py @@ -40,7 +40,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None consistency = 1 - (len(violations) / len(data[determinant])) result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQdimension="Consistency", DQmetric="CountFDViolations", DQgranularity="table", diff --git a/metis/metric/minimality/minimality_duplicateCount.py b/metis/metric/minimality/minimality_duplicateCount.py index 5dd59d7..35ea646 100644 --- a/metis/metric/minimality/minimality_duplicateCount.py +++ b/metis/metric/minimality/minimality_duplicateCount.py @@ -27,7 +27,7 @@ def assess(self, data: pd.DataFrame, reference: Union[pd.DataFrame, None] = None annotations = {"CandidateKey": "CandidateKey"} result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQdimension="Minimality", DQmetric="DuplicateCount", DQgranularity="column", diff --git a/metis/metric/validity/validity_outOfVocabulary.py b/metis/metric/validity/validity_outOfVocabulary.py index b918d25..8c5289f 100644 --- a/metis/metric/validity/validity_outOfVocabulary.py +++ b/metis/metric/validity/validity_outOfVocabulary.py @@ -62,7 +62,7 @@ def is_valid(text: str) -> bool: } result = DQResult( - mesTime=pd.Timestamp.now(), + timestamp=pd.Timestamp.now(), DQdimension="Validity", DQmetric="OutOfVocabulary", DQgranularity="column", diff --git a/metis/utils/result.py b/metis/utils/result.py index 841ce2f..ef7ec2e 100644 --- a/metis/utils/result.py +++ b/metis/utils/result.py @@ -4,7 +4,7 @@ class DQResult: def __init__( self, - mesTime: pd.Timestamp, + timestamp: pd.Timestamp, DQdimension: str, DQmetric: str, DQgranularity: str, @@ -20,7 +20,7 @@ def __init__( """Create a data-quality result representing a single assessed value. Required arguments - - `mesTime: pd.Timestamp`: The time at which the result was assessed. + - `timestamp: pd.Timestamp`: The time at which the result was assessed. - `DQdimension: str`: Data quality dimension assessed (e.g. 'completeness', 'accuracy'). - `DQmetric: str`: Name of the specific metric within the dimension. - `DQgranularity: str`: Granularity of the metric (e.g. 'column', 'table', 'cell'). @@ -50,7 +50,7 @@ def __init__( need to encode non-numeric outcomes consider using `DQexplanation` to store auxiliary information while keeping `DQvalue` numeric. """ - self._mesTime = mesTime + self._timestamp = timestamp self._DQdimension = DQdimension self._DQmetric = DQmetric self._DQgranularity = DQgranularity @@ -64,12 +64,12 @@ def __init__( self._configJson = configJson @property - def mesTime(self): - return self._mesTime + def timestamp(self): + return self._timestamp - @mesTime.setter - def mesTime(self, value): - self._mesTime = value + @timestamp.setter + def timestamp(self, value): + self._timestamp = value @property def DQdimension(self): @@ -161,7 +161,7 @@ def configJson(self, value): def as_json(self): return { - "mesTime": self._mesTime, + "timestamp": self._timestamp, "DQdimension": self._DQdimension, "DQmetric": self._DQmetric, "DQgranularity": self._DQgranularity, diff --git a/metis/writer/database_writer.py b/metis/writer/database_writer.py index 0d1e838..d6f29bd 100644 --- a/metis/writer/database_writer.py +++ b/metis/writer/database_writer.py @@ -22,7 +22,7 @@ def write(self, results: List[DQResult]) -> None: with Session(self.engine) as session: db_entities = [ self.DQResultModel( - mes_time=result.mesTime.to_pydatetime(), + timestamp=result.timestamp.to_pydatetime(), dq_dimension=result.DQdimension, dq_metric=result.DQmetric, dq_granularity=result.DQgranularity, From a41ac036539622f76049502712a59af051eef7d1 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 10:42:34 +0100 Subject: [PATCH 09/25] feat: add DataProfile model for caching profiling results --- metis/database_models.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/metis/database_models.py b/metis/database_models.py index 5e7a7dd..3e802e0 100644 --- a/metis/database_models.py +++ b/metis/database_models.py @@ -28,3 +28,40 @@ class DQResultModel(Base): config_json: Mapped[dict | None] = mapped_column(JSON) return DQResultModel + +class DataProfile(Base): + """Stores data profiling results for caching and manual imports. + + Covers single-column statistics (null_count, distinct_count, histograms, ...), + multi-column dependencies (FDs, UCCs, INDs, ...), and any other profiling + result type. The result payload is stored as JSON so the schema stays + flexible across different task types. + """ + + __tablename__ = "data_profiles" + __table_args__ = {"extend_existing": True} + + id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True) + timestamp: Mapped[datetime] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + + # --- identifiers --- + dataset: Mapped[str] + table_name: Mapped[str] + column_names: Mapped[List[str]] = mapped_column(JSON) + dp_task_name: Mapped[str] # e.g. "null_count", "fd", "ucc" + task_config: Mapped[dict | None] = mapped_column(JSON) # extra params used + + # --- category --- + profile_type: Mapped[str] = mapped_column(default="single_column") + # "single_column" | "multi_column" | "dependency" | "custom" + + # --- result --- + dp_result_value: Mapped[dict | None] = mapped_column(JSON) # {"v": } + result_type: Mapped[str] = mapped_column(default="scalar") + # "scalar" | "list" | "dict" | "series" — for deserialization hint + + # --- provenance --- + source: Mapped[str] = mapped_column(default="computed") + # "computed" | "imported:hyfd" | "imported:manual" | … From 76b7e6fd889ecfb633bedc0c2960165e2154ba8c Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 10:58:17 +0100 Subject: [PATCH 10/25] feat: add DataProfileManager singleton for profile caching --- metis/profiling/__init__.py | 52 ++++ metis/profiling/data_profile_manager.py | 328 ++++++++++++++++++++++++ 2 files changed, 380 insertions(+) create mode 100644 metis/profiling/__init__.py create mode 100644 metis/profiling/data_profile_manager.py diff --git a/metis/profiling/__init__.py b/metis/profiling/__init__.py new file mode 100644 index 0000000..1adf70e --- /dev/null +++ b/metis/profiling/__init__.py @@ -0,0 +1,52 @@ +"""Cached data-profiling functions and the ``DataProfileManager``. + +Import profiling functions from this package instead of +``metis.utils.data_profiling`` to get automatic caching:: + + from metis.profiling import null_count # cached version + +The original (uncached) implementations remain accessible via +``metis.utils.data_profiling``. +""" + +from .data_profile_manager import DataProfileManager + +# --- Cardinalities --- +from .single_column.cardinalities.distinct_values import distinct_count +from .single_column.cardinalities.null_values import null_count, null_percentage +from .single_column.cardinalities.row_count import row_count +from .single_column.cardinalities.uniqueness import uniqueness +from .single_column.cardinalities.value_length import ( + value_length_max, + value_length_mean, + value_length_median, + value_length_min, +) + +# --- Value Distribution --- +from .single_column.value_distribution.histogram import ( + equi_width_histogram, + equi_depth_histogram, +) +from .single_column.value_distribution.constancy import constancy, most_frequent_value +from .single_column.value_distribution.quartiles import quartiles, interquartile_range + +# --- Patterns & Data Types --- +from .single_column.patterns_and_data_types.basic_type import basic_type +from .single_column.patterns_and_data_types.data_type import data_type +from .single_column.patterns_and_data_types.numeric_precision import size, decimals +from .single_column.patterns_and_data_types.patterns import patterns + +# --- Summaries & Sketches --- +from .single_column.summaries_and_sketches.jaccard_similarity import ( + jaccard_similarity, + jaccard_similarity_ngrams, +) +from .single_column.summaries_and_sketches.minhash import ( + minhash_signature, + estimate_jaccard_from_minhash, +) + +# --- Domain Classification --- +from .single_column.domain_classification.data_class import data_class +from .single_column.domain_classification.domain import domain diff --git a/metis/profiling/data_profile_manager.py b/metis/profiling/data_profile_manager.py new file mode 100644 index 0000000..539d9bb --- /dev/null +++ b/metis/profiling/data_profile_manager.py @@ -0,0 +1,328 @@ +from __future__ import annotations + +import json +import threading +from typing import Any, Dict, List, Optional + +from sqlalchemy import Engine, create_engine as sa_create_engine, select +from sqlalchemy.orm import Session + +from metis.database_models import Base, DataProfile + + +class DataProfileManager: + """Singleton that manages data-profile caching and storage. + + Typical lifecycle: + 1. ``DQOrchestrator`` calls ``DataProfileManager.initialize(engine)`` + to hand over a SQLAlchemy engine (or a connection string). + 2. Before running metrics for a dataset the orchestrator calls + ``set_context(dataset=..., table=...)`` so cached wrappers know + which dataset is active. + 3. Cached profiling functions use ``get_instance()`` internally. + """ + + _instance: Optional[DataProfileManager] = None + _lock = threading.Lock() + + # ------------------------------------------------------------------ # + # Singleton access + # ------------------------------------------------------------------ # + @classmethod + def initialize(cls, engine_or_url: Engine | str) -> DataProfileManager: + """Create (or re-create) the singleton with the given engine.""" + with cls._lock: + if isinstance(engine_or_url, str): + engine = sa_create_engine(engine_or_url) + else: + engine = engine_or_url + cls._instance = cls(engine) + Base.metadata.create_all(engine) + return cls._instance + + @classmethod + def get_instance(cls) -> DataProfileManager: + """Return the current singleton. Raises if not initialized.""" + if cls._instance is None: + raise RuntimeError( + "DataProfileManager has not been initialized. " + "Call DataProfileManager.initialize(engine) first." + ) + return cls._instance + + @classmethod + def is_initialized(cls) -> bool: + return cls._instance is not None + + @classmethod + def shutdown(cls) -> None: + """Shutdown the singleton and dispose the engine.""" + with cls._lock: + if cls._instance is not None: + cls._instance._engine.dispose() + cls._instance = None + + def __init__(self, engine: Engine) -> None: + self._engine = engine + self._dataset: Optional[str] = None + self._table: Optional[str] = None + # In-memory cache for the current run to avoid repeated DB queries + self._mem_cache: Dict[str, Any] = {} + + # ------------------------------------------------------------------ # + # Context management (called by DQOrchestrator) + # ------------------------------------------------------------------ # + def set_context(self, dataset: str, table: str) -> None: + """Set the currently active dataset / table.""" + self._dataset = dataset + self._table = table + self._mem_cache.clear() + + @property + def dataset(self) -> Optional[str]: + return self._dataset + + @property + def table(self) -> Optional[str]: + return self._table + + # ------------------------------------------------------------------ # + # Cache lookup + # ------------------------------------------------------------------ # + @staticmethod + def _cache_key( + dataset: str, + table: str, + column_names: List[str], + dp_task_name: str, + task_config: Optional[dict] = None, + ) -> str: + config_str = json.dumps(task_config, sort_keys=True) if task_config else "" + return f"{dataset}|{table}|{','.join(sorted(column_names))}|{dp_task_name}|{config_str}" + + def lookup( + self, + column_names: List[str], + dp_task_name: str, + task_config: Optional[dict] = None, + ) -> Optional[Any]: + """Look up a cached profiling result. Returns ``None`` on miss.""" + if self._dataset is None or self._table is None: + return None + + key = self._cache_key( + self._dataset, self._table, column_names, dp_task_name, task_config + ) + + # fast path: in-memory + if key in self._mem_cache: + return self._mem_cache[key] + + # slow path: DB + with Session(self._engine) as session: + stmt = ( + select(DataProfile) + .where(DataProfile.dataset == self._dataset) + .where(DataProfile.table_name == self._table) + .where(DataProfile.dp_task_name == dp_task_name) + ) + for row in session.execute(stmt).scalars(): + if sorted(row.column_names) == sorted(column_names): + cfg = row.task_config or {} + if cfg == (task_config or {}): + value = self._deserialize(row.dp_result_value, row.result_type) + self._mem_cache[key] = value + return value + return None + + # ------------------------------------------------------------------ # + # Store results + # ------------------------------------------------------------------ # + def store( + self, + column_names: List[str], + dp_task_name: str, + value: Any, + task_config: Optional[dict] = None, + profile_type: str = "single_column", + source: str = "computed", + dataset: Optional[str] = None, + table: Optional[str] = None, + ) -> None: + """Persist a profiling result to the database.""" + ds = dataset or self._dataset + tbl = table or self._table + if ds is None or tbl is None: + raise RuntimeError( + "Cannot store profiling result: dataset/table context not set." + ) + + serialized, result_type = self._serialize(value) + + profile = DataProfile( + dataset=ds, + table_name=tbl, + column_names=column_names, + dp_task_name=dp_task_name, + task_config=task_config, + profile_type=profile_type, + dp_result_value=serialized, + result_type=result_type, + source=source, + ) + + with Session(self._engine) as session: + session.add(profile) + session.commit() + + # update in-memory cache + key = self._cache_key(ds, tbl, column_names, dp_task_name, task_config) + self._mem_cache[key] = value + + # ------------------------------------------------------------------ # + # Convenience helpers for dependency storage + # ------------------------------------------------------------------ # + def store_fd( + self, + lhs: List[str], + rhs: str, + dataset: Optional[str] = None, + table: Optional[str] = None, + source: str = "computed", + ) -> None: + """Store a functional dependency lhs -> rhs.""" + self.store( + column_names=sorted(lhs + [rhs]), + dp_task_name="fd", + value={"lhs": lhs, "rhs": rhs}, + profile_type="dependency", + source=source, + dataset=dataset, + table=table, + ) + + def store_ucc( + self, + columns: List[str], + dataset: Optional[str] = None, + table: Optional[str] = None, + source: str = "computed", + ) -> None: + """Store a unique column combination.""" + self.store( + column_names=sorted(columns), + dp_task_name="ucc", + value={"columns": columns}, + profile_type="dependency", + source=source, + dataset=dataset, + table=table, + ) + + def store_ind( + self, + dependent: List[str], + referenced: List[str], + referenced_table: Optional[str] = None, + dataset: Optional[str] = None, + table: Optional[str] = None, + source: str = "computed", + ) -> None: + """Store an inclusion dependency.""" + self.store( + column_names=sorted(dependent + referenced), + dp_task_name="ind", + value={ + "dependent": dependent, + "referenced": referenced, + "referenced_table": referenced_table, + }, + profile_type="dependency", + source=source, + dataset=dataset, + table=table, + ) + + def get_fds( + self, + dataset: Optional[str] = None, + table: Optional[str] = None, + ) -> List[dict]: + """Return all stored FDs for a dataset/table.""" + return self._query_by_task("fd", dataset, table) + + def get_uccs( + self, + dataset: Optional[str] = None, + table: Optional[str] = None, + ) -> List[dict]: + return self._query_by_task("ucc", dataset, table) + + def get_inds( + self, + dataset: Optional[str] = None, + table: Optional[str] = None, + ) -> List[dict]: + return self._query_by_task("ind", dataset, table) + + # ------------------------------------------------------------------ # + # Internal helpers + # ------------------------------------------------------------------ # + def _query_by_task( + self, dp_task_name: str, dataset: Optional[str], table: Optional[str] + ) -> List[dict]: + ds = dataset or self._dataset + tbl = table or self._table + if ds is None or tbl is None: + return [] + with Session(self._engine) as session: + stmt = ( + select(DataProfile) + .where(DataProfile.dataset == ds) + .where(DataProfile.table_name == tbl) + .where(DataProfile.dp_task_name == dp_task_name) + ) + return [ + self._deserialize(row.dp_result_value, row.result_type) + for row in session.execute(stmt).scalars() + ] + + @staticmethod + def _serialize(value: Any) -> tuple[dict, str]: + """Wrap *value* into a JSON-safe dict and return (payload, type_tag).""" + import numpy as np + import pandas as pd + + def to_json_safe(v: Any) -> Any: + """Convert numpy types to native Python types.""" + if isinstance(v, (np.integer,)): + return int(v) + if isinstance(v, (np.floating,)): + return float(v) + if isinstance(v, np.ndarray): + return v.tolist() + if isinstance(v, dict): + return {k: to_json_safe(val) for k, val in v.items()} + if isinstance(v, list): + return [to_json_safe(item) for item in v] + return v + + if isinstance(value, pd.Series): + return {"v": to_json_safe(value.to_dict())}, "series" + if isinstance(value, dict): + return {"v": to_json_safe(value)}, "dict" + if isinstance(value, list): + return {"v": to_json_safe(value)}, "list" + # scalar (int, float, str, bool, None …) + return {"v": to_json_safe(value)}, "scalar" + + @staticmethod + def _deserialize(payload: Optional[dict], result_type: str) -> Any: + if payload is None: + return None + import pandas as pd + + raw = payload.get("v") + if result_type == "series": + return pd.Series(raw) + return raw From df0239b38f4160f30c17f4771b579bdfe2e4c025 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 11:21:53 +0100 Subject: [PATCH 11/25] feat: add caching decorator for profiling functions --- metis/profiling/cache.py | 93 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 metis/profiling/cache.py diff --git a/metis/profiling/cache.py b/metis/profiling/cache.py new file mode 100644 index 0000000..9c61353 --- /dev/null +++ b/metis/profiling/cache.py @@ -0,0 +1,93 @@ +"""Transparent caching decorator for data-profiling functions. + +Usage:: + + from metis.profiling.cache import cached + from metis.utils.data_profiling.single_column.cardinalities.null_values import ( + null_count as _null_count, + ) + + null_count = cached(_null_count) + +The wrapped function has the *exact same* signature as the original. +When ``DataProfileManager`` is initialised and a context (dataset / table) +is active, results are automatically looked up in – and written to – the +profile database. When the manager is *not* initialised the function simply +delegates to the original implementation (zero overhead apart from the +isinstance check). +""" + +from __future__ import annotations + +import functools +from typing import Any, Callable + +import pandas as pd + + +def cached(fn: Callable) -> Callable: + """Return a wrapper around *fn* that caches via ``DataProfileManager``.""" + + @functools.wraps(fn) + def wrapper(data: pd.Series | pd.DataFrame, *args: Any, **kwargs: Any) -> Any: + from metis.profiling.data_profile_manager import DataProfileManager + + # If the manager is not active, fall through to the original function. + if not DataProfileManager.is_initialized(): + return fn(data, *args, **kwargs) + + manager = DataProfileManager.get_instance() + if manager.dataset is None or manager.table is None: + return fn(data, *args, **kwargs) + + # Derive column names from the data argument. + if isinstance(data, pd.Series): + if data.name is None: + return fn(data, *args, **kwargs) + column_names = [str(data.name)] + else: + column_names = [str(c) for c in data.columns] + + # Build optional config dict from extra arguments (if any). + task_config = _build_config(fn, args, kwargs) or None + + # Cache lookup + cached_value = manager.lookup(column_names, fn.__name__, task_config) + if cached_value is not None: + return cached_value + + # Compute + result = fn(data, *args, **kwargs) + + # Store + manager.store( + column_names=column_names, + dp_task_name=fn.__name__, + value=result, + task_config=task_config, + ) + + return result + + return wrapper + + +def _build_config(fn: Callable, args: tuple, kwargs: dict) -> dict | None: + """Turn extra positional/keyword args into a JSON-safe config dict.""" + if not args and not kwargs: + return None + + import inspect + + sig = inspect.signature(fn) + params = list(sig.parameters.keys())[1:] # skip 'data' + + config: dict = {} + for i, val in enumerate(args): + if i < len(params): + config[params[i]] = val + config.update(kwargs) + + if not config: + return None + return config From cd0140791968b08cc1968594a8e8df58c9cf359d Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 11:43:26 +0100 Subject: [PATCH 12/25] feat: wrap existing profiling functions with caching --- metis/profiling/single_column/__init__.py | 0 .../single_column/cardinalities/__init__.py | 0 .../single_column/cardinalities/distinct_values.py | 6 ++++++ .../single_column/cardinalities/null_values.py | 8 ++++++++ .../single_column/cardinalities/row_count.py | 6 ++++++ .../single_column/cardinalities/uniqueness.py | 6 ++++++ .../single_column/cardinalities/value_length.py | 12 ++++++++++++ .../single_column/domain_classification/__init__.py | 0 .../domain_classification/data_class.py | 6 ++++++ .../single_column/domain_classification/domain.py | 6 ++++++ .../patterns_and_data_types/__init__.py | 0 .../patterns_and_data_types/basic_type.py | 6 ++++++ .../patterns_and_data_types/data_type.py | 6 ++++++ .../patterns_and_data_types/numeric_precision.py | 8 ++++++++ .../patterns_and_data_types/patterns.py | 6 ++++++ .../single_column/summaries_and_sketches/__init__.py | 0 .../summaries_and_sketches/jaccard_similarity.py | 8 ++++++++ .../single_column/summaries_and_sketches/minhash.py | 8 ++++++++ .../single_column/value_distribution/__init__.py | 0 .../single_column/value_distribution/constancy.py | 8 ++++++++ .../single_column/value_distribution/histogram.py | 8 ++++++++ .../single_column/value_distribution/quartiles.py | 8 ++++++++ 22 files changed, 116 insertions(+) create mode 100644 metis/profiling/single_column/__init__.py create mode 100644 metis/profiling/single_column/cardinalities/__init__.py create mode 100644 metis/profiling/single_column/cardinalities/distinct_values.py create mode 100644 metis/profiling/single_column/cardinalities/null_values.py create mode 100644 metis/profiling/single_column/cardinalities/row_count.py create mode 100644 metis/profiling/single_column/cardinalities/uniqueness.py create mode 100644 metis/profiling/single_column/cardinalities/value_length.py create mode 100644 metis/profiling/single_column/domain_classification/__init__.py create mode 100644 metis/profiling/single_column/domain_classification/data_class.py create mode 100644 metis/profiling/single_column/domain_classification/domain.py create mode 100644 metis/profiling/single_column/patterns_and_data_types/__init__.py create mode 100644 metis/profiling/single_column/patterns_and_data_types/basic_type.py create mode 100644 metis/profiling/single_column/patterns_and_data_types/data_type.py create mode 100644 metis/profiling/single_column/patterns_and_data_types/numeric_precision.py create mode 100644 metis/profiling/single_column/patterns_and_data_types/patterns.py create mode 100644 metis/profiling/single_column/summaries_and_sketches/__init__.py create mode 100644 metis/profiling/single_column/summaries_and_sketches/jaccard_similarity.py create mode 100644 metis/profiling/single_column/summaries_and_sketches/minhash.py create mode 100644 metis/profiling/single_column/value_distribution/__init__.py create mode 100644 metis/profiling/single_column/value_distribution/constancy.py create mode 100644 metis/profiling/single_column/value_distribution/histogram.py create mode 100644 metis/profiling/single_column/value_distribution/quartiles.py diff --git a/metis/profiling/single_column/__init__.py b/metis/profiling/single_column/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/cardinalities/__init__.py b/metis/profiling/single_column/cardinalities/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/cardinalities/distinct_values.py b/metis/profiling/single_column/cardinalities/distinct_values.py new file mode 100644 index 0000000..d46c54d --- /dev/null +++ b/metis/profiling/single_column/cardinalities/distinct_values.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.cardinalities.distinct_values import ( + distinct_count as _distinct_count, +) + +distinct_count = cached(_distinct_count) diff --git a/metis/profiling/single_column/cardinalities/null_values.py b/metis/profiling/single_column/cardinalities/null_values.py new file mode 100644 index 0000000..27f0c27 --- /dev/null +++ b/metis/profiling/single_column/cardinalities/null_values.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.cardinalities.null_values import ( + null_count as _null_count, + null_percentage as _null_percentage, +) + +null_count = cached(_null_count) +null_percentage = cached(_null_percentage) diff --git a/metis/profiling/single_column/cardinalities/row_count.py b/metis/profiling/single_column/cardinalities/row_count.py new file mode 100644 index 0000000..ebe0004 --- /dev/null +++ b/metis/profiling/single_column/cardinalities/row_count.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.cardinalities.row_count import ( + row_count as _row_count, +) + +row_count = cached(_row_count) diff --git a/metis/profiling/single_column/cardinalities/uniqueness.py b/metis/profiling/single_column/cardinalities/uniqueness.py new file mode 100644 index 0000000..61f49d5 --- /dev/null +++ b/metis/profiling/single_column/cardinalities/uniqueness.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.cardinalities.uniqueness import ( + uniqueness as _uniqueness, +) + +uniqueness = cached(_uniqueness) diff --git a/metis/profiling/single_column/cardinalities/value_length.py b/metis/profiling/single_column/cardinalities/value_length.py new file mode 100644 index 0000000..69707ac --- /dev/null +++ b/metis/profiling/single_column/cardinalities/value_length.py @@ -0,0 +1,12 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.cardinalities.value_length import ( + value_length_max as _value_length_max, + value_length_mean as _value_length_mean, + value_length_median as _value_length_median, + value_length_min as _value_length_min, +) + +value_length_max = cached(_value_length_max) +value_length_mean = cached(_value_length_mean) +value_length_median = cached(_value_length_median) +value_length_min = cached(_value_length_min) diff --git a/metis/profiling/single_column/domain_classification/__init__.py b/metis/profiling/single_column/domain_classification/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/domain_classification/data_class.py b/metis/profiling/single_column/domain_classification/data_class.py new file mode 100644 index 0000000..27603fe --- /dev/null +++ b/metis/profiling/single_column/domain_classification/data_class.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.domain_classification.data_class import ( + data_class as _data_class, +) + +data_class = cached(_data_class) diff --git a/metis/profiling/single_column/domain_classification/domain.py b/metis/profiling/single_column/domain_classification/domain.py new file mode 100644 index 0000000..071ee4c --- /dev/null +++ b/metis/profiling/single_column/domain_classification/domain.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.domain_classification.domain import ( + domain as _domain, +) + +domain = cached(_domain) diff --git a/metis/profiling/single_column/patterns_and_data_types/__init__.py b/metis/profiling/single_column/patterns_and_data_types/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/patterns_and_data_types/basic_type.py b/metis/profiling/single_column/patterns_and_data_types/basic_type.py new file mode 100644 index 0000000..7ff2214 --- /dev/null +++ b/metis/profiling/single_column/patterns_and_data_types/basic_type.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.patterns_and_data_types.basic_type import ( + basic_type as _basic_type, +) + +basic_type = cached(_basic_type) diff --git a/metis/profiling/single_column/patterns_and_data_types/data_type.py b/metis/profiling/single_column/patterns_and_data_types/data_type.py new file mode 100644 index 0000000..40c6f87 --- /dev/null +++ b/metis/profiling/single_column/patterns_and_data_types/data_type.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.patterns_and_data_types.data_type import ( + data_type as _data_type, +) + +data_type = cached(_data_type) diff --git a/metis/profiling/single_column/patterns_and_data_types/numeric_precision.py b/metis/profiling/single_column/patterns_and_data_types/numeric_precision.py new file mode 100644 index 0000000..5c4aeb5 --- /dev/null +++ b/metis/profiling/single_column/patterns_and_data_types/numeric_precision.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.patterns_and_data_types.numeric_precision import ( + size as _size, + decimals as _decimals, +) + +size = cached(_size) +decimals = cached(_decimals) diff --git a/metis/profiling/single_column/patterns_and_data_types/patterns.py b/metis/profiling/single_column/patterns_and_data_types/patterns.py new file mode 100644 index 0000000..304dce6 --- /dev/null +++ b/metis/profiling/single_column/patterns_and_data_types/patterns.py @@ -0,0 +1,6 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.patterns_and_data_types.patterns import ( + patterns as _patterns, +) + +patterns = cached(_patterns) diff --git a/metis/profiling/single_column/summaries_and_sketches/__init__.py b/metis/profiling/single_column/summaries_and_sketches/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/summaries_and_sketches/jaccard_similarity.py b/metis/profiling/single_column/summaries_and_sketches/jaccard_similarity.py new file mode 100644 index 0000000..134e9d5 --- /dev/null +++ b/metis/profiling/single_column/summaries_and_sketches/jaccard_similarity.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.summaries_and_sketches.jaccard_similarity import ( + jaccard_similarity as _jaccard_similarity, + jaccard_similarity_ngrams as _jaccard_similarity_ngrams, +) + +jaccard_similarity = cached(_jaccard_similarity) +jaccard_similarity_ngrams = cached(_jaccard_similarity_ngrams) diff --git a/metis/profiling/single_column/summaries_and_sketches/minhash.py b/metis/profiling/single_column/summaries_and_sketches/minhash.py new file mode 100644 index 0000000..b8d307a --- /dev/null +++ b/metis/profiling/single_column/summaries_and_sketches/minhash.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.summaries_and_sketches.minhash import ( + minhash_signature as _minhash_signature, + estimate_jaccard_from_minhash as _estimate_jaccard_from_minhash, +) + +minhash_signature = cached(_minhash_signature) +estimate_jaccard_from_minhash = cached(_estimate_jaccard_from_minhash) diff --git a/metis/profiling/single_column/value_distribution/__init__.py b/metis/profiling/single_column/value_distribution/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/metis/profiling/single_column/value_distribution/constancy.py b/metis/profiling/single_column/value_distribution/constancy.py new file mode 100644 index 0000000..97cdddc --- /dev/null +++ b/metis/profiling/single_column/value_distribution/constancy.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.value_distribution.constancy import ( + constancy as _constancy, + most_frequent_value as _most_frequent_value, +) + +constancy = cached(_constancy) +most_frequent_value = cached(_most_frequent_value) diff --git a/metis/profiling/single_column/value_distribution/histogram.py b/metis/profiling/single_column/value_distribution/histogram.py new file mode 100644 index 0000000..948e1f5 --- /dev/null +++ b/metis/profiling/single_column/value_distribution/histogram.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.value_distribution.histogram import ( + equi_width_histogram as _equi_width_histogram, + equi_depth_histogram as _equi_depth_histogram, +) + +equi_width_histogram = cached(_equi_width_histogram) +equi_depth_histogram = cached(_equi_depth_histogram) diff --git a/metis/profiling/single_column/value_distribution/quartiles.py b/metis/profiling/single_column/value_distribution/quartiles.py new file mode 100644 index 0000000..e606ced --- /dev/null +++ b/metis/profiling/single_column/value_distribution/quartiles.py @@ -0,0 +1,8 @@ +from metis.profiling.cache import cached +from metis.utils.data_profiling.single_column.value_distribution.quartiles import ( + quartiles as _quartiles, + interquartile_range as _interquartile_range, +) + +quartiles = cached(_quartiles) +interquartile_range = cached(_interquartile_range) From e418d0ea82214255bf9f91431f112bffd2bb550a Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 12:09:41 +0100 Subject: [PATCH 13/25] feat: add base importer class for data profiles --- metis/profiling/importers/__init__.py | 72 +++++++++++++++ metis/profiling/importers/base.py | 124 ++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100644 metis/profiling/importers/__init__.py create mode 100644 metis/profiling/importers/base.py diff --git a/metis/profiling/importers/__init__.py b/metis/profiling/importers/__init__.py new file mode 100644 index 0000000..bb0425f --- /dev/null +++ b/metis/profiling/importers/__init__.py @@ -0,0 +1,72 @@ +"""Data profile importers registry.""" + +from typing import Dict + +from .base import BaseImporter +from .fd_importer import FDImporter +from .histogram_importer import HistogramImporter +from .ind_importer import INDImporter +from .jaccard_importer import JaccardImporter +from .patterns_importer import PatternsImporter +from .quartiles_importer import QuartilesImporter +from .scalar_importer import ScalarImporter, create_scalar_importers +from .ucc_importer import UCCImporter + + +def _build_registry() -> Dict[str, BaseImporter]: + """Build the complete importer registry.""" + registry: Dict[str, BaseImporter] = {} + + # Scalar tasks (column, value) + registry.update(create_scalar_importers()) + + # Histogram tasks + registry["equi_width_histogram"] = HistogramImporter("equi_width_histogram") + registry["equi_depth_histogram"] = HistogramImporter("equi_depth_histogram") + + # Other complex tasks + registry["patterns"] = PatternsImporter() + registry["quartiles"] = QuartilesImporter() + + # 2-column tasks + registry["jaccard_similarity"] = JaccardImporter("jaccard_similarity") + registry["jaccard_similarity_ngrams"] = JaccardImporter("jaccard_similarity_ngrams") + + # Dependencies + registry["fd"] = FDImporter() + registry["ucc"] = UCCImporter() + registry["ind"] = INDImporter() + + return registry + + +IMPORTER_REGISTRY: Dict[str, BaseImporter] = _build_registry() + + +def get_importer(task_name: str) -> BaseImporter: + """Get the importer for a given task name. + + Raises: + KeyError: If no importer exists for the task + """ + if task_name not in IMPORTER_REGISTRY: + raise KeyError( + f"No importer registered for task '{task_name}'. " + f"Available tasks: {list(IMPORTER_REGISTRY.keys())}" + ) + return IMPORTER_REGISTRY[task_name] + + +__all__ = [ + "BaseImporter", + "IMPORTER_REGISTRY", + "get_importer", + "FDImporter", + "HistogramImporter", + "INDImporter", + "JaccardImporter", + "PatternsImporter", + "QuartilesImporter", + "ScalarImporter", + "UCCImporter", +] diff --git a/metis/profiling/importers/base.py b/metis/profiling/importers/base.py new file mode 100644 index 0000000..e70b05f --- /dev/null +++ b/metis/profiling/importers/base.py @@ -0,0 +1,124 @@ +"""Base class for data profile importers.""" + +from __future__ import annotations + +import csv +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Any, Dict, List, TYPE_CHECKING + +if TYPE_CHECKING: + from metis.profiling.data_profile_manager import DataProfileManager + + +def auto_detect_type(value: str) -> int | float | bool | str: + """Auto-detect Python type from string value.""" + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + if value.lower() in ("true", "false"): + return value.lower() == "true" + return value + + +class BaseImporter(ABC): + """Abstract base class for data profile importers. + + Each importer handles one or more related task types (e.g., ScalarImporter + handles all simple column->value tasks). + """ + + @property + @abstractmethod + def task_name(self) -> str: + """The profiling task name this importer handles.""" + ... + + @property + def profile_type(self) -> str: + """Profile type for storage (single_column, dependency, etc.).""" + return "single_column" + + @abstractmethod + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse an external file and return a list of profile dicts. + + Args: + file_path: Path to the file to parse + table_name: Name of the table (for column name extraction) + + Returns: + List of dicts with keys: column_names, value, task_config (optional) + """ + ... + + @abstractmethod + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + """Parse inline JSON values. + + Args: + values: List of value dicts from the config + table_name: Name of the table + + Returns: + List of dicts with keys: column_names, value, task_config (optional) + """ + ... + + def import_to_manager( + self, + config: Dict[str, Any], + manager: DataProfileManager, + dataset: str, + table: str, + ) -> int: + """Import profiles from config into the DataProfileManager. + + Args: + config: The task config dict with 'source' and either 'file' or 'values' + manager: DataProfileManager instance + dataset: Dataset identifier + table: Table name + + Returns: + Number of profiles imported + """ + source = config.get("source", "imported") + + if "file" in config: + profiles = self.parse_file(config["file"], table) + elif "values" in config: + profiles = self.parse_inline(config["values"], table) + else: + raise ValueError( + f"Config for {self.task_name} must have 'file' or 'values'" + ) + + for profile in profiles: + manager.store( + column_names=profile["column_names"], + dp_task_name=self.task_name, + value=profile["value"], + task_config=profile.get("task_config"), + profile_type=self.profile_type, + source=source, + dataset=dataset, + table=table, + ) + + return len(profiles) + + @staticmethod + def read_csv(file_path: str) -> List[Dict[str, str]]: + """Read a CSV file and return list of row dicts.""" + path = Path(file_path) + with path.open("r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + return list(reader) From d706943c94db5cbc2fe7a74caf7a1d4d848048f6 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 12:26:08 +0100 Subject: [PATCH 14/25] feat: add scalar value importer for simple column profiles --- metis/profiling/importers/scalar_importer.py | 70 ++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 metis/profiling/importers/scalar_importer.py diff --git a/metis/profiling/importers/scalar_importer.py b/metis/profiling/importers/scalar_importer.py new file mode 100644 index 0000000..03e3197 --- /dev/null +++ b/metis/profiling/importers/scalar_importer.py @@ -0,0 +1,70 @@ +"""Importer for simple column->value profiling tasks.""" + +from typing import Any, Dict, List + +from .base import BaseImporter, auto_detect_type + + +class ScalarImporter(BaseImporter): + """Importer for scalar profiling tasks (column, value pairs). + + Handles: null_count, null_percentage, distinct_count, row_count, uniqueness, + value_length_min/max/mean/median, constancy, most_frequent_value, + interquartile_range, basic_type, data_type, size, decimals, data_class, domain + """ + + def __init__(self, task_name: str): + self._task_name = task_name + + @property + def task_name(self) -> str: + return self._task_name + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + rows = self.read_csv(file_path) + return [ + { + "column_names": [row["column"]], + "value": auto_detect_type(row["value"]), + } + for row in rows + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": [v["column"]], + "value": v["value"], + } + for v in values + ] + + +# Pre-instantiated importers for scalar tasks +SCALAR_TASKS = [ + "null_count", + "null_percentage", + "distinct_count", + "row_count", + "uniqueness", + "value_length_min", + "value_length_max", + "value_length_mean", + "value_length_median", + "constancy", + "most_frequent_value", + "interquartile_range", + "basic_type", + "data_type", + "size", + "decimals", + "data_class", + "domain", +] + + +def create_scalar_importers() -> Dict[str, ScalarImporter]: + """Create ScalarImporter instances for all scalar tasks.""" + return {task: ScalarImporter(task) for task in SCALAR_TASKS} From 55679f196df0d36e7377bda1ec3822abefb7f362 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 12:47:52 +0100 Subject: [PATCH 15/25] feat: add histogram, patterns, quartiles, and jaccard importers --- .../profiling/importers/histogram_importer.py | 62 +++++++++++++++++++ metis/profiling/importers/jaccard_importer.py | 57 +++++++++++++++++ .../profiling/importers/patterns_importer.py | 48 ++++++++++++++ .../profiling/importers/quartiles_importer.py | 39 ++++++++++++ 4 files changed, 206 insertions(+) create mode 100644 metis/profiling/importers/histogram_importer.py create mode 100644 metis/profiling/importers/jaccard_importer.py create mode 100644 metis/profiling/importers/patterns_importer.py create mode 100644 metis/profiling/importers/quartiles_importer.py diff --git a/metis/profiling/importers/histogram_importer.py b/metis/profiling/importers/histogram_importer.py new file mode 100644 index 0000000..a9db520 --- /dev/null +++ b/metis/profiling/importers/histogram_importer.py @@ -0,0 +1,62 @@ +"""Importer for histogram profiling tasks.""" + +from collections import defaultdict +from typing import Any, Dict, List + +from .base import BaseImporter + + +class HistogramImporter(BaseImporter): + """Importer for equi_width_histogram and equi_depth_histogram tasks.""" + + def __init__(self, task_name: str): + self._task_name = task_name + + @property + def task_name(self) -> str: + return self._task_name + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with columns: column, bin_min, bin_max, count""" + rows = self.read_csv(file_path) + + # Group bins by column + column_bins: Dict[str, List[Dict]] = defaultdict(list) + for row in rows: + column_bins[row["column"]].append( + { + "min": float(row["bin_min"]), + "max": float(row["bin_max"]), + "count": int(row["count"]), + } + ) + + return [ + { + "column_names": [col], + "value": self._bins_to_histogram(bins), + } + for col, bins in column_bins.items() + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": [v["column"]], + "value": self._bins_to_histogram(v["bins"]), + } + for v in values + ] + + @staticmethod + def _bins_to_histogram(bins: List[Dict]) -> Dict: + """Convert list of bin dicts to histogram format. + + Input: [{"min": 0, "max": 30, "count": 100}, ...] + Output: {"bin_edges": [(0, 30), ...], "frequencies": [100, ...]} + """ + bin_edges = [(b["min"], b["max"]) for b in bins] + frequencies = [b["count"] for b in bins] + return {"bin_edges": bin_edges, "frequencies": frequencies} diff --git a/metis/profiling/importers/jaccard_importer.py b/metis/profiling/importers/jaccard_importer.py new file mode 100644 index 0000000..8240103 --- /dev/null +++ b/metis/profiling/importers/jaccard_importer.py @@ -0,0 +1,57 @@ +"""Importer for Jaccard similarity tasks.""" + +from typing import Any, Dict, List + +from .base import BaseImporter, auto_detect_type + + +class JaccardImporter(BaseImporter): + """Importer for jaccard_similarity and jaccard_similarity_ngrams tasks.""" + + def __init__(self, task_name: str): + self._task_name = task_name + + @property + def task_name(self) -> str: + return self._task_name + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with columns: column1, column2, [n], value.""" + rows = self.read_csv(file_path) + results = [] + + for row in rows: + col1 = row["column1"] + col2 = row["column2"] + value = float(row["value"]) + + profile = { + "column_names": sorted([col1, col2]), + "value": value, + } + + # For ngrams, include n as task_config + if "n" in row and row["n"]: + profile["task_config"] = {"n": int(row["n"])} + + results.append(profile) + + return results + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + results = [] + + for v in values: + profile = { + "column_names": sorted([v["column1"], v["column2"]]), + "value": v["value"], + } + + if "n" in v: + profile["task_config"] = {"n": v["n"]} + + results.append(profile) + + return results diff --git a/metis/profiling/importers/patterns_importer.py b/metis/profiling/importers/patterns_importer.py new file mode 100644 index 0000000..9166787 --- /dev/null +++ b/metis/profiling/importers/patterns_importer.py @@ -0,0 +1,48 @@ +"""Importer for patterns profiling task.""" + +from collections import defaultdict +from typing import Any, Dict, List + +from .base import BaseImporter + + +class PatternsImporter(BaseImporter): + """Importer for the patterns task.""" + + @property + def task_name(self) -> str: + return "patterns" + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with columns: column, pattern, count, frequency""" + rows = self.read_csv(file_path) + + # Group patterns by column + column_patterns: Dict[str, List[Dict]] = defaultdict(list) + for row in rows: + column_patterns[row["column"]].append( + { + "pattern": row["pattern"], + "count": int(row["count"]), + "frequency": float(row["frequency"]), + } + ) + + return [ + { + "column_names": [col], + "value": patterns, + } + for col, patterns in column_patterns.items() + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": [v["column"]], + "value": v["patterns"], + } + for v in values + ] diff --git a/metis/profiling/importers/quartiles_importer.py b/metis/profiling/importers/quartiles_importer.py new file mode 100644 index 0000000..7d4c01c --- /dev/null +++ b/metis/profiling/importers/quartiles_importer.py @@ -0,0 +1,39 @@ +"""Importer for quartiles profiling task.""" + +from typing import Any, Dict, List + +from .base import BaseImporter + + +class QuartilesImporter(BaseImporter): + """Importer for the quartiles task.""" + + @property + def task_name(self) -> str: + return "quartiles" + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with columns: column, Q1, Q2, Q3""" + rows = self.read_csv(file_path) + return [ + { + "column_names": [row["column"]], + "value": { + "Q1": float(row["Q1"]), + "Q2": float(row["Q2"]), + "Q3": float(row["Q3"]), + }, + } + for row in rows + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": [v["column"]], + "value": {"Q1": v["Q1"], "Q2": v["Q2"], "Q3": v["Q3"]}, + } + for v in values + ] From 7eb5d823303f2c8d0337486dc3c681db229f51fc Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 13:14:29 +0100 Subject: [PATCH 16/25] feat: add dependency importers for FD, UCC, and IND --- metis/profiling/importers/fd_importer.py | 145 ++++++++++++++++++++++ metis/profiling/importers/ind_importer.py | 55 ++++++++ metis/profiling/importers/ucc_importer.py | 44 +++++++ 3 files changed, 244 insertions(+) create mode 100644 metis/profiling/importers/fd_importer.py create mode 100644 metis/profiling/importers/ind_importer.py create mode 100644 metis/profiling/importers/ucc_importer.py diff --git a/metis/profiling/importers/fd_importer.py b/metis/profiling/importers/fd_importer.py new file mode 100644 index 0000000..39b6725 --- /dev/null +++ b/metis/profiling/importers/fd_importer.py @@ -0,0 +1,145 @@ +"""Importer for functional dependencies with HyFD/AIDFD/CFDFinder parsers.""" + +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any, Dict, List, TYPE_CHECKING + +from .base import BaseImporter + +if TYPE_CHECKING: + from metis.profiling.data_profile_manager import DataProfileManager + + +class FDImporter(BaseImporter): + """Importer for functional dependencies (fd task). + + Supports: + - JSON inline: {"lhs": ["col1"], "rhs": "col2"} + - HyFD/AIDFD format: [table.col1, table.col2]->table.col3 + - CFDFinder format: [table.col1]->table.col2#(pattern1);(pattern2) + """ + + # HyFD/AIDFD: [table.col1, table.col2]->table.col3 + HYFD_PATTERN = re.compile(r"\[([^\]]+)\]->([^\s\[#]+)") + + # CFDFinder: [table.col1]->table.col2#(pattern) + CFD_PATTERN = re.compile(r"\[([^\]]+)\]->([^#\s]+)#(.+)") + + @property + def task_name(self) -> str: + return "fd" + + @property + def profile_type(self) -> str: + return "dependency" + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse FD output file (HyFD, AIDFD, or CFDFinder format).""" + path = Path(file_path) + content = path.read_text(encoding="utf-8") + + fds: List[Dict[str, Any]] = [] + + # Try CFDFinder first (has # pattern) + for match in self.CFD_PATTERN.finditer(content): + lhs_raw, rhs_raw, pattern_tableau = match.groups() + lhs = self._parse_columns(lhs_raw, table_name) + rhs = self._parse_column(rhs_raw, table_name) + fds.append( + { + "column_names": sorted(lhs + [rhs]), + "value": {"lhs": lhs, "rhs": rhs}, + "task_config": {"pattern_tableau": pattern_tableau}, + } + ) + + # If no CFD matches, try HyFD/AIDFD + if not fds: + for match in self.HYFD_PATTERN.finditer(content): + lhs_raw, rhs_raw = match.groups() + lhs = self._parse_columns(lhs_raw, table_name) + rhs = self._parse_column(rhs_raw, table_name) + fds.append( + { + "column_names": sorted(lhs + [rhs]), + "value": {"lhs": lhs, "rhs": rhs}, + } + ) + + return fds + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + """Parse inline FD definitions.""" + return [ + { + "column_names": sorted(v["lhs"] + [v["rhs"]]), + "value": {"lhs": v["lhs"], "rhs": v["rhs"]}, + } + for v in values + ] + + def import_to_manager( + self, + config: Dict[str, Any], + manager: DataProfileManager, + dataset: str, + table: str, + ) -> int: + """Import FDs using the dedicated store_fd method.""" + source = config.get("source", "imported") + + if "file" in config: + profiles = self.parse_file(config["file"], table) + elif "values" in config: + profiles = self.parse_inline(config["values"], table) + else: + raise ValueError("FD config must have 'file' or 'values'") + + for profile in profiles: + fd_value = profile["value"] + manager.store_fd( + lhs=fd_value["lhs"], + rhs=fd_value["rhs"], + dataset=dataset, + table=table, + source=source, + ) + + return len(profiles) + + @staticmethod + def _parse_columns(raw: str, table_name: str) -> List[str]: + """Parse comma-separated column list, stripping table prefix.""" + cols = [c.strip() for c in raw.split(",")] + return [FDImporter._strip_table_prefix(c, table_name) for c in cols] + + @staticmethod + def _parse_column(raw: str, table_name: str) -> str: + """Parse single column, stripping table prefix.""" + return FDImporter._strip_table_prefix(raw.strip(), table_name) + + @staticmethod + def _strip_table_prefix(col: str, table_name: str) -> str: + """Strip table.csv. or table. prefix from column name.""" + # Handle both "table.csv.col" and "table.col" formats + prefixes = [ + f"{table_name}.csv.", + f"{table_name}.", + ] + for prefix in prefixes: + if col.startswith(prefix): + return col[len(prefix) :] + # Also try without extension + table_base = table_name.rsplit(".", 1)[0] if "." in table_name else table_name + prefixes = [ + f"{table_base}.csv.", + f"{table_base}.", + ] + for prefix in prefixes: + if col.startswith(prefix): + return col[len(prefix) :] + return col diff --git a/metis/profiling/importers/ind_importer.py b/metis/profiling/importers/ind_importer.py new file mode 100644 index 0000000..1bd16ef --- /dev/null +++ b/metis/profiling/importers/ind_importer.py @@ -0,0 +1,55 @@ +"""Importer for inclusion dependencies.""" + +from typing import Any, Dict, List + +from .base import BaseImporter + + +class INDImporter(BaseImporter): + """Importer for inclusion dependencies (ind task).""" + + @property + def task_name(self) -> str: + return "ind" + + @property + def profile_type(self) -> str: + return "dependency" + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with columns: dependent, referenced, referenced_table.""" + rows = self.read_csv(file_path) + return [ + { + "column_names": sorted( + self._parse_columns(row["dependent"]) + + self._parse_columns(row["referenced"]) + ), + "value": { + "dependent": self._parse_columns(row["dependent"]), + "referenced": self._parse_columns(row["referenced"]), + "referenced_table": row.get("referenced_table"), + }, + } + for row in rows + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": sorted(v["dependent"] + v["referenced"]), + "value": { + "dependent": v["dependent"], + "referenced": v["referenced"], + "referenced_table": v.get("referenced_table"), + }, + } + for v in values + ] + + @staticmethod + def _parse_columns(raw: str) -> List[str]: + """Parse column list (may be comma-separated).""" + return [c.strip() for c in raw.split(",")] diff --git a/metis/profiling/importers/ucc_importer.py b/metis/profiling/importers/ucc_importer.py new file mode 100644 index 0000000..b2b16cb --- /dev/null +++ b/metis/profiling/importers/ucc_importer.py @@ -0,0 +1,44 @@ +"""Importer for unique column combinations.""" + +from typing import Any, Dict, List + +from .base import BaseImporter + + +class UCCImporter(BaseImporter): + """Importer for unique column combinations (ucc task).""" + + @property + def task_name(self) -> str: + return "ucc" + + @property + def profile_type(self) -> str: + return "dependency" + + def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: + """Parse CSV with column: columns (comma-separated in quotes for multi).""" + rows = self.read_csv(file_path) + return [ + { + "column_names": self._parse_columns(row["columns"]), + "value": {"columns": self._parse_columns(row["columns"])}, + } + for row in rows + ] + + def parse_inline( + self, values: List[Dict[str, Any]], table_name: str + ) -> List[Dict[str, Any]]: + return [ + { + "column_names": sorted(v["columns"]), + "value": {"columns": v["columns"]}, + } + for v in values + ] + + @staticmethod + def _parse_columns(raw: str) -> List[str]: + """Parse column list (may be comma-separated).""" + return sorted([c.strip() for c in raw.split(",")]) From 961d75697844ae944c85255e02e92a9ae1b16b42 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 13:38:44 +0100 Subject: [PATCH 17/25] feat: add data_profiles field to DataConfig --- metis/utils/data_config.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/metis/utils/data_config.py b/metis/utils/data_config.py index 49fa368..4ca1969 100644 --- a/metis/utils/data_config.py +++ b/metis/utils/data_config.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import Dict, List +from typing import Any, Dict, List import metis.globals @@ -29,5 +29,8 @@ def __init__( self.parse_dates: bool = config.get("parse_dates", False) self.decimals: str = config.get("decimals", ".") self.thousands: str | None = config.get("thousands") - self.decimals: str = config.get("decimals", ".") - self.thousands: str | None = config.get("thousands") + + # Data profiling imports + self.data_profiles: Dict[str, Dict[str, Any]] | None = config.get( + "data_profiles" + ) From 8027d44bff27edd636dabac4cdef09518d1dee67 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 13:56:18 +0100 Subject: [PATCH 18/25] feat: integrate data profile import into DQOrchestrator --- metis/dq_orchestrator.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/metis/dq_orchestrator.py b/metis/dq_orchestrator.py index 0d699a3..0df957b 100644 --- a/metis/dq_orchestrator.py +++ b/metis/dq_orchestrator.py @@ -5,6 +5,8 @@ from metis.loader.csv_loader import CSVLoader from metis.metric import Metric +from metis.profiling.data_profile_manager import DataProfileManager +from metis.profiling.importers import get_importer from metis.utils.data_config import DataConfig from metis.utils.result import DQResult from metis.writer.console_writer import ConsoleWriter @@ -32,6 +34,11 @@ def __init__(self, writer_config_path: str | None = None) -> None: elif writer_config["writer_name"] == "postgres": self.writer = PostgresWriter(writer_config) + # Initialize profile cache using the same DB as the writer. + # No caching if no DB writer is configured. + if hasattr(self.writer, "engine"): + DataProfileManager.initialize(self.writer.engine) + def load(self, data_loader_configs: List[str]) -> None: for config_path in data_loader_configs: with open(config_path, "r") as f: @@ -49,6 +56,12 @@ def load(self, data_loader_configs: List[str]) -> None: reference_config.file_name = config.reference_file_name reference_dataframe = loader.load(reference_config) self.reference_dataframes[config.name] = reference_dataframe + + # Import pre-computed data profiles + if config.data_profiles: + self._import_data_profiles( + config.data_profiles, config_path, config.name + ) else: raise ValueError( f"Unsupported loader type: {config_data.get('loader', None)}" @@ -63,6 +76,11 @@ def assess(self, metrics: List[str], metric_configs: List[str | None]) -> None: raise ValueError(f"Metric {metric} is not registered.") metric_instance: Metric = metric_class() for df_name, df in self.dataframes.items(): + # Set profiling context so cached functions know the active dataset. + if DataProfileManager.is_initialized(): + DataProfileManager.get_instance().set_context( + dataset=self.data_paths[df_name], table=df_name + ) incomplete_metric_results = metric_instance.assess( data=df, reference=self.reference_dataframes.get(df_name), @@ -77,3 +95,29 @@ def assess(self, metrics: List[str], metric_configs: List[str | None]) -> None: def get_dq_result(self, query: str) -> List[DQResult]: return [] + + def _import_data_profiles( + self, profiles: dict, dataset: str, table: str + ) -> None: + """Import pre-computed data profiles from config. + + Args: + profiles: Dict mapping task_name -> {source, file|values} + dataset: Dataset identifier (config path) + table: Table name + """ + if not DataProfileManager.is_initialized(): + return + + manager = DataProfileManager.get_instance() + + for task_name, task_config in profiles.items(): + try: + importer = get_importer(task_name) + count = importer.import_to_manager( + task_config, manager, dataset, table + ) + except KeyError as e: + raise ValueError( + f"Unknown data profile task: {task_name}" + ) from e From 03c3cae42b587da3a412ee2686c2d20ca7ff5551 Mon Sep 17 00:00:00 2001 From: Tomic-Riedel Date: Sun, 22 Feb 2026 14:23:05 +0100 Subject: [PATCH 19/25] docs: add data profile import format documentation --- README.md | 40 +++ docs/DATA_PROFILE_IMPORT_FORMATS.md | 429 ++++++++++++++++++++++++++++ 2 files changed, 469 insertions(+) create mode 100644 docs/DATA_PROFILE_IMPORT_FORMATS.md diff --git a/README.md b/README.md index efae405..57ceb3b 100644 --- a/README.md +++ b/README.md @@ -67,5 +67,45 @@ Furthermore, there are more optional arguments that might need to be set dependi - **rowIndex: Optional[int]**: Index of the row this result is associated with. This can either be used together with columnNames to assess data quality on a cell level or for row based metrics. - **DQannotations: Optional[dict]**: To allow metrics to save additional information or annotations, this dictionary can store all additional information that might need to be saved. This currently does not need for follow a predefined structure. +## Data Profiling +Metis includes a data profiling system that caches computed statistics and supports importing pre-computed profiles. + +### Cached Profiling Functions + +Use cached profiling functions from `metis.profiling` for automatic caching: + +```python +from metis.profiling import null_count, distinct_count, data_type + +# These are automatically cached when DataProfileManager is initialized +nulls = null_count(df["column"]) +``` + +### Importing Pre-computed Profiles + +You can import pre-computed data profiles (from external tools like HyFD, CFDFinder, etc.) via the data loader config: + +```json +{ + "loader": "CSV", + "name": "Adult", + "file_name": "adult.csv", + "data_profiles": { + "fd": { + "source": "hyfd", + "file": "outputs/adult_hyfd.txt" + }, + "null_count": { + "source": "manual", + "values": [ + {"column": "age", "value": 0}, + {"column": "workclass", "value": 1836} + ] + } + } +} +``` + +For complete documentation of all supported import formats, see [Data Profile Import Formats](docs/DATA_PROFILE_IMPORT_FORMATS.md). diff --git a/docs/DATA_PROFILE_IMPORT_FORMATS.md b/docs/DATA_PROFILE_IMPORT_FORMATS.md new file mode 100644 index 0000000..c2a69d4 --- /dev/null +++ b/docs/DATA_PROFILE_IMPORT_FORMATS.md @@ -0,0 +1,429 @@ +# Data Profile Import Formats + +This document describes all supported formats for importing pre-computed data profiling results into Metis. + +## Overview + +Data profiles are defined in the data loader config (e.g., `adult.json`) under the `data_profiles` key. Each task type supports two import methods: + +1. **Inline values** (`values`): Define results directly in the JSON config +2. **External file** (`file`): Reference a CSV or TXT file with the results + +```json +{ + "loader": "CSV", + "name": "Adult", + "file_name": "adult.csv", + "data_profiles": { + "": { + "source": "", + "file": "path/to/file.csv", // OR + "values": [...] // inline values + } + } +} +``` + +--- + +## Cardinalities + +### null_count, null_percentage, distinct_count, row_count, uniqueness + +**JSON inline:** +```json +"null_count": { + "source": "manual", + "values": [ + {"column": "age", "value": 150}, + {"column": "income", "value": 42} + ] +} +``` + +**CSV file:** +```csv +column,value +age,150 +income,42 +``` + +### value_length_min, value_length_max, value_length_mean, value_length_median + +**JSON inline:** +```json +"value_length_max": { + "source": "manual", + "values": [ + {"column": "name", "value": 50}, + {"column": "email", "value": 100} + ] +} +``` + +**CSV file:** +```csv +column,value +name,50 +email,100 +``` + +--- + +## Value Distribution + +### constancy, interquartile_range + +**JSON inline:** +```json +"constancy": { + "source": "manual", + "values": [ + {"column": "status", "value": 0.85} + ] +} +``` + +**CSV file:** +```csv +column,value +status,0.85 +``` + +### most_frequent_value + +Values are auto-detected as int, float, bool, or string. + +**JSON inline:** +```json +"most_frequent_value": { + "source": "manual", + "values": [ + {"column": "status", "value": "active"}, + {"column": "count", "value": 42} + ] +} +``` + +**CSV file:** +```csv +column,value +status,active +count,42 +``` + +### quartiles + +**JSON inline:** +```json +"quartiles": { + "source": "manual", + "values": [ + {"column": "age", "Q1": 25.0, "Q2": 35.0, "Q3": 50.0}, + {"column": "income", "Q1": 30000, "Q2": 50000, "Q3": 80000} + ] +} +``` + +**CSV file:** +```csv +column,Q1,Q2,Q3 +age,25.0,35.0,50.0 +income,30000,50000,80000 +``` + +### equi_width_histogram, equi_depth_histogram + +**JSON inline:** +```json +"equi_width_histogram": { + "source": "manual", + "values": [ + { + "column": "age", + "bins": [ + {"min": 0, "max": 30, "count": 1500}, + {"min": 30, "max": 60, "count": 2200}, + {"min": 60, "max": 90, "count": 800} + ] + } + ] +} +``` + +**CSV file:** +```csv +column,bin_min,bin_max,count +age,0,30,1500 +age,30,60,2200 +age,60,90,800 +income,0,50000,3000 +income,50000,100000,2500 +``` + +--- + +## Patterns and Data Types + +### basic_type, data_type, data_class, domain + +**JSON inline:** +```json +"basic_type": { + "source": "manual", + "values": [ + {"column": "age", "value": "numeric"}, + {"column": "name", "value": "alphabetic"} + ] +} +``` + +**CSV file:** +```csv +column,value +age,numeric +name,alphabetic +``` + +Valid values for `basic_type`: `numeric`, `alphabetic`, `alphanumeric`, `date`, `time`, `mixed`, `empty` + +Valid values for `data_type`: `boolean`, `smallint`, `int`, `bigint`, `numeric`, `double`, `date`, `time`, `timestamp`, `varchar`, `text` + +Valid values for `data_class`: `code`, `indicator`, `text`, `date/time`, `quantity`, `identifier` + +Valid values for `domain`: `email`, `url`, `ssn`, `date_iso`, `time`, `ip_address`, `zip_code`, `credit_card`, `phone`, `currency`, `first_name`, `last_name`, `full_name`, `city`, `state`, `country`, `address`, `postal_code`, `unknown` + +### size, decimals + +**JSON inline:** +```json +"size": { + "source": "manual", + "values": [ + {"column": "price", "value": 10} + ] +} +``` + +**CSV file:** +```csv +column,value +price,10 +``` + +### patterns + +**JSON inline:** +```json +"patterns": { + "source": "manual", + "values": [ + { + "column": "phone", + "patterns": [ + {"pattern": "999-999-9999", "count": 500, "frequency": 0.8}, + {"pattern": "(999) 999-9999", "count": 100, "frequency": 0.16} + ] + } + ] +} +``` + +**CSV file:** +```csv +column,pattern,count,frequency +phone,999-999-9999,500,0.8 +phone,(999) 999-9999,100,0.16 +``` + +Pattern codes: `A`=uppercase, `a`=lowercase, `9`=digit, `#`=special char, `?`=other letter, ` `=space + +--- + +## Summaries and Sketches + +### jaccard_similarity + +**JSON inline:** +```json +"jaccard_similarity": { + "source": "manual", + "values": [ + {"column1": "name", "column2": "alias", "value": 0.85} + ] +} +``` + +**CSV file:** +```csv +column1,column2,value +name,alias,0.85 +``` + +### jaccard_similarity_ngrams + +**JSON inline:** +```json +"jaccard_similarity_ngrams": { + "source": "manual", + "values": [ + {"column1": "name", "column2": "alias", "n": 2, "value": 0.78} + ] +} +``` + +**CSV file:** +```csv +column1,column2,n,value +name,alias,2,0.78 +``` + +### minhash_signature + +Not importable (returns MinHash objects). + +--- + +## Dependencies + +### Functional Dependencies (fd) + +#### JSON inline + +```json +"fd": { + "source": "manual", + "values": [ + {"lhs": ["zip"], "rhs": "city"}, + {"lhs": ["id"], "rhs": "name"}, + {"lhs": ["city", "street"], "rhs": "zip"} + ] +} +``` + +#### External file (HyFD / AIDFD format) + +```json +"fd": { + "source": "hyfd", + "file": "outputs/adult_hyfd.txt" +} +``` + +**HyFD/AIDFD output format:** +``` +[table.col1]->table.col2 +[table.col1, table.col2]->table.col3 +``` + +Each FD on the same line, space-separated. Table prefix is stripped automatically. + +#### External file (CFDFinder format) + +```json +"fd": { + "source": "cfdfinder", + "file": "outputs/adult_cfd.txt" +} +``` + +**CFDFinder output format:** +``` +[table.col1]->table.col2#(pattern1);(pattern2) +``` + +The pattern tableau is stored as additional metadata. + +### Unique Column Combinations (ucc) + +**JSON inline:** +```json +"ucc": { + "source": "manual", + "values": [ + {"columns": ["id"]}, + {"columns": ["name", "birthdate"]} + ] +} +``` + +**CSV file:** +```csv +columns +id +"name,birthdate" +``` + +### Inclusion Dependencies (ind) + +**JSON inline:** +```json +"ind": { + "source": "manual", + "values": [ + { + "dependent": ["customer_id"], + "referenced": ["id"], + "referenced_table": "customers" + } + ] +} +``` + +**CSV file:** +```csv +dependent,referenced,referenced_table +customer_id,id,customers +"order_id,product_id","id,id","orders,products" +``` + +--- + +## Source Identifiers + +The `source` field tracks where the data came from: + +| Source | Description | +|--------|-------------| +| `manual` | Manually entered values | +| `hyfd` | HyFD algorithm output | +| `aidfd` | AIDFD algorithm output | +| `cfdfinder` | CFDFinder algorithm output | +| `computed` | Computed by Metis (automatic) | +| `imported:` | Custom import source | + +--- + +## Example: Complete Config + +```json +{ + "loader": "CSV", + "name": "Adult", + "file_name": "adult.csv", + "data_profiles": { + "fd": { + "source": "hyfd", + "file": "outputs/adult_hyfd.txt" + }, + "null_count": { + "source": "manual", + "values": [ + {"column": "age", "value": 0}, + {"column": "workclass", "value": 1836} + ] + }, + "equi_width_histogram": { + "source": "manual", + "file": "outputs/adult_histograms.csv" + }, + "basic_type": { + "source": "manual", + "values": [ + {"column": "age", "value": "numeric"}, + {"column": "education", "value": "alphabetic"} + ] + } + } +} +``` From bc87bbfd999d3ec7530eb2f19303a6415e52f2fc Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 11:01:23 +0100 Subject: [PATCH 20/25] fix: remove incompatible cached wrapper from estimate_jaccard_from_minhash --- metis/profiling/single_column/summaries_and_sketches/minhash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metis/profiling/single_column/summaries_and_sketches/minhash.py b/metis/profiling/single_column/summaries_and_sketches/minhash.py index b8d307a..da1ad28 100644 --- a/metis/profiling/single_column/summaries_and_sketches/minhash.py +++ b/metis/profiling/single_column/summaries_and_sketches/minhash.py @@ -5,4 +5,4 @@ ) minhash_signature = cached(_minhash_signature) -estimate_jaccard_from_minhash = cached(_estimate_jaccard_from_minhash) +estimate_jaccard_from_minhash = _estimate_jaccard_from_minhash From 07d4e00e032d2bdf926c31755f1ec310e7d81009 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 11:14:47 +0100 Subject: [PATCH 21/25] fix: handle multi-Series args in cached decorator --- metis/profiling/cache.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/metis/profiling/cache.py b/metis/profiling/cache.py index 9c61353..459cffe 100644 --- a/metis/profiling/cache.py +++ b/metis/profiling/cache.py @@ -48,6 +48,12 @@ def wrapper(data: pd.Series | pd.DataFrame, *args: Any, **kwargs: Any) -> Any: else: column_names = [str(c) for c in data.columns] + # Include column names from any extra Series args (e.g. jaccard's column2) + # so that the cache key distinguishes different column pairs. + for arg in args: + if isinstance(arg, pd.Series) and arg.name is not None: + column_names.append(str(arg.name)) + # Build optional config dict from extra arguments (if any). task_config = _build_config(fn, args, kwargs) or None @@ -84,6 +90,8 @@ def _build_config(fn: Callable, args: tuple, kwargs: dict) -> dict | None: config: dict = {} for i, val in enumerate(args): + if isinstance(val, pd.Series): + continue if i < len(params): config[params[i]] = val config.update(kwargs) From e70b4ee2285db64fae8e3e04becc48ece5218fcd Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 11:31:09 +0100 Subject: [PATCH 22/25] fix: add MinHash serialize/deserialize support in DataProfileManager --- metis/profiling/data_profile_manager.py | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/metis/profiling/data_profile_manager.py b/metis/profiling/data_profile_manager.py index 539d9bb..f29509a 100644 --- a/metis/profiling/data_profile_manager.py +++ b/metis/profiling/data_profile_manager.py @@ -309,6 +309,31 @@ def to_json_safe(v: Any) -> Any: if isinstance(value, pd.Series): return {"v": to_json_safe(value.to_dict())}, "series" + + # MinHash support (for minhash_signature results) + from datasketch import MinHash as _MinHash + + if isinstance(value, _MinHash): + return { + "v": { + "hashvalues": value.hashvalues.tolist(), + "num_perm": int(value.num_perm), + "seed": int(value.seed), + } + }, "minhash" + + if isinstance(value, dict) and value and isinstance(next(iter(value.values())), _MinHash): + return { + "v": { + k: { + "hashvalues": v.hashvalues.tolist(), + "num_perm": int(v.num_perm), + "seed": int(v.seed), + } + for k, v in value.items() + } + }, "minhash_dict" + if isinstance(value, dict): return {"v": to_json_safe(value)}, "dict" if isinstance(value, list): @@ -325,4 +350,23 @@ def _deserialize(payload: Optional[dict], result_type: str) -> Any: raw = payload.get("v") if result_type == "series": return pd.Series(raw) + if result_type == "minhash": + from datasketch import MinHash as _MinHash + import numpy as np + return _MinHash( + num_perm=raw["num_perm"], + seed=raw["seed"], + hashvalues=np.array(raw["hashvalues"], dtype=np.uint64), + ) + if result_type == "minhash_dict": + from datasketch import MinHash as _MinHash + import numpy as np + return { + k: _MinHash( + num_perm=v["num_perm"], + seed=v["seed"], + hashvalues=np.array(v["hashvalues"], dtype=np.uint64), + ) + for k, v in raw.items() + } return raw From 214fdff4a9d79ce79a3d1f55e7ce4b98c790e6b8 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 11:36:35 +0100 Subject: [PATCH 23/25] fix: escape dot and add quantifier in URL domain pattern --- .../single_column/domain_classification/domain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metis/utils/data_profiling/single_column/domain_classification/domain.py b/metis/utils/data_profiling/single_column/domain_classification/domain.py index 6cb4f6c..dd5f0b3 100644 --- a/metis/utils/data_profiling/single_column/domain_classification/domain.py +++ b/metis/utils/data_profiling/single_column/domain_classification/domain.py @@ -4,7 +4,7 @@ DOMAIN_PATTERNS: Dict[str, str] = { "email": r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', - "url": r'^https?://[^\s/$.?#].[^\s]*$', + "url": r'^https?://[^\s/$.?#]+\.[^\s]*$', "ssn": r'^(?!000|666|9\d{2})\d{3}-(?!00)\d{2}-(?!0000)\d{4}$', "date_iso": r'^\d{4}-(0[1-9]|1[0-2])-(0[1-9]|[12]\d|3[01])$', "time": r'^([01]\d|2[0-3]):[0-5]\d(:[0-5]\d)?$', From 0a8ce25dea5b0942d4a4c7ba3887d722acc49620 Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 11:51:42 +0100 Subject: [PATCH 24/25] fix: use upsert in DataProfileManager.store() to prevent duplicate rows --- metis/profiling/data_profile_manager.py | 45 ++++++++++++++++++------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/metis/profiling/data_profile_manager.py b/metis/profiling/data_profile_manager.py index f29509a..b233851 100644 --- a/metis/profiling/data_profile_manager.py +++ b/metis/profiling/data_profile_manager.py @@ -159,20 +159,39 @@ def store( serialized, result_type = self._serialize(value) - profile = DataProfile( - dataset=ds, - table_name=tbl, - column_names=column_names, - dp_task_name=dp_task_name, - task_config=task_config, - profile_type=profile_type, - dp_result_value=serialized, - result_type=result_type, - source=source, - ) - with Session(self._engine) as session: - session.add(profile) + # Find existing row with same logical key + stmt = ( + select(DataProfile) + .where(DataProfile.dataset == ds) + .where(DataProfile.table_name == tbl) + .where(DataProfile.dp_task_name == dp_task_name) + ) + existing = None + for row in session.execute(stmt).scalars(): + if sorted(row.column_names) == sorted(column_names): + cfg = row.task_config or {} + if cfg == (task_config or {}): + existing = row + break + + if existing is not None: + existing.dp_result_value = serialized + existing.result_type = result_type + existing.profile_type = profile_type + existing.source = source + else: + session.add(DataProfile( + dataset=ds, + table_name=tbl, + column_names=column_names, + dp_task_name=dp_task_name, + task_config=task_config, + profile_type=profile_type, + dp_result_value=serialized, + result_type=result_type, + source=source, + )) session.commit() # update in-memory cache From 7846ef51fa8e08584b5c185020cc4cc72de93a5e Mon Sep 17 00:00:00 2001 From: Tomic Riedel Date: Wed, 25 Feb 2026 12:05:18 +0100 Subject: [PATCH 25/25] fix: parse FD formats per-line to avoid silent HyFD drop --- metis/profiling/importers/fd_importer.py | 33 +++++++++++++----------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/metis/profiling/importers/fd_importer.py b/metis/profiling/importers/fd_importer.py index 39b6725..a6f2bba 100644 --- a/metis/profiling/importers/fd_importer.py +++ b/metis/profiling/importers/fd_importer.py @@ -42,22 +42,25 @@ def parse_file(self, file_path: str, table_name: str) -> List[Dict[str, Any]]: fds: List[Dict[str, Any]] = [] - # Try CFDFinder first (has # pattern) - for match in self.CFD_PATTERN.finditer(content): - lhs_raw, rhs_raw, pattern_tableau = match.groups() - lhs = self._parse_columns(lhs_raw, table_name) - rhs = self._parse_column(rhs_raw, table_name) - fds.append( - { - "column_names": sorted(lhs + [rhs]), - "value": {"lhs": lhs, "rhs": rhs}, - "task_config": {"pattern_tableau": pattern_tableau}, - } - ) + for line in content.splitlines(): + # Try CFDFinder first (more specific: requires #) + match = self.CFD_PATTERN.search(line) + if match: + lhs_raw, rhs_raw, pattern_tableau = match.groups() + lhs = self._parse_columns(lhs_raw, table_name) + rhs = self._parse_column(rhs_raw, table_name) + fds.append( + { + "column_names": sorted(lhs + [rhs]), + "value": {"lhs": lhs, "rhs": rhs}, + "task_config": {"pattern_tableau": pattern_tableau}, + } + ) + continue - # If no CFD matches, try HyFD/AIDFD - if not fds: - for match in self.HYFD_PATTERN.finditer(content): + # Fall back to HyFD/AIDFD + match = self.HYFD_PATTERN.search(line) + if match: lhs_raw, rhs_raw = match.groups() lhs = self._parse_columns(lhs_raw, table_name) rhs = self._parse_column(rhs_raw, table_name)