diff --git a/InteroperabilityEnabler/utils/annotation_dataset.py b/InteroperabilityEnabler/utils/annotation_dataset.py deleted file mode 100644 index f3c0be7..0000000 --- a/InteroperabilityEnabler/utils/annotation_dataset.py +++ /dev/null @@ -1,65 +0,0 @@ -""" -Add quality annotations to a Dataframe - -Author: Shahin ABDOUL SOUKOUR - Inria -Maintainer: Shahin ABDOUL SOUKOUR - Inria -""" - -import pandas as pd - - -def add_quality_annotations_to_df( - context_df, time_series_df, sep="__", assessed_attrs=None -): - """ - Add NGSI-LD quality annotations to either the context (instance-level) - or the time series (attribute-level). - - Args: - context_df (pd.DataFrame): Single-row DataFrame with 'id' and 'type'. - time_series_df (pd.DataFrame): Flattened time series DataFrame. - sep (str): Separator used in flattened column names (default: "__"). - assessed_attrs (list of str, optional): List of attributes to annotate. - If None, annotate the context (instance-level). - - Returns: - Tuple[pd.DataFrame, pd.DataFrame]: (updated context_df, updated time_series_df) - """ - # Copy inputs to avoid mutation - context_df = context_df.copy() - time_series_df = time_series_df.copy() - - entity_id = context_df.loc[0, "id"] - entity_type = context_df.loc[0, "type"] - - if assessed_attrs is None: - # Instance-level annotation → attach to context - context_df[f"hasQuality{sep}type"] = "Relationship" - context_df[f"hasQuality{sep}object"] = ( - f"urn:ngsi-ld:DataQualityAssessment:{entity_type}:{entity_id}" - ) - else: - # Attribute-level annotation → apply per-attribute, per-row - for attr in assessed_attrs: - attr_cols = [ - col for col in time_series_df.columns if col.startswith(f"{attr}{sep}") - ] - if not attr_cols: - raise ValueError(f"Attribute '{attr}' not found in DataFrame.") - - rows_to_annotate = time_series_df[attr_cols].notna().any(axis=1) - - quality_type_col = f"{attr}{sep}hasQuality{sep}type" - quality_obj_col = f"{attr}{sep}hasQuality{sep}object" - - # Initialize empty columns with None - time_series_df[quality_type_col] = None - time_series_df[quality_obj_col] = None - - # Apply values only to relevant rows - time_series_df.loc[rows_to_annotate, quality_type_col] = "Relationship" - time_series_df.loc[rows_to_annotate, quality_obj_col] = ( - f"urn:ngsi-ld:DataQualityAssessment:{entity_type}:{entity_id}:{attr}" - ) - - return context_df, time_series_df diff --git a/InteroperabilityEnabler/utils/data_mapper.py b/InteroperabilityEnabler/utils/data_mapper.py index 77a4340..dcceb00 100644 --- a/InteroperabilityEnabler/utils/data_mapper.py +++ b/InteroperabilityEnabler/utils/data_mapper.py @@ -11,13 +11,10 @@ def data_mapper( - context_df: pd.DataFrame, time_series_df: pd.DataFrame, sep="__" -) -> dict: + context_df: pd.DataFrame, time_series_df: pd.DataFrame, sep="__") -> dict: """ Maps data from context and time series DataFrames into a structured dictionary format, - while organizing instance-level quality annotations and grouping attributes from time - series data. The function ensures proper nesting of "hasQuality" fields, utilizes a - custom separator for splitting field names, and preserves timestamp data in ISO 8601 format. + while grouping attributes from time series data. Args: context_df (pd.DataFrame): The context DataFrame, expected to contain a single row @@ -32,35 +29,21 @@ def data_mapper( dict: A dictionary containing context-level attributes along with grouped and timestamped attribute data from the time series DataFrame. """ - # Extract context as dict + # Extract context as dictionary (single row) context = context_df.iloc[0].to_dict() - # Handle instance-level hasQuality annotation from context - instance_type_key = f"hasQuality{sep}type" - instance_object_key = f"hasQuality{sep}object" - if instance_type_key in context and instance_object_key in context: - if pd.notna(context[instance_type_key]) and pd.notna( - context[instance_object_key] - ): - context["hasQuality"] = { - "type": context.pop(instance_type_key), - "object": context.pop(instance_object_key), - } - else: - context.pop(instance_type_key, None) - context.pop(instance_object_key, None) - - # Prepare time series attribute grouping + # Initialize attribute grouping: attr -> list of observation dicts attribute_groups = {} for _, row in time_series_df.iterrows(): ts = row["observedAt"] ts_iso = datetime.utcfromtimestamp(int(ts)).strftime("%Y-%m-%dT%H:%M:%SZ") + # Temporarily collect fields for each attribute attr_temp = {} for col, val in row.items(): - if col == "observedAt": + if col == "observedAt": # or pd.isna(val): continue if sep in col: @@ -69,29 +52,13 @@ def data_mapper( attr_temp[attr] = {} attr_temp[attr][field] = val + # Add observedAt to each attribute's dict for attr, data in attr_temp.items(): data["observedAt"] = ts_iso - - # Detect and nest hasQuality fields if present - hq_type_key = "hasQuality" + sep + "type" - hq_obj_key = "hasQuality" + sep + "object" - - hq_type = data.pop(hq_type_key, None) - hq_obj = data.pop(hq_obj_key, None) - - # if pd.notna(hq_type) and pd.notna(hq_obj): - # data["hasQuality"] = {"type": hq_type, "object": hq_obj} - - # Always add hasQuality key, with None if missing - data["hasQuality"] = { - "type": None if pd.isna(hq_type) else hq_type, - "object": None if pd.isna(hq_obj) else hq_obj, - } - - # Store observations per attribute if attr not in attribute_groups: attribute_groups[attr] = [] attribute_groups[attr].append(data) - # Merge and return - return {**context, **attribute_groups} + # Merge context and time-series attributes + final_json = {**context, **attribute_groups} + return final_json \ No newline at end of file diff --git a/README.md b/README.md index 4fa63c7..b97d0ae 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ Interoperability Enabler (IE) component is designed to facilitate seamless integ ## Key Feature - Data Formatter - Convert JSON data (time-series data) into the SEDIMARK internal processing format (pandas DataFrames) -- Data Quality Annotations - Enable adding any kind of quality annotations to data inside pandas DataFrames - Data Mapper – Convert data from pandas DataFrames into JSON - Data Extractor – Extract relevant data from a pandas DataFrame - Metadata Restorer – Restore metadata to a pandas DataFrame @@ -37,7 +36,6 @@ InteroperabilityEnabler │ └── utils │ ├── __init__.py │ ├── add_metadata.py -│ ├── annotation_dataset.py │ ├── data_formatter.py │ ├── data_mapper.py │ ├── extract_data.py @@ -64,34 +62,12 @@ FILE_PATH="sample.json" context_df, time_series_df = data_formatter(FILE_PATH) ``` - -#### Data Quality Annotations (to enrich pandas DataFrames by adding quality annotations) - -Instance-level annotations: -```python -from InteroperabilityEnabler.utils.annotation_dataset import add_quality_annotations_to_df - -context_df, annotated_df = add_quality_annotations_to_df( - context_df, time_series_df, assessed_attrs=None -) -``` - -Attribute-level annotation: -```python -from InteroperabilityEnabler.utils.annotation_dataset import add_quality_annotations_to_df - -assessed_attrs = ["no"] # Base attribute name -context_df, annotated_df = add_quality_annotations_to_df( - context_df, time_series_df, assessed_attrs=assessed_attrs -) -``` - #### Data Mapper (to convert the DataFrame into JSON format) ```python from InteroperabilityEnabler.utils.data_mapper import data_mapper -data_json = data_mapper(context_df, annotated_df) +data_json = data_mapper(context_df, time_series_df) ``` #### Data Extractor (to extract and return specific columns from a pandas DataFrame) diff --git a/README_package.md b/README_package.md index 1e513a8..20b9834 100644 --- a/README_package.md +++ b/README_package.md @@ -6,7 +6,6 @@ Interoperability Enabler (IE) component is designed to facilitate seamless integ ## Key Feature - Data Formatter - Convert JSON data (time-series data) into the SEDIMARK internal processing format (pandas DataFrames) -- Data Quality Annotations - Enable adding any kind of quality annotations to data inside pandas DataFrames - Data Mapper – Convert data from pandas DataFrames into JSON - Data Extractor – Extract relevant data from a pandas DataFrame - Metadata Restorer – Restore metadata to a pandas DataFrame @@ -33,33 +32,12 @@ FILE_PATH="sample.json" context_df, time_series_df = data_formatter(FILE_PATH) ``` -#### Data Quality Annotations (to enrich pandas DataFrames by adding quality annotations) - -Instance-level annotations: -```python -from InteroperabilityEnabler.utils.annotation_dataset import add_quality_annotations_to_df - -context_df, annotated_df = add_quality_annotations_to_df( - context_df, time_series_df, assessed_attrs=None -) -``` - -Attribute-level annotation: -```python -from InteroperabilityEnabler.utils.annotation_dataset import add_quality_annotations_to_df - -assessed_attrs = ["no"] # Base attribute name -context_df, annotated_df = add_quality_annotations_to_df( - context_df, time_series_df, assessed_attrs=assessed_attrs -) -``` - #### Data Mapper (to convert the DataFrame into JSON format) ```python from InteroperabilityEnabler.utils.data_mapper import data_mapper -data_json = data_mapper(context_df, annotated_df) +data_json = data_mapper(context_df, time_series_df) ``` #### Data Extractor (to extract and return specific columns from a pandas DataFrame) diff --git a/tests/test_basic.py b/tests/test_basic.py index 339816c..bb94205 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,7 +1,6 @@ import pandas as pd import pytest from InteroperabilityEnabler.utils.data_formatter import data_formatter -from InteroperabilityEnabler.utils.annotation_dataset import add_quality_annotations_to_df from io import StringIO from InteroperabilityEnabler.utils.merge_data import merge_predicted_data from InteroperabilityEnabler.utils.extract_data import extract_columns @@ -58,81 +57,6 @@ def test_data_formatter(file_path): -@pytest.mark.parametrize("file_path", [FILE_PATH_JSON]) -def test_instance_level_annotation(file_path): - """ - Data quality annotation component tests. - Instance-level annotations. - """ - # Load JSON data - with open(file_path, "r", encoding="utf-8") as f: - json_data = json.load(f) - - # Convert to DataFrames - context_df, time_series_df = data_formatter(json_data, sep="__") - - # Apply instance-level annotation - updated_context_df, updated_time_series_df = add_quality_annotations_to_df( - context_df, time_series_df, sep="__", assessed_attrs=None - ) - - # Assertions for context-level quality annotation - assert "hasQuality__type" in updated_context_df.columns - assert "hasQuality__object" in updated_context_df.columns - - assert updated_context_df.loc[0, "hasQuality__type"] == "Relationship" - assert updated_context_df.loc[0, "hasQuality__object"] == ( - "urn:ngsi-ld:DataQualityAssessment:MonitoringSite:urn:sedimark:station:1" - ) - - # Time-series DataFrame should remain unchanged - assert "pm10__hasQuality__type" not in updated_time_series_df.columns - - - -@pytest.mark.parametrize("file_path", [FILE_PATH_JSON]) -def test_attribute_level_annotation(file_path): - """ - Data quality annotation component tests. - Attribut-level annotation. - """ - # Load JSON data from file - with open(file_path, "r", encoding="utf-8") as f: - json_data = json.load(f) - - # Convert to DataFrames - context_df, time_series_df = data_formatter(json_data, sep="__") - - # Apply attribute-level annotation on 'pm10' - updated_context_df, updated_time_series_df = add_quality_annotations_to_df( - context_df, - time_series_df, - sep="__", - assessed_attrs=["pm10"] - ) - - # Check that new quality columns are added for 'pm10' - assert "pm10__hasQuality__type" in updated_time_series_df.columns - assert "pm10__hasQuality__object" in updated_time_series_df.columns - - # Ensure all annotated rows have correct values - expected_object_uri = ( - "urn:ngsi-ld:DataQualityAssessment:MonitoringSite:urn:sedimark:station:1:pm10" - ) - - for i in range(len(updated_time_series_df)): - has_value = pd.notna(updated_time_series_df.loc[i, "pm10__value"]) - expected_type = "Relationship" if has_value else None - expected_obj = expected_object_uri if has_value else None - - assert updated_time_series_df.loc[i, "pm10__hasQuality__type"] == expected_type - assert updated_time_series_df.loc[i, "pm10__hasQuality__object"] == expected_obj - - # Confirm context_df is unchanged (no instance-level fields) - assert "hasQuality__type" not in updated_context_df.columns - - - @pytest.mark.parametrize("file_path", [FILE_PATH_JSON]) def test_data_mapper(file_path): """ @@ -146,14 +70,6 @@ def test_data_mapper(file_path): # Format data context_df, time_series_df = data_formatter(json_data, sep="__") - # Apply attribute-level annotation on 'no2' - context_df, time_series_df = add_quality_annotations_to_df( - context_df, - time_series_df, - sep="__", - assessed_attrs=["no2"] - ) - # Map back to JSON structure mapped_data = data_mapper(context_df, time_series_df, sep="__") @@ -163,18 +79,6 @@ def test_data_mapper(file_path): assert mapped_data["type"] == "MonitoringSite" assert "no2" in mapped_data - # Check at least one annotation exists for 'no2' - no2_values = mapped_data["no2"] - assert isinstance(no2_values, list) - - found_annotated = any( - "hasQuality" in item and - item["hasQuality"]["type"] == "Relationship" and - item["hasQuality"]["object"].endswith(":no2") - for item in no2_values - ) - assert found_annotated, "No attribute-level annotation found for no2" - def test_extract_columns_valid_indices(): """