From aa03d5402449b93b31b77130fca6a7ad8d1f3915 Mon Sep 17 00:00:00 2001 From: NeurArk Date: Tue, 20 May 2025 12:16:11 +0200 Subject: [PATCH] Fix package discovery --- .gitignore | 2 + README.md | 5 +++ datamorpher/cleaner.py | 80 +++++++++++++++++++++++++++++------- datamorpher/converter.py | 54 +++++++++++++++++++++--- datamorpher/streamlit_app.py | 1 + pyproject.toml | 3 ++ sample_data/messy_data.csv | 11 +++++ tests/test_cleaner.py | 2 +- 8 files changed, 137 insertions(+), 21 deletions(-) create mode 100644 sample_data/messy_data.csv diff --git a/.gitignore b/.gitignore index 480b11c..2ea7546 100644 --- a/.gitignore +++ b/.gitignore @@ -74,6 +74,8 @@ report_dirty.md *.csv !requirements.txt !sample_report.md +!sample_data/ +!sample_data/*.csv # OS specific .DS_Store diff --git a/README.md b/README.md index 64a1287..47aaca4 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,11 @@ The report also lists transformations applied, for example: - NaN -> 76166.67 (mean) ``` +## Sample Data + +A messy CSV file is included in `sample_data/messy_data.csv` to test the +cleaning features and edge cases. + ## Development ### Running Tests diff --git a/datamorpher/cleaner.py b/datamorpher/cleaner.py index c721f2f..a85ab67 100644 --- a/datamorpher/cleaner.py +++ b/datamorpher/cleaner.py @@ -29,34 +29,46 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: for col in df.columns: dtype = infer_dtype(df[col], skipna=True) series = df[col] + if dtype in {"string", "mixed"}: + bools = _normalize_booleans(series) + if bools.notna().sum() / max(series.notna().sum(), 1) >= 0.5: + df[col] = bools + dtype = "boolean" + series = df[col] if dtype in {"string", "mixed"}: validated, invalid = _validate_numeric( series, col, transformations ) if validated is not None: - df[col] = validated - if invalid: - info["invalid"][col] = invalid - dtype = "floating" - else: + ratio = validated.notna().sum() / max(series.notna().sum(), 1) + if ratio >= 0.5: + df[col] = validated + if invalid: + info["invalid"][col] = invalid + dtype = "floating" + if dtype in {"string", "mixed"}: validated, invalid = _validate_dates( series, column=col, transformations=transformations ) if validated is not None: - df[col] = validated - if invalid: - info["invalid"][col] = invalid - dtype = "date" + ratio = validated.notna().sum() / max( + series.notna().sum(), 1 + ) + if ratio >= 0.5: + df[col] = validated + if invalid: + info["invalid"][col] = invalid + dtype = "date" if df[col].isna().any(): if dtype in {"integer", "floating"}: - value = df[col].mean() + value = df[col].median() df[col] = df[col].fillna(value) - info["imputed"][col] = "mean" + info["imputed"][col] = "median" transformations.setdefault(col, []).append( - f"NaN -> {value:.2f} (mean)" + f"NaN -> {value:.2f} (median)" ) - elif dtype in {"string", "categorical", "boolean"}: + elif dtype in {"string", "categorical"}: mode = df[col].mode(dropna=True) if not mode.empty: value = mode.iloc[0] @@ -65,6 +77,15 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: transformations.setdefault(col, []).append( f"NaN -> {value} (mode)" ) + elif dtype == "boolean": + mode = df[col].mode(dropna=True) + if not mode.empty: + value = bool(mode.iloc[0]) + df[col] = df[col].fillna(value) + info["imputed"][col] = "mode" + transformations.setdefault(col, []).append( + f"NaN -> {value} (mode)" + ) return df, info @@ -116,6 +137,18 @@ def _words_to_num(text: str | None) -> float | None: return float(value) +def _normalize_booleans(series: pd.Series) -> pd.Series: + """Normalize various boolean representations.""" + true_values = {"true", "yes", "1", "t", "y"} + false_values = {"false", "no", "0", "f", "n", "inactive"} + + lowercase = series.astype(str).str.lower() + result = pd.Series(index=series.index, dtype="object") + result[lowercase.isin(true_values)] = True + result[lowercase.isin(false_values)] = False + return result + + def _validate_numeric( series: pd.Series, column: str | None = None, @@ -138,7 +171,7 @@ def _validate_numeric( extracted = ( series.where(mask) .where(~series.str.contains(r"[/-]", na=False)) - .str.extract(r"(\d+\.?\d*)")[0] + .str.extract(r"(\d+\.\d+|\d+)")[0] ) extracted_numeric = pd.to_numeric(extracted, errors="coerce") if transformations is not None and column is not None: @@ -151,6 +184,10 @@ def _validate_numeric( if converted.notna().sum() == 0: return None, 0 + ratio = converted.notna().sum() / max(series.notna().sum(), 1) + if ratio < 0.5: + return None, 0 + invalid = int((converted.isna() & series.notna()).sum()) return converted, invalid @@ -164,7 +201,16 @@ def _validate_dates( ) -> tuple[pd.Series | None, int]: """Return series of ISO formatted dates if convertible.""" if formats is None: - formats = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"] + formats = [ + "%Y-%m-%d", + "%d/%m/%Y", + "%m/%d/%Y", + "%Y/%m/%d", + "%d-%m-%Y", + "%m-%d-%Y", + "%B %d %Y", + "%d %B %Y", + ] parsed = pd.Series(pd.NaT, index=series.index) for fmt in formats: @@ -174,6 +220,10 @@ def _validate_dates( if parsed.notna().sum() == 0: return None, 0 + ratio = parsed.notna().sum() / max(series.notna().sum(), 1) + if ratio < 0.5: + return None, 0 + if transformations is not None and column is not None: for idx in series.index: if pd.isna(parsed[idx]) or pd.isna(series[idx]): diff --git a/datamorpher/converter.py b/datamorpher/converter.py index 71bfdec..2e34278 100644 --- a/datamorpher/converter.py +++ b/datamorpher/converter.py @@ -6,7 +6,17 @@ from typing import Dict import pandas as pd -from pandas.api.types import infer_dtype + +_DATE_FORMATS = [ + "%Y-%m-%d", + "%d/%m/%Y", + "%m/%d/%Y", + "%Y/%m/%d", + "%d-%m-%Y", + "%m-%d-%Y", + "%B %d %Y", + "%d %B %Y", +] class convert: @@ -40,10 +50,7 @@ def write(df: pd.DataFrame, path: Path) -> None: def detect_types(df: pd.DataFrame) -> Dict[str, str]: types: Dict[str, str] = {} for col in df.columns: - dtype = infer_dtype(df[col], skipna=True) - if dtype == "string" and _looks_like_date(df[col].astype(str)): - dtype = "date" - types[col] = dtype + types[col] = _infer_column_type(df, col) return types @@ -52,6 +59,43 @@ def _looks_like_date(series: pd.Series) -> bool: return sample.str.match(r"\d{4}-\d{2}-\d{2}").all() +def _infer_column_type(df: pd.DataFrame, col_name: str) -> str: + """Infer a column's semantic type.""" + series = df[col_name] + sample = series.dropna().astype(str).head(10) + + if col_name.lower() in {"name", "product", "title", "description"}: + return "string" + + bool_values = { + "true", + "false", + "yes", + "no", + "1", + "0", + "t", + "f", + "y", + "n", + "inactive", + } + if sample.str.lower().isin(bool_values).mean() > 0.5: + return "boolean" + + for fmt in _DATE_FORMATS: + parsed = pd.to_datetime(sample, errors="coerce", format=fmt) + if parsed.notna().mean() > 0.5: + return "date" + + if sample.str.match(r"^-?\d+(\.\d+)?$").mean() > 0.8: + if sample.str.contains("\.", regex=False).any(): + return "floating" + return "integer" + + return "string" + + def _read_json(path: Path) -> pd.DataFrame: """Return DataFrame from JSON, auto-detecting newline-delimited format.""" with path.open("r", encoding="utf-8") as f: diff --git a/datamorpher/streamlit_app.py b/datamorpher/streamlit_app.py index 8e2671a..0b23c52 100644 --- a/datamorpher/streamlit_app.py +++ b/datamorpher/streamlit_app.py @@ -10,6 +10,7 @@ # Add parent directory to sys.path when running directly if __name__ == "__main__": import os + # Get the absolute path of the parent directory parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) # Add to sys.path if not already there diff --git a/pyproject.toml b/pyproject.toml index 9f4e2aa..aff7b73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,3 +27,6 @@ line-length = 79 [build-system] requires = ["setuptools>=61"] build-backend = "setuptools.build_meta" + +[tool.setuptools.packages.find] +include = ["datamorpher*"] diff --git a/sample_data/messy_data.csv b/sample_data/messy_data.csv new file mode 100644 index 0000000..2ff23d6 --- /dev/null +++ b/sample_data/messy_data.csv @@ -0,0 +1,11 @@ +product_name,price,launch_date,active +"Mountain Bike 29",450.00,20th Feb 2023,TRUE +"Tablet Pro 11",699,March 15 2023,yes +"Smart Watch Series 5",299.99,2018/09/15,T +"Laptop X",,2020-07-01,1 +"",200,,false +"95ABC.50",95ABC.50,2019-13-01,N +"Another product",150,05/06/2021,active +"",,, +"Phone Ten",ten,January 5 2022,y +"Headset 2.0","two hundred",,inactive diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py index 10e89ab..d92002f 100644 --- a/tests/test_cleaner.py +++ b/tests/test_cleaner.py @@ -7,7 +7,7 @@ def test_cleaning_removes_duplicates_and_imputes() -> None: df = pd.DataFrame({"a": [1, 1, None], "b": ["x", "x", None]}) cleaned, info = clean_data(df) assert info["duplicates"] == 1 - assert info["imputed"] == {"a": "mean", "b": "mode"} + assert info["imputed"] == {"a": "median", "b": "mode"} assert len(cleaned) == 2 assert not cleaned.isna().any().any()