From aa03d5402449b93b31b77130fca6a7ad8d1f3915 Mon Sep 17 00:00:00 2001
From: NeurArk <guillaume.rospape@neurark.com>
Date: Tue, 20 May 2025 12:16:11 +0200
Subject: [PATCH] Fix package discovery

---
 .gitignore                   |  2 +
 README.md                    |  5 +++
 datamorpher/cleaner.py       | 80 +++++++++++++++++++++++++++++-------
 datamorpher/converter.py     | 54 +++++++++++++++++++++---
 datamorpher/streamlit_app.py |  1 +
 pyproject.toml               |  3 ++
 sample_data/messy_data.csv   | 11 +++++
 tests/test_cleaner.py        |  2 +-
 8 files changed, 137 insertions(+), 21 deletions(-)
 create mode 100644 sample_data/messy_data.csv

diff --git a/.gitignore b/.gitignore
index 480b11c..2ea7546 100644
--- a/.gitignore
+++ b/.gitignore
@@ -74,6 +74,8 @@ report_dirty.md
 *.csv
 !requirements.txt
 !sample_report.md
+!sample_data/
+!sample_data/*.csv
 
 # OS specific
 .DS_Store
diff --git a/README.md b/README.md
index 64a1287..47aaca4 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,11 @@ The report also lists transformations applied, for example:
 - NaN -> 76166.67 (mean)
 ```
 
+## Sample Data
+
+A messy CSV file is included in `sample_data/messy_data.csv` to test the
+cleaning features and edge cases.
+
 ## Development
 
 ### Running Tests
diff --git a/datamorpher/cleaner.py b/datamorpher/cleaner.py
index c721f2f..a85ab67 100644
--- a/datamorpher/cleaner.py
+++ b/datamorpher/cleaner.py
@@ -29,34 +29,46 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
     for col in df.columns:
         dtype = infer_dtype(df[col], skipna=True)
         series = df[col]
+        if dtype in {"string", "mixed"}:
+            bools = _normalize_booleans(series)
+            if bools.notna().sum() / max(series.notna().sum(), 1) >= 0.5:
+                df[col] = bools
+                dtype = "boolean"
+                series = df[col]
         if dtype in {"string", "mixed"}:
             validated, invalid = _validate_numeric(
                 series, col, transformations
             )
             if validated is not None:
-                df[col] = validated
-                if invalid:
-                    info["invalid"][col] = invalid
-                dtype = "floating"
-            else:
+                ratio = validated.notna().sum() / max(series.notna().sum(), 1)
+                if ratio >= 0.5:
+                    df[col] = validated
+                    if invalid:
+                        info["invalid"][col] = invalid
+                    dtype = "floating"
+            if dtype in {"string", "mixed"}:
                 validated, invalid = _validate_dates(
                     series, column=col, transformations=transformations
                 )
                 if validated is not None:
-                    df[col] = validated
-                    if invalid:
-                        info["invalid"][col] = invalid
-                    dtype = "date"
+                    ratio = validated.notna().sum() / max(
+                        series.notna().sum(), 1
+                    )
+                    if ratio >= 0.5:
+                        df[col] = validated
+                        if invalid:
+                            info["invalid"][col] = invalid
+                        dtype = "date"
 
         if df[col].isna().any():
             if dtype in {"integer", "floating"}:
-                value = df[col].mean()
+                value = df[col].median()
                 df[col] = df[col].fillna(value)
-                info["imputed"][col] = "mean"
+                info["imputed"][col] = "median"
                 transformations.setdefault(col, []).append(
-                    f"NaN -> {value:.2f} (mean)"
+                    f"NaN -> {value:.2f} (median)"
                 )
-            elif dtype in {"string", "categorical", "boolean"}:
+            elif dtype in {"string", "categorical"}:
                 mode = df[col].mode(dropna=True)
                 if not mode.empty:
                     value = mode.iloc[0]
@@ -65,6 +77,15 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
                     transformations.setdefault(col, []).append(
                         f"NaN -> {value} (mode)"
                     )
+            elif dtype == "boolean":
+                mode = df[col].mode(dropna=True)
+                if not mode.empty:
+                    value = bool(mode.iloc[0])
+                    df[col] = df[col].fillna(value)
+                    info["imputed"][col] = "mode"
+                    transformations.setdefault(col, []).append(
+                        f"NaN -> {value} (mode)"
+                    )
     return df, info
 
 
@@ -116,6 +137,18 @@ def _words_to_num(text: str | None) -> float | None:
     return float(value)
 
 
+def _normalize_booleans(series: pd.Series) -> pd.Series:
+    """Normalize various boolean representations."""
+    true_values = {"true", "yes", "1", "t", "y"}
+    false_values = {"false", "no", "0", "f", "n", "inactive"}
+
+    lowercase = series.astype(str).str.lower()
+    result = pd.Series(index=series.index, dtype="object")
+    result[lowercase.isin(true_values)] = True
+    result[lowercase.isin(false_values)] = False
+    return result
+
+
 def _validate_numeric(
     series: pd.Series,
     column: str | None = None,
@@ -138,7 +171,7 @@ def _validate_numeric(
         extracted = (
             series.where(mask)
             .where(~series.str.contains(r"[/-]", na=False))
-            .str.extract(r"(\d+\.?\d*)")[0]
+            .str.extract(r"(\d+\.\d+|\d+)")[0]
         )
         extracted_numeric = pd.to_numeric(extracted, errors="coerce")
         if transformations is not None and column is not None:
@@ -151,6 +184,10 @@ def _validate_numeric(
     if converted.notna().sum() == 0:
         return None, 0
 
+    ratio = converted.notna().sum() / max(series.notna().sum(), 1)
+    if ratio < 0.5:
+        return None, 0
+
     invalid = int((converted.isna() & series.notna()).sum())
     return converted, invalid
 
@@ -164,7 +201,16 @@ def _validate_dates(
 ) -> tuple[pd.Series | None, int]:
     """Return series of ISO formatted dates if convertible."""
     if formats is None:
-        formats = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"]
+        formats = [
+            "%Y-%m-%d",
+            "%d/%m/%Y",
+            "%m/%d/%Y",
+            "%Y/%m/%d",
+            "%d-%m-%Y",
+            "%m-%d-%Y",
+            "%B %d %Y",
+            "%d %B %Y",
+        ]
 
     parsed = pd.Series(pd.NaT, index=series.index)
     for fmt in formats:
@@ -174,6 +220,10 @@ def _validate_dates(
     if parsed.notna().sum() == 0:
         return None, 0
 
+    ratio = parsed.notna().sum() / max(series.notna().sum(), 1)
+    if ratio < 0.5:
+        return None, 0
+
     if transformations is not None and column is not None:
         for idx in series.index:
             if pd.isna(parsed[idx]) or pd.isna(series[idx]):
diff --git a/datamorpher/converter.py b/datamorpher/converter.py
index 71bfdec..2e34278 100644
--- a/datamorpher/converter.py
+++ b/datamorpher/converter.py
@@ -6,7 +6,17 @@
 from typing import Dict
 
 import pandas as pd
-from pandas.api.types import infer_dtype
+
+_DATE_FORMATS = [
+    "%Y-%m-%d",
+    "%d/%m/%Y",
+    "%m/%d/%Y",
+    "%Y/%m/%d",
+    "%d-%m-%Y",
+    "%m-%d-%Y",
+    "%B %d %Y",
+    "%d %B %Y",
+]
 
 
 class convert:
@@ -40,10 +50,7 @@ def write(df: pd.DataFrame, path: Path) -> None:
     def detect_types(df: pd.DataFrame) -> Dict[str, str]:
         types: Dict[str, str] = {}
         for col in df.columns:
-            dtype = infer_dtype(df[col], skipna=True)
-            if dtype == "string" and _looks_like_date(df[col].astype(str)):
-                dtype = "date"
-            types[col] = dtype
+            types[col] = _infer_column_type(df, col)
         return types
 
 
@@ -52,6 +59,43 @@ def _looks_like_date(series: pd.Series) -> bool:
     return sample.str.match(r"\d{4}-\d{2}-\d{2}").all()
 
 
+def _infer_column_type(df: pd.DataFrame, col_name: str) -> str:
+    """Infer a column's semantic type."""
+    series = df[col_name]
+    sample = series.dropna().astype(str).head(10)
+
+    if col_name.lower() in {"name", "product", "title", "description"}:
+        return "string"
+
+    bool_values = {
+        "true",
+        "false",
+        "yes",
+        "no",
+        "1",
+        "0",
+        "t",
+        "f",
+        "y",
+        "n",
+        "inactive",
+    }
+    if sample.str.lower().isin(bool_values).mean() > 0.5:
+        return "boolean"
+
+    for fmt in _DATE_FORMATS:
+        parsed = pd.to_datetime(sample, errors="coerce", format=fmt)
+        if parsed.notna().mean() > 0.5:
+            return "date"
+
+    if sample.str.match(r"^-?\d+(\.\d+)?$").mean() > 0.8:
+        if sample.str.contains("\.", regex=False).any():
+            return "floating"
+        return "integer"
+
+    return "string"
+
+
 def _read_json(path: Path) -> pd.DataFrame:
     """Return DataFrame from JSON, auto-detecting newline-delimited format."""
     with path.open("r", encoding="utf-8") as f:
diff --git a/datamorpher/streamlit_app.py b/datamorpher/streamlit_app.py
index 8e2671a..0b23c52 100644
--- a/datamorpher/streamlit_app.py
+++ b/datamorpher/streamlit_app.py
@@ -10,6 +10,7 @@
 # Add parent directory to sys.path when running directly
 if __name__ == "__main__":
     import os
+
     # Get the absolute path of the parent directory
     parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
     # Add to sys.path if not already there
diff --git a/pyproject.toml b/pyproject.toml
index 9f4e2aa..aff7b73 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,3 +27,6 @@ line-length = 79
 [build-system]
 requires = ["setuptools>=61"]
 build-backend = "setuptools.build_meta"
+
+[tool.setuptools.packages.find]
+include = ["datamorpher*"]
diff --git a/sample_data/messy_data.csv b/sample_data/messy_data.csv
new file mode 100644
index 0000000..2ff23d6
--- /dev/null
+++ b/sample_data/messy_data.csv
@@ -0,0 +1,11 @@
+product_name,price,launch_date,active
+"Mountain Bike 29",450.00,20th Feb 2023,TRUE
+"Tablet Pro 11",699,March 15 2023,yes
+"Smart Watch Series 5",299.99,2018/09/15,T
+"Laptop X",,2020-07-01,1
+"",200,,false
+"95ABC.50",95ABC.50,2019-13-01,N
+"Another product",150,05/06/2021,active
+"",,,
+"Phone Ten",ten,January 5 2022,y
+"Headset 2.0","two hundred",,inactive
diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py
index 10e89ab..d92002f 100644
--- a/tests/test_cleaner.py
+++ b/tests/test_cleaner.py
@@ -7,7 +7,7 @@ def test_cleaning_removes_duplicates_and_imputes() -> None:
     df = pd.DataFrame({"a": [1, 1, None], "b": ["x", "x", None]})
     cleaned, info = clean_data(df)
     assert info["duplicates"] == 1
-    assert info["imputed"] == {"a": "mean", "b": "mode"}
+    assert info["imputed"] == {"a": "median", "b": "mode"}
     assert len(cleaned) == 2
     assert not cleaned.isna().any().any()