Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ report_dirty.md
*.csv
!requirements.txt
!sample_report.md
!sample_data/
!sample_data/*.csv

# OS specific
.DS_Store
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,11 @@ The report also lists transformations applied, for example:
- NaN -> 76166.67 (mean)
```

## Sample Data

A messy CSV file is included in `sample_data/messy_data.csv` to test the
cleaning features and edge cases.

## Development

### Running Tests
Expand Down
80 changes: 65 additions & 15 deletions datamorpher/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,34 +29,46 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
for col in df.columns:
dtype = infer_dtype(df[col], skipna=True)
series = df[col]
if dtype in {"string", "mixed"}:
bools = _normalize_booleans(series)
if bools.notna().sum() / max(series.notna().sum(), 1) >= 0.5:
df[col] = bools
dtype = "boolean"
series = df[col]
if dtype in {"string", "mixed"}:
validated, invalid = _validate_numeric(
series, col, transformations
)
if validated is not None:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "floating"
else:
ratio = validated.notna().sum() / max(series.notna().sum(), 1)
if ratio >= 0.5:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "floating"
if dtype in {"string", "mixed"}:
validated, invalid = _validate_dates(
series, column=col, transformations=transformations
)
if validated is not None:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "date"
ratio = validated.notna().sum() / max(
series.notna().sum(), 1
)
if ratio >= 0.5:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "date"

if df[col].isna().any():
if dtype in {"integer", "floating"}:
value = df[col].mean()
value = df[col].median()
df[col] = df[col].fillna(value)
info["imputed"][col] = "mean"
info["imputed"][col] = "median"
transformations.setdefault(col, []).append(
f"NaN -> {value:.2f} (mean)"
f"NaN -> {value:.2f} (median)"
)
elif dtype in {"string", "categorical", "boolean"}:
elif dtype in {"string", "categorical"}:
mode = df[col].mode(dropna=True)
if not mode.empty:
value = mode.iloc[0]
Expand All @@ -65,6 +77,15 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
transformations.setdefault(col, []).append(
f"NaN -> {value} (mode)"
)
elif dtype == "boolean":
mode = df[col].mode(dropna=True)
if not mode.empty:
value = bool(mode.iloc[0])
df[col] = df[col].fillna(value)
info["imputed"][col] = "mode"
transformations.setdefault(col, []).append(
f"NaN -> {value} (mode)"
)
return df, info


Expand Down Expand Up @@ -116,6 +137,18 @@ def _words_to_num(text: str | None) -> float | None:
return float(value)


def _normalize_booleans(series: pd.Series) -> pd.Series:
"""Normalize various boolean representations."""
true_values = {"true", "yes", "1", "t", "y"}
false_values = {"false", "no", "0", "f", "n", "inactive"}

lowercase = series.astype(str).str.lower()
result = pd.Series(index=series.index, dtype="object")
result[lowercase.isin(true_values)] = True
result[lowercase.isin(false_values)] = False
return result


def _validate_numeric(
series: pd.Series,
column: str | None = None,
Expand All @@ -138,7 +171,7 @@ def _validate_numeric(
extracted = (
series.where(mask)
.where(~series.str.contains(r"[/-]", na=False))
.str.extract(r"(\d+\.?\d*)")[0]
.str.extract(r"(\d+\.\d+|\d+)")[0]
)
extracted_numeric = pd.to_numeric(extracted, errors="coerce")
if transformations is not None and column is not None:
Expand All @@ -151,6 +184,10 @@ def _validate_numeric(
if converted.notna().sum() == 0:
return None, 0

ratio = converted.notna().sum() / max(series.notna().sum(), 1)
if ratio < 0.5:
return None, 0

invalid = int((converted.isna() & series.notna()).sum())
return converted, invalid

Expand All @@ -164,7 +201,16 @@ def _validate_dates(
) -> tuple[pd.Series | None, int]:
"""Return series of ISO formatted dates if convertible."""
if formats is None:
formats = ["%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"]
formats = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%Y/%m/%d",
"%d-%m-%Y",
"%m-%d-%Y",
"%B %d %Y",
"%d %B %Y",
]

parsed = pd.Series(pd.NaT, index=series.index)
for fmt in formats:
Expand All @@ -174,6 +220,10 @@ def _validate_dates(
if parsed.notna().sum() == 0:
return None, 0

ratio = parsed.notna().sum() / max(series.notna().sum(), 1)
if ratio < 0.5:
return None, 0

if transformations is not None and column is not None:
for idx in series.index:
if pd.isna(parsed[idx]) or pd.isna(series[idx]):
Expand Down
54 changes: 49 additions & 5 deletions datamorpher/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@
from typing import Dict

import pandas as pd
from pandas.api.types import infer_dtype

_DATE_FORMATS = [
"%Y-%m-%d",
"%d/%m/%Y",
"%m/%d/%Y",
"%Y/%m/%d",
"%d-%m-%Y",
"%m-%d-%Y",
"%B %d %Y",
"%d %B %Y",
]


class convert:
Expand Down Expand Up @@ -40,10 +50,7 @@ def write(df: pd.DataFrame, path: Path) -> None:
def detect_types(df: pd.DataFrame) -> Dict[str, str]:
types: Dict[str, str] = {}
for col in df.columns:
dtype = infer_dtype(df[col], skipna=True)
if dtype == "string" and _looks_like_date(df[col].astype(str)):
dtype = "date"
types[col] = dtype
types[col] = _infer_column_type(df, col)
return types


Expand All @@ -52,6 +59,43 @@ def _looks_like_date(series: pd.Series) -> bool:
return sample.str.match(r"\d{4}-\d{2}-\d{2}").all()


def _infer_column_type(df: pd.DataFrame, col_name: str) -> str:
"""Infer a column's semantic type."""
series = df[col_name]
sample = series.dropna().astype(str).head(10)

if col_name.lower() in {"name", "product", "title", "description"}:
return "string"

bool_values = {
"true",
"false",
"yes",
"no",
"1",
"0",
"t",
"f",
"y",
"n",
"inactive",
}
if sample.str.lower().isin(bool_values).mean() > 0.5:
return "boolean"

for fmt in _DATE_FORMATS:
parsed = pd.to_datetime(sample, errors="coerce", format=fmt)
if parsed.notna().mean() > 0.5:
return "date"

if sample.str.match(r"^-?\d+(\.\d+)?$").mean() > 0.8:
if sample.str.contains("\.", regex=False).any():
return "floating"
return "integer"

return "string"


def _read_json(path: Path) -> pd.DataFrame:
"""Return DataFrame from JSON, auto-detecting newline-delimited format."""
with path.open("r", encoding="utf-8") as f:
Expand Down
1 change: 1 addition & 0 deletions datamorpher/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
# Add parent directory to sys.path when running directly
if __name__ == "__main__":
import os

# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
# Add to sys.path if not already there
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,6 @@ line-length = 79
[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"

[tool.setuptools.packages.find]
include = ["datamorpher*"]
11 changes: 11 additions & 0 deletions sample_data/messy_data.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
product_name,price,launch_date,active
"Mountain Bike 29",450.00,20th Feb 2023,TRUE
"Tablet Pro 11",699,March 15 2023,yes
"Smart Watch Series 5",299.99,2018/09/15,T
"Laptop X",,2020-07-01,1
"",200,,false
"95ABC.50",95ABC.50,2019-13-01,N
"Another product",150,05/06/2021,active
"",,,
"Phone Ten",ten,January 5 2022,y
"Headset 2.0","two hundred",,inactive
2 changes: 1 addition & 1 deletion tests/test_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def test_cleaning_removes_duplicates_and_imputes() -> None:
df = pd.DataFrame({"a": [1, 1, None], "b": ["x", "x", None]})
cleaned, info = clean_data(df)
assert info["duplicates"] == 1
assert info["imputed"] == {"a": "mean", "b": "mode"}
assert info["imputed"] == {"a": "median", "b": "mode"}
assert len(cleaned) == 2
assert not cleaned.isna().any().any()

Expand Down