Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ A compact Python utility for seamless data conversion and cleaning, featuring bo
- **Multi-format Support**: Convert between CSV, Excel (.xlsx/.xls), and JSON formats
- **Automatic Type Detection**: Intelligently identifies numeric, categorical, date, and boolean columns
- **Data Cleaning**: Remove duplicates and impute missing values
- **Smart JSON Detection**: Handles standard and newline-delimited JSON
- **Robust Validation**: Cleans malformed numbers and dates
- **Detailed Reports**: Generate Markdown reports with conversion statistics
- **Dual Interface**: Choose between CLI for power users or Streamlit web app for non-technical users

Expand Down Expand Up @@ -93,6 +95,7 @@ When enabled, DataMorpher performs:
- Categorical columns: Mode (most frequent value)
- Date columns: Left as missing
- Boolean columns: Mode
- Invalid values reported for review

## Conversion Report

Expand Down
91 changes: 88 additions & 3 deletions datamorpher/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,34 @@


def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
"""Remove duplicates and impute missing values."""
info: Dict[str, object] = {"duplicates": 0, "imputed": {}}
"""Clean a DataFrame and report actions taken."""
info: Dict[str, object] = {"duplicates": 0, "imputed": {}, "invalid": {}}

before = len(df)
df = df.drop_duplicates()
info["duplicates"] = before - len(df)

# Make a copy to avoid SettingWithCopyWarning
df = df.copy()

for col in df.columns:
dtype = infer_dtype(df[col], skipna=True)
series = df[col]
if dtype in {"string", "mixed"}:
validated, invalid = _validate_numeric(series)
if validated is not None:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "floating"
else:
validated, invalid = _validate_dates(series)
if validated is not None:
df[col] = validated
if invalid:
info["invalid"][col] = invalid
dtype = "date"

if df[col].isna().any():
if dtype in {"integer", "floating"}:
df[col] = df[col].fillna(df[col].mean())
Expand All @@ -31,3 +47,72 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
df[col] = df[col].fillna(mode.iloc[0])
info["imputed"][col] = "mode"
return df, info


_NUMBER_WORDS = {
"zero": 0,
"one": 1,
"two": 2,
"three": 3,
"four": 4,
"five": 5,
"six": 6,
"seven": 7,
"eight": 8,
"nine": 9,
"ten": 10,
"eleven": 11,
"twelve": 12,
"thirteen": 13,
"fourteen": 14,
"fifteen": 15,
"sixteen": 16,
"seventeen": 17,
"eighteen": 18,
"nineteen": 19,
"twenty": 20,
"thirty": 30,
"forty": 40,
"fifty": 50,
"sixty": 60,
"seventy": 70,
"eighty": 80,
"ninety": 90,
}


def _words_to_num(text: str | None) -> float | None:
"""Convert simple number words to a float."""
if not isinstance(text, str):
return None
text = text.lower().replace("-", " ")
parts = text.split()
if not parts:
return None
value = 0
for part in parts:
if part not in _NUMBER_WORDS:
return None
value += _NUMBER_WORDS[part]
return float(value)


def _validate_numeric(series: pd.Series) -> tuple[pd.Series | None, int]:
"""Return numeric series if convertible and count invalid entries."""
converted = pd.to_numeric(series, errors="coerce")
if converted.isna().any():
as_words = series.where(converted.isna()).apply(_words_to_num)
converted.update(as_words)
if converted.notna().sum() == 0:
return None, 0
invalid = int((converted.isna() & series.notna()).sum())
return converted, invalid


def _validate_dates(series: pd.Series) -> tuple[pd.Series | None, int]:
"""Return series of ISO formatted dates if convertible."""
parsed = pd.to_datetime(series, errors="coerce")
if parsed.notna().sum() == 0:
return None, 0
invalid = int(parsed.isna().sum())
return parsed.dt.strftime("%Y-%m-%d"), invalid
27 changes: 23 additions & 4 deletions datamorpher/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,14 @@ def read(path: Path) -> pd.DataFrame:
if suffix in {".xlsx", ".xls"}:
return pd.read_excel(path)
if suffix == ".json":
try:
return pd.read_json(path, lines=True)
except ValueError:
return pd.read_json(path)
return _read_json(path)
raise ValueError(f"Unsupported input format: {path.suffix}")

@staticmethod
def write(df: pd.DataFrame, path: Path) -> None:
suffix = path.suffix.lower()
if suffix == ".csv":
df = _flatten(df)
df.to_csv(path, index=False)
elif suffix in {".xlsx", ".xls"}:
df.to_excel(path, index=False)
Expand All @@ -52,3 +50,24 @@ def detect_types(df: pd.DataFrame) -> Dict[str, str]:
def _looks_like_date(series: pd.Series) -> bool:
sample = series.dropna().astype(str).head(10)
return sample.str.match(r"\d{4}-\d{2}-\d{2}").all()


def _read_json(path: Path) -> pd.DataFrame:
"""Return DataFrame from JSON, auto-detecting newline-delimited format."""
with path.open("r", encoding="utf-8") as f:
start = f.read(1024)
text = start.lstrip()
if text.startswith("["):
return pd.read_json(path)
return pd.read_json(path, lines=True)


def _flatten(df: pd.DataFrame) -> pd.DataFrame:
"""Flatten nested columns if present."""
has_nested = any(
df[col].map(lambda x: isinstance(x, (dict, list))).any()
for col in df.columns
)
if has_nested:
df = pd.json_normalize(df.to_dict(orient="records"))
return df
9 changes: 5 additions & 4 deletions datamorpher/reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@ def build_report(
headers=["Column", "Detected Type"],
tablefmt="github",
)
imputed = "\n".join(
f"- {c}: {m}" for c, m in clean_info["imputed"].items()
) or "None"
imputed = (
"\n".join(f"- {c}: {m}" for c, m in clean_info["imputed"].items())
or "None"
)
report = f"""# DataMorpher Report

## Summary
- Input: {input_path.name} ({rows_in} rows)
- Output: {output_path.name} ({rows_out} rows)
- Duplicates removed: {clean_info['duplicates']}
- Duplicates removed: {clean_info["duplicates"]}
- Values imputed:\n{imputed}
- Duration: {duration:.2f}s

Expand Down
14 changes: 14 additions & 0 deletions tests/test_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,17 @@ def test_cleaning_removes_duplicates_and_imputes() -> None:
assert info["imputed"] == {"a": "mean", "b": "mode"}
assert len(cleaned) == 2
assert not cleaned.isna().any().any()


def test_numeric_and_date_validation() -> None:
df = pd.DataFrame(
{
"num": ["1", "twenty-eight", "8000foo0"],
"date": ["2020-01-01", "invalid_date", "2021-02-03"],
}
)
cleaned, info = clean_data(df)

assert info["invalid"] == {"num": 1, "date": 1}
assert cleaned.loc[1, "num"] == 28
assert cleaned["date"].isna().sum() == 1
26 changes: 26 additions & 0 deletions tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,29 @@ def test_csv_to_excel(tmp_path: Path) -> None:

df_x = pd.read_excel(out)
assert df_x.equals(df)


def test_json_reading_variants(tmp_path: Path) -> None:
ndjson = tmp_path / "data_lines.json"
with ndjson.open("w", encoding="utf-8") as f:
f.write('{"a":1}\n{"a":2}\n')

array_json = tmp_path / "data_array.json"
array_json.write_text('[{"a":1},{"a":2}]', encoding="utf-8")

df_lines = convert.read(ndjson)
df_array = convert.read(array_json)

assert len(df_lines) == 2
assert len(df_array) == 2


def test_json_to_csv_flatten(tmp_path: Path) -> None:
src = tmp_path / "nested.json"
src.write_text('{"a": 1, "b": {"c": 2}}\n{"a": 3, "b": {"c": 4}}')
df = convert.read(src)
out = tmp_path / "out.csv"
convert.write(df, out)
df_out = pd.read_csv(out)
assert "b.c" in df_out.columns
assert df_out.loc[0, "b.c"] == 2