diff --git a/README.md b/README.md index fe047e2..49df249 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,8 @@ A compact Python utility for seamless data conversion and cleaning, featuring bo - **Multi-format Support**: Convert between CSV, Excel (.xlsx/.xls), and JSON formats - **Automatic Type Detection**: Intelligently identifies numeric, categorical, date, and boolean columns - **Data Cleaning**: Remove duplicates and impute missing values +- **Smart JSON Detection**: Handles standard and newline-delimited JSON +- **Robust Validation**: Cleans malformed numbers and dates - **Detailed Reports**: Generate Markdown reports with conversion statistics - **Dual Interface**: Choose between CLI for power users or Streamlit web app for non-technical users @@ -93,6 +95,7 @@ When enabled, DataMorpher performs: - Categorical columns: Mode (most frequent value) - Date columns: Left as missing - Boolean columns: Mode + - Invalid values reported for review ## Conversion Report diff --git a/datamorpher/cleaner.py b/datamorpher/cleaner.py index 7076e64..f8161d5 100644 --- a/datamorpher/cleaner.py +++ b/datamorpher/cleaner.py @@ -9,8 +9,8 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: - """Remove duplicates and impute missing values.""" - info: Dict[str, object] = {"duplicates": 0, "imputed": {}} + """Clean a DataFrame and report actions taken.""" + info: Dict[str, object] = {"duplicates": 0, "imputed": {}, "invalid": {}} before = len(df) df = df.drop_duplicates() @@ -18,9 +18,25 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: # Make a copy to avoid SettingWithCopyWarning df = df.copy() - + for col in df.columns: dtype = infer_dtype(df[col], skipna=True) + series = df[col] + if dtype in {"string", "mixed"}: + validated, invalid = _validate_numeric(series) + if validated is not None: + df[col] = validated + if invalid: + info["invalid"][col] = invalid + dtype = "floating" + else: + validated, invalid = _validate_dates(series) + if validated is not None: + df[col] = validated + if invalid: + info["invalid"][col] = invalid + dtype = "date" + if df[col].isna().any(): if dtype in {"integer", "floating"}: df[col] = df[col].fillna(df[col].mean()) @@ -31,3 +47,72 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]: df[col] = df[col].fillna(mode.iloc[0]) info["imputed"][col] = "mode" return df, info + + +_NUMBER_WORDS = { + "zero": 0, + "one": 1, + "two": 2, + "three": 3, + "four": 4, + "five": 5, + "six": 6, + "seven": 7, + "eight": 8, + "nine": 9, + "ten": 10, + "eleven": 11, + "twelve": 12, + "thirteen": 13, + "fourteen": 14, + "fifteen": 15, + "sixteen": 16, + "seventeen": 17, + "eighteen": 18, + "nineteen": 19, + "twenty": 20, + "thirty": 30, + "forty": 40, + "fifty": 50, + "sixty": 60, + "seventy": 70, + "eighty": 80, + "ninety": 90, +} + + +def _words_to_num(text: str | None) -> float | None: + """Convert simple number words to a float.""" + if not isinstance(text, str): + return None + text = text.lower().replace("-", " ") + parts = text.split() + if not parts: + return None + value = 0 + for part in parts: + if part not in _NUMBER_WORDS: + return None + value += _NUMBER_WORDS[part] + return float(value) + + +def _validate_numeric(series: pd.Series) -> tuple[pd.Series | None, int]: + """Return numeric series if convertible and count invalid entries.""" + converted = pd.to_numeric(series, errors="coerce") + if converted.isna().any(): + as_words = series.where(converted.isna()).apply(_words_to_num) + converted.update(as_words) + if converted.notna().sum() == 0: + return None, 0 + invalid = int((converted.isna() & series.notna()).sum()) + return converted, invalid + + +def _validate_dates(series: pd.Series) -> tuple[pd.Series | None, int]: + """Return series of ISO formatted dates if convertible.""" + parsed = pd.to_datetime(series, errors="coerce") + if parsed.notna().sum() == 0: + return None, 0 + invalid = int(parsed.isna().sum()) + return parsed.dt.strftime("%Y-%m-%d"), invalid diff --git a/datamorpher/converter.py b/datamorpher/converter.py index eb36e9c..71bfdec 100644 --- a/datamorpher/converter.py +++ b/datamorpher/converter.py @@ -20,16 +20,14 @@ def read(path: Path) -> pd.DataFrame: if suffix in {".xlsx", ".xls"}: return pd.read_excel(path) if suffix == ".json": - try: - return pd.read_json(path, lines=True) - except ValueError: - return pd.read_json(path) + return _read_json(path) raise ValueError(f"Unsupported input format: {path.suffix}") @staticmethod def write(df: pd.DataFrame, path: Path) -> None: suffix = path.suffix.lower() if suffix == ".csv": + df = _flatten(df) df.to_csv(path, index=False) elif suffix in {".xlsx", ".xls"}: df.to_excel(path, index=False) @@ -52,3 +50,24 @@ def detect_types(df: pd.DataFrame) -> Dict[str, str]: def _looks_like_date(series: pd.Series) -> bool: sample = series.dropna().astype(str).head(10) return sample.str.match(r"\d{4}-\d{2}-\d{2}").all() + + +def _read_json(path: Path) -> pd.DataFrame: + """Return DataFrame from JSON, auto-detecting newline-delimited format.""" + with path.open("r", encoding="utf-8") as f: + start = f.read(1024) + text = start.lstrip() + if text.startswith("["): + return pd.read_json(path) + return pd.read_json(path, lines=True) + + +def _flatten(df: pd.DataFrame) -> pd.DataFrame: + """Flatten nested columns if present.""" + has_nested = any( + df[col].map(lambda x: isinstance(x, (dict, list))).any() + for col in df.columns + ) + if has_nested: + df = pd.json_normalize(df.to_dict(orient="records")) + return df diff --git a/datamorpher/reporter.py b/datamorpher/reporter.py index e87bc3a..05a28ca 100644 --- a/datamorpher/reporter.py +++ b/datamorpher/reporter.py @@ -23,15 +23,16 @@ def build_report( headers=["Column", "Detected Type"], tablefmt="github", ) - imputed = "\n".join( - f"- {c}: {m}" for c, m in clean_info["imputed"].items() - ) or "None" + imputed = ( + "\n".join(f"- {c}: {m}" for c, m in clean_info["imputed"].items()) + or "None" + ) report = f"""# DataMorpher Report ## Summary - Input: {input_path.name} ({rows_in} rows) - Output: {output_path.name} ({rows_out} rows) -- Duplicates removed: {clean_info['duplicates']} +- Duplicates removed: {clean_info["duplicates"]} - Values imputed:\n{imputed} - Duration: {duration:.2f}s diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py index 38a3d9b..16510d6 100644 --- a/tests/test_cleaner.py +++ b/tests/test_cleaner.py @@ -10,3 +10,17 @@ def test_cleaning_removes_duplicates_and_imputes() -> None: assert info["imputed"] == {"a": "mean", "b": "mode"} assert len(cleaned) == 2 assert not cleaned.isna().any().any() + + +def test_numeric_and_date_validation() -> None: + df = pd.DataFrame( + { + "num": ["1", "twenty-eight", "8000foo0"], + "date": ["2020-01-01", "invalid_date", "2021-02-03"], + } + ) + cleaned, info = clean_data(df) + + assert info["invalid"] == {"num": 1, "date": 1} + assert cleaned.loc[1, "num"] == 28 + assert cleaned["date"].isna().sum() == 1 diff --git a/tests/test_converter.py b/tests/test_converter.py index 7e207b7..da116dc 100644 --- a/tests/test_converter.py +++ b/tests/test_converter.py @@ -18,3 +18,29 @@ def test_csv_to_excel(tmp_path: Path) -> None: df_x = pd.read_excel(out) assert df_x.equals(df) + + +def test_json_reading_variants(tmp_path: Path) -> None: + ndjson = tmp_path / "data_lines.json" + with ndjson.open("w", encoding="utf-8") as f: + f.write('{"a":1}\n{"a":2}\n') + + array_json = tmp_path / "data_array.json" + array_json.write_text('[{"a":1},{"a":2}]', encoding="utf-8") + + df_lines = convert.read(ndjson) + df_array = convert.read(array_json) + + assert len(df_lines) == 2 + assert len(df_array) == 2 + + +def test_json_to_csv_flatten(tmp_path: Path) -> None: + src = tmp_path / "nested.json" + src.write_text('{"a": 1, "b": {"c": 2}}\n{"a": 3, "b": {"c": 4}}') + df = convert.read(src) + out = tmp_path / "out.csv" + convert.write(df, out) + df_out = pd.read_csv(out) + assert "b.c" in df_out.columns + assert df_out.loc[0, "b.c"] == 2