NeurArk · NeurArk · May 19, 2025 · May 19, 2025
diff --git a/README.md b/README.md
@@ -10,6 +10,8 @@ A compact Python utility for seamless data conversion and cleaning, featuring bo
 - **Multi-format Support**: Convert between CSV, Excel (.xlsx/.xls), and JSON formats
 - **Automatic Type Detection**: Intelligently identifies numeric, categorical, date, and boolean columns
 - **Data Cleaning**: Remove duplicates and impute missing values
+- **Smart JSON Detection**: Handles standard and newline-delimited JSON
+- **Robust Validation**: Cleans malformed numbers and dates
 - **Detailed Reports**: Generate Markdown reports with conversion statistics
 - **Dual Interface**: Choose between CLI for power users or Streamlit web app for non-technical users
 
@@ -93,6 +95,7 @@ When enabled, DataMorpher performs:
   - Categorical columns: Mode (most frequent value)
   - Date columns: Left as missing
   - Boolean columns: Mode
+  - Invalid values reported for review
 
 ## Conversion Report
 

diff --git a/datamorpher/cleaner.py b/datamorpher/cleaner.py
@@ -9,18 +9,34 @@
 
 
 def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
-    """Remove duplicates and impute missing values."""
-    info: Dict[str, object] = {"duplicates": 0, "imputed": {}}
+    """Clean a DataFrame and report actions taken."""
+    info: Dict[str, object] = {"duplicates": 0, "imputed": {}, "invalid": {}}
 
     before = len(df)
     df = df.drop_duplicates()
     info["duplicates"] = before - len(df)
 
     # Make a copy to avoid SettingWithCopyWarning
     df = df.copy()
-    
+
     for col in df.columns:
         dtype = infer_dtype(df[col], skipna=True)
+        series = df[col]
+        if dtype in {"string", "mixed"}:
+            validated, invalid = _validate_numeric(series)
+            if validated is not None:
+                df[col] = validated
+                if invalid:
+                    info["invalid"][col] = invalid
+                dtype = "floating"
+            else:
+                validated, invalid = _validate_dates(series)
+                if validated is not None:
+                    df[col] = validated
+                    if invalid:
+                        info["invalid"][col] = invalid
+                    dtype = "date"
+
         if df[col].isna().any():
             if dtype in {"integer", "floating"}:
                 df[col] = df[col].fillna(df[col].mean())
@@ -31,3 +47,72 @@ def clean_data(df: pd.DataFrame) -> tuple[pd.DataFrame, Dict[str, object]]:
                     df[col] = df[col].fillna(mode.iloc[0])
                     info["imputed"][col] = "mode"
     return df, info
+
+
+_NUMBER_WORDS = {
+    "zero": 0,
+    "one": 1,
+    "two": 2,
+    "three": 3,
+    "four": 4,
+    "five": 5,
+    "six": 6,
+    "seven": 7,
+    "eight": 8,
+    "nine": 9,
+    "ten": 10,
+    "eleven": 11,
+    "twelve": 12,
+    "thirteen": 13,
+    "fourteen": 14,
+    "fifteen": 15,
+    "sixteen": 16,
+    "seventeen": 17,
+    "eighteen": 18,
+    "nineteen": 19,
+    "twenty": 20,
+    "thirty": 30,
+    "forty": 40,
+    "fifty": 50,
+    "sixty": 60,
+    "seventy": 70,
+    "eighty": 80,
+    "ninety": 90,
+}
+
+
+def _words_to_num(text: str | None) -> float | None:
+    """Convert simple number words to a float."""
+    if not isinstance(text, str):
+        return None
+    text = text.lower().replace("-", " ")
+    parts = text.split()
+    if not parts:
+        return None
+    value = 0
+    for part in parts:
+        if part not in _NUMBER_WORDS:
+            return None
+        value += _NUMBER_WORDS[part]
+    return float(value)
+
+
+def _validate_numeric(series: pd.Series) -> tuple[pd.Series | None, int]:
+    """Return numeric series if convertible and count invalid entries."""
+    converted = pd.to_numeric(series, errors="coerce")
+    if converted.isna().any():
+        as_words = series.where(converted.isna()).apply(_words_to_num)
+        converted.update(as_words)
+    if converted.notna().sum() == 0:
+        return None, 0
+    invalid = int((converted.isna() & series.notna()).sum())
+    return converted, invalid
+
+
+def _validate_dates(series: pd.Series) -> tuple[pd.Series | None, int]:
+    """Return series of ISO formatted dates if convertible."""
+    parsed = pd.to_datetime(series, errors="coerce")
+    if parsed.notna().sum() == 0:
+        return None, 0
+    invalid = int(parsed.isna().sum())
+    return parsed.dt.strftime("%Y-%m-%d"), invalid
diff --git a/datamorpher/converter.py b/datamorpher/converter.py
@@ -20,16 +20,14 @@ def read(path: Path) -> pd.DataFrame:
         if suffix in {".xlsx", ".xls"}:
             return pd.read_excel(path)
         if suffix == ".json":
-            try:
-                return pd.read_json(path, lines=True)
-            except ValueError:
-                return pd.read_json(path)
+            return _read_json(path)
         raise ValueError(f"Unsupported input format: {path.suffix}")
 
     @staticmethod
     def write(df: pd.DataFrame, path: Path) -> None:
         suffix = path.suffix.lower()
         if suffix == ".csv":
+            df = _flatten(df)
             df.to_csv(path, index=False)
         elif suffix in {".xlsx", ".xls"}:
             df.to_excel(path, index=False)
@@ -52,3 +50,24 @@ def detect_types(df: pd.DataFrame) -> Dict[str, str]:
 def _looks_like_date(series: pd.Series) -> bool:
     sample = series.dropna().astype(str).head(10)
     return sample.str.match(r"\d{4}-\d{2}-\d{2}").all()
+
+
+def _read_json(path: Path) -> pd.DataFrame:
+    """Return DataFrame from JSON, auto-detecting newline-delimited format."""
+    with path.open("r", encoding="utf-8") as f:
+        start = f.read(1024)
+    text = start.lstrip()
+    if text.startswith("["):
+        return pd.read_json(path)
+    return pd.read_json(path, lines=True)
+
+
+def _flatten(df: pd.DataFrame) -> pd.DataFrame:
+    """Flatten nested columns if present."""
+    has_nested = any(
+        df[col].map(lambda x: isinstance(x, (dict, list))).any()
+        for col in df.columns
+    )
+    if has_nested:
+        df = pd.json_normalize(df.to_dict(orient="records"))
+    return df
diff --git a/datamorpher/reporter.py b/datamorpher/reporter.py
@@ -23,15 +23,16 @@ def build_report(
         headers=["Column", "Detected Type"],
         tablefmt="github",
     )
-    imputed = "\n".join(
-        f"- {c}: {m}" for c, m in clean_info["imputed"].items()
-    ) or "None"
+    imputed = (
+        "\n".join(f"- {c}: {m}" for c, m in clean_info["imputed"].items())
+        or "None"
+    )
     report = f"""# DataMorpher Report
 
 ## Summary
 - Input: {input_path.name} ({rows_in} rows)
 - Output: {output_path.name} ({rows_out} rows)
-- Duplicates removed: {clean_info['duplicates']}
+- Duplicates removed: {clean_info["duplicates"]}
 - Values imputed:\n{imputed}
 - Duration: {duration:.2f}s
 

diff --git a/tests/test_cleaner.py b/tests/test_cleaner.py
@@ -10,3 +10,17 @@ def test_cleaning_removes_duplicates_and_imputes() -> None:
     assert info["imputed"] == {"a": "mean", "b": "mode"}
     assert len(cleaned) == 2
     assert not cleaned.isna().any().any()
+
+
+def test_numeric_and_date_validation() -> None:
+    df = pd.DataFrame(
+        {
+            "num": ["1", "twenty-eight", "8000foo0"],
+            "date": ["2020-01-01", "invalid_date", "2021-02-03"],
+        }
+    )
+    cleaned, info = clean_data(df)
+
+    assert info["invalid"] == {"num": 1, "date": 1}
+    assert cleaned.loc[1, "num"] == 28
+    assert cleaned["date"].isna().sum() == 1
diff --git a/tests/test_converter.py b/tests/test_converter.py
@@ -18,3 +18,29 @@ def test_csv_to_excel(tmp_path: Path) -> None:
 
     df_x = pd.read_excel(out)
     assert df_x.equals(df)
+
+
+def test_json_reading_variants(tmp_path: Path) -> None:
+    ndjson = tmp_path / "data_lines.json"
+    with ndjson.open("w", encoding="utf-8") as f:
+        f.write('{"a":1}\n{"a":2}\n')
+
+    array_json = tmp_path / "data_array.json"
+    array_json.write_text('[{"a":1},{"a":2}]', encoding="utf-8")
+
+    df_lines = convert.read(ndjson)
+    df_array = convert.read(array_json)
+
+    assert len(df_lines) == 2
+    assert len(df_array) == 2
+
+
+def test_json_to_csv_flatten(tmp_path: Path) -> None:
+    src = tmp_path / "nested.json"
+    src.write_text('{"a": 1, "b": {"c": 2}}\n{"a": 3, "b": {"c": 4}}')
+    df = convert.read(src)
+    out = tmp_path / "out.csv"
+    convert.write(df, out)
+    df_out = pd.read_csv(out)
+    assert "b.c" in df_out.columns
+    assert df_out.loc[0, "b.c"] == 2