From 4e3920631d224baa1755a62a49db8e00aea115b8 Mon Sep 17 00:00:00 2001
From: NeurArk <guillaume.rospape@neurark.com>
Date: Thu, 22 May 2025 13:54:44 +0200
Subject: [PATCH] Improve error handling for data loading and transformation

---
 TODO.md                  |  7 +++++
 pages/data_explorer.py   | 59 ++++++++++++++++++++++------------------
 pages/prediction.py      |  9 ++++--
 pages/report.py          |  9 ++++--
 tests/test_data_utils.py | 15 ++++++++++
 tests/test_transform.py  | 12 ++++++++
 utils/data.py            | 14 +++++++++-
 utils/transform.py       | 14 ++++++++++
 8 files changed, 106 insertions(+), 33 deletions(-)

diff --git a/TODO.md b/TODO.md
index f13797f..6291e97 100644
--- a/TODO.md
+++ b/TODO.md
@@ -181,6 +181,13 @@ After completing a milestone, create a pull request with your changes for review
 - [x] Integrated evaluation metrics and plots into the Data Explorer page
 - [x] Implemented modeling page with model selection, training, cross-validation, and export functionality
 
+## PR17: Robust Error Handling
+
+- [x] Improve data-loading functions to validate inputs and raise descriptive errors
+- [x] Add checks for missing columns and types in transformation utilities
+- [x] Surface error messages in UI pages using `st.error`
+- [x] Add tests for new error handling in data and transform modules
+
 ## Notes for Development
 
 - Create comprehensive commit messages that clearly describe changes
diff --git a/pages/data_explorer.py b/pages/data_explorer.py
index e9d43fc..aef9d4a 100644
--- a/pages/data_explorer.py
+++ b/pages/data_explorer.py
@@ -45,14 +45,17 @@ def main() -> None:
         st.subheader("Sample Datasets")
         for name, path in config.SAMPLE_DATASETS.items():
             if st.button(f"Load {name}"):
-                st.session_state["data"] = data_utils.load_data(path)
-                st.session_state["data"] = data_utils.convert_dtypes(
-                    st.session_state["data"]
-                )
-                st.session_state["datetime_cols"] = eda.detect_datetime_columns(
-                    st.session_state["data"]
-                )
-                st.success(f"{name} loaded!")
+                try:
+                    st.session_state["data"] = data_utils.load_data(path)
+                    st.session_state["data"] = data_utils.convert_dtypes(
+                        st.session_state["data"]
+                    )
+                    st.session_state["datetime_cols"] = eda.detect_datetime_columns(
+                        st.session_state["data"]
+                    )
+                    st.success(f"{name} loaded!")
+                except (ValueError, TypeError) as exc:
+                    st.error(f"Failed to load sample data: {exc}")
 
         with st.expander("Help"):
             st.markdown(ui.help_markdown())
@@ -64,7 +67,7 @@ def main() -> None:
             st.session_state["data"] = df
             st.session_state["datetime_cols"] = eda.detect_datetime_columns(df)
             st.success("File loaded successfully!")
-        except ValueError as exc:
+        except (ValueError, TypeError) as exc:
             st.error(f"Failed to load file: {exc}")
 
     data = st.session_state.get("data")
@@ -171,23 +174,27 @@ def _corr(df):
             )
         if st.button("Apply Transformations"):
             df_trans = data.copy()
-            if missing_strategy == "Drop rows":
-                df_trans = transform.handle_missing_values(df_trans, strategy="drop")
-            elif missing_strategy == "Fill Mean":
-                df_trans = transform.handle_missing_values(df_trans, strategy="mean")
-            elif missing_strategy == "Fill Median":
-                df_trans = transform.handle_missing_values(df_trans, strategy="median")
-            elif missing_strategy == "Fill Mode":
-                df_trans = transform.handle_missing_values(df_trans, strategy="mode")
-            if encode_cols:
-                method = "onehot" if encode_method == "One-Hot" else "label"
-                df_trans = transform.encode_features(df_trans, encode_cols, method=method)
-            if scale_cols:
-                method = "standard" if scale_method == "Standard" else "minmax"
-                df_trans = transform.scale_features(df_trans, scale_cols, method=method)
-            st.session_state["data"] = df_trans
-            data = df_trans
-            st.success("Transformations applied!")
+            try:
+                if missing_strategy == "Drop rows":
+                    df_trans = transform.handle_missing_values(df_trans, strategy="drop")
+                elif missing_strategy == "Fill Mean":
+                    df_trans = transform.handle_missing_values(df_trans, strategy="mean")
+                elif missing_strategy == "Fill Median":
+                    df_trans = transform.handle_missing_values(df_trans, strategy="median")
+                elif missing_strategy == "Fill Mode":
+                    df_trans = transform.handle_missing_values(df_trans, strategy="mode")
+                if encode_cols:
+                    method = "onehot" if encode_method == "One-Hot" else "label"
+                    df_trans = transform.encode_features(df_trans, encode_cols, method=method)
+                if scale_cols:
+                    method = "standard" if scale_method == "Standard" else "minmax"
+                    df_trans = transform.scale_features(df_trans, scale_cols, method=method)
+            except (ValueError, KeyError, TypeError) as exc:
+                st.error(f"Transformation error: {exc}")
+            else:
+                st.session_state["data"] = df_trans
+                data = df_trans
+                st.success("Transformations applied!")
 
         st.subheader("Model Training - Classification")
         target = st.selectbox(
diff --git a/pages/prediction.py b/pages/prediction.py
index 25f1a8f..82e87b2 100644
--- a/pages/prediction.py
+++ b/pages/prediction.py
@@ -33,9 +33,12 @@ def main() -> None:
             model_obj = predict.load_model(Path(tmp.name))
 
     if data_file is not None:
-        df = data_utils.load_data(data_file)
-        df = data_utils.convert_dtypes(df)
-        st.session_state["pred_data"] = df
+        try:
+            df = data_utils.load_data(data_file)
+            df = data_utils.convert_dtypes(df)
+            st.session_state["pred_data"] = df
+        except (ValueError, TypeError) as exc:
+            st.error(f"Failed to load data: {exc}")
 
     data = st.session_state.get("pred_data")
 
diff --git a/pages/report.py b/pages/report.py
index 8b795da..61ea129 100644
--- a/pages/report.py
+++ b/pages/report.py
@@ -26,9 +26,12 @@ def main() -> None:
         )
 
     if data_file is not None:
-        df = data_utils.load_data(data_file)
-        df = data_utils.convert_dtypes(df)
-        st.session_state["report_data"] = df
+        try:
+            df = data_utils.load_data(data_file)
+            df = data_utils.convert_dtypes(df)
+            st.session_state["report_data"] = df
+        except (ValueError, TypeError) as exc:
+            st.error(f"Failed to load data: {exc}")
 
     df = st.session_state.get("report_data")
     if df is not None:
diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py
index d3dcff2..04e0de4 100644
--- a/tests/test_data_utils.py
+++ b/tests/test_data_utils.py
@@ -46,3 +46,18 @@ def test_sample_dataset_loads():
         df = data.load_data(path)
         assert not df.empty
 
+
+def test_load_data_invalid_type():
+    with pytest.raises(ValueError):
+        data.load_data(123)
+
+
+def test_convert_dtypes_invalid():
+    with pytest.raises(TypeError):
+        data.convert_dtypes([1, 2, 3])
+
+
+def test_data_summary_empty():
+    with pytest.raises(ValueError):
+        data.data_summary(pd.DataFrame())
+
diff --git a/tests/test_transform.py b/tests/test_transform.py
index b860ead..1f13fbe 100644
--- a/tests/test_transform.py
+++ b/tests/test_transform.py
@@ -55,3 +55,15 @@ def test_transformation_workflow():
     df = transform.scale_features(df, ["num"], method="minmax")
     assert df["num"].min() == 0
     assert "cat" in df.columns
+
+
+def test_encode_features_missing_column():
+    df = sample_df().fillna({"cat": "b"})
+    with pytest.raises(KeyError):
+        transform.encode_features(df, ["missing"], method="onehot")
+
+
+def test_scale_features_non_numeric():
+    df = sample_df().fillna({"cat": "b"})
+    with pytest.raises(TypeError):
+        transform.scale_features(df, ["cat"], method="standard")
diff --git a/utils/data.py b/utils/data.py
index 32b338a..c627b5c 100644
--- a/utils/data.py
+++ b/utils/data.py
@@ -10,10 +10,16 @@
 
 def load_data(file: Any) -> pd.DataFrame:
     """Load a CSV or Excel file into a DataFrame."""
+    if file is None:
+        raise ValueError("No file provided")
+
     if hasattr(file, "name"):
         ext = Path(file.name).suffix.lower()
     else:
-        ext = Path(str(file)).suffix.lower()
+        path = Path(str(file))
+        ext = path.suffix.lower()
+        if not path.exists():
+            raise ValueError("Invalid file path")
 
     try:
         if ext == ".csv":
@@ -27,6 +33,8 @@ def load_data(file: Any) -> pd.DataFrame:
 
 def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
     """Attempt to convert object columns to numeric or datetime."""
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
     for column in df.columns:
         if df[column].dtype == object:
             df[column] = pd.to_numeric(df[column], errors="ignore")
@@ -37,4 +45,8 @@ def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
 
 def data_summary(df: pd.DataFrame) -> pd.DataFrame:
     """Return a statistical summary of the dataframe."""
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    if df.empty:
+        raise ValueError("DataFrame is empty")
     return df.describe(include="all")
diff --git a/utils/transform.py b/utils/transform.py
index 36882d4..e4a5685 100644
--- a/utils/transform.py
+++ b/utils/transform.py
@@ -15,6 +15,8 @@
 
 def handle_missing_values(df: pd.DataFrame, *, strategy: str = "drop") -> pd.DataFrame:
     """Handle missing values according to the specified strategy."""
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
     if strategy not in MISSING_STRATEGIES:
         raise ValueError(f"Invalid strategy: {strategy}")
     if strategy == "drop":
@@ -40,6 +42,11 @@ def encode_features(
     method: str = "onehot",
 ) -> pd.DataFrame:
     """Encode categorical features using the given method."""
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    missing = set(columns) - set(df.columns)
+    if missing:
+        raise KeyError(f"Columns not found: {', '.join(missing)}")
     if method not in ENCODING_METHODS:
         raise ValueError(f"Invalid encoding method: {method}")
     df = df.copy()
@@ -58,6 +65,13 @@ def scale_features(
     method: str = "standard",
 ) -> pd.DataFrame:
     """Scale numeric features with the given method."""
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("df must be a pandas DataFrame")
+    missing = set(columns) - set(df.columns)
+    if missing:
+        raise KeyError(f"Columns not found: {', '.join(missing)}")
+    if not all(pd.api.types.is_numeric_dtype(df[c]) for c in columns):
+        raise TypeError("Scale features require numeric columns")
     if method not in SCALING_METHODS:
         raise ValueError(f"Invalid scaling method: {method}")
     df = df.copy()