From 4e3920631d224baa1755a62a49db8e00aea115b8 Mon Sep 17 00:00:00 2001 From: NeurArk Date: Thu, 22 May 2025 13:54:44 +0200 Subject: [PATCH] Improve error handling for data loading and transformation --- TODO.md | 7 +++++ pages/data_explorer.py | 59 ++++++++++++++++++++++------------------ pages/prediction.py | 9 ++++-- pages/report.py | 9 ++++-- tests/test_data_utils.py | 15 ++++++++++ tests/test_transform.py | 12 ++++++++ utils/data.py | 14 +++++++++- utils/transform.py | 14 ++++++++++ 8 files changed, 106 insertions(+), 33 deletions(-) diff --git a/TODO.md b/TODO.md index f13797f..6291e97 100644 --- a/TODO.md +++ b/TODO.md @@ -181,6 +181,13 @@ After completing a milestone, create a pull request with your changes for review - [x] Integrated evaluation metrics and plots into the Data Explorer page - [x] Implemented modeling page with model selection, training, cross-validation, and export functionality +## PR17: Robust Error Handling + +- [x] Improve data-loading functions to validate inputs and raise descriptive errors +- [x] Add checks for missing columns and types in transformation utilities +- [x] Surface error messages in UI pages using `st.error` +- [x] Add tests for new error handling in data and transform modules + ## Notes for Development - Create comprehensive commit messages that clearly describe changes diff --git a/pages/data_explorer.py b/pages/data_explorer.py index e9d43fc..aef9d4a 100644 --- a/pages/data_explorer.py +++ b/pages/data_explorer.py @@ -45,14 +45,17 @@ def main() -> None: st.subheader("Sample Datasets") for name, path in config.SAMPLE_DATASETS.items(): if st.button(f"Load {name}"): - st.session_state["data"] = data_utils.load_data(path) - st.session_state["data"] = data_utils.convert_dtypes( - st.session_state["data"] - ) - st.session_state["datetime_cols"] = eda.detect_datetime_columns( - st.session_state["data"] - ) - st.success(f"{name} loaded!") + try: + st.session_state["data"] = data_utils.load_data(path) + st.session_state["data"] = data_utils.convert_dtypes( + st.session_state["data"] + ) + st.session_state["datetime_cols"] = eda.detect_datetime_columns( + st.session_state["data"] + ) + st.success(f"{name} loaded!") + except (ValueError, TypeError) as exc: + st.error(f"Failed to load sample data: {exc}") with st.expander("Help"): st.markdown(ui.help_markdown()) @@ -64,7 +67,7 @@ def main() -> None: st.session_state["data"] = df st.session_state["datetime_cols"] = eda.detect_datetime_columns(df) st.success("File loaded successfully!") - except ValueError as exc: + except (ValueError, TypeError) as exc: st.error(f"Failed to load file: {exc}") data = st.session_state.get("data") @@ -171,23 +174,27 @@ def _corr(df): ) if st.button("Apply Transformations"): df_trans = data.copy() - if missing_strategy == "Drop rows": - df_trans = transform.handle_missing_values(df_trans, strategy="drop") - elif missing_strategy == "Fill Mean": - df_trans = transform.handle_missing_values(df_trans, strategy="mean") - elif missing_strategy == "Fill Median": - df_trans = transform.handle_missing_values(df_trans, strategy="median") - elif missing_strategy == "Fill Mode": - df_trans = transform.handle_missing_values(df_trans, strategy="mode") - if encode_cols: - method = "onehot" if encode_method == "One-Hot" else "label" - df_trans = transform.encode_features(df_trans, encode_cols, method=method) - if scale_cols: - method = "standard" if scale_method == "Standard" else "minmax" - df_trans = transform.scale_features(df_trans, scale_cols, method=method) - st.session_state["data"] = df_trans - data = df_trans - st.success("Transformations applied!") + try: + if missing_strategy == "Drop rows": + df_trans = transform.handle_missing_values(df_trans, strategy="drop") + elif missing_strategy == "Fill Mean": + df_trans = transform.handle_missing_values(df_trans, strategy="mean") + elif missing_strategy == "Fill Median": + df_trans = transform.handle_missing_values(df_trans, strategy="median") + elif missing_strategy == "Fill Mode": + df_trans = transform.handle_missing_values(df_trans, strategy="mode") + if encode_cols: + method = "onehot" if encode_method == "One-Hot" else "label" + df_trans = transform.encode_features(df_trans, encode_cols, method=method) + if scale_cols: + method = "standard" if scale_method == "Standard" else "minmax" + df_trans = transform.scale_features(df_trans, scale_cols, method=method) + except (ValueError, KeyError, TypeError) as exc: + st.error(f"Transformation error: {exc}") + else: + st.session_state["data"] = df_trans + data = df_trans + st.success("Transformations applied!") st.subheader("Model Training - Classification") target = st.selectbox( diff --git a/pages/prediction.py b/pages/prediction.py index 25f1a8f..82e87b2 100644 --- a/pages/prediction.py +++ b/pages/prediction.py @@ -33,9 +33,12 @@ def main() -> None: model_obj = predict.load_model(Path(tmp.name)) if data_file is not None: - df = data_utils.load_data(data_file) - df = data_utils.convert_dtypes(df) - st.session_state["pred_data"] = df + try: + df = data_utils.load_data(data_file) + df = data_utils.convert_dtypes(df) + st.session_state["pred_data"] = df + except (ValueError, TypeError) as exc: + st.error(f"Failed to load data: {exc}") data = st.session_state.get("pred_data") diff --git a/pages/report.py b/pages/report.py index 8b795da..61ea129 100644 --- a/pages/report.py +++ b/pages/report.py @@ -26,9 +26,12 @@ def main() -> None: ) if data_file is not None: - df = data_utils.load_data(data_file) - df = data_utils.convert_dtypes(df) - st.session_state["report_data"] = df + try: + df = data_utils.load_data(data_file) + df = data_utils.convert_dtypes(df) + st.session_state["report_data"] = df + except (ValueError, TypeError) as exc: + st.error(f"Failed to load data: {exc}") df = st.session_state.get("report_data") if df is not None: diff --git a/tests/test_data_utils.py b/tests/test_data_utils.py index d3dcff2..04e0de4 100644 --- a/tests/test_data_utils.py +++ b/tests/test_data_utils.py @@ -46,3 +46,18 @@ def test_sample_dataset_loads(): df = data.load_data(path) assert not df.empty + +def test_load_data_invalid_type(): + with pytest.raises(ValueError): + data.load_data(123) + + +def test_convert_dtypes_invalid(): + with pytest.raises(TypeError): + data.convert_dtypes([1, 2, 3]) + + +def test_data_summary_empty(): + with pytest.raises(ValueError): + data.data_summary(pd.DataFrame()) + diff --git a/tests/test_transform.py b/tests/test_transform.py index b860ead..1f13fbe 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -55,3 +55,15 @@ def test_transformation_workflow(): df = transform.scale_features(df, ["num"], method="minmax") assert df["num"].min() == 0 assert "cat" in df.columns + + +def test_encode_features_missing_column(): + df = sample_df().fillna({"cat": "b"}) + with pytest.raises(KeyError): + transform.encode_features(df, ["missing"], method="onehot") + + +def test_scale_features_non_numeric(): + df = sample_df().fillna({"cat": "b"}) + with pytest.raises(TypeError): + transform.scale_features(df, ["cat"], method="standard") diff --git a/utils/data.py b/utils/data.py index 32b338a..c627b5c 100644 --- a/utils/data.py +++ b/utils/data.py @@ -10,10 +10,16 @@ def load_data(file: Any) -> pd.DataFrame: """Load a CSV or Excel file into a DataFrame.""" + if file is None: + raise ValueError("No file provided") + if hasattr(file, "name"): ext = Path(file.name).suffix.lower() else: - ext = Path(str(file)).suffix.lower() + path = Path(str(file)) + ext = path.suffix.lower() + if not path.exists(): + raise ValueError("Invalid file path") try: if ext == ".csv": @@ -27,6 +33,8 @@ def load_data(file: Any) -> pd.DataFrame: def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame: """Attempt to convert object columns to numeric or datetime.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("df must be a pandas DataFrame") for column in df.columns: if df[column].dtype == object: df[column] = pd.to_numeric(df[column], errors="ignore") @@ -37,4 +45,8 @@ def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame: def data_summary(df: pd.DataFrame) -> pd.DataFrame: """Return a statistical summary of the dataframe.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("df must be a pandas DataFrame") + if df.empty: + raise ValueError("DataFrame is empty") return df.describe(include="all") diff --git a/utils/transform.py b/utils/transform.py index 36882d4..e4a5685 100644 --- a/utils/transform.py +++ b/utils/transform.py @@ -15,6 +15,8 @@ def handle_missing_values(df: pd.DataFrame, *, strategy: str = "drop") -> pd.DataFrame: """Handle missing values according to the specified strategy.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("df must be a pandas DataFrame") if strategy not in MISSING_STRATEGIES: raise ValueError(f"Invalid strategy: {strategy}") if strategy == "drop": @@ -40,6 +42,11 @@ def encode_features( method: str = "onehot", ) -> pd.DataFrame: """Encode categorical features using the given method.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("df must be a pandas DataFrame") + missing = set(columns) - set(df.columns) + if missing: + raise KeyError(f"Columns not found: {', '.join(missing)}") if method not in ENCODING_METHODS: raise ValueError(f"Invalid encoding method: {method}") df = df.copy() @@ -58,6 +65,13 @@ def scale_features( method: str = "standard", ) -> pd.DataFrame: """Scale numeric features with the given method.""" + if not isinstance(df, pd.DataFrame): + raise TypeError("df must be a pandas DataFrame") + missing = set(columns) - set(df.columns) + if missing: + raise KeyError(f"Columns not found: {', '.join(missing)}") + if not all(pd.api.types.is_numeric_dtype(df[c]) for c in columns): + raise TypeError("Scale features require numeric columns") if method not in SCALING_METHODS: raise ValueError(f"Invalid scaling method: {method}") df = df.copy()