Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,13 @@ After completing a milestone, create a pull request with your changes for review
- [x] Integrated evaluation metrics and plots into the Data Explorer page
- [x] Implemented modeling page with model selection, training, cross-validation, and export functionality

## PR17: Robust Error Handling

- [x] Improve data-loading functions to validate inputs and raise descriptive errors
- [x] Add checks for missing columns and types in transformation utilities
- [x] Surface error messages in UI pages using `st.error`
- [x] Add tests for new error handling in data and transform modules

## Notes for Development

- Create comprehensive commit messages that clearly describe changes
Expand Down
59 changes: 33 additions & 26 deletions pages/data_explorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,17 @@ def main() -> None:
st.subheader("Sample Datasets")
for name, path in config.SAMPLE_DATASETS.items():
if st.button(f"Load {name}"):
st.session_state["data"] = data_utils.load_data(path)
st.session_state["data"] = data_utils.convert_dtypes(
st.session_state["data"]
)
st.session_state["datetime_cols"] = eda.detect_datetime_columns(
st.session_state["data"]
)
st.success(f"{name} loaded!")
try:
st.session_state["data"] = data_utils.load_data(path)
st.session_state["data"] = data_utils.convert_dtypes(
st.session_state["data"]
)
st.session_state["datetime_cols"] = eda.detect_datetime_columns(
st.session_state["data"]
)
st.success(f"{name} loaded!")
except (ValueError, TypeError) as exc:
st.error(f"Failed to load sample data: {exc}")

with st.expander("Help"):
st.markdown(ui.help_markdown())
Expand All @@ -64,7 +67,7 @@ def main() -> None:
st.session_state["data"] = df
st.session_state["datetime_cols"] = eda.detect_datetime_columns(df)
st.success("File loaded successfully!")
except ValueError as exc:
except (ValueError, TypeError) as exc:
st.error(f"Failed to load file: {exc}")

data = st.session_state.get("data")
Expand Down Expand Up @@ -171,23 +174,27 @@ def _corr(df):
)
if st.button("Apply Transformations"):
df_trans = data.copy()
if missing_strategy == "Drop rows":
df_trans = transform.handle_missing_values(df_trans, strategy="drop")
elif missing_strategy == "Fill Mean":
df_trans = transform.handle_missing_values(df_trans, strategy="mean")
elif missing_strategy == "Fill Median":
df_trans = transform.handle_missing_values(df_trans, strategy="median")
elif missing_strategy == "Fill Mode":
df_trans = transform.handle_missing_values(df_trans, strategy="mode")
if encode_cols:
method = "onehot" if encode_method == "One-Hot" else "label"
df_trans = transform.encode_features(df_trans, encode_cols, method=method)
if scale_cols:
method = "standard" if scale_method == "Standard" else "minmax"
df_trans = transform.scale_features(df_trans, scale_cols, method=method)
st.session_state["data"] = df_trans
data = df_trans
st.success("Transformations applied!")
try:
if missing_strategy == "Drop rows":
df_trans = transform.handle_missing_values(df_trans, strategy="drop")
elif missing_strategy == "Fill Mean":
df_trans = transform.handle_missing_values(df_trans, strategy="mean")
elif missing_strategy == "Fill Median":
df_trans = transform.handle_missing_values(df_trans, strategy="median")
elif missing_strategy == "Fill Mode":
df_trans = transform.handle_missing_values(df_trans, strategy="mode")
if encode_cols:
method = "onehot" if encode_method == "One-Hot" else "label"
df_trans = transform.encode_features(df_trans, encode_cols, method=method)
if scale_cols:
method = "standard" if scale_method == "Standard" else "minmax"
df_trans = transform.scale_features(df_trans, scale_cols, method=method)
except (ValueError, KeyError, TypeError) as exc:
st.error(f"Transformation error: {exc}")
else:
st.session_state["data"] = df_trans
data = df_trans
st.success("Transformations applied!")

st.subheader("Model Training - Classification")
target = st.selectbox(
Expand Down
9 changes: 6 additions & 3 deletions pages/prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,12 @@ def main() -> None:
model_obj = predict.load_model(Path(tmp.name))

if data_file is not None:
df = data_utils.load_data(data_file)
df = data_utils.convert_dtypes(df)
st.session_state["pred_data"] = df
try:
df = data_utils.load_data(data_file)
df = data_utils.convert_dtypes(df)
st.session_state["pred_data"] = df
except (ValueError, TypeError) as exc:
st.error(f"Failed to load data: {exc}")

data = st.session_state.get("pred_data")

Expand Down
9 changes: 6 additions & 3 deletions pages/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ def main() -> None:
)

if data_file is not None:
df = data_utils.load_data(data_file)
df = data_utils.convert_dtypes(df)
st.session_state["report_data"] = df
try:
df = data_utils.load_data(data_file)
df = data_utils.convert_dtypes(df)
st.session_state["report_data"] = df
except (ValueError, TypeError) as exc:
st.error(f"Failed to load data: {exc}")

df = st.session_state.get("report_data")
if df is not None:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,18 @@ def test_sample_dataset_loads():
df = data.load_data(path)
assert not df.empty


def test_load_data_invalid_type():
with pytest.raises(ValueError):
data.load_data(123)


def test_convert_dtypes_invalid():
with pytest.raises(TypeError):
data.convert_dtypes([1, 2, 3])


def test_data_summary_empty():
with pytest.raises(ValueError):
data.data_summary(pd.DataFrame())

12 changes: 12 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,3 +55,15 @@ def test_transformation_workflow():
df = transform.scale_features(df, ["num"], method="minmax")
assert df["num"].min() == 0
assert "cat" in df.columns


def test_encode_features_missing_column():
df = sample_df().fillna({"cat": "b"})
with pytest.raises(KeyError):
transform.encode_features(df, ["missing"], method="onehot")


def test_scale_features_non_numeric():
df = sample_df().fillna({"cat": "b"})
with pytest.raises(TypeError):
transform.scale_features(df, ["cat"], method="standard")
14 changes: 13 additions & 1 deletion utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@

def load_data(file: Any) -> pd.DataFrame:
"""Load a CSV or Excel file into a DataFrame."""
if file is None:
raise ValueError("No file provided")

if hasattr(file, "name"):
ext = Path(file.name).suffix.lower()
else:
ext = Path(str(file)).suffix.lower()
path = Path(str(file))
ext = path.suffix.lower()
if not path.exists():
raise ValueError("Invalid file path")

try:
if ext == ".csv":
Expand All @@ -27,6 +33,8 @@ def load_data(file: Any) -> pd.DataFrame:

def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:
"""Attempt to convert object columns to numeric or datetime."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
for column in df.columns:
if df[column].dtype == object:
df[column] = pd.to_numeric(df[column], errors="ignore")
Expand All @@ -37,4 +45,8 @@ def convert_dtypes(df: pd.DataFrame) -> pd.DataFrame:

def data_summary(df: pd.DataFrame) -> pd.DataFrame:
"""Return a statistical summary of the dataframe."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
if df.empty:
raise ValueError("DataFrame is empty")
return df.describe(include="all")
14 changes: 14 additions & 0 deletions utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

def handle_missing_values(df: pd.DataFrame, *, strategy: str = "drop") -> pd.DataFrame:
"""Handle missing values according to the specified strategy."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
if strategy not in MISSING_STRATEGIES:
raise ValueError(f"Invalid strategy: {strategy}")
if strategy == "drop":
Expand All @@ -40,6 +42,11 @@ def encode_features(
method: str = "onehot",
) -> pd.DataFrame:
"""Encode categorical features using the given method."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
missing = set(columns) - set(df.columns)
if missing:
raise KeyError(f"Columns not found: {', '.join(missing)}")
if method not in ENCODING_METHODS:
raise ValueError(f"Invalid encoding method: {method}")
df = df.copy()
Expand All @@ -58,6 +65,13 @@ def scale_features(
method: str = "standard",
) -> pd.DataFrame:
"""Scale numeric features with the given method."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
missing = set(columns) - set(df.columns)
if missing:
raise KeyError(f"Columns not found: {', '.join(missing)}")
if not all(pd.api.types.is_numeric_dtype(df[c]) for c in columns):
raise TypeError("Scale features require numeric columns")
if method not in SCALING_METHODS:
raise ValueError(f"Invalid scaling method: {method}")
df = df.copy()
Expand Down