diff --git a/src/clean_data.py b/src/clean_data.py index da613640a..50af97091 100644 --- a/src/clean_data.py +++ b/src/clean_data.py @@ -1,7 +1,20 @@ +import pandas as pd def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame: + """ Clean sensor data by handling missing or invalid values. Returns: pd.DataFrame: Cleaned data. """ + cleaned_df = df.copy() + + #Handle missing values - replace with None + cleaned_df = cleaned_df.replace(['', 'NULL', 'null', 'NAN'], None) + + #Convert numeric columns + numeric_columns = ['pH', 'turbidity', 'temperature', 'dissolved_oxygen'] + for col in numeric_columns: + cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce') + + return cleaned_df \ No newline at end of file diff --git a/src/evaluate.py b/src/evaluate.py index 006256224..a5a882165 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -1,9 +1,43 @@ +import pandas as pd class WaterQualityEvaluator: def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0): self.ph_range = ph_range self.turbidity_threshold = turbidity_threshold - def is_safe(self, row: pd.Series) -> bool: + def is_safe(self, row: pd.Series, location_name) -> bool: """ Determine if a row of water data is safe. """ + sensor_id = row['sensor_id'] + ph = row['pH'] + turbidity = row['turbidity'] + + #check for missing values + if pd.isna(ph): + return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (missing pH)" + if pd.isna(turbidity): + return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (missing turbidity)" + + #check ph range + min_ph, max_ph = self.ph_range + if ph < min_ph: + return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (pH too low)" + elif ph > max_ph: + return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (pH too high)" + + # Check turbidity + if turbidity > self.turbidity_threshold: + return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (turbidity too high)" + + # If all checks pass + return f"Sensor {sensor_id} at {location_name}: ✅ Safe" + + def evaluate_dataframe(self, df, location_name): + """ + Evaluate all rows in the dataframe + """ + results = [] + for index, row in df.iterrows(): + result = self.is_safe(row, location_name) + results.append(result) + return results \ No newline at end of file diff --git a/src/load_data.py b/src/load_data.py index c0126703a..939731dba 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -1,3 +1,4 @@ +import pandas as pd def load_csv(filepath: str) -> pd.DataFrame: """ Load sensor data from a CSV file. @@ -8,3 +9,5 @@ def load_csv(filepath: str) -> pd.DataFrame: Returns: pd.DataFrame: Loaded data as a pandas DataFrame. """ + df = pd.read_csv(filepath) + return df \ No newline at end of file diff --git a/src/main.py b/src/main.py index e69de29bb..69c57032e 100644 --- a/src/main.py +++ b/src/main.py @@ -0,0 +1,50 @@ +import pandas as pd +from load_data import load_csv +from clean_data import clean_sensor_data +from evaluate import WaterQualityEvaluator + +def main(): + # Get input from user + file_path = input("Enter the path to your CSV file: ") + location_name = input("Enter the location name (e.g., Lake A): ") + + # Load data + print("Loading data...") + df = load_csv(file_path) + + if df is None: + print("Failed to load data. Exiting.") + return + + print(f"Loaded {len(df)} rows of data") + + # Clean data + print("Cleaning data...") + cleaned_df = clean_sensor_data(df) + + # Evaluate data + print("Evaluating water safety...") + evaluator = WaterQualityEvaluator() + results = evaluator.evaluate_dataframe(cleaned_df, location_name) + + # Print results + print("\n--- Water Safety Results ---") + for result in results: + print(result) + + # Bonus: Count safe vs unsafe + safe_count = sum(1 for result in results if "✅ Safe" in result) + unsafe_count = len(results) - safe_count + + print(f"\nSummary: {safe_count} safe, {unsafe_count} unsafe") + + # Bonus: Save to CSV + results_df = pd.DataFrame({ + 'sensor_id': cleaned_df['sensor_id'], + 'result': [r.split(': ')[1] for r in results] + }) + results_df.to_csv('results.csv', index=False) + print("Results saved to results.csv") + + +main() \ No newline at end of file