diff --git a/.gitignore b/.gitignore index fa30cb2fb..760a202d3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,43 @@ -index.py -env/ \ No newline at end of file +"""" +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual Environment +venv/ +env/ +ENV/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Data files (optional - uncomment if you don't want to track data) +# data/*.csv +# results.csv \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 1411a4a0b..3f24da678 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ -pandas \ No newline at end of file +""" +pandas>=1.5.0 +numpy>=1.21.0 +""" diff --git a/src/clean_data.py b/src/clean_data.py index da613640a..e1a4b6131 100644 --- a/src/clean_data.py +++ b/src/clean_data.py @@ -1,3 +1,7 @@ +import numpy as np +import pandas as pd + + def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame: """ Clean sensor data by handling missing or invalid values. @@ -5,3 +9,41 @@ def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame: Returns: pd.DataFrame: Cleaned data. """ + # Create a copy to avoid modifying original data + + cleaned_df = df.copy() + + # Add flags for missing data + cleaned_df['missing_ph'] = cleaned_df['ph'].isna() + cleaned_df['missing_turbidity'] = cleaned_df['turbidity'].isna() + cleaned_df['missing_temperature'] = cleaned_df['temperature'].isna() + + # Handle invalid pH values (should be between 0-14) + invalid_ph_mask = (cleaned_df['ph'] < 0) | (cleaned_df['ph'] > 14) + cleaned_df.loc[invalid_ph_mask, 'ph'] = np.nan + cleaned_df.loc[invalid_ph_mask, 'missing_ph'] = True + + # Handle invalid turbidity values (should be non-negative) + invalid_turbidity_mask = cleaned_df['turbidity'] < 0 + cleaned_df.loc[invalid_turbidity_mask, 'turbidity'] = np.nan + cleaned_df.loc[invalid_turbidity_mask, 'missing_turbidity'] = True + + # Handle invalid temperature values (reasonable range: -10 to 50°C) + invalid_temp_mask = (cleaned_df['temperature'] < -10) | (cleaned_df['temperature'] > 50) + cleaned_df.loc[invalid_temp_mask, 'temperature'] = np.nan + cleaned_df.loc[invalid_temp_mask, 'missing_temperature'] = True + + # Fill missing sensor_id if needed + if cleaned_df['sensor_id'].isna().any(): + cleaned_df['sensor_id'] = cleaned_df['sensor_id'].fillna('Unknown') + + # Fill missing location if needed + if cleaned_df['location'].isna().any(): + cleaned_df['location'] = cleaned_df['location'].fillna('Unknown Location') + + print(f"Data cleaning completed. Found {cleaned_df['missing_ph'].sum()} missing pH values, " + f"{cleaned_df['missing_turbidity'].sum()} missing turbidity values, " + f"{cleaned_df['missing_temperature'].sum()} missing temperature values.") + + return cleaned_df + diff --git a/src/evaluate.py b/src/evaluate.py index 006256224..0aab0ebcf 100644 --- a/src/evaluate.py +++ b/src/evaluate.py @@ -1,3 +1,8 @@ +import pandas as pd +from clean_data import clean_sensor_data +import os + + class WaterQualityEvaluator: def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0): self.ph_range = ph_range @@ -7,3 +12,60 @@ def is_safe(self, row: pd.Series) -> bool: """ Determine if a row of water data is safe. """ +def evaluate_single_reading(self, row): + """ + Evaluate a single water quality reading. + + Args: + row (pd.Series): Single row of water quality data + + Returns: + dict: Evaluation results with safety status and reasons + """ + issues = [] + + # Check for missing pH + if row.get('missing_ph', False) or pd.isna(row.get('ph')): + issues.append('missing pH') + elif row['ph'] < self.ph_min: + issues.append('pH too low') + elif row['ph'] > self.ph_max: + issues.append('pH too high') + + # Check for missing turbidity + if row.get('missing_turbidity', False) or pd.isna(row.get('turbidity')): + issues.append('missing turbidity') + elif row['turbidity'] > self.turbidity_max: + issues.append('turbidity too high') + + is_safe = len(issues) == 0 + + return { + 'is_safe': is_safe, + 'issues': issues, + 'status': 'Safe' if is_safe else 'Unsafe', + 'reason': ', '.join(issues) if issues else 'All parameters within safe range' + } +def evaluate_all_readings(self, df): + """ + Evaluate all water quality readings in the DataFrame. + + Args: + df (pd.DataFrame): Water quality data + + Returns: + pd.DataFrame: Original data with evaluation results + """ + results = [] + + for _, row in df.iterrows(): + evaluation = self.evaluate_single_reading(row) + results.append(evaluation) + + # Add evaluation results to DataFrame + results_df = df.copy() + results_df['is_safe'] = [r['is_safe'] for r in results] + results_df['status'] = [r['status'] for r in results] + results_df['reason'] = [r['reason'] for r in results] + + return results_df diff --git a/src/load_data.py b/src/load_data.py index c0126703a..0faf590ba 100644 --- a/src/load_data.py +++ b/src/load_data.py @@ -1,4 +1,10 @@ -def load_csv(filepath: str) -> pd.DataFrame: +import pandas as pd +from clean_data import clean_sensor_data +import os + + +def sensor_data_csv(filepath: str) -> pd.DataFrame: + """ Load sensor data from a CSV file. @@ -8,3 +14,18 @@ def load_csv(filepath: str) -> pd.DataFrame: Returns: pd.DataFrame: Loaded data as a pandas DataFrame. """ + import pandas as pd + + try: + data = pd.load_csv(filepath) + return data + except FileNotFoundError: + print(f"File not found: {filepath}") + return pd.DataFrame() + except pd.errors.EmptyDataError: + print(f"No data in file: {filepath}") + return pd.DataFrame() + except Exception as e: + print(f"An error occurred while loading the file: {e}") + return pd.DataFrame() + diff --git a/src/main.py b/src/main.py index e69de29bb..ea4023dfd 100644 --- a/src/main.py +++ b/src/main.py @@ -0,0 +1,117 @@ +import pandas as pd +import os +import sys +from datetime import datetime +from load_data import sensor_data_csv +from clean_data import clean_sensor_data +from evaluate import WaterQualityEvaluator + +# Import our custom modules +# from load_data import load_csv + +def print_results(df): + """Print results in the expected format.""" + print("\n" + "="*50) + print("WATER QUALITY ASSESSMENT RESULTS") + print("="*50) + + for _, row in df.iterrows(): + sensor_id = row['sensor_id'] + location = row['location'] + status_emoji = "✅" if row['is_safe'] else "❌" + status = row['status'] + + if row['is_safe']: + print(f"Sensor {sensor_id} at {location}: {status_emoji} {status}") + else: + print(f"Sensor {sensor_id} at {location}: {status_emoji} {status} ({row['reason']})") + +def save_results_to_csv(df, output_file='results.csv'): + """Save results to CSV file.""" + # Select relevant columns for output + output_columns = ['sensor_id', 'location', 'ph', 'turbidity', 'temperature', + 'is_safe', 'status', 'reason'] + + results_df = df[output_columns].copy() + results_df.to_csv(output_file, index=False) + print(f"\nResults saved to {output_file}") + +def print_summary(df): + """Print summary statistics.""" + total_sensors = len(df) + safe_sensors = df['is_safe'].sum() + unsafe_sensors = total_sensors - safe_sensors + + print(f"\n" + "="*30) + print("SUMMARY") + print("="*30) + print(f"Total sensors: {total_sensors}") + print(f"Safe sensors: {safe_sensors}") + print(f"Unsafe sensors: {unsafe_sensors}") + print(f"Safety rate: {(safe_sensors/total_sensors)*100:.1f}%") + +def filter_by_location(df, location_filter): + """Filter data by location.""" + if location_filter: + filtered_df = df[df['location'].str.contains(location_filter, case=False, na=False)] + if len(filtered_df) == 0: + print(f"No sensors found for location: {location_filter}") + return df + else: + print(f"Filtered to {len(filtered_df)} sensors for location: {location_filter}") + return filtered_df + return df + +def main(): + """Main function to run the water quality monitoring pipeline.""" + print("Water Quality Monitoring System") + print("="*40) + + # Configuration + data_file = 'clean_sensor_data.csv' + + # Check if user wants to filter by location + location_filter = None + if len(sys.argv) > 1: + location_filter = sys.argv[1] + print(f"Filtering by location: {location_filter}") + + try: + # Step 1: Load data + print(f"\n1. Loading data from {data_file}...") + df = sensor_data_csv(data_file) + + # Step 2: Clean data + print("\n2. Cleaning data...") + cleaned_df = clean_sensor_data(df) + + # Step 3: Filter by location if specified + if location_filter: + cleaned_df = filter_by_location(cleaned_df, location_filter) + + # Step 4: Evaluate water quality + print("\n3. Evaluating water quality...") + evaluator = WaterQualityEvaluator() + results_df = evaluator.evaluate_all_readings(cleaned_df) + + # Step 5: Display results + print_results(results_df) + + # Step 6: Print summary + print_summary(results_df) + + # Step 7: Save results (bonus feature) + save_results_to_csv(results_df) + + print(f"\nProcessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + except Exception as e: + print(f"Error: {str(e)}") + sys.exit(1) + +if __name__ == "__main__": + main() + # Accept location name from terminal (bonus) + location = sys.argv[1] if len(sys.argv) > 1 else None + main(location) +