SamDewriter · datasavvysarah · Jun 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,43 @@
-index.py
-env/
+""""
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual Environment
+venv/
+env/
+ENV/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Data files (optional - uncomment if you don't want to track data)
+# data/*.csv
+# results.csv
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,4 @@
-pandas
+"""
+pandas>=1.5.0
+numpy>=1.21.0
+"""
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -1,7 +1,49 @@
+import numpy as np
+import pandas as pd
+
+
 def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
     """
     Clean sensor data by handling missing or invalid values.
 
     Returns:
         pd.DataFrame: Cleaned data.
     """
+    # Create a copy to avoid modifying original data
+
+    cleaned_df = df.copy()
+
+    # Add flags for missing data
+    cleaned_df['missing_ph'] = cleaned_df['ph'].isna()
+    cleaned_df['missing_turbidity'] = cleaned_df['turbidity'].isna()
+    cleaned_df['missing_temperature'] = cleaned_df['temperature'].isna()
+
+    # Handle invalid pH values (should be between 0-14)
+    invalid_ph_mask = (cleaned_df['ph'] < 0) | (cleaned_df['ph'] > 14)
+    cleaned_df.loc[invalid_ph_mask, 'ph'] = np.nan
+    cleaned_df.loc[invalid_ph_mask, 'missing_ph'] = True
+
+    # Handle invalid turbidity values (should be non-negative)
+    invalid_turbidity_mask = cleaned_df['turbidity'] < 0
+    cleaned_df.loc[invalid_turbidity_mask, 'turbidity'] = np.nan
+    cleaned_df.loc[invalid_turbidity_mask, 'missing_turbidity'] = True
+
+    # Handle invalid temperature values (reasonable range: -10 to 50°C)
+    invalid_temp_mask = (cleaned_df['temperature'] < -10) | (cleaned_df['temperature'] > 50)
+    cleaned_df.loc[invalid_temp_mask, 'temperature'] = np.nan
+    cleaned_df.loc[invalid_temp_mask, 'missing_temperature'] = True
+
+    # Fill missing sensor_id if needed
+    if cleaned_df['sensor_id'].isna().any():
+        cleaned_df['sensor_id'] = cleaned_df['sensor_id'].fillna('Unknown')
+
+    # Fill missing location if needed
+    if cleaned_df['location'].isna().any():
+        cleaned_df['location'] = cleaned_df['location'].fillna('Unknown Location')
+
+    print(f"Data cleaning completed. Found {cleaned_df['missing_ph'].sum()} missing pH values, "
+          f"{cleaned_df['missing_turbidity'].sum()} missing turbidity values, "
+          f"{cleaned_df['missing_temperature'].sum()} missing temperature values.")
+
+    return cleaned_df
+
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -1,3 +1,8 @@
+import pandas as pd
+from clean_data import clean_sensor_data
+import os
+
+
 class WaterQualityEvaluator:
     def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
         self.ph_range = ph_range
@@ -7,3 +12,60 @@ def is_safe(self, row: pd.Series) -> bool:
         """
         Determine if a row of water data is safe.
         """
+def evaluate_single_reading(self, row):
+        """
+        Evaluate a single water quality reading.
+
+        Args:
+            row (pd.Series): Single row of water quality data
+
+        Returns:
+            dict: Evaluation results with safety status and reasons
+        """
+        issues = []
+
+        # Check for missing pH
+        if row.get('missing_ph', False) or pd.isna(row.get('ph')):
+            issues.append('missing pH')
+        elif row['ph'] < self.ph_min:
+            issues.append('pH too low')
+        elif row['ph'] > self.ph_max:
+            issues.append('pH too high')
+
+        # Check for missing turbidity
+        if row.get('missing_turbidity', False) or pd.isna(row.get('turbidity')):
+            issues.append('missing turbidity')
+        elif row['turbidity'] > self.turbidity_max:
+            issues.append('turbidity too high')
+
+        is_safe = len(issues) == 0
+
+        return {
+            'is_safe': is_safe,
+            'issues': issues,
+            'status': 'Safe' if is_safe else 'Unsafe',
+            'reason': ', '.join(issues) if issues else 'All parameters within safe range'
+        }
+def evaluate_all_readings(self, df):
+        """
+        Evaluate all water quality readings in the DataFrame.
+
+        Args:
+            df (pd.DataFrame): Water quality data
+
+        Returns:
+            pd.DataFrame: Original data with evaluation results
+        """
+        results = []
+
+        for _, row in df.iterrows():
+            evaluation = self.evaluate_single_reading(row)
+            results.append(evaluation)
+
+        # Add evaluation results to DataFrame
+        results_df = df.copy()
+        results_df['is_safe'] = [r['is_safe'] for r in results]
+        results_df['status'] = [r['status'] for r in results]
+        results_df['reason'] = [r['reason'] for r in results]
+
+        return results_df
diff --git a/src/load_data.py b/src/load_data.py
@@ -1,4 +1,10 @@
-def load_csv(filepath: str) -> pd.DataFrame:
+import pandas as pd
+from clean_data import clean_sensor_data
+import os
+
+
+def sensor_data_csv(filepath: str) -> pd.DataFrame:
+
     """
     Load sensor data from a CSV file.
 
@@ -8,3 +14,18 @@ def load_csv(filepath: str) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Loaded data as a pandas DataFrame.
     """
+    import pandas as pd
+
+    try:
+        data = pd.load_csv(filepath)
+        return data
+    except FileNotFoundError:
+        print(f"File not found: {filepath}")
+        return pd.DataFrame()
+    except pd.errors.EmptyDataError:
+        print(f"No data in file: {filepath}")
+        return pd.DataFrame()
+    except Exception as e:
+        print(f"An error occurred while loading the file: {e}")
+        return pd.DataFrame()
+
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,117 @@
+import pandas as pd
+import os
+import sys
+from datetime import datetime
+from load_data import sensor_data_csv
+from clean_data import clean_sensor_data
+from evaluate import WaterQualityEvaluator
+
+# Import our custom modules
+# from load_data import load_csv
+
+def print_results(df):
+    """Print results in the expected format."""
+    print("\n" + "="*50)
+    print("WATER QUALITY ASSESSMENT RESULTS")
+    print("="*50)
+
+    for _, row in df.iterrows():
+        sensor_id = row['sensor_id']
+        location = row['location']
+        status_emoji = "✅" if row['is_safe'] else "❌"
+        status = row['status']
+
+        if row['is_safe']:
+            print(f"Sensor {sensor_id} at {location}: {status_emoji} {status}")
+        else:
+            print(f"Sensor {sensor_id} at {location}: {status_emoji} {status} ({row['reason']})")
+
+def save_results_to_csv(df, output_file='results.csv'):
+    """Save results to CSV file."""
+    # Select relevant columns for output
+    output_columns = ['sensor_id', 'location', 'ph', 'turbidity', 'temperature', 
+                     'is_safe', 'status', 'reason']
+
+    results_df = df[output_columns].copy()
+    results_df.to_csv(output_file, index=False)
+    print(f"\nResults saved to {output_file}")
+
+def print_summary(df):
+    """Print summary statistics."""
+    total_sensors = len(df)
+    safe_sensors = df['is_safe'].sum()
+    unsafe_sensors = total_sensors - safe_sensors
+
+    print(f"\n" + "="*30)
+    print("SUMMARY")
+    print("="*30)
+    print(f"Total sensors: {total_sensors}")
+    print(f"Safe sensors: {safe_sensors}")
+    print(f"Unsafe sensors: {unsafe_sensors}")
+    print(f"Safety rate: {(safe_sensors/total_sensors)*100:.1f}%")
+
+def filter_by_location(df, location_filter):
+    """Filter data by location."""
+    if location_filter:
+        filtered_df = df[df['location'].str.contains(location_filter, case=False, na=False)]
+        if len(filtered_df) == 0:
+            print(f"No sensors found for location: {location_filter}")
+            return df
+        else:
+            print(f"Filtered to {len(filtered_df)} sensors for location: {location_filter}")
+            return filtered_df
+    return df
+
+def main():
+    """Main function to run the water quality monitoring pipeline."""
+    print("Water Quality Monitoring System")
+    print("="*40)
+
+    # Configuration
+    data_file = 'clean_sensor_data.csv'
+
+    # Check if user wants to filter by location
+    location_filter = None
+    if len(sys.argv) > 1:
+        location_filter = sys.argv[1]
+        print(f"Filtering by location: {location_filter}")
+
+    try:
+        # Step 1: Load data
+        print(f"\n1. Loading data from {data_file}...")
+        df = sensor_data_csv(data_file)
+
+        # Step 2: Clean data
+        print("\n2. Cleaning data...")
+        cleaned_df = clean_sensor_data(df)
+
+        # Step 3: Filter by location if specified
+        if location_filter:
+            cleaned_df = filter_by_location(cleaned_df, location_filter)
+
+        # Step 4: Evaluate water quality
+        print("\n3. Evaluating water quality...")
+        evaluator = WaterQualityEvaluator()
+        results_df = evaluator.evaluate_all_readings(cleaned_df)
+
+        # Step 5: Display results
+        print_results(results_df)
+
+        # Step 6: Print summary
+        print_summary(results_df)
+
+        # Step 7: Save results (bonus feature)
+        save_results_to_csv(results_df)
+
+        print(f"\nProcessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
+    # Accept location name from terminal (bonus)
+    location = sys.argv[1] if len(sys.argv) > 1 else None
+    main(location)
+