SamDewriter · lewikeezy · Jun 16, 2025 · Jun 17, 2025
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@ cd Water_Quality_Monitoring
 2. **Create a virtual environment and install dependencies**  
 ```bash
 python -m venv venv
-source venv/bin/activate  # On Windows: venv\Scripts\activate
+source venv/bin/activate  # On Windows: venv\Scripts\activate   #git-bash on windows: source venv/scripts/activate
 pip install -r requirements.txt
 ```
 
@@ -101,4 +101,4 @@ Water_Quality_Monitoring/
 2. Create a new branch (`git checkout -b feature-name`)
 3. Commit your changes (`git commit -am 'Add something'`)
 4. Push to the branch (`git push origin feature-name`)
-5. Open a pull request
+5. Open a pull request
diff --git a/src/__pycache__/clean_data.cpython-313.pyc b/src/__pycache__/clean_data.cpython-313.pyc
diff --git a/src/__pycache__/evaluate.cpython-313.pyc b/src/__pycache__/evaluate.cpython-313.pyc
diff --git a/src/__pycache__/load_data.cpython-313.pyc b/src/__pycache__/load_data.cpython-313.pyc
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -1,7 +1,45 @@
+import pandas as pd
+
 def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
     """
     Clean sensor data by handling missing or invalid values.
-
     Returns:
         pd.DataFrame: Cleaned data.
     """
+    # Normalize column names
+    df.columns = df.columns.str.lower()
+
+    # Convert pH and turbidity to numeric values, coercing errors
+    df['ph'] = pd.to_numeric(df['ph'], errors='coerce')
+    df['turbidity'] = pd.to_numeric(df['turbidity'], errors='coerce')
+
+    # Drop rows with missing pH or turbidity values
+    df = df.dropna(subset=['ph', 'turbidity'])
+
+    # Map sensor_id to location
+    lake_map = {
+        'SENSOR_001': 'Lake A',
+        'SENSOR_002': 'Lake B',
+        'SENSOR_003': 'Lake C',
+        'SENSOR_004': 'Lake D',
+        'SENSOR_005': 'Lake E',
+        'SENSOR_006': 'Lake F',
+        'SENSOR_007': 'Lake G',
+        'SENSOR_008': 'Lake H',
+        'SENSOR_009': 'Lake I',
+        'SENSOR_010': 'Lake J',
+        'SENSOR_011': 'Lake K',
+        'SENSOR_012': 'Lake L',
+        'SENSOR_013': 'Lake M',
+        'SENSOR_014': 'Lake N',
+        'SENSOR_015': 'Lake O',
+        'SENSOR_016': 'Lake P',
+        'SENSOR_017': 'Lake Q',
+        'SENSOR_018': 'Lake R',
+        'SENSOR_019': 'Lake S',
+        'SENSOR_020': 'Lake T'
+    }
+
+    df['location'] = df['sensor_id'].map(lake_map)
+
+    return df
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 class WaterQualityEvaluator:
     def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
         self.ph_range = ph_range
@@ -6,4 +8,39 @@ def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
     def is_safe(self, row: pd.Series) -> bool:
         """
         Determine if a row of water data is safe.
+
+        Args:
+            row (pd.Series): A row of sensor data.
+
+        Returns:
+            bool: True if safe, False otherwise.
+        """
+        if pd.isna(row['ph']) or pd.isna(row['turbidity']):
+            return False
+        if not (self.ph_range[0] <= row['ph'] <= self.ph_range[1]):
+            return False
+        if row['turbidity'] > self.turbidity_threshold:
+            return False
+        return True
+
+    def get_reason(self, row: pd.Series) -> str:
+        """
+        Provide reason for safety status of the sensor data row.
+
+        Args:
+            row (pd.Series): A row of sensor data.
+
+        Returns:
+            str: Reason message including emoji.
         """
+        if pd.isna(row['ph']):
+            return "❌ Unsafe (missing pH)"
+        if pd.isna(row['turbidity']):
+            return "❌ Unsafe (missing turbidity)"
+        if row['ph'] < self.ph_range[0]:
+            return "❌ Unsafe (pH too low)"
+        if row['ph'] > self.ph_range[1]:
+            return "❌ Unsafe (pH too high)"
+        if row['turbidity'] > self.turbidity_threshold:
+            return "❌ Unsafe (turbidity too high)"
+        return "✅ Safe"
diff --git a/src/load_data.py b/src/load_data.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 def load_csv(filepath: str) -> pd.DataFrame:
     """
     Load sensor data from a CSV file.
@@ -8,3 +10,11 @@ def load_csv(filepath: str) -> pd.DataFrame:
     Returns:
         pd.DataFrame: Loaded data as a pandas DataFrame.
     """
+    try:
+        return pd.read_csv(filepath)
+    except FileNotFoundError:
+        print(f"Error: File not found at {filepath}")
+        return pd.DataFrame()
+    except pd.errors.ParserError:
+        print(f"Error: Failed to parse CSV file at {filepath}")
+        return pd.DataFrame()
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,45 @@
+import os
+import pandas as pd
+from load_data import load_csv
+from clean_data import clean_sensor_data
+from evaluate import WaterQualityEvaluator
+
+
+def main():
+    # Step 1: Load data
+    data_path = os.path.join("..", "data", "sensor_data.csv")
+    df = load_csv(data_path)
+
+    if df.empty:
+        print("No data to process.")
+        return
+
+    # Step 2: Clean data
+    cleaned_df = clean_sensor_data(df)
+
+
+    # Step 3: Evaluate data
+    evaluator = WaterQualityEvaluator()
+    results = []
+
+    for _, row in cleaned_df.iterrows():  # use original df to preserve missing entries
+        status = evaluator.get_reason(row)
+        print(f"Sensor {row['sensor_id']} at {row['location']}: {status}")
+        results.append({
+            "sensor_id": row["sensor_id"],
+            "location": row["location"],
+            "status": status
+        })
+
+    # Step 4 (Bonus): Save results to CSV
+    results_df = pd.DataFrame(results)
+    results_df.to_csv("results.csv", index=False)
+
+    # Step 5 (Bonus): Summary counts
+    safe_count = results_df["status"].str.contains("✅").sum()
+    unsafe_count = results_df.shape[0] - safe_count
+    print(f"Sensor {row['sensor_id']} at {row['location']}: {status}")
+
+
+if __name__ == "__main__":
+    main()