SamDewriter · Dorcas-Njuguna · Oct 5, 2025 · Oct 5, 2025
diff --git a/data/results.csv b/data/results.csv
@@ -0,0 +1,52 @@
+sensor_id,timestamp,ph,turbidity,dissolved_oxygen,temperature,is_safe,status_text
+SENSOR_012,2023-01-01 15:30:00,6.28,3.04,6.14,25.22,False,"Unsafe (pH too low, turbidity too high)"
+SENSOR_012,2023-01-01 18:00:00,7.55,2.04,7.5,17.29,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-01 18:30:00,7.26,3.99,7.36,23.04,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-01 23:00:00,7.31,0.93,5.94,22.57,True,Safe
+SENSOR_012,2023-01-02 04:30:00,7.07,2.16,9.76,18.64,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-02 05:30:00,7.63,2.45,12.9,22.73,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-02 11:30:00,6.72,2.77,10.35,17.33,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-02 19:30:00,7.17,2.58,6.96,21.31,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-02 23:30:00,6.73,2.85,7.11,18.13,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-03 04:00:00,7.91,3.56,8.03,18.71,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-03 04:30:00,6.4,3.49,8.31,26.82,False,"Unsafe (pH too low, turbidity too high)"
+SENSOR_012,2023-01-04 03:00:00,6.91,5.35,6.9,21.06,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-04 04:00:00,6.84,3.47,7.54,27.52,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-04 09:00:00,6.9,2.71,8.8,22.45,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-04 11:30:00,7.07,2.11,6.38,22.1,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-05 07:00:00,6.85,2.45,9.24,26.85,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-05 16:30:00,6.47,2.88,6.15,25.64,False,"Unsafe (pH too low, turbidity too high)"
+SENSOR_012,2023-01-06 05:30:00,7.5,2.74,8.79,19.96,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-06 08:00:00,7.4,3.69,7.22,24.31,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-06 12:30:00,7.46,2.18,6.4,24.21,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-08 08:00:00,7.32,2.55,7.78,19.93,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-09 07:30:00,6.77,2.42,6.71,23.09,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-09 16:00:00,6.42,1.82,9.14,17.86,False,"Unsafe (pH too low, turbidity too high)"
+SENSOR_012,2023-01-10 03:30:00,6.97,3.74,9.06,22.74,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-10 05:30:00,7.01,0.41,5.51,24.14,True,Safe
+SENSOR_012,2023-01-10 09:30:00,7.09,2.62,10.24,20.55,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-10 12:30:00,7.06,3.61,3.82,19.73,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-10 22:30:00,6.63,2.61,6.7,24.34,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-13 04:30:00,7.05,3.63,6.46,20.66,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-13 18:30:00,7.71,2.55,6.14,22.63,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-13 20:30:00,7.1,3.44,11.03,21.04,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-14 19:00:00,7.45,2.75,8.01,25.29,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-14 21:30:00,7.27,3.88,8.66,19.66,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-15 19:30:00,6.53,2.48,7.47,24.84,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-15 23:30:00,6.92,3.5,7.23,20.18,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-16 03:00:00,7.03,4.1,6.97,17.72,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-16 05:30:00,7.0,2.8,5.68,21.35,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-16 16:00:00,6.63,4.32,8.78,25.5,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-17 11:30:00,6.75,2.93,4.22,24.29,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-17 12:00:00,7.65,2.67,7.63,24.06,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-17 21:30:00,7.04,2.92,6.29,21.71,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-18 15:00:00,6.92,4.35,6.93,21.82,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-19 06:00:00,6.63,1.81,6.07,24.59,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-19 09:00:00,7.14,3.32,8.98,20.48,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-20 09:30:00,6.55,4.07,8.53,22.39,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-20 15:00:00,6.1,4.57,6.96,21.7,False,"Unsafe (pH too low, turbidity too high)"
+SENSOR_012,2023-01-20 16:30:00,7.7,2.07,10.47,21.49,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-21 05:00:00,6.8,2.56,6.9,22.3,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-21 08:30:00,6.71,3.33,8.6,15.33,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-21 10:00:00,7.46,3.65,8.99,17.92,False,Unsafe (turbidity too high)
+SENSOR_012,2023-01-21 16:00:00,7.03,3.51,6.97,19.53,False,Unsafe (turbidity too high)
diff --git a/src/__pycache__/clean_data.cpython-312.pyc b/src/__pycache__/clean_data.cpython-312.pyc
diff --git a/src/__pycache__/evaluate.cpython-312.pyc b/src/__pycache__/evaluate.cpython-312.pyc
diff --git a/src/__pycache__/load_data.cpython-312.pyc b/src/__pycache__/load_data.cpython-312.pyc
diff --git a/src/__pycache__/sensor.cpython-312.pyc b/src/__pycache__/sensor.cpython-312.pyc
diff --git a/src/clean_data.py b/src/clean_data.py
@@ -1,7 +1,41 @@
+# clean_data.py 
+
+import pandas as pd
+
 def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
     """
-    Clean sensor data by handling missing or invalid values.
-
-    Returns:
-        pd.DataFrame: Cleaned data.
+    Clean the data but keep rows (we'll flag problems later):
+      - trim location text
+      - convert pH/turbidity/dissolved_oxygen/temperature to numeric
+      - set out-of-range values to NaN so we can mark them unsafe
+      - drop duplicate readings (same sensor_id + timestamp)
     """
+    df = df.copy()
+
+    if "location" in df.columns:
+        df["location"] = df["location"].astype(str).str.strip()
+
+    numeric_cols = ["ph", "turbidity", "dissolved_oxygen", "temperature"]
+    for col in numeric_cols:
+        if col in df.columns:
+            df[col] = pd.to_numeric(df[col], errors="coerce")
+
+    # ranges
+    if "ph" in df.columns:
+        df.loc[(df["ph"] < 0) | (df["ph"] > 14), "ph"] = pd.NA
+    if "turbidity" in df.columns:
+        df.loc[df["turbidity"] < 0, "turbidity"] = pd.NA
+    if "temperature" in df.columns:
+        df.loc[(df["temperature"] < -5) | (df["temperature"] > 60), "temperature"] = pd.NA
+
+    # drop duplicate readings (keep last)
+    dup_subset = [c for c in ["sensor_id", "timestamp"] if c in df.columns]
+    if dup_subset:
+        df = df.drop_duplicates(subset=dup_subset, keep="last")
+
+    return df
+
+
+
+
+
diff --git a/src/evaluate.py b/src/evaluate.py
@@ -1,9 +1,41 @@
+
+# evaluate.py — mark each reading Safe/Unsafe with a reason
+
+import pandas as pd
+
 class WaterQualityEvaluator:
     def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
         self.ph_range = ph_range
         self.turbidity_threshold = turbidity_threshold
 
-    def is_safe(self, row: pd.Series) -> bool:
+    def is_safe(self, row: pd.Series) -> tuple[bool, str]:
         """
-        Determine if a row of water data is safe.
+        Row is safe if:
+          - pH is present and within [ph_range]
+          - turbidity is present and <= turbidity_threshold
+        Missing values = unsafe, with a reason.
         """
+        reasons = []
+
+        ph = row.get("ph")
+        turb = row.get("turbidity")
+
+        # missing checks (pd.isna handles NaN/None)
+        if pd.isna(ph):
+            reasons.append("missing pH")
+        if pd.isna(turb):
+            reasons.append("missing turbidity")
+
+        # range checks only if present
+        if not reasons:
+            lo, hi = self.ph_range
+            if ph < lo:
+                reasons.append("pH too low")
+            elif ph > hi:
+                reasons.append("pH too high")
+            if turb > self.turbidity_threshold:
+                reasons.append("turbidity too high")
+
+        ok = len(reasons) == 0
+        message = "Safe" if ok else "Unsafe (" + ", ".join(reasons) + ")"
+        return ok, message
diff --git a/src/load_data.py b/src/load_data.py
@@ -1,10 +1,16 @@
+import pandas as pd
+
 def load_csv(filepath: str) -> pd.DataFrame:
     """
-    Load sensor data from a CSV file.
+    Load sensor data from a CSV file into a pandas DataFrame.
+    - Lower-case and underscore column names
+    - Parse 'timestamp' to datetime if present
+    """
+    df = pd.read_csv(filepath)
+    df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
 
-    Args:
-        filepath (str): Path to the CSV file.
+    if "timestamp" in df.columns:
+        df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+
+    return df
 
-    Returns:
-        pd.DataFrame: Loaded data as a pandas DataFrame.
-    """
diff --git a/src/main.py b/src/main.py
@@ -0,0 +1,92 @@
+# main.py — run the water quality pipeline using sensor_id (dataset has no location)
+
+from load_data import load_csv
+from clean_data import clean_sensor_data
+from evaluate import WaterQualityEvaluator
+from sensor import SensorReading
+
+def display_sensor_id(raw):
+    """Make IDs like 'SENSOR_005' print as 5. If no digits, return the original."""
+    s = str(raw)
+    digits = "".join(ch for ch in s if ch.isdigit())
+    return int(digits) if digits else s
+
+def main():
+    # 1) load
+    df = load_csv("../data/sensor_data.csv")
+
+    # 2) clean
+    df = clean_sensor_data(df)
+
+    # Ensure sensor_id is present and printable
+    if "sensor_id" not in df.columns:
+        raise ValueError("CSV must contain 'sensor_id' when there is no 'location' column.")
+    df["sensor_id"] = df["sensor_id"].astype(str).str.strip()
+
+    # 2b) user input: filter by sensor_id (press Enter for all)
+    print("Enter a sensor_id to filter (e.g., SENSOR_005).")
+    print("Press Enter to use ALL sensors.")
+    user_sensor = input("sensor_id: ").strip()
+    if user_sensor:
+        df = df[df["sensor_id"] == user_sensor]
+        if df.empty:
+            print(f"No rows found for sensor_id: {user_sensor}")
+            return
+
+    # 3) evaluate each row and build SensorReading objects
+    evaluator = WaterQualityEvaluator(ph_range=(6.5, 8.5), turbidity_threshold=1.0)
+
+    is_safe_list = []
+    status_text_list = []
+    sample_readings = []
+
+    for i, (_, row) in enumerate(df.iterrows(), start=1):
+        # class usage
+        reading = SensorReading.from_row(row)
+        if i <= 2:
+            sample_readings.append(reading)
+
+        # evaluator is_safe might return (ok, text) or just ok
+        result = evaluator.is_safe(row)
+        if isinstance(result, tuple):
+            ok, text = result
+        else:
+            ok = bool(result)
+            text = "Safe" if ok else "Unsafe"
+
+        is_safe_list.append(ok)
+        status_text_list.append(text)
+
+    df["is_safe"] = is_safe_list
+    df["status_text"] = status_text_list
+
+    # 4) use the class
+    if sample_readings:
+        print("\n(example SensorReading objects)")
+        for r in sample_readings:
+            print(" ", r)
+
+    # 5) report
+    print("\n=== Water Quality Report (first 10 rows) ===")
+    for _, r in df.head(10).iterrows():
+        sid_num = display_sensor_id(r.get("sensor_id", ""))
+        ok = bool(r.get("is_safe", False))
+        text = r.get("status_text", "Safe" if ok else "Unsafe")
+        pretty = "Safe" if ok else (text if text.startswith(" X ") else f" X {text}")
+        print(f"Sensor {sid_num}: {pretty}")
+
+    # 6) row-level counts
+    total = len(df)
+    safe_count = int(df["is_safe"].sum())
+    unsafe_count = total - safe_count
+    print(f"\nSummary: {safe_count} safe, {unsafe_count} unsafe (out of {total})")
+
+    # 7) save results
+    out_path = "../data/results.csv"
+    df.to_csv(out_path, index=False)
+    print(f"\nSaved detailed results to {out_path}")
+
+if __name__ == "__main__":
+    main()
+
+
diff --git a/src/sensor.py b/src/sensor.py
@@ -0,0 +1,39 @@
+# sensor.py — a tiny class to model one sensor reading
+
+from dataclasses import dataclass, asdict
+from typing import Optional
+
+@dataclass
+class SensorReading:
+    sensor_id: Optional[str]
+    location: str
+    ph: Optional[float]
+    turbidity: Optional[float]
+    temperature: Optional[float] = None
+
+    @classmethod
+    def from_row(cls, row) -> "SensorReading":
+        """Build a SensorReading from a pandas row (Series)."""
+        def val(col):
+            return row[col] if col in row else None
+
+        def num(col):
+            v = val(col)
+            # Treat NaN as None (NaN != NaN)
+            return None if v is None or v != v else float(v)
+
+        # sensor IDs keeps as str
+        sensor = val("sensor_id")
+        sensor = str(sensor) if sensor is not None and sensor == sensor else None
+
+        return cls(
+            sensor_id=sensor,
+            location=str(val("location") or ""),
+            ph=num("ph"),
+            turbidity=num("turbidity"),
+            temperature=num("temperature"),
+        )
+
+    def to_dict(self) -> dict:
+        """Plain dict (handy for JSON/CSV or debugging)."""
+        return asdict(self)