Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added src/__pycache__/clean_data.cpython-313.pyc
Binary file not shown.
Binary file added src/__pycache__/evaluate.cpython-313.pyc
Binary file not shown.
Binary file added src/__pycache__/load_data.cpython-313.pyc
Binary file not shown.
14 changes: 13 additions & 1 deletion src/clean_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,19 @@
import pandas as pd
def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean sensor data by handling missing or invalid values.

Returns:
pd.DataFrame: Cleaned data.
"""
cleaned_df = df.copy()

#Handle missing values - replace with None
cleaned_df = cleaned_df.replace(['', 'NULL', 'null', 'NAN'], None)

#Convert numeric columns
numeric_columns = ['pH', 'turbidity', 'temperature', 'dissolved_oxygen']
for col in numeric_columns:
cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')

return cleaned_df
36 changes: 35 additions & 1 deletion src/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,43 @@
import pandas as pd
class WaterQualityEvaluator:
def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
self.ph_range = ph_range
self.turbidity_threshold = turbidity_threshold

def is_safe(self, row: pd.Series) -> bool:
def is_safe(self, row: pd.Series, location_name) -> bool:
"""
Determine if a row of water data is safe.
"""
sensor_id = row['sensor_id']
ph = row['pH']
turbidity = row['turbidity']

#check for missing values
if pd.isna(ph):
return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (missing pH)"
if pd.isna(turbidity):
return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (missing turbidity)"

#check ph range
min_ph, max_ph = self.ph_range
if ph < min_ph:
return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (pH too low)"
elif ph > max_ph:
return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (pH too high)"

# Check turbidity
if turbidity > self.turbidity_threshold:
return f"Sensor {sensor_id} at {location_name}: ❌ Unsafe (turbidity too high)"

# If all checks pass
return f"Sensor {sensor_id} at {location_name}: ✅ Safe"

def evaluate_dataframe(self, df, location_name):
"""
Evaluate all rows in the dataframe
"""
results = []
for index, row in df.iterrows():
result = self.is_safe(row, location_name)
results.append(result)
return results
3 changes: 3 additions & 0 deletions src/load_data.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pandas as pd
def load_csv(filepath: str) -> pd.DataFrame:
"""
Load sensor data from a CSV file.
Expand All @@ -8,3 +9,5 @@ def load_csv(filepath: str) -> pd.DataFrame:
Returns:
pd.DataFrame: Loaded data as a pandas DataFrame.
"""
df = pd.read_csv(filepath)
return df
50 changes: 50 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas as pd
from load_data import load_csv
from clean_data import clean_sensor_data
from evaluate import WaterQualityEvaluator

def main():
# Get input from user
file_path = input("Enter the path to your CSV file: ")
location_name = input("Enter the location name (e.g., Lake A): ")

# Load data
print("Loading data...")
df = load_csv(file_path)

if df is None:
print("Failed to load data. Exiting.")
return

print(f"Loaded {len(df)} rows of data")

# Clean data
print("Cleaning data...")
cleaned_df = clean_sensor_data(df)

# Evaluate data
print("Evaluating water safety...")
evaluator = WaterQualityEvaluator()
results = evaluator.evaluate_dataframe(cleaned_df, location_name)

# Print results
print("\n--- Water Safety Results ---")
for result in results:
print(result)

# Bonus: Count safe vs unsafe
safe_count = sum(1 for result in results if "✅ Safe" in result)
unsafe_count = len(results) - safe_count

print(f"\nSummary: {safe_count} safe, {unsafe_count} unsafe")

# Bonus: Save to CSV
results_df = pd.DataFrame({
'sensor_id': cleaned_df['sensor_id'],
'result': [r.split(': ')[1] for r in results]
})
results_df.to_csv('results.csv', index=False)
print("Results saved to results.csv")


main()
Loading