Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,43 @@
index.py
env/
""""
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Virtual Environment
venv/
env/
ENV/

# IDE
.vscode/
.idea/
*.swp
*.swo
*~

# OS
.DS_Store
Thumbs.db

# Data files (optional - uncomment if you don't want to track data)
# data/*.csv
# results.csv
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
pandas
"""
pandas>=1.5.0
numpy>=1.21.0
"""
42 changes: 42 additions & 0 deletions src/clean_data.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,49 @@
import numpy as np
import pandas as pd


def clean_sensor_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Clean sensor data by handling missing or invalid values.

Returns:
pd.DataFrame: Cleaned data.
"""
# Create a copy to avoid modifying original data

cleaned_df = df.copy()

# Add flags for missing data
cleaned_df['missing_ph'] = cleaned_df['ph'].isna()
cleaned_df['missing_turbidity'] = cleaned_df['turbidity'].isna()
cleaned_df['missing_temperature'] = cleaned_df['temperature'].isna()

# Handle invalid pH values (should be between 0-14)
invalid_ph_mask = (cleaned_df['ph'] < 0) | (cleaned_df['ph'] > 14)
cleaned_df.loc[invalid_ph_mask, 'ph'] = np.nan
cleaned_df.loc[invalid_ph_mask, 'missing_ph'] = True

# Handle invalid turbidity values (should be non-negative)
invalid_turbidity_mask = cleaned_df['turbidity'] < 0
cleaned_df.loc[invalid_turbidity_mask, 'turbidity'] = np.nan
cleaned_df.loc[invalid_turbidity_mask, 'missing_turbidity'] = True

# Handle invalid temperature values (reasonable range: -10 to 50°C)
invalid_temp_mask = (cleaned_df['temperature'] < -10) | (cleaned_df['temperature'] > 50)
cleaned_df.loc[invalid_temp_mask, 'temperature'] = np.nan
cleaned_df.loc[invalid_temp_mask, 'missing_temperature'] = True

# Fill missing sensor_id if needed
if cleaned_df['sensor_id'].isna().any():
cleaned_df['sensor_id'] = cleaned_df['sensor_id'].fillna('Unknown')

# Fill missing location if needed
if cleaned_df['location'].isna().any():
cleaned_df['location'] = cleaned_df['location'].fillna('Unknown Location')

print(f"Data cleaning completed. Found {cleaned_df['missing_ph'].sum()} missing pH values, "
f"{cleaned_df['missing_turbidity'].sum()} missing turbidity values, "
f"{cleaned_df['missing_temperature'].sum()} missing temperature values.")

return cleaned_df

62 changes: 62 additions & 0 deletions src/evaluate.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import pandas as pd
from clean_data import clean_sensor_data
import os


class WaterQualityEvaluator:
def __init__(self, ph_range=(6.5, 8.5), turbidity_threshold=1.0):
self.ph_range = ph_range
Expand All @@ -7,3 +12,60 @@ def is_safe(self, row: pd.Series) -> bool:
"""
Determine if a row of water data is safe.
"""
def evaluate_single_reading(self, row):
"""
Evaluate a single water quality reading.

Args:
row (pd.Series): Single row of water quality data

Returns:
dict: Evaluation results with safety status and reasons
"""
issues = []

# Check for missing pH
if row.get('missing_ph', False) or pd.isna(row.get('ph')):
issues.append('missing pH')
elif row['ph'] < self.ph_min:
issues.append('pH too low')
elif row['ph'] > self.ph_max:
issues.append('pH too high')

# Check for missing turbidity
if row.get('missing_turbidity', False) or pd.isna(row.get('turbidity')):
issues.append('missing turbidity')
elif row['turbidity'] > self.turbidity_max:
issues.append('turbidity too high')

is_safe = len(issues) == 0

return {
'is_safe': is_safe,
'issues': issues,
'status': 'Safe' if is_safe else 'Unsafe',
'reason': ', '.join(issues) if issues else 'All parameters within safe range'
}
def evaluate_all_readings(self, df):
"""
Evaluate all water quality readings in the DataFrame.

Args:
df (pd.DataFrame): Water quality data

Returns:
pd.DataFrame: Original data with evaluation results
"""
results = []

for _, row in df.iterrows():
evaluation = self.evaluate_single_reading(row)
results.append(evaluation)

# Add evaluation results to DataFrame
results_df = df.copy()
results_df['is_safe'] = [r['is_safe'] for r in results]
results_df['status'] = [r['status'] for r in results]
results_df['reason'] = [r['reason'] for r in results]

return results_df
23 changes: 22 additions & 1 deletion src/load_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
def load_csv(filepath: str) -> pd.DataFrame:
import pandas as pd
from clean_data import clean_sensor_data
import os


def sensor_data_csv(filepath: str) -> pd.DataFrame:

"""
Load sensor data from a CSV file.

Expand All @@ -8,3 +14,18 @@ def load_csv(filepath: str) -> pd.DataFrame:
Returns:
pd.DataFrame: Loaded data as a pandas DataFrame.
"""
import pandas as pd

try:
data = pd.load_csv(filepath)
return data
except FileNotFoundError:
print(f"File not found: {filepath}")
return pd.DataFrame()
except pd.errors.EmptyDataError:
print(f"No data in file: {filepath}")
return pd.DataFrame()
except Exception as e:
print(f"An error occurred while loading the file: {e}")
return pd.DataFrame()

117 changes: 117 additions & 0 deletions src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
import pandas as pd
import os
import sys
from datetime import datetime
from load_data import sensor_data_csv
from clean_data import clean_sensor_data
from evaluate import WaterQualityEvaluator

# Import our custom modules
# from load_data import load_csv

def print_results(df):
"""Print results in the expected format."""
print("\n" + "="*50)
print("WATER QUALITY ASSESSMENT RESULTS")
print("="*50)

for _, row in df.iterrows():
sensor_id = row['sensor_id']
location = row['location']
status_emoji = "✅" if row['is_safe'] else "❌"
status = row['status']

if row['is_safe']:
print(f"Sensor {sensor_id} at {location}: {status_emoji} {status}")
else:
print(f"Sensor {sensor_id} at {location}: {status_emoji} {status} ({row['reason']})")

def save_results_to_csv(df, output_file='results.csv'):
"""Save results to CSV file."""
# Select relevant columns for output
output_columns = ['sensor_id', 'location', 'ph', 'turbidity', 'temperature',
'is_safe', 'status', 'reason']

results_df = df[output_columns].copy()
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")

def print_summary(df):
"""Print summary statistics."""
total_sensors = len(df)
safe_sensors = df['is_safe'].sum()
unsafe_sensors = total_sensors - safe_sensors

print(f"\n" + "="*30)
print("SUMMARY")
print("="*30)
print(f"Total sensors: {total_sensors}")
print(f"Safe sensors: {safe_sensors}")
print(f"Unsafe sensors: {unsafe_sensors}")
print(f"Safety rate: {(safe_sensors/total_sensors)*100:.1f}%")

def filter_by_location(df, location_filter):
"""Filter data by location."""
if location_filter:
filtered_df = df[df['location'].str.contains(location_filter, case=False, na=False)]
if len(filtered_df) == 0:
print(f"No sensors found for location: {location_filter}")
return df
else:
print(f"Filtered to {len(filtered_df)} sensors for location: {location_filter}")
return filtered_df
return df

def main():
"""Main function to run the water quality monitoring pipeline."""
print("Water Quality Monitoring System")
print("="*40)

# Configuration
data_file = 'clean_sensor_data.csv'

# Check if user wants to filter by location
location_filter = None
if len(sys.argv) > 1:
location_filter = sys.argv[1]
print(f"Filtering by location: {location_filter}")

try:
# Step 1: Load data
print(f"\n1. Loading data from {data_file}...")
df = sensor_data_csv(data_file)

# Step 2: Clean data
print("\n2. Cleaning data...")
cleaned_df = clean_sensor_data(df)

# Step 3: Filter by location if specified
if location_filter:
cleaned_df = filter_by_location(cleaned_df, location_filter)

# Step 4: Evaluate water quality
print("\n3. Evaluating water quality...")
evaluator = WaterQualityEvaluator()
results_df = evaluator.evaluate_all_readings(cleaned_df)

# Step 5: Display results
print_results(results_df)

# Step 6: Print summary
print_summary(results_df)

# Step 7: Save results (bonus feature)
save_results_to_csv(results_df)

print(f"\nProcessing completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

except Exception as e:
print(f"Error: {str(e)}")
sys.exit(1)

if __name__ == "__main__":
main()
# Accept location name from terminal (bonus)
location = sys.argv[1] if len(sys.argv) > 1 else None
main(location)