Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions data_cleaning/clean_col_status_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
UDF will skim through category lists of words that belong to either Partially Functional, Functional, Not Functional.
UDF goes through sequentially via list_partlydamaged, list_functional1, list_notfunctional, list_functional2
List of words were obtained by testing samples after filtering from top 10 countries that provided cumulative 85% of
data.

UDF has several nested for loops, which will break upon first occurrence of words from categorised list (partially
functional, functional, not functional)
This is to reduce processing time.
"""

def categorise_status(x):

list_partlydamaged = (
"partly damaged",
"yes but",
"yes- but",
"yes - but",
"partially functional")

list_functional1 = (
"not developed mechanical problems",
"not breakdown",
":functional",
"yes – functional",
"no damages",
"no damage")

list_notfunctional = (
'not function',
'poor',
'dry',
'problem',
'not delivering',
'problem',
'plugged',
'abandoned',
'non-operational',
'partially functional',
'non-functional',
'broken',
'with problems',
'spoiled',
'damaged ',
'damage',
'not giving',
'defective',
'not well dug',
'malfunction',
'stolen',
'fault',
'spoilt',
'spoiled',
'spoid',
'spioled',
'rusting',
'weak',
'limited',
'breaking',
'not completed',
'not working',
'stolen',
'not commissioned',
'undeveloped',
'incomplete',
'lack of',
'non- functional',
'in bad shape',
'technical breakdown',
'faulty',
'in bad state',
'no operation',
'dried',
'no water',
'fallen',
'chocked',
'choked',
'no funds',
'lack of funds',
'no hundle',
'not cpmplete',
'stop flowing',
'breakdown',
'broking',
'not complete',
'serious',
'no head',
'contaminated',
'yet to be completed',
'broken down',
'leakage',
'disconnected',
'stopped',
'does not flow',
'long time',
'not well function',
'disconnection',
'no money',
'dirty',
'brakage',
'worn out',
'brokedown',
'water stops',
'no connection',
'removed',
'sunk',
'did not work',
'too oldbreackdown',
'collapse',
'break',
'stolen',
'not installed',
'under construction',
'dried',
'dysfunction',
'desamorsage')

list_functional2 = (
"functional",
"fair",
"good",
"ok",
"operational",
"working",
"satisfaisant"
)

answer = "Unknown"
for h in list_partlydamaged:
if h in x:
answer = "Partially Functional with Damages"
break
else:
# if we reach end of primary partlydamaged list
if h == list_partlydamaged[-1]:

# Start with primary functional list
for i in list_functional1:
if i in x:
answer = "Functional"
break
else:
# if we reach end of primary functional list
if i == list_functional1[-1]:

# Start with primary non functional list
for j in list_notfunctional:
if j in x:
answer = "Not Functional"
break
else:
# if we reach end of non functional list
if j == list_notfunctional[-1]:

# Start with secondary functional list
for k in list_functional2:
if k in x:
answer = "Functional"
break
else:
break

return answer
21 changes: 20 additions & 1 deletion data_cleaning/clean_wpdx_sample_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import pandas

""" Import helper file for STATUS Col cleaning"""
import clean_col_status_helper

def clean_columns(input_file, output_file):
with open(input_file) as csvfile, open(output_file, 'wt') as writer:
Expand Down Expand Up @@ -143,6 +144,24 @@ def clean_col_management(input_data):
input_data = 'Direct Government Operation'
return input_data

def clean_col_status(input_data):
"""
Clean values in column: "status"
Trello card: https://trello.com/c/S4FjIDgo"
"""
cleaned_data = input_data

# Preprocessing clean up
# Remove all NaNs by converting into string text "Not Available"
input_data = "Not Available" if isinstance(input_data, float) else cleaned_data

# Convert to lower character
input_data = input_data.lower()

# Apply UDF from clean_col_status_helper
input_data = clean_col_status_helper.categorise_status(input_data)

return input_data

if __name__ == '__main__':
clean_columns('wpdx_sample_data.csv', 'cleaned_wpdx_sample_data.csv')
11 changes: 11 additions & 0 deletions data_cleaning/test_clean_wpdx_sample_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,14 @@ def test_clean_col_management():
assert cwsd.clean_col_management('Direct Government Operation?,') == 'Direct Government Operation'
assert cwsd.clean_col_management('management') == 'Direct Government Operation'

def test_clean_col_status():
"""
Testing the clean values in column: "status"
Trello card: https://trello.com/c/S4FjIDgo"
"""
assert cwsd.clean_col_status('Partly Damaged') == 'Partially Functional with Damages'
assert cwsd.clean_col_status('absoLUte no daMAge') == 'Functional'
assert cwsd.clean_col_status('fOUND in BAD state') == 'Not Functional'
assert cwsd.clean_col_status('ok oPERatIOnal') == 'Functional'
assert cwsd.clean_col_status("ok") == "Functional"
assert cwsd.clean_col_status("Not Functional") == "Not Functional"