From 04406c1e65aad2e5caf5b2f51046dcc885d859ed Mon Sep 17 00:00:00 2001 From: ffoo Date: Sun, 17 Mar 2019 21:46:18 +0800 Subject: [PATCH] corrected for col status pytest --- data_cleaning/clean_col_status_helper.py | 163 +++++++++++++++++++ data_cleaning/clean_wpdx_sample_data.py | 21 ++- data_cleaning/test_clean_wpdx_sample_data.py | 11 ++ 3 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 data_cleaning/clean_col_status_helper.py diff --git a/data_cleaning/clean_col_status_helper.py b/data_cleaning/clean_col_status_helper.py new file mode 100644 index 0000000..8f6a50e --- /dev/null +++ b/data_cleaning/clean_col_status_helper.py @@ -0,0 +1,163 @@ +""" +UDF will skim through category lists of words that belong to either Partially Functional, Functional, Not Functional. +UDF goes through sequentially via list_partlydamaged, list_functional1, list_notfunctional, list_functional2 +List of words were obtained by testing samples after filtering from top 10 countries that provided cumulative 85% of +data. + +UDF has several nested for loops, which will break upon first occurrence of words from categorised list (partially +functional, functional, not functional) +This is to reduce processing time. +""" + +def categorise_status(x): + + list_partlydamaged = ( + "partly damaged", + "yes but", + "yes- but", + "yes - but", + "partially functional") + + list_functional1 = ( + "not developed mechanical problems", + "not breakdown", + ":functional", + "yes – functional", + "no damages", + "no damage") + + list_notfunctional = ( + 'not function', + 'poor', + 'dry', + 'problem', + 'not delivering', + 'problem', + 'plugged', + 'abandoned', + 'non-operational', + 'partially functional', + 'non-functional', + 'broken', + 'with problems', + 'spoiled', + 'damaged ', + 'damage', + 'not giving', + 'defective', + 'not well dug', + 'malfunction', + 'stolen', + 'fault', + 'spoilt', + 'spoiled', + 'spoid', + 'spioled', + 'rusting', + 'weak', + 'limited', + 'breaking', + 'not completed', + 'not working', + 'stolen', + 'not commissioned', + 'undeveloped', + 'incomplete', + 'lack of', + 'non- functional', + 'in bad shape', + 'technical breakdown', + 'faulty', + 'in bad state', + 'no operation', + 'dried', + 'no water', + 'fallen', + 'chocked', + 'choked', + 'no funds', + 'lack of funds', + 'no hundle', + 'not cpmplete', + 'stop flowing', + 'breakdown', + 'broking', + 'not complete', + 'serious', + 'no head', + 'contaminated', + 'yet to be completed', + 'broken down', + 'leakage', + 'disconnected', + 'stopped', + 'does not flow', + 'long time', + 'not well function', + 'disconnection', + 'no money', + 'dirty', + 'brakage', + 'worn out', + 'brokedown', + 'water stops', + 'no connection', + 'removed', + 'sunk', + 'did not work', + 'too oldbreackdown', + 'collapse', + 'break', + 'stolen', + 'not installed', + 'under construction', + 'dried', + 'dysfunction', + 'desamorsage') + + list_functional2 = ( + "functional", + "fair", + "good", + "ok", + "operational", + "working", + "satisfaisant" + ) + + answer = "Unknown" + for h in list_partlydamaged: + if h in x: + answer = "Partially Functional with Damages" + break + else: + # if we reach end of primary partlydamaged list + if h == list_partlydamaged[-1]: + + # Start with primary functional list + for i in list_functional1: + if i in x: + answer = "Functional" + break + else: + # if we reach end of primary functional list + if i == list_functional1[-1]: + + # Start with primary non functional list + for j in list_notfunctional: + if j in x: + answer = "Not Functional" + break + else: + # if we reach end of non functional list + if j == list_notfunctional[-1]: + + # Start with secondary functional list + for k in list_functional2: + if k in x: + answer = "Functional" + break + else: + break + + return answer diff --git a/data_cleaning/clean_wpdx_sample_data.py b/data_cleaning/clean_wpdx_sample_data.py index d6c236b..87d672d 100644 --- a/data_cleaning/clean_wpdx_sample_data.py +++ b/data_cleaning/clean_wpdx_sample_data.py @@ -1,6 +1,7 @@ import csv import pandas - +""" Import helper file for STATUS Col cleaning""" +import clean_col_status_helper def clean_columns(input_file, output_file): with open(input_file) as csvfile, open(output_file, 'wt') as writer: @@ -143,6 +144,24 @@ def clean_col_management(input_data): input_data = 'Direct Government Operation' return input_data +def clean_col_status(input_data): + """ + Clean values in column: "status" + Trello card: https://trello.com/c/S4FjIDgo" + """ + cleaned_data = input_data + + # Preprocessing clean up + # Remove all NaNs by converting into string text "Not Available" + input_data = "Not Available" if isinstance(input_data, float) else cleaned_data + + # Convert to lower character + input_data = input_data.lower() + + # Apply UDF from clean_col_status_helper + input_data = clean_col_status_helper.categorise_status(input_data) + + return input_data if __name__ == '__main__': clean_columns('wpdx_sample_data.csv', 'cleaned_wpdx_sample_data.csv') diff --git a/data_cleaning/test_clean_wpdx_sample_data.py b/data_cleaning/test_clean_wpdx_sample_data.py index f45d79a..1c3a9bd 100644 --- a/data_cleaning/test_clean_wpdx_sample_data.py +++ b/data_cleaning/test_clean_wpdx_sample_data.py @@ -66,3 +66,14 @@ def test_clean_col_management(): assert cwsd.clean_col_management('Direct Government Operation?,') == 'Direct Government Operation' assert cwsd.clean_col_management('management') == 'Direct Government Operation' +def test_clean_col_status(): + """ + Testing the clean values in column: "status" + Trello card: https://trello.com/c/S4FjIDgo" + """ + assert cwsd.clean_col_status('Partly Damaged') == 'Partially Functional with Damages' + assert cwsd.clean_col_status('absoLUte no daMAge') == 'Functional' + assert cwsd.clean_col_status('fOUND in BAD state') == 'Not Functional' + assert cwsd.clean_col_status('ok oPERatIOnal') == 'Functional' + assert cwsd.clean_col_status("ok") == "Functional" + assert cwsd.clean_col_status("Not Functional") == "Not Functional"