From 04406c1e65aad2e5caf5b2f51046dcc885d859ed Mon Sep 17 00:00:00 2001
From: ffoo <kfoofw@gmail.com>
Date: Sun, 17 Mar 2019 21:46:18 +0800
Subject: [PATCH] corrected for col status pytest

---
 data_cleaning/clean_col_status_helper.py     | 163 +++++++++++++++++++
 data_cleaning/clean_wpdx_sample_data.py      |  21 ++-
 data_cleaning/test_clean_wpdx_sample_data.py |  11 ++
 3 files changed, 194 insertions(+), 1 deletion(-)
 create mode 100644 data_cleaning/clean_col_status_helper.py

diff --git a/data_cleaning/clean_col_status_helper.py b/data_cleaning/clean_col_status_helper.py
new file mode 100644
index 0000000..8f6a50e
--- /dev/null
+++ b/data_cleaning/clean_col_status_helper.py
@@ -0,0 +1,163 @@
+"""
+UDF will skim through category lists of words that belong to either Partially Functional, Functional, Not Functional.
+UDF goes through sequentially via list_partlydamaged, list_functional1, list_notfunctional, list_functional2
+List of words were obtained by testing samples after filtering from top 10 countries that provided cumulative 85% of
+data.
+
+UDF has several nested for loops, which will break upon first occurrence of words from categorised list (partially
+functional, functional, not functional)
+This is to reduce processing time.
+"""
+
+def categorise_status(x):
+
+    list_partlydamaged = (
+        "partly damaged",
+        "yes but",
+        "yes- but",
+        "yes - but",
+        "partially functional")
+
+    list_functional1 = (
+        "not developed mechanical problems",
+        "not breakdown",
+        ":functional", 
+        "yes – functional", 
+        "no damages", 
+        "no damage")
+
+    list_notfunctional = (
+        'not function',
+        'poor',
+        'dry',
+        'problem',
+        'not delivering',
+        'problem',
+        'plugged',
+        'abandoned',
+        'non-operational',
+        'partially functional',
+        'non-functional',
+        'broken',
+        'with problems',
+        'spoiled',
+        'damaged ',
+        'damage',
+        'not giving',
+        'defective',
+        'not well dug',
+        'malfunction',
+        'stolen',
+        'fault',
+        'spoilt',
+        'spoiled',
+        'spoid',
+        'spioled',
+        'rusting',
+        'weak',
+        'limited',
+        'breaking',
+        'not completed',
+        'not working',
+        'stolen',
+        'not commissioned',
+        'undeveloped',
+        'incomplete',
+        'lack of',
+        'non- functional',
+        'in bad shape',
+        'technical breakdown',
+        'faulty',
+        'in bad state',
+        'no operation',
+        'dried',
+        'no water',
+        'fallen',
+        'chocked',
+        'choked',
+        'no funds',
+        'lack of funds',
+        'no hundle',
+        'not cpmplete',
+        'stop flowing',
+        'breakdown',
+        'broking',
+        'not complete',
+        'serious',
+        'no head',
+        'contaminated',
+        'yet to be completed',
+        'broken down',
+        'leakage',
+        'disconnected',
+        'stopped',
+        'does not flow',
+        'long time',
+        'not well function',
+        'disconnection',
+        'no money',
+        'dirty',
+        'brakage',
+        'worn out',
+        'brokedown',
+        'water stops',
+        'no connection',
+        'removed',
+        'sunk',
+        'did not work',
+        'too oldbreackdown',
+        'collapse',
+        'break',
+        'stolen',
+        'not installed',
+        'under construction',
+        'dried',
+        'dysfunction',
+        'desamorsage')
+
+    list_functional2 = (
+        "functional",
+        "fair",
+        "good",
+        "ok",
+        "operational",
+        "working",
+        "satisfaisant"
+        )
+    
+    answer = "Unknown"
+    for h in list_partlydamaged:
+        if h in x:
+            answer = "Partially Functional with Damages"
+            break
+        else:
+            # if we reach end of primary partlydamaged list
+            if h == list_partlydamaged[-1]:
+
+                # Start with primary functional list
+                for i in list_functional1:
+                    if i in x:
+                        answer = "Functional"
+                        break
+                    else:
+                        # if we reach end of primary functional list
+                        if i == list_functional1[-1]:
+
+                            # Start with primary non functional list
+                            for j in list_notfunctional:
+                                if j in x:
+                                    answer = "Not Functional"
+                                    break
+                                else:
+                                    # if we reach end of non functional list
+                                    if j == list_notfunctional[-1]:
+
+                                        # Start with secondary functional list
+                                        for k in list_functional2:
+                                            if k in x:
+                                                answer = "Functional"
+                                                break
+                                        else:
+                                            break
+
+    return answer
diff --git a/data_cleaning/clean_wpdx_sample_data.py b/data_cleaning/clean_wpdx_sample_data.py
index d6c236b..87d672d 100644
--- a/data_cleaning/clean_wpdx_sample_data.py
+++ b/data_cleaning/clean_wpdx_sample_data.py
@@ -1,6 +1,7 @@
 import csv
 import pandas
-
+""" Import helper file for STATUS Col cleaning"""
+import clean_col_status_helper
 
 def clean_columns(input_file, output_file):
     with open(input_file) as csvfile, open(output_file, 'wt') as writer:
@@ -143,6 +144,24 @@ def clean_col_management(input_data):
         input_data = 'Direct Government Operation'
     return input_data
 
+def clean_col_status(input_data):
+    """
+    Clean values in column: "status"
+    Trello card: https://trello.com/c/S4FjIDgo"
+    """
+    cleaned_data = input_data
+
+    # Preprocessing clean up
+    # Remove all NaNs by converting into string text "Not Available"
+    input_data = "Not Available" if isinstance(input_data, float) else cleaned_data
+
+    # Convert to lower character
+    input_data = input_data.lower()
+
+    # Apply UDF from clean_col_status_helper
+    input_data = clean_col_status_helper.categorise_status(input_data)
+
+    return input_data
 
 if __name__ == '__main__':
     clean_columns('wpdx_sample_data.csv', 'cleaned_wpdx_sample_data.csv')
diff --git a/data_cleaning/test_clean_wpdx_sample_data.py b/data_cleaning/test_clean_wpdx_sample_data.py
index f45d79a..1c3a9bd 100644
--- a/data_cleaning/test_clean_wpdx_sample_data.py
+++ b/data_cleaning/test_clean_wpdx_sample_data.py
@@ -66,3 +66,14 @@ def test_clean_col_management():
     assert cwsd.clean_col_management('Direct Government Operation?,') == 'Direct Government Operation'
     assert cwsd.clean_col_management('management') == 'Direct Government Operation'
 
+def test_clean_col_status():
+    """
+    Testing the clean values in column: "status"
+    Trello card: https://trello.com/c/S4FjIDgo"
+    """
+    assert cwsd.clean_col_status('Partly Damaged') == 'Partially Functional with Damages'
+    assert cwsd.clean_col_status('absoLUte no daMAge') == 'Functional'
+    assert cwsd.clean_col_status('fOUND in BAD state') == 'Not Functional'
+    assert cwsd.clean_col_status('ok oPERatIOnal') == 'Functional'
+    assert cwsd.clean_col_status("ok") == "Functional"
+    assert cwsd.clean_col_status("Not Functional") == "Not Functional"