-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcheck_data.py
More file actions
136 lines (110 loc) · 3.22 KB
/
check_data.py
File metadata and controls
136 lines (110 loc) · 3.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import threading
import json
import redis
import re
import math
from dateutil.parser import parse
from rejson import Client, Path
import dedup_data
def print_data(data):
for key, value in data.items():
print(key, value)
def validate_url(data):
if not data:
return 'F'
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'localhost|' # localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if re.match(regex, data) is not None:
return 'A'
else:
return 'D'
def validate_date_time(data):
if not data:
return 'F'
try:
parse(data)
return 'A'
except ValueError:
return 'C'
def validate_category(data):
if not data:
return 'B'
if isinstance(data, str):
return 'A'
else:
return 'B'
def validate_postcode(data):
if not data:
return 'F'
regex = re.compile('[0-9][ ]*[0-9][ ]*[0-9][ ]*[0-9][ ]*[A-Za-z][ ]*[A-Za-z]')
if re.match(regex, data) is not None:
return 'A'
else:
return 'D'
def validate_string(data):
if not data:
return 'B'
if isinstance(data, str) and 4 < len(data) < 51:
return 'A'
else:
return 'B'
def validate_integer(data):
if not data:
return 'B'
frac, whole = math.modf(data)
if isinstance(data, int) or (isinstance(data, float) and frac == 0):
return 'A'
else:
return 'B'
def compute_grade(grades):
if 'F' in grades:
return 'F'
elif 'D' in grades:
return 'D'
elif 'C' in grades:
return 'C'
elif 'B' in grades:
return 'B'
else:
return 'A'
def grade_data(data):
string0_grade = validate_url(data['String0'])
string1_grade = validate_date_time(data['String1'])
string2_grade = validate_category(data['String2'])
string3_grade = validate_postcode(data['String3'])
string4_grade = validate_string(data['String4'])
string5_grade = validate_integer(data['String5'])
grades = [string0_grade, string1_grade, string2_grade, string3_grade, string4_grade, string5_grade]
grade = compute_grade(grades)
data['grades'] = grades
return data
def retrieve_and_check():
# i as interval in seconds
n = 5
threading.Timer(n, retrieve_and_check).start()
# gets executed every n seconds
# retrieve a RANDOM data point from the redis queue
grading_queue = redis.StrictRedis('localhost', 6379)
key = grading_queue.randomkey()
if key is None:
print('Pre-processing queue is empty.')
return
item = json.loads(grading_queue.execute_command('JSON.GET', key))
item = grade_data(item)
if item['grades'] == ['A', 'A', 'A', 'A', 'A', 'A']:
item['status'] = 1
else:
item['status'] = 0
# delete item from redis queue
grading_queue.execute_command('JSON.DEL', key)
# forward item to de-duplication queue
dedup_data.deduplicate_data(key, item)
def main():
retrieve_and_check()
if __name__ == "__main__":
main()