-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdiff_for_ds.py
More file actions
123 lines (90 loc) · 3.52 KB
/
diff_for_ds.py
File metadata and controls
123 lines (90 loc) · 3.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Usage examples:
./process.py ownername_diff --state id --county franklin --ver_a county --ver_b rep --field owner_name
"""
import logging
import difflib
import string
import pandas as pd
import os
import time
from psycopg2 import sql
from settings import EXPORT_PATH
# from sqlalchemy import create_engine
# import io
log = logging.getLogger(__name__)
def description():
return "Compares owner_name between 2 parcel datasets and gets percent " \
"difference"
def df_to_csv(df, file_name):
path = EXPORT_PATH + '/field_comparison_results'
if not os.path.exists(path):
os.makedirs(path)
output = path + '/' + file_name
df.to_csv(output, index=False)
return output
# def csv_to_table(csv):
# f = open(r'C:\Users\n\Desktop\data.csv', 'r')
# cur.copy_from(f, temp_unicommerce_status, sep=',')
def args(subparser):
subparser.add_argument('--state',
help='Enter state (ex. mt)')
subparser.add_argument('--county',
help='Enter county (ex. gallatin)')
subparser.add_argument('--ver_a',
help='Enter version of first table')
subparser.add_argument('--ver_b',
help='Enter version of first table')
subparser.add_argument('--field',
help='Enter field to compare differences')
subparser.add_argument('--output',
help='Enter output table name')
# removes punctuation
def remove_punctuation(sentence):
cleaned = sentence.translate(None, string.punctuation)
nopunct = sorted_sentence(cleaned)
return nopunct.upper()
# sorts sentence
def sorted_sentence(sentence):
words = sentence.split(' ')
words.sort()
newSentence = ' '.join(words)
return newSentence.strip()
def process(database_connection, args):
"""
Usage examples:
./process.py ownername_diff --state id --county franklin --ver_a county --ver_b rep --field owner_name
"""
state = args.state
county = args.county
table = 'parcel_{state}_{county}'.format(state=state, county=county)
table_a = table + '$' + args.ver_a
table_b = table + '$' + args.ver_b
field = args.field
output_table = args.output
with database_connection.cursor as cursor:
sql_query = """SELECT a.apn AS a_apn, a.{field} AS a_{field}, b.apn AS b_apn, b.{field} AS b_{field}
FROM {table_a} a
INNER JOIN {table_b} b
ON a.apn = b.apn
""".format(table_a=table_a, table_b=table_b,
field=field, output=output_table)
cursor.execute(sql.SQL(sql_query))
df = pd.DataFrame(cursor.fetchall())
df.sort_values(by=['a_apn'])
df = df.fillna(value='NO VALUE')
field_a = 'a_{}'.format(field)
field_b = 'b_{}'.format(field)
df[field_a] = df[field_a].apply(remove_punctuation)
df[field_b] = df[field_b].apply(remove_punctuation)
df['percent similar'] = df.apply(lambda x: difflib.SequenceMatcher(None, x[field_a], x[field_b]).ratio(), axis=1)
# print(df)
# timestamp = str(int(round(time.time() * 1000)))
timestamp = str(time.strftime("%Y%m%d-%H%M"))
file_name = '{state}_{county}_{field}_compare_{timestamp}.csv'\
.format(state=state, county=county, field=field, timestamp=timestamp)
output = df_to_csv(df, file_name)
# with database_connection.cursor as cursor:
# f = open(output, 'r')
# cursor.copy_from(f, 'temp_comparison', sep=',')
database_connection.commit()