-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNB.py
More file actions
56 lines (51 loc) · 2.36 KB
/
NB.py
File metadata and controls
56 lines (51 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
import io
import re
Products = [] # full products as a list of dictionaries
Result = [] # final result objects
Product_H_Space = [] # product names (split) with hyphens converted to spaces
Product_No_H = [] # product names (split) with hyphens removed
# Read in the products
with open('products.txt') as f:
for line in f:
product = json.loads(line)
Products.append(product)
Result.append({"product_name": product["product_name"], "listings": []})
Product_H_Space.append(set(re.split('_|-', product["product_name"].upper())))
Product_No_H.append(set(re.split('_', re.sub('-', '', product["product_name"].upper()))))
# Reads in listings file. For each listing, it goes through all products and
# adds its listing to the first product it "mathes". A "match" ifor a listing
# is defined as containing at least |name of product|-1 of the words in the
# name of the product, and somewhere contains the same manufacturer, family
# (if available) and model.
num_matches = 0
with open('listings.txt', encoding="utf8") as f:
i = 0
for line in f:
listing = json.loads(line)
listing_h_space = set(re.split(' |-', listing["title"].upper()))
listing_no_h = set(re.split(' ', re.sub('-()', '', listing["title"].upper())))
listing_both_h = listing_h_space | listing_no_h
product_count = 0
for product in Products:
if (((len(Product_H_Space[product_count] & listing_both_h)
>= (len(Product_H_Space[product_count]) -1)) or
(len(Product_No_H[product_count] &
listing_both_h) >=
(len(Product_No_H[product_count]) -1))) and
(product["manufacturer"].upper() in
listing["manufacturer"].upper().split(' ')) and
(not("family" in product) or (re.sub('-| ', '',
product["family"].upper()) in listing_both_h)) and
(re.sub('-| ', '', product["model"].upper()) in
listing_both_h)):
Result[product_count]["listings"].append(listing)
num_matches = num_matches + 1
break
product_count = product_count + 1
# Print out the results
print(num_matches)
f = io.open("Results.txt", 'w', encoding='utf8')
for r in Result:
f.write(json.dumps(r))
f.write('\n')