-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter.py
More file actions
70 lines (66 loc) · 1.91 KB
/
filter.py
File metadata and controls
70 lines (66 loc) · 1.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
bag_of_words = [
['\n', " "],
['\?', " "],
["\[.*\]"," "],
["\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"," "],
["don't","do not"],
["doesn't", "does not"],
["didn't", "did not"],
["hasn't", "has not"],
["haven't", "have not"],
["hadn't", "had not"],
["won't", "will not"],
["wouldn't", "would not"],
["can't", "can not"],
["cannot", "can not"],
["i'm", "i am"],
["i'll", "i will"],
["its", "it is"],
["it's", "it is"],
["that's", "that is"],
["weren't", "were not"],
["i'd","i would"],
["i've","i have"],
["she'd","she would"],
["they'll","they will"],
["they're","they are"],
["we'd","we would"],
["we'll","we will"],
["we've","we have"],
["it'll","it will"],
["there's","there is"],
["where's","where is"],
["they're","they are"],
["let's","let us"],
["couldn't","could not"],
["shouldn't","should not"],
["wasn't","was not"],
["could've","could have"],
["might've","might have"],
["must've","must have"],
["should've","should have"],
["would've","would have"],
["who's","who is"],
["\bim\b", "i am"],
[r'[^\w\s]',''],
["\d+", ""],
["what's", "what is "],
[r" e g ", " eg "],
[r" b g ", " bg "],
[r" 9 11 ", "911"],
[r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", ""],
[r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", ""],
# Drop image
[r"image:[a-zA-Z0-9]*\.jpg", " ",],
[r"image:[a-zA-Z0-9]*\.png", " "],
[r"image:[a-zA-Z0-9]*\.gif", " "],
[r"image:[a-zA-Z0-9]*\.bmp", " "],
# Drop css
[r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " "],
[r"\{\|[^\}]*\|\}", " "],
# Clean templates
[r"\[?\[user:.*\|", " "],
[r"\[?\[wikipedia:.*\]", " "],
[r"\[?\[special:.*\]", " "],
[r"\[?\[category:.*\]", " "]
]