-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtext_utils.py
More file actions
35 lines (24 loc) · 782 Bytes
/
text_utils.py
File metadata and controls
35 lines (24 loc) · 782 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# Preprocessing functions for text data
import re
def striphtml(text):
"""
remove html formating (e.g. <b> <\b>, <a </a>) from a string
>>> striphtml('Hi! <b> Important <\b> click here <a> link </a>')
'Hi! Important click here link '
"""
p = re.compile(r'<.*?>')
return p.sub('', text)
def remove_words(words_to_remove, list_of_strings):
"""
words_to_remove: list of words to remove
list_of_strings: list of string from which to remove words
>>> remove_words(['bye'], ['hello, bye', 'bye, hi'])
['hello, ', ', hi']
"""
for word in words_to_remove:
list_of_strings = [s.replace(word, '') for s in list_of_strings]
return list_of_strings
# test the functions
if __name__ == '__main__':
import doctest
doctest.testmod()