-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextprocessing.py
More file actions
32 lines (26 loc) · 796 Bytes
/
textprocessing.py
File metadata and controls
32 lines (26 loc) · 796 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import re
import os
# Read stop words from the file
def read_stopwords(stopwords):
stopword =[]
if(os.path.exists(stopwords)):
file1 = open(stopwords)
read = file1.readline()
while(read):
stopword.append(read.strip())
read = file1.readline()
return stopword
# Remove the non words.
def remove_nonwords(text):
non_words = re.compile(r"[^a-z ]")
processed_text = re.sub(non_words, ' ', text)
return processed_text.strip()
# Function to remove stopwords from the text
def remove_stopwords(text, stopwords):
words = [word for word in text.split() if word not in stopwords]
return words
# Function to pre-process the text
def preprocess_text(text, stopwords):
processed_text = remove_nonwords(text.lower())
words = remove_stopwords(processed_text, stopwords)
return words