-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathread_data.py
More file actions
43 lines (37 loc) · 1.23 KB
/
read_data.py
File metadata and controls
43 lines (37 loc) · 1.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#The first step is to walk among all directories and file
import os
def readEmail(filename):
"""input: name of a the txt file
output: list of words of the message in the email
"""
#data = [filename]
data = []
try:
fh = open(filename,'r',encoding = 'latin1')
except IOError:
print('cannot open', filename)
else:
for line in fh:
if line !='\n':
words = line[:-1].split(' ')
for word in words:
data.append(word)
finally:
fh.close()
return data
def readDirectory(directory, class_name):
'''Takes the name of the directory and the name of the class
and creates a subset of data
'''
d = []
for root, dirs, files in os.walk(directory):
for name in files:
d.append([readEmail(os.path.join(root,name)),class_name])
return d
#the name of the files I want to reare are
train_ham_file = '/home/pili/T2/dataset_1/train/ham'
train_spam_file = '/home/pili/T2/dataset_1/train/spam'
test_ham_file = '/home/pili/T2/dataset_1/test/ham'
test_spam_file = '/home/pili/T2/dataset_1/test/spam'
data = readDirectory(train_spam_file,1) + readDirectory(train_ham_file,0)
print(len(data))