forked from K10ForTheWin/Text_Analysis_Technobabble
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathFilterFiles.py
More file actions
125 lines (107 loc) · 3.02 KB
/
FilterFiles.py
File metadata and controls
125 lines (107 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Get files of a certain size threshold, top 100
#preview file lexicon
import string
from shutil import copyfile
import os
import operator
from nltk.corpus import stopwords
## for m in more:
## for mm in m:
## statinfo = os.stat(mm)
## size = statinfo.st_size
## sz = file_size(mm)
## print(file, sz)
def convert_bytes(num):
"""
this function will convert bytes to MB.... GB... etc
"""
for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
if num < 1024.0:
return "%3.1f %s" % (num, x)
num /= 1024.0
def file_size(file_path):
"""
this function will return the file size
"""
if os.path.isfile(file_path):
file_info = os.stat(file_path)
return convert_bytes(file_info.st_size)
def dir_dir(dirname):
if os.path.isdir(dirname):
pass
else:
os.mkdir(dirname)
##root = "C:\Python3\data_char_lines"
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "data_char_lines")
contents = os.listdir(root)
root_keep = "C:\Python3\data_char_lines_top_100"
dir_dir(root_keep)
mydict = {}
keep_files=[]
for c in contents:
file_sizes = []
r = os.path.join(root, c)
for file in os.listdir(r):
f = os.path.join(r, file)
statinfo = os.stat(f)
size = statinfo.st_size
if size > 9228:
keep_files.append(f)
copyfile(f, os.path.join(root_keep, file))
file_sizes.append(size)
mydict[c] = file_sizes
print(c, len(file_sizes), min(file_sizes), max(file_sizes), sum(file_sizes)/len(file_sizes))
print(len(keep_files))
mydict2 = {}
d={}
for file in os.listdir(root_keep):
f = os.path.join(root_keep, file)
with open(f) as infile:
lex = []
lines = infile.readlines()
## print(len(lines))
for l in lines:
words = l.split(" ")
words = [w.strip(string.punctuation) for w in words]
for w in words:
lex.append(w)
## print(len(lex))
k=file.strip(".txt").title()
mydict2[file.strip(".txt")]=lex
d[file.strip(".txt")]=len(lex)
##for k, v in mydict2.items():
## print("%s\t%d" % (k.title(), len(v)))
sorted_x = sorted(d.items(), key=operator.itemgetter(1))
for x in sorted_x:
print(x)
root_keep = "C:\Python3\data_char_lines_top_100"
os.chdir(root_keep)
picard_words = {}
words = []
with open("PICARD.txt") as infile:
w = infile.readlines()
for line in w:
lexs = line.split(" ")
words.extend(lexs)
words = filter_words(words)
def filter_words(wordlist):
filteredwords = []
for w in wordlist:
w = w.lower()
if w[-1:] in string.punctuation:
w = w[:-1]
elif w[0] in string.punctuation:
w = w[1:]
filteredwords.append(w)
return filteredwords
# 10 GB
##>>> os.stat("C:\Python3\data_char_lines\Voyager\GEGEN.txt")
##os.stat_result(st_mode=33206, st_ino=281474977348190, st_dev=606102944, st_nlink=1, st_uid=0, st_gid=0, st_size=9228, st_atime=1542864677, st_mtime=1542864677, st_ctime=1542864677)
##>>> x=_
##>>> x.st_size
##9228
## print(f)
## sz = file_size(f)
## if not sz.endswith("bytes"):
## print(file, sz)