Text_Analysis_Technobabble/FilterFiles.py at master · dorehaus/Text_Analysis_Technobabble · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Get files of a certain size threshold, top 100
#preview file lexicon
import string
from shutil import copyfile
import os
import operator
from nltk.corpus import stopwords
##  for m in more:
##  for mm in m:
##    statinfo = os.stat(mm)
##    size = statinfo.st_size
##    sz = file_size(mm)
##    print(file, sz)

def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def file_size(file_path):
    """
    this function will return the file size
    """
    if os.path.isfile(file_path):
        file_info = os.stat(file_path)
        return convert_bytes(file_info.st_size)

def dir_dir(dirname):
  if os.path.isdir(dirname):
    pass
  else:
    os.mkdir(dirname)

##root = "C:\Python3\data_char_lines"
my_path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(my_path, "data_char_lines")

contents = os.listdir(root)

root_keep = "C:\Python3\data_char_lines_top_100"
dir_dir(root_keep)

mydict = {}
keep_files=[]
for c in contents:
  file_sizes = []
  r = os.path.join(root, c)
  for file in os.listdir(r):
    f = os.path.join(r, file)
    statinfo = os.stat(f)
    size = statinfo.st_size
    if size > 9228:
      keep_files.append(f)
      copyfile(f, os.path.join(root_keep, file))
    file_sizes.append(size)

  mydict[c] = file_sizes
  print(c, len(file_sizes), min(file_sizes), max(file_sizes), sum(file_sizes)/len(file_sizes))

print(len(keep_files))

mydict2 = {}
d={}
for file in os.listdir(root_keep):
  f = os.path.join(root_keep, file)
  with open(f) as infile:
    lex = []
    lines = infile.readlines()
  ##      print(len(lines))
    for l in lines:
      words = l.split(" ")
      words = [w.strip(string.punctuation) for w in words]
      for w in words:
        lex.append(w)
##      print(len(lex))
      k=file.strip(".txt").title()
      mydict2[file.strip(".txt")]=lex
      d[file.strip(".txt")]=len(lex)


##for k, v in mydict2.items():
##  print("%s\t%d" % (k.title(), len(v)))

sorted_x = sorted(d.items(), key=operator.itemgetter(1))
for x in sorted_x:
  print(x)


root_keep = "C:\Python3\data_char_lines_top_100"
os.chdir(root_keep)
picard_words = {}
words = []
with open("PICARD.txt") as infile:
	w = infile.readlines()
	for line in w:
		lexs = line.split(" ")
		words.extend(lexs)
words = filter_words(words)

def filter_words(wordlist):
  filteredwords = []
  for w in wordlist:
    w = w.lower()
    if w[-1:] in string.punctuation:
      w = w[:-1]
    elif w[0] in string.punctuation:
      w = w[1:]
    filteredwords.append(w)
  return filteredwords

# 10 GB
##>>> os.stat("C:\Python3\data_char_lines\Voyager\GEGEN.txt")
##os.stat_result(st_mode=33206, st_ino=281474977348190, st_dev=606102944, st_nlink=1, st_uid=0, st_gid=0, st_size=9228, st_atime=1542864677, st_mtime=1542864677, st_ctime=1542864677)
##>>> x=_
##>>> x.st_size
##9228

##    print(f)
##    sz = file_size(f)
##    if not sz.endswith("bytes"):
##      print(file, sz)