-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathduplicate.py
More file actions
executable file
·201 lines (164 loc) · 6.01 KB
/
duplicate.py
File metadata and controls
executable file
·201 lines (164 loc) · 6.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/usr/bin/env python
"""
Python Duplicate picture finder, this mainly tests
various stuff related to the duplicate finding, such as
file/directory access, imhdr(identification) and so forth
"""
import os
import sys
import time
import Image
import imghdr
from optparse import OptionParser
################################################################################
import generate_sim
import compare_sim
import fhash
import generate_svd
################################################################################
def generate_img_list(rootdir, exclude):
img = []
for root, dirs, files in os.walk(rootdir):
if exclude in dirs:
dirs.remove(exclude)
for filename in files:
path = os.path.abspath(os.path.join(root, filename))
what = imghdr.what(path)
if what != None:
img.append( (root, filename) )
return img
################################################################################
def calc_image_stats(img_list):
count = {}
total = 0
for root, filename in img_list:
try:
im = Image.open(os.path.join(root, filename))
count[(im.mode, im.format)] = count.setdefault((im.mode, im.format), 0) + 1
total += 1
except IOError:
# Ignore bad images
1+1
# Print results
mode = ['1', 'P', 'L', 'LA', 'RGB', 'RGBA', 'RGBX', 'CMYK']
imgf = ['BMP', 'PNG', 'GIF', 'JPEG', 'TIFF']
for m in mode:
print m
for t in imgf:
try:
v = count[(m, t)]
print "\t" + t + "\t - %.2f" % (100.0 * (float(v) / float(total))) + "%\t - " + str(v)
except KeyError:
# Ignore
1+1
print "---"
print "TOTAL\t\t - 100.00%\t - " + str(total)
return total
################################################################################
def dup(comp, name):
dup = calc_dup(comp)
print "len " + name + ": " + str(len(comp)) + " dups: " + str(dup) + " nodup: " + str(len(comp) - dup)
################################################################################
def calc_dup(comp):
# Assuming a common data storage format of this:
# (file1, file2, (data... as needed))
templist = list(comp)
dupitem = len(comp) - len(set(comp))
duppath = 0
for patha, pathb, data in comp:
if (patha == pathb):
duppath += 1
templist.remove( (patha, pathb, data) )
dupinv = 0
consumed_idx = []
for idx, item in enumerate(templist):
patha, pathb, data = item
inv = (pathb, patha, data)
if (inv in templist):
if ((templist.index(inv)) not in consumed_idx):
consumed_idx.append(idx)
dupinv += 1
return dupitem + duppath + dupinv
################################################################################
if __name__ == '__main__':
# Option Parser
usage = "usage: %prog [options] rootdir"
parser = OptionParser(usage)
parser.add_option("-e", "--exclude", dest="exclude", help="Exclude this directory")
# Compare options
parser.add_option("-s", "--sim", dest="imgsim", default=False, action='store_true',
help="Compare with image similarity - abs(a-b)")
parser.add_option("-f", "--filehash", dest="filehash", default=False, action='store_true',
help="Compare with File hash")
parser.add_option("-v", "--svd", dest="svd", default=False, action='store_true',
help="Compare with Singular Value Decomposition")
options, args = parser.parse_args()
if len(args) != 1:
parser.error("Need a root directory to recurse into")
rootdir = args[0]
imgsim = options.imgsim
exclude = options.exclude
filehash = options.filehash
svd = options.svd
if ((not imgsim) and (not filehash) and (not svd)):
parser.error("Need to pick one compare option")
print "Generating image listing..."
start = time.time()
img = generate_img_list(rootdir, exclude)
print "Timing: " + str(time.time() - start) + " s"
print
print "Generating image stats..."
total = calc_image_stats(img)
comp4 = []
if imgsim:
# Generate SIM
print
print "Generating image sim..."
start = time.time()
sim = generate_sim.generate_sim_data(img)
print "Timing: " + str(time.time() - start) + " s"
# Diag output
print
print "Image stats..."
print "Total images: " + str(total)
print "Processed images: " + str(len(sim))
print "Omitted images: " + str(total - len(sim))
# Multiprocess Python Compare
print
print "Processing with c n-way compare..."
start = time.time()
comp4 = compare_sim.compare(sim)
print "Timing: " + str(time.time() - start) + " s"
elif filehash:
# Generate file based hash & so forth for verification
print
print "Generating & Processing file based hashes..."
start = time.time()
comp4 = fhash.file_hashes(img)
print "Timing: " + str(time.time() - start) + " s"
elif svd:
print
print "Generating image svd..."
start = time.time()
sim = generate_svd.generate_svd_data(img)
print "Timing: " + str(time.time() - start) + " s"
# Detect duplicate.... duplicates
print
print "Detecting duplicate duplicates..."
dup(comp4, "c")
# for idx in xrange(0,len(comp1)):
# fpa, pathaa, pathba = comp1[idx]
# fpb, pathab, pathbb = comp2[idx]
#
# if (fpa != fpb) or (pathaa != pathab) or (pathba != pathbb):
# print fpa, fpb, " - ", pathaa, pathab, " - ", pathba, pathbb
#
# # Flush out the list
# with open('comp1', 'w') as f:
# for val, path1, path2 in comp1:
# print >>f, str(val) + " - " + path1 + " - " + path2
#
# # Flush out the list
# with open('comp2', 'w') as f:
# for val, path1, path2 in comp2:
# print >>f, str(val) + " - " + path1 + " - " + path2