-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathfhash.py
More file actions
101 lines (82 loc) · 3.1 KB
/
fhash.py
File metadata and controls
101 lines (82 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import sys
import hashlib
BLOCKSIZE = 1024 * 8
################################################################################
def file_hashes(img_list):
img = []
print "Chunking it into sizes..."
# Break it up into "sizes" so we can save on disk reads latter for
# the hashes
sizedict = {}
for root, filename in img_list:
filepath = os.path.join(root, filename)
size = os.path.getsize(filepath)
files = sizedict.get(size, [])
files.append(filepath)
sizedict[size] = files
# Cull the lists that are less than 2 item out of the sizedict
for k, v in sizedict.items():
if len(v) < 2:
del sizedict[k]
print "Hashing same size files..."
# Perform the hashing on the remaining items
hashdict = {}
for file_list in sizedict.itervalues():
for files in file_list:
h = hashlib.md5() # Should look into other hashes, this will do and its quick
with open(files, 'rb') as f:
for chunk in iter(lambda: f.read(BLOCKSIZE), ''):
h.update(chunk)
digest = h.hexdigest()
hashes = hashdict.get(digest, [])
hashes.append(files)
hashdict[digest] = hashes
# Cull the lists that are less than 2 item out of the hashdict
for k, v in hashdict.items():
if len(v) < 2:
del hashdict[k]
print "Comparing same hash files, byte by byte..."
# Do byte to byte compare here to be 100% certain that two file that
# hashes the same is really the same.
to_process = []
not_dupes = []
for k, v in hashdict.items():
for idxa in xrange(0, len(v)):
for idxb in xrange ((idxa + 1), len(v)):
tup = (v[idxa], v[idxb])
tup2 = (v[idxb], v[idxa])
if ((tup not in not_dupes) and (tup2 not in not_dupes)):
if ((tup not in img) and (tup2 not in img)):
to_process.append( tup )
# Process the list of item to process
for tup in to_process:
fa, fb = tup
try:
f1 = open(fa, 'rb')
f2 = open(fb, 'rb')
while True:
buf1 = f1.read(BLOCKSIZE)
buf2 = f2.read(BLOCKSIZE)
if (buf1 and buf2):
# Compare
if (buf1 != buf2):
not_dupes.append( (fa, fb) )
break
elif (((not buf1) and buf2) or (buf1 and (not buf2))):
not_dupes.append( (fa, fb) )
break
else:
img.append( (fa, fb, None) )
break
except:
print "Unexpected error:", sys.exc_info()[0]
raise
finally:
f1.close()
f2.close()
# Flush the to_process list
to_process = []
print "hash dup - " + str(len(hashdict))
print "byte by byte dup - " + str(len(img))
return img