Skip to content

Commit 2b1cb8f

Browse files
committed
feat: add script
1 parent a47dee9 commit 2b1cb8f

1 file changed

Lines changed: 32 additions & 0 deletions

File tree

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
from pathlib import Path
3+
import hashlib
4+
import ipyplot
5+
import numpy as np
6+
from PIL import Image
7+
8+
9+
path = './data_test'
10+
11+
file_list = os.walk(path)
12+
13+
file_hashes = dict()
14+
for root,folders,files in file_list:
15+
for file in files:
16+
path = Path(os.path.join(root,file))
17+
fhash = hashlib.md5(open(path,'rb').read()).hexdigest()
18+
19+
if fhash in file_hashes:
20+
file_hashes[fhash].append(path)
21+
else:
22+
file_hashes[fhash] = [path]
23+
24+
duplicate_file_paths = [x[1] for x in file_hashes.items() if len(x[1])>1]
25+
26+
get_len = lambda x: len(x.stem)
27+
for duplicates in duplicate_file_paths:
28+
keep_index = np.argmin(list(map(get_len, duplicates)))
29+
for i, duplicate in enumerate(duplicates):
30+
if keep_index != i:
31+
print(f"removed file {duplicate}")
32+
os.remove(duplicate)

0 commit comments

Comments
 (0)