File tree Expand file tree Collapse file tree
python_tiny_projects/remove_duplicates Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ import os
2+ from pathlib import Path
3+ import hashlib
4+ import ipyplot
5+ import numpy as np
6+ from PIL import Image
7+
8+
9+ path = './data_test'
10+
11+ file_list = os .walk (path )
12+
13+ file_hashes = dict ()
14+ for root ,folders ,files in file_list :
15+ for file in files :
16+ path = Path (os .path .join (root ,file ))
17+ fhash = hashlib .md5 (open (path ,'rb' ).read ()).hexdigest ()
18+
19+ if fhash in file_hashes :
20+ file_hashes [fhash ].append (path )
21+ else :
22+ file_hashes [fhash ] = [path ]
23+
24+ duplicate_file_paths = [x [1 ] for x in file_hashes .items () if len (x [1 ])> 1 ]
25+
26+ get_len = lambda x : len (x .stem )
27+ for duplicates in duplicate_file_paths :
28+ keep_index = np .argmin (list (map (get_len , duplicates )))
29+ for i , duplicate in enumerate (duplicates ):
30+ if keep_index != i :
31+ print (f"removed file { duplicate } " )
32+ os .remove (duplicate )
You can’t perform that action at this time.
0 commit comments