Spaces:
Running
Running
import imagehash | |
import os | |
from collections import deque | |
from PIL import Image | |
from tqdm import tqdm | |
def find_similar_images( | |
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 | |
): | |
snapshots_files = sorted(os.listdir(base_dir)) | |
hash_dict = {} | |
hash_queue = deque([], maxlen=queue_len) | |
duplicates = [] | |
num_duplicates = 0 | |
print("---" * 5, "Finding similar files", "---" * 5) | |
with tqdm(snapshots_files) as t: | |
for file in t: | |
read_file = Image.open(os.path.join(base_dir, file)) | |
comp_hash = hashfunc(read_file, hash_size=hash_size) | |
duplicate = False | |
if comp_hash not in hash_dict: | |
hash_dict[comp_hash] = file | |
# Compare with hash queue to find out potential duplicates | |
for img_hash in hash_queue: | |
if img_hash - comp_hash <= threshold: | |
duplicate = True | |
break | |
if not duplicate: | |
hash_queue.append(comp_hash) | |
else: | |
duplicate = True | |
if duplicate: | |
duplicates.append(file) | |
num_duplicates += 1 | |
t.set_postfix_str(f"Duplicate files: {num_duplicates}") | |
return hash_dict, duplicates | |
def remove_duplicates( | |
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 | |
): | |
_, duplicates = find_similar_images( | |
base_dir, | |
hash_size=hash_size, | |
hashfunc=hashfunc, | |
queue_len=queue_len, | |
threshold=threshold, | |
) | |
if not len(duplicates): | |
print("No duplicates found!") | |
else: | |
print("Removing duplicates...") | |
for dup_file in duplicates: | |
file_path = os.path.join(base_dir, dup_file) | |
if os.path.exists(file_path): | |
os.remove(file_path) | |
else: | |
print("Filepath: ", file_path, "does not exists.") | |
print("All duplicates removed!") | |
print("***" * 10, "\n") | |
if __name__ == "__main__": | |
remove_duplicates("sample_1") | |