import imagehash import os from collections import deque from PIL import Image from tqdm import tqdm def find_similar_images( base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 ): snapshots_files = sorted(os.listdir(base_dir)) hash_dict = {} hash_queue = deque([], maxlen=queue_len) duplicates = [] num_duplicates = 0 print("---" * 5, "Finding similar files", "---" * 5) with tqdm(snapshots_files) as t: for file in t: read_file = Image.open(os.path.join(base_dir, file)) comp_hash = hashfunc(read_file, hash_size=hash_size) duplicate = False if comp_hash not in hash_dict: hash_dict[comp_hash] = file # Compare with hash queue to find out potential duplicates for img_hash in hash_queue: if img_hash - comp_hash <= threshold: duplicate = True break if not duplicate: hash_queue.append(comp_hash) else: duplicate = True if duplicate: duplicates.append(file) num_duplicates += 1 t.set_postfix_str(f"Duplicate files: {num_duplicates}") return hash_dict, duplicates def remove_duplicates( base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4 ): _, duplicates = find_similar_images( base_dir, hash_size=hash_size, hashfunc=hashfunc, queue_len=queue_len, threshold=threshold, ) if not len(duplicates): print("No duplicates found!") else: print("Removing duplicates...") for dup_file in duplicates: file_path = os.path.join(base_dir, dup_file) if os.path.exists(file_path): os.remove(file_path) else: print("Filepath: ", file_path, "does not exists.") print("All duplicates removed!") print("***" * 10, "\n") if __name__ == "__main__": remove_duplicates("sample_1")