video2slide / post_process.py
dragonSwing's picture
Use tqdm for processing
e0cedf5
raw
history blame contribute delete
No virus
2.12 kB
import imagehash
import os
from collections import deque
from PIL import Image
from tqdm import tqdm
def find_similar_images(
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
snapshots_files = sorted(os.listdir(base_dir))
hash_dict = {}
hash_queue = deque([], maxlen=queue_len)
duplicates = []
num_duplicates = 0
print("---" * 5, "Finding similar files", "---" * 5)
with tqdm(snapshots_files) as t:
for file in t:
read_file = Image.open(os.path.join(base_dir, file))
comp_hash = hashfunc(read_file, hash_size=hash_size)
duplicate = False
if comp_hash not in hash_dict:
hash_dict[comp_hash] = file
# Compare with hash queue to find out potential duplicates
for img_hash in hash_queue:
if img_hash - comp_hash <= threshold:
duplicate = True
break
if not duplicate:
hash_queue.append(comp_hash)
else:
duplicate = True
if duplicate:
duplicates.append(file)
num_duplicates += 1
t.set_postfix_str(f"Duplicate files: {num_duplicates}")
return hash_dict, duplicates
def remove_duplicates(
base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
_, duplicates = find_similar_images(
base_dir,
hash_size=hash_size,
hashfunc=hashfunc,
queue_len=queue_len,
threshold=threshold,
)
if not len(duplicates):
print("No duplicates found!")
else:
print("Removing duplicates...")
for dup_file in duplicates:
file_path = os.path.join(base_dir, dup_file)
if os.path.exists(file_path):
os.remove(file_path)
else:
print("Filepath: ", file_path, "does not exists.")
print("All duplicates removed!")
print("***" * 10, "\n")
if __name__ == "__main__":
remove_duplicates("sample_1")