File size: 2,122 Bytes
e086001
 
 
 
e0cedf5
e086001
 
 
 
 
 
 
 
 
 
 
 
 
 
e0cedf5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e086001
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import imagehash
import os
from collections import deque
from PIL import Image
from tqdm import tqdm


def find_similar_images(
    base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
    snapshots_files = sorted(os.listdir(base_dir))

    hash_dict = {}
    hash_queue = deque([], maxlen=queue_len)
    duplicates = []
    num_duplicates = 0

    print("---" * 5, "Finding similar files", "---" * 5)

    with tqdm(snapshots_files) as t:
        for file in t:
            read_file = Image.open(os.path.join(base_dir, file))
            comp_hash = hashfunc(read_file, hash_size=hash_size)
            duplicate = False

            if comp_hash not in hash_dict:
                hash_dict[comp_hash] = file
                # Compare with hash queue to find out potential duplicates
                for img_hash in hash_queue:
                    if img_hash - comp_hash <= threshold:
                        duplicate = True
                        break

                if not duplicate:
                    hash_queue.append(comp_hash)
            else:
                duplicate = True

            if duplicate:
                duplicates.append(file)
                num_duplicates += 1
                t.set_postfix_str(f"Duplicate files: {num_duplicates}")

    return hash_dict, duplicates


def remove_duplicates(
    base_dir, hash_size=8, hashfunc=imagehash.dhash, queue_len=5, threshold=4
):
    _, duplicates = find_similar_images(
        base_dir,
        hash_size=hash_size,
        hashfunc=hashfunc,
        queue_len=queue_len,
        threshold=threshold,
    )

    if not len(duplicates):
        print("No duplicates found!")
    else:
        print("Removing duplicates...")

        for dup_file in duplicates:
            file_path = os.path.join(base_dir, dup_file)

            if os.path.exists(file_path):
                os.remove(file_path)
            else:
                print("Filepath: ", file_path, "does not exists.")

        print("All duplicates removed!")

    print("***" * 10, "\n")


if __name__ == "__main__":
    remove_duplicates("sample_1")