matheus-erthal commited on
Commit
daa1246
1 Parent(s): 064bf02

Commit inicial

Browse files
Files changed (3) hide show
  1. app.py +11 -0
  2. image_similarity.py +57 -0
  3. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from image_similarity import ImageSimilarity
3
+
4
+ def greet(name):
5
+ return "Hello " + name + "!!"
6
+
7
+ def image_similarity(images):
8
+ image_similarity = ImageSimilarity(1).check(images)
9
+
10
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
11
+ iface.launch()
image_similarity.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util
2
+ from PIL import Image
3
+ # import glob
4
+
5
+ class ImageSimilarity(object):
6
+ def __init__(self, minimum_commutative_image_diff):
7
+ self.minimum_commutative_image_diff = minimum_commutative_image_diff
8
+
9
+ def check(self, image_names):
10
+ # Load the OpenAI CLIP Model
11
+ print('Loading CLIP Model...')
12
+ model = SentenceTransformer('clip-ViT-B-32')
13
+
14
+ # Next we compute the embeddings
15
+ # To encode an image, you can use the following code:
16
+ # from PIL import Image
17
+ # encoded_image = model.encode(Image.open(filepath))
18
+ # image_names = list(glob.glob('./*.jpg'))
19
+ print("Images:", len(image_names))
20
+ encoded_image = model.encode([Image.open(filepath) for filepath in image_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True)
21
+
22
+ # Now we run the clustering algorithm. This function compares images aganist
23
+ # all other images and returns a list with the pairs that have the highest
24
+ # cosine similarity score
25
+ processed_images = util.paraphrase_mining_embeddings(encoded_image)
26
+ NUM_SIMILAR_IMAGES = 10
27
+
28
+ # =================
29
+ # DUPLICATES
30
+ # =================
31
+ print('Finding duplicate images...')
32
+ # Filter list for duplicates. Results are triplets (score, image_id1, image_id2) and is scorted in decreasing order
33
+ # A duplicate image will have a score of 1.00
34
+ # It may be 0.9999 due to lossy image compression (.jpg)
35
+ duplicates = [image for image in processed_images if image[0] >= 0.999]
36
+
37
+ # Output the top X duplicate images
38
+ for score, image_id1, image_id2 in duplicates[0:NUM_SIMILAR_IMAGES]:
39
+ print("\nScore: {:.3f}%".format(score * 100))
40
+ print(image_names[image_id1])
41
+ print(image_names[image_id2])
42
+
43
+ # =================
44
+ # NEAR DUPLICATES
45
+ # =================
46
+ print('Finding near duplicate images...')
47
+ # Use a threshold parameter to identify two images as similar. By setting the threshold lower,
48
+ # you will get larger clusters which have less similar images in it. Threshold 0 - 1.00
49
+ # A threshold of 1.00 means the two images are exactly the same. Since we are finding near
50
+ # duplicate images, we can set it at 0.99 or any number 0 < X < 1.00.
51
+ threshold = 0.99
52
+ near_duplicates = [image for image in processed_images if image[0] < threshold]
53
+
54
+ for score, image_id1, image_id2 in near_duplicates[0:NUM_SIMILAR_IMAGES]:
55
+ print("\nScore: {:.3f}%".format(score * 100))
56
+ print(image_names[image_id1])
57
+ print(image_names[image_id2])
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ sentence_transformers==2.2.2