kawa commited on
Commit
b357b2b
1 Parent(s): cb7704a
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import gradio as gr
3
+
4
+ import torch
5
+ from transformers import CLIPProcessor, CLIPModel
6
+
7
+ device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
8
+ model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device=device)
9
+ processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
10
+
11
+ def predict(text, image):
12
+
13
+ classes = text2classes(text)
14
+ inputs = processor(text=classes, images=image, return_tensors="pt", padding=True)
15
+
16
+ outputs = model(**inputs)
17
+ logits_per_image = outputs.logits_per_image # image-text similarity score
18
+ probs = logits_per_image.softmax(dim=1)
19
+
20
+ results = {}
21
+ for i, label in enumerate(classes):
22
+ results[label] = float(probs.detach().numpy()[0, i])
23
+
24
+ return results
25
+
26
+
27
+ def text2classes(text: str):
28
+ classes = text.lower().strip().split(',')
29
+ return classes
30
+
31
+
32
+ def addClass(text: str):
33
+ if len(text) > 0:
34
+ classes.append(text)
35
+ classes = list(set(classes))
36
+
37
+ overview = ''
38
+ for cls in classes:
39
+ overview += cls + '; '
40
+
41
+ return overview
42
+
43
+
44
+ def similarity(image1, image2):
45
+
46
+ inputs = processor(images=image1, return_tensors="pt")
47
+ features1 = model.get_image_features(**inputs)
48
+
49
+ inputs = processor(images=image2, return_tensors="pt")
50
+ features2 = model.get_image_features(**inputs)
51
+
52
+ similarity_measure = torch.nn.functional.cosine_similarity(features1, features2, dim=-1).detach().numpy()
53
+
54
+ result = {}
55
+ result['Similarity'] = float(similarity_measure)
56
+
57
+ return result
58
+
59
+
60
+ with gr.Blocks() as clip_demo:
61
+
62
+ gr.Markdown('# Similarity Clip')
63
+ gr.Markdown("""This is a demo to show the potential of image embeddings with CLIP. Three takeaways:
64
+ * CLIP combines large language models with images to form an unified embedding space.
65
+ * Embeddings of CLIP can be used to compare two images, to compare two text prompts and to compare text prompt to image.
66
+ * It can be used for e.g. zero-shot classification, image retrieval.
67
+ """)
68
+
69
+ with gr.Row():
70
+ with gr.Column():
71
+ limage = gr.Image(type='pil')
72
+
73
+ with gr.Column():
74
+ rimage = gr.Image(type='pil')
75
+
76
+ predictionButton = gr.Button('Predict')
77
+ labels = gr.Label()
78
+
79
+ gr.Examples([ ['./examples/elephants.jpg', './examples/bush_elephant.jpg'],
80
+ ['./examples/elephants.jpg', './examples/elephant_skeleton.jpg'],
81
+ # ['./examples/elephants.jpg', './examples/rembrandt.jpg']
82
+ ], inputs=[limage, rimage])
83
+
84
+ # event handler
85
+ predictionButton.click(fn=similarity, inputs=[limage, rimage], outputs=labels)
86
+
87
+ clip_demo.launch()
examples/bush_elephant.jpg ADDED
examples/elephant_skeleton.jpg ADDED
examples/elephants.jpg ADDED