Spaces:

kawapanion
/

SimilarityClip

Runtime error

App Files Files Community

kawa commited on Nov 13, 2022

Commit

b357b2b

1 Parent(s): cb7704a

init

Browse files

Files changed (4) hide show

app.py +87 -0
examples/bush_elephant.jpg +0 -0
examples/elephant_skeleton.jpg +0 -0
examples/elephants.jpg +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+from PIL import Image
+import gradio as gr
+import torch
+from transformers import CLIPProcessor, CLIPModel
+device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device=device)
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+def predict(text, image):
+    classes = text2classes(text)
+    inputs = processor(text=classes, images=image, return_tensors="pt", padding=True)
+    outputs = model(**inputs)
+    logits_per_image = outputs.logits_per_image  # image-text similarity score
+    probs = logits_per_image.softmax(dim=1)
+    results = {}
+    for i, label in enumerate(classes):
+        results[label] = float(probs.detach().numpy()[0, i])
+    return results
+def text2classes(text: str):
+    classes = text.lower().strip().split(',')
+    return classes
+def addClass(text: str):
+    if len(text) > 0:
+        classes.append(text)
+        classes = list(set(classes))
+    overview = ''
+    for cls in classes:
+        overview += cls + '; '
+    return overview
+def similarity(image1, image2):
+    inputs = processor(images=image1, return_tensors="pt")
+    features1 = model.get_image_features(**inputs)
+    inputs = processor(images=image2, return_tensors="pt")
+    features2 = model.get_image_features(**inputs)
+    similarity_measure = torch.nn.functional.cosine_similarity(features1, features2, dim=-1).detach().numpy()
+    result = {}
+    result['Similarity'] = float(similarity_measure)
+    return result
+with gr.Blocks() as clip_demo:
+    gr.Markdown('# Similarity Clip')
+    gr.Markdown("""This is a demo to show the potential of image embeddings with CLIP. Three takeaways:
+    * CLIP combines large language models with images to form an unified embedding space.
+    * Embeddings of CLIP can be used to compare two images, to compare two text prompts and to compare text prompt to image.
+    * It can be used for e.g. zero-shot classification, image retrieval.
+    """)
+    with gr.Row():
+        with gr.Column():
+            limage = gr.Image(type='pil')
+        with gr.Column():
+            rimage = gr.Image(type='pil')
+    predictionButton = gr.Button('Predict')
+    labels = gr.Label()
+    gr.Examples([   ['./examples/elephants.jpg', './examples/bush_elephant.jpg'],
+                    ['./examples/elephants.jpg', './examples/elephant_skeleton.jpg'],
+                    # ['./examples/elephants.jpg', './examples/rembrandt.jpg']
+                    ], inputs=[limage, rimage])
+    # event handler
+    predictionButton.click(fn=similarity, inputs=[limage, rimage], outputs=labels)
+clip_demo.launch()

examples/bush_elephant.jpg ADDED Viewed

examples/elephant_skeleton.jpg ADDED Viewed

examples/elephants.jpg ADDED Viewed