clip-variants / example.py
mlunar's picture
Tweaked example output
555f7ba
from operator import itemgetter
import numpy as np
from tabulate import tabulate
from cliponnx.models import TextualModel, VisualModel
def cosine_similarity(a, b):
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
# With GPU (slower startup, faster inference with supported cards)
# providers = ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
# CPU only (faster startup, slower inference)
providers = ['CPUExecutionProvider']
images = [
"flowers.jpg",
"heavy-industry.jpg",
]
texts = [
"a close up photo of a cherry blossom",
"cherry blossom",
"flowers",
"plant",
"processing plant",
"a large industrial plant with many pipes, walkways and railings",
"ruhrgebiet",
"industry",
"a photo taken on a bright and sunny day",
"a photo taken on a dark and cloudy day",
"a photo taken at midnight",
"bees",
"cars",
"dogs and cats",
]
visual = VisualModel("models/clip-vit-base-patch32-visual-float16.onnx", providers=providers)
images_input = visual.preprocess_images(images)
print(f"Images shape: {images_input.shape}")
image_embeddings = visual.encode(images_input)
print(f"Embeddings shape: {image_embeddings.shape}")
print()
textual = TextualModel("models/clip-vit-base-patch32-textual-float16.onnx", providers=providers)
texts_input = textual.tokenize(texts)
print(f"Texts shape: {texts_input.shape}")
text_embeddings = textual.encode(texts_input)
print(f"Embeddings shape: {text_embeddings.shape}")
print()
table = [["image", "similarity", "text"]]
for ii, image in enumerate(images):
image_embedding = image_embeddings[ii]
similarities = []
for ti, text in enumerate(texts):
text_embedding = text_embeddings[ti]
similarity = cosine_similarity(image_embedding, text_embedding)
similarities.append([similarity, ">" * int(similarity * 30), text])
similarities.sort(reverse=True, key=itemgetter(0))
print(image)
print(tabulate(similarities, headers=["similarity", "bar chart", "text"]))
print()