|
from operator import itemgetter |
|
import numpy as np |
|
from tabulate import tabulate |
|
|
|
from cliponnx.models import TextualModel, VisualModel |
|
|
|
def cosine_similarity(a, b): |
|
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) |
|
|
|
|
|
|
|
|
|
|
|
providers = ['CPUExecutionProvider'] |
|
|
|
images = [ |
|
"flowers.jpg", |
|
"heavy-industry.jpg", |
|
] |
|
|
|
texts = [ |
|
"a close up photo of a cherry blossom", |
|
"cherry blossom", |
|
"flowers", |
|
"plant", |
|
"processing plant", |
|
"a large industrial plant with many pipes, walkways and railings", |
|
"ruhrgebiet", |
|
"industry", |
|
"a photo taken on a bright and sunny day", |
|
"a photo taken on a dark and cloudy day", |
|
"a photo taken at midnight", |
|
"bees", |
|
"cars", |
|
"dogs and cats", |
|
] |
|
|
|
visual = VisualModel("models/clip-vit-base-patch32-visual-float16.onnx", providers=providers) |
|
images_input = visual.preprocess_images(images) |
|
print(f"Images shape: {images_input.shape}") |
|
image_embeddings = visual.encode(images_input) |
|
print(f"Embeddings shape: {image_embeddings.shape}") |
|
print() |
|
|
|
textual = TextualModel("models/clip-vit-base-patch32-textual-float16.onnx", providers=providers) |
|
texts_input = textual.tokenize(texts) |
|
print(f"Texts shape: {texts_input.shape}") |
|
text_embeddings = textual.encode(texts_input) |
|
print(f"Embeddings shape: {text_embeddings.shape}") |
|
print() |
|
|
|
table = [["image", "similarity", "text"]] |
|
|
|
for ii, image in enumerate(images): |
|
image_embedding = image_embeddings[ii] |
|
|
|
similarities = [] |
|
for ti, text in enumerate(texts): |
|
text_embedding = text_embeddings[ti] |
|
similarity = cosine_similarity(image_embedding, text_embedding) |
|
similarities.append([similarity, ">" * int(similarity * 30), text]) |
|
|
|
similarities.sort(reverse=True, key=itemgetter(0)) |
|
print(image) |
|
print(tabulate(similarities, headers=["similarity", "bar chart", "text"])) |
|
print() |
|
|