File size: 2,573 Bytes
f457390
7f0b913
 
 
 
 
 
f457390
7f0b913
 
 
 
de33293
f457390
 
7f53b0a
f457390
 
 
 
624ee8e
de33293
 
fa2cb47
a96ef2d
de33293
 
 
 
 
 
 
 
 
 
 
 
18cc9d1
de33293
 
 
 
 
a96ef2d
4ca3402
a96ef2d
 
 
952a07a
f2dda80
1e90462
 
 
 
 
 
 
 
931456e
f2dda80
 
2f24ca3
 
 
f2dda80
 
 
 
a96ef2d
 
4ca3402
a96ef2d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import os
import skimage
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import OrderedDict
import torch
from imagebind import data
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
import torch.nn as nn
import pickle


device = "cpu" #"cuda:0" if torch.cuda.is_available() else "cpu"
model = imagebind_model.imagebind_huge(pretrained=True)
model.eval()
model.to(device)

image_features = pickle.load(open("./assets/image_features_norm_2.pkl","rb"))
image_paths = pickle.load(open("./assets/image_paths.pkl","rb"))


def generate_image(text):
    inputs = {
        ModalityType.TEXT: data.load_and_transform_text([text], device)
    }

    with torch.no_grad():
        embeddings = model(inputs)
    
    text_features = embeddings[ModalityType.TEXT]
    text_features /= text_features.norm(dim=-1, keepdim=True)

    similarity = text_features.cpu().numpy() @ image_features.cpu().numpy().T

    
    index_img = np.argmax(similarity)
    img_name = os.path.basename(image_paths[index_img])
    im = Image.open(f"./assets/images/{img_name}").convert("RGB")

    return im


iface = gr.Interface(
    fn=generate_image,
    inputs="text",
    outputs="image",
    examples=[
            ["a page of text about segmentation", "assets/images/page.png"],
            ["a facial photo of a tabby cat", "assets/images/chelsea.png"],
            ["a portrait of an astronaut with the American flag", "assets/images/astronaut.png"],
            ["a rocket standing on a launchpad", "assets/images/rocket.png"],
            ["a red motorcycle standing in a garage", "assets/images/motorcycle_right.png"],
            ["a person looking at a camera on a tripod", "assets/images/camera.png"],
            ["a black-and-white silhouette of a horse", "assets/images/horse.png"],
            ["a cup of coffee on a saucer", "assets/images/coffee.png"]
        ],
    title="Find the image most similar to the given text",
    description='''<p>
    Welcome to a straightforward demonstration of ImageBind. 
    This simple demo is designed to find the image most similar to a given text 
    using cosine similarity. For a comprehensive 
    understanding of its capabilities, we encourage you to explore the original research <a href='https://arxiv.org/abs/2305.05665' target='_blank'>paper</a> 
    and visit the <a href='https://github.com/facebookresearch/ImageBind' target='_blank'>repository</a>  
    for more in-depth information.<p>
    '''
)


iface.launch()