from sentence_transformers import SentenceTransformer, util from PIL import Image import glob import torch import pickle import zipfile import os from tqdm.autonotebook import tqdm import gradio as gr # Here we load the multilingual CLIP model. Note, this model can only encode text. # If you need embeddings for images, you must load the 'clip-ViT-B-32' model model = SentenceTransformer('clip-ViT-B-32-multilingual-v1') # Next, we get about 25k images from Unsplash img_folder = 'photos/' if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0: os.makedirs(img_folder, exist_ok=True) photo_filename = 'unsplash-25k-photos.zip' if not os.path.exists(photo_filename): #Download dataset if does not exist util.http_get('http://sbert.net/datasets/'+photo_filename, photo_filename) #Extract all images with zipfile.ZipFile(photo_filename, 'r') as zf: for member in tqdm(zf.infolist(), desc='Extracting'): zf.extract(member, img_folder) # Now, we need to compute the embeddings # To speed things up, we destribute pre-computed embeddings # Otherwise you can also encode the images yourself. # To encode an image, you can use the following code: # from PIL import Image # img_emb = model.encode(Image.open(filepath)) use_precomputed_embeddings = True if use_precomputed_embeddings: emb_filename = 'unsplash-25k-photos-embeddings.pkl' if not os.path.exists(emb_filename): #Download dataset if does not exist util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename) with open(emb_filename, 'rb') as fIn: img_names, img_emb = pickle.load(fIn) print("Images:", len(img_names)) else: #For embedding images, we need the non-multilingual CLIP model img_model = SentenceTransformer('clip-ViT-B-32') img_names = list(glob.glob('photos/*.jpg')) print("Images:", len(img_names)) img_emb = img_model.encode([Image.open(filepath) for filepath in img_names], batch_size=128, convert_to_tensor=True, show_progress_bar=True) filepath = 'photos/'+img_names[0] one_emb = torch.tensor(img_emb[0]) img_model = SentenceTransformer('clip-ViT-B-32') comb_emb = img_model.encode(Image.open(filepath), convert_to_tensor=True).cpu() # Next, we define a search function. def search(query): # First, we encode the query (which can either be an image or a text string) query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False) # Then, we use the util.semantic_search function, which computes the cosine-similarity # between the query embedding and all image embeddings. # It then returns the top_k highest ranked images, which we output hits = util.semantic_search(query_emb, img_emb, top_k=1)[0] for hit in hits: return os.path.join(img_folder, img_names[hit['corpus_id']]) title = "Image Search" description = "demo for multilingual text2image search for 50+ languages. To use it, simply add your text, or click one of the examples to load them. Read more at the links below." article = "

SentenceTransformers Documentation | Github Repo

" gr.Interface( search, gr.inputs.Textbox(label="Input"), gr.outputs.Image(type="file", label="Output"), title=title, description=description, article=article, examples=[ ['Two dogs playing in the snow'], ['Eine Katze auf einem Stuhl'], ['Muchos peces'], ['棕榈树的沙滩'], ['Закат на пляже'], ['Parkta bir köpek'], ['夜のニューヨーク'] ] ).launch()