#Acknowledgments: #This project is inspired by: #1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov #2. OpenAI's CLIP #Import all the necessary libraries import torch import requests import numpy as np import pandas as pd import gradio as gr from io import BytesIO from PIL import Image as PILIMAGE from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer #Selecting device based on availability of GPUs device = "cuda" if torch.cuda.is_available() else "cpu" #Defining model, processor and tokenizer model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") #Loading the data photos = pd.read_csv("./photos_debug.tsv000", sep='\t', header=0) photo_features = np.load("./features_debug.npy") photo_ids = pd.read_csv("./photo_ids_debug.csv") photo_ids = list(photo_ids['photo_id']) def find_best_matches(text): #Inference with torch.no_grad(): # Encode and normalize the description using CLIP inputs = tokenizer([text], padding=True, return_tensors="pt") inputs = processor(text=[text], images=None, return_tensors="pt", padding=True) text_encoded = model.get_text_features(**inputs).detach().numpy() # Finding Cosine similarity similarities = list((text_encoded @ photo_features.T).squeeze(0)) #Block of code for displaying top 3 best matches (images) matched_images = [] for i in range(3): idx = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)[i][1] photo_id = photo_ids[idx] photo_data = photos[photos["photo_id"] == photo_id].iloc[0] response = requests.get(photo_data["photo_image_url"] + "?w=640") img = PILIMAGE.open(BytesIO(response.content)) matched_images.append(img) return matched_images #Gradio app iface = gr.Interface(fn=find_best_matches, inputs=[gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",)], examples=[["Dog sticking its tongue out"],["Traffic light on the right"],["Honey bee eating honey"],["Leaves of Bryophyllum fallen on the ground"], ["Cute Kangaroo"], ["Athlete holding a bike in his hands"], ["Happy puppy"], ["Sad puppy"], ["Leopard hiding in the bushes"]], theme = "grass", outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil")]), enable_queue=True, title= "Text to Image search using CLIP", description="This application displays TOP THREE images from Unsplash dataset that best match the natural language search query provided by the user.").launch()