Spaces:
Runtime error
Runtime error
#Acknowledgments: | |
#This project is inspired by: | |
#1. https://github.com/haltakov/natural-language-image-search by Vladimir Haltakov | |
#2. OpenAI's CLIP | |
#Import all the necessary libraries | |
import torch | |
import requests | |
import numpy as np | |
import pandas as pd | |
import gradio as gr | |
from io import BytesIO | |
from PIL import Image as PILIMAGE | |
from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer | |
#Selecting device based on availability of GPUs | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
#Defining model, processor and tokenizer | |
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device) | |
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") | |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32") | |
#Loading the data | |
photos = pd.read_csv("./photos_debug.tsv000", sep='\t', header=0) | |
photo_features = np.load("./features_debug.npy") | |
photo_ids = pd.read_csv("./photo_ids_debug.csv") | |
photo_ids = list(photo_ids['photo_id']) | |
def find_best_matches(text): | |
#Inference | |
with torch.no_grad(): | |
# Encode and normalize the description using CLIP | |
inputs = tokenizer([text], padding=True, return_tensors="pt") | |
inputs = processor(text=[text], images=None, return_tensors="pt", padding=True) | |
text_encoded = model.get_text_features(**inputs).detach().numpy() | |
# Finding Cosine similarity | |
similarities = list((text_encoded @ photo_features.T).squeeze(0)) | |
#Block of code for displaying top 3 best matches (images) | |
matched_images = [] | |
for i in range(3): | |
idx = sorted(zip(similarities, range(photo_features.shape[0])), key=lambda x: x[0], reverse=True)[i][1] | |
photo_id = photo_ids[idx] | |
photo_data = photos[photos["photo_id"] == photo_id].iloc[0] | |
response = requests.get(photo_data["photo_image_url"] + "?w=640") | |
img = PILIMAGE.open(BytesIO(response.content)) | |
matched_images.append(img) | |
return matched_images | |
#Gradio app | |
iface = gr.Interface(fn=find_best_matches, inputs=[gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",)], | |
examples=[["Dog sticking its tongue out"],["Traffic light on the right"],["Honey bee eating honey"],["Leaves of Bryophyllum fallen on the ground"], ["Cute Kangaroo"], ["Athlete holding a bike in his hands"], ["Happy puppy"], ["Sad puppy"], ["Leopard hiding in the bushes"]], | |
theme = "grass", | |
outputs=gr.outputs.Carousel([gr.outputs.Image(type="pil")]), | |
enable_queue=True, | |
title= "Text to Image search using CLIP", | |
description="This application displays TOP THREE images from Unsplash dataset that best match the natural language search query provided by the user.").launch() |