import csv import gradio as gr import glob import pprint as pp from sys import excepthook from re import T from urllib.parse import parse_qs, urlparse import clip import numpy as np import requests import torch import io from IPython.display import Image, display from PIL import Image, ImageFont import os import cv2 import torch import glob # Model def predict(img,text): import tempfile model = torch.hub.load('ultralytics/yolov5', 'yolov5s') results = model(img) dirpath = tempfile.mkdtemp() results.crop(save_dir=dirpath) path= dirpath+'/crops/**/*.jpg' txtfiles = [] for file in glob.glob(path): txtfiles.append(file) from PIL import Image l = [] #keyList = list(range(len(txtfiles))) for filename in glob.glob(path): foo = Image.open(filename).convert('RGB') #resized_image = foo.resize((250,250)) l.append(foo) device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) images = torch.stack([preprocess(im) for im in l]).to(device) with torch.no_grad(): image_features = model.encode_image(images) image_features /= image_features.norm(dim=-1, keepdim=True) image_features.cpu().numpy() image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]) image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]) images = [preprocess(im) for im in l] image_input = torch.tensor(np.stack(images)) image_input -= image_mean[:, None, None] image_input /= image_std[:, None, None] with torch.no_grad(): image_features = model.encode_image(image_input).float() image_features /= image_features.norm(dim=-1, keepdim=True) def get_top_N_semantic_similarity(similarity_list,N): results = zip(range(len(similarity_list)), similarity_list) results = sorted(results, key=lambda x: x[1],reverse= True) top_N_images = [] scores=[] for index,score in results[:N]: scores.append(score) top_N_images.append(l[index]) return scores,top_N_images #search_query = text with torch.no_grad(): # Encode and normalize the description using CLIP text_encoded = model.encode_text(clip.tokenize(text).to(device)) text_encoded /= text_encoded.norm(dim=-1, keepdim=True) similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T similarity = similarity[0] scores,imgs= get_top_N_semantic_similarity(similarity,N=1) #print ("scores ",scores) #ipyplot.plot_images(imgs,img_width=350) return imgs[0] #text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",) #img = gr.inputs.Image() #img = "image" gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch();