import csv import gradio as gr import glob import pprint as pp from sys import excepthook from re import T from urllib.parse import parse_qs, urlparse import clip import numpy as np import requests import torch import io from IPython.display import Image, display from PIL import Image, ImageFont import os import cv2 import torch import glob # Model def predict(text,img): model = torch.hub.load('ultralytics/yolov5', 'yolov5s') from PIL import Image img1 = Image.open(img).convert("RGB") results = model(img1) dirpath = tempfile.mkdtemp() results.crop(save_dir=dirpath) path= dirpath+'/crops/**/*.jpg' txtfiles = [] for file in glob.glob(path): txtfiles.append(file) l = [] #keyList = list(range(len(txtfiles))) for filename in glob.glob(path): foo = Image.open(filename).convert('RGB') #resized_image = foo.resize((250,250)) l.append(foo) device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) images = torch.stack([preprocess(im) for im in l]).to(device) with torch.no_grad(): image_features = model.encode_image(images) image_features /= image_features.norm(dim=-1, keepdim=True) image_features.cpu().numpy() image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda() image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda() images = [preprocess(im) for im in l] image_input = torch.tensor(np.stack(images)).cuda() image_input -= image_mean[:, None, None] image_input /= image_std[:, None, None] with torch.no_grad(): image_features = model.encode_image(image_input).float() image_features /= image_features.norm(dim=-1, keepdim=True) def get_top_N_semantic_similarity(similarity_list,N): results = zip(range(len(similarity_list)), similarity_list) results = sorted(results, key=lambda x: x[1],reverse= True) top_N_images = [] scores=[] for index,score in results[:N]: scores.append(score) top_N_images.append(l[index]) return scores,top_N_images #search_query = text with torch.no_grad(): # Encode and normalize the description using CLIP text_encoded = model.encode_text(clip.tokenize([text]).to(device)) text_encoded /= text_encoded.norm(dim=-1, keepdim=True) similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T similarity = similarity[0] scores,imgs= get_top_N_semantic_similarity(similarity,N=1) #print ("scores ",scores) #ipyplot.plot_images(imgs,img_width=350) return imgs[0] text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",) #img = gr.inputs.Image() #img = "image" gr.Interface(predict, ["image", text], outputs="image", title='Search inside image').launch();