import csv import gradio as gr import glob import pprint as pp from sys import excepthook from re import T from urllib.parse import parse_qs, urlparse import clip import numpy as np import requests import torch import io from IPython.display import Image, display from PIL import Image, ImageFont import os import cv2 import torch import glob # Model def predict(text,img): model = torch.hub.load('ultralytics/yolov5', 'yolov5s') img = cv2.imread(img) results = model(img1) dirpath = tempfile.mkdtemp() results.crop(save_dir=dirpath) path= dirpath+'/crops/**/*.jpg' txtfiles = [] for file in glob.glob(path): txtfiles.append(file) import ipyplot from PIL import Image l = [] #keyList = list(range(len(txtfiles))) for filename in glob.glob(path): foo = Image.open(filename).convert('RGB') #resized_image = foo.resize((250,250)) l.append(foo) device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) images = torch.stack([preprocess(im) for im in l]).to(device) with torch.no_grad(): image_features = model.encode_image(images) image_features /= image_features.norm(dim=-1, keepdim=True) image_features.cpu().numpy() image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda() image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda() images = [preprocess(im) for im in l] image_input = torch.tensor(np.stack(images)).cuda() image_input -= image_mean[:, None, None] image_input /= image_std[:, None, None] with torch.no_grad(): image_features = model.encode_image(image_input).float() image_features /= image_features.norm(dim=-1, keepdim=True) def get_top_N_semantic_similarity(similarity_list,N): results = zip(range(len(similarity_list)), similarity_list) results = sorted(results, key=lambda x: x[1],reverse= True) top_N_images = [] scores=[] for index,score in results[:N]: scores.append(score) top_N_images.append(l[index]) return scores,top_N_images search_query = "White car" with torch.no_grad(): # Encode and normalize the description using CLIP text_encoded = model.encode_text(clip.tokenize(search_query).to(device)) text_encoded /= text_encoded.norm(dim=-1, keepdim=True) similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T similarity = similarity[0] scores,imgs= get_top_N_semantic_similarity(similarity,N=1) #print ("scores ",scores) #ipyplot.plot_images(imgs,img_width=350) return imgs[0] text = gr.inputs.Textbox(lines=5, label="Context") #img = gr.inputs.Image() #img = "image" gr.Interface(predict, ["image", text], outputs="image", title='Search inside image').launch();