import csv import gradio as gr import glob import pprint as pp from sys import excepthook from re import T from urllib.parse import parse_qs, urlparse import clip import numpy as np import requests import torch import io from PIL import Image, ImageFont import os import cv2 import torch import glob # Model def predict(img,text): import tempfile model = torch.hub.load('ultralytics/yolov5', 'yolov5s') results = model(img) dirpath = tempfile.mkdtemp() results.crop(save_dir=dirpath) path= dirpath+'/crops/**/*.jpg' txtfiles = [] for file in glob.glob(path): txtfiles.append(file) from PIL import Image l = [] #keyList = list(range(len(txtfiles))) for filename in glob.glob(path): foo = Image.open(filename).convert('RGB') #resized_image = foo.resize((250,250)) l.append(foo) device = "cuda" if torch.cuda.is_available() else "cpu" model, preprocess = clip.load("ViT-B/32", device=device) images = torch.stack([preprocess(im) for im in l]).to(device) with torch.no_grad(): image_features = model.encode_image(images) image_features /= image_features.norm(dim=-1, keepdim=True) image_features.cpu().numpy() image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]) image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]) images = [preprocess(im) for im in l] image_input = torch.tensor(np.stack(images)) image_input -= image_mean[:, None, None] image_input /= image_std[:, None, None] with torch.no_grad(): image_features = model.encode_image(image_input).float() image_features /= image_features.norm(dim=-1, keepdim=True) def get_top_N_semantic_similarity(similarity_list,N): results = zip(range(len(similarity_list)), similarity_list) results = sorted(results, key=lambda x: x[1],reverse= True) top_N_images = [] scores=[] for index,score in results[:N]: scores.append(score) top_N_images.append(l[index]) return scores,top_N_images #search_query = text with torch.no_grad(): # Encode and normalize the description using CLIP text_encoded = model.encode_text(clip.tokenize(text).to(device)) text_encoded /= text_encoded.norm(dim=-1, keepdim=True) similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T similarity = similarity[0] scores,imgs= get_top_N_semantic_similarity(similarity,N=1) #print ("scores ",scores) #ipyplot.plot_images(imgs,img_width=350) return imgs[0] #text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",) #img = gr.inputs.Image() #img = "image" gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch();