Spaces:
Runtime error
Runtime error
import csv | |
import gradio as gr | |
import glob | |
import pprint as pp | |
from sys import excepthook | |
from re import T | |
from urllib.parse import parse_qs, urlparse | |
import clip | |
import numpy as np | |
import requests | |
import torch | |
import io | |
from IPython.display import Image, display | |
from PIL import Image, ImageFont | |
import os | |
import cv2 | |
import torch | |
import glob | |
# Model | |
def predict(img,text): | |
import tempfile | |
model = torch.hub.load('ultralytics/yolov5', 'yolov5s') | |
results = model(img) | |
dirpath = tempfile.mkdtemp() | |
results.crop(save_dir=dirpath) | |
path= dirpath+'/crops/**/*.jpg' | |
txtfiles = [] | |
for file in glob.glob(path): | |
txtfiles.append(file) | |
from PIL import Image | |
l = [] | |
#keyList = list(range(len(txtfiles))) | |
for filename in glob.glob(path): | |
foo = Image.open(filename).convert('RGB') | |
#resized_image = foo.resize((250,250)) | |
l.append(foo) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model, preprocess = clip.load("ViT-B/32", device=device) | |
images = torch.stack([preprocess(im) for im in l]).to(device) | |
with torch.no_grad(): | |
image_features = model.encode_image(images) | |
image_features /= image_features.norm(dim=-1, keepdim=True) | |
image_features.cpu().numpy() | |
image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]) | |
image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]) | |
images = [preprocess(im) for im in l] | |
image_input = torch.tensor(np.stack(images)) | |
image_input -= image_mean[:, None, None] | |
image_input /= image_std[:, None, None] | |
with torch.no_grad(): | |
image_features = model.encode_image(image_input).float() | |
image_features /= image_features.norm(dim=-1, keepdim=True) | |
def get_top_N_semantic_similarity(similarity_list,N): | |
results = zip(range(len(similarity_list)), similarity_list) | |
results = sorted(results, key=lambda x: x[1],reverse= True) | |
top_N_images = [] | |
scores=[] | |
for index,score in results[:N]: | |
scores.append(score) | |
top_N_images.append(l[index]) | |
return scores,top_N_images | |
#search_query = text | |
with torch.no_grad(): | |
# Encode and normalize the description using CLIP | |
text_encoded = model.encode_text(clip.tokenize(text).to(device)) | |
text_encoded /= text_encoded.norm(dim=-1, keepdim=True) | |
similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T | |
similarity = similarity[0] | |
scores,imgs= get_top_N_semantic_similarity(similarity,N=1) | |
#print ("scores ",scores) | |
#ipyplot.plot_images(imgs,img_width=350) | |
return imgs[0] | |
#text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",) | |
#img = gr.inputs.Image() | |
#img = "image" | |
gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch(); | |