Crop-CLIP / app.py
SoumyabrotoTSL's picture
Update app.py
29fb270
raw history blame
No virus
2.96 kB
import csv
import gradio as gr
import glob
import pprint as pp
from sys import excepthook
from re import T
from urllib.parse import parse_qs, urlparse
import clip
import numpy as np
import requests
import torch
import io
from PIL import Image, ImageFont
import os
import cv2
import torch
import glob
# Model
def predict(img,text):
import tempfile
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
results = model(img)
dirpath = tempfile.mkdtemp()
results.crop(save_dir=dirpath)
path= dirpath+'/crops/**/*.jpg'
txtfiles = []
for file in glob.glob(path):
txtfiles.append(file)
from PIL import Image
l = []
#keyList = list(range(len(txtfiles)))
for filename in glob.glob(path):
foo = Image.open(filename).convert('RGB')
#resized_image = foo.resize((250,250))
l.append(foo)
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
images = torch.stack([preprocess(im) for im in l]).to(device)
with torch.no_grad():
image_features = model.encode_image(images)
image_features /= image_features.norm(dim=-1, keepdim=True)
image_features.cpu().numpy()
image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])
image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])
images = [preprocess(im) for im in l]
image_input = torch.tensor(np.stack(images))
image_input -= image_mean[:, None, None]
image_input /= image_std[:, None, None]
with torch.no_grad():
image_features = model.encode_image(image_input).float()
image_features /= image_features.norm(dim=-1, keepdim=True)
def get_top_N_semantic_similarity(similarity_list,N):
results = zip(range(len(similarity_list)), similarity_list)
results = sorted(results, key=lambda x: x[1],reverse= True)
top_N_images = []
scores=[]
for index,score in results[:N]:
scores.append(score)
top_N_images.append(l[index])
return scores,top_N_images
#search_query = text
with torch.no_grad():
# Encode and normalize the description using CLIP
text_encoded = model.encode_text(clip.tokenize(text).to(device))
text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T
similarity = similarity[0]
scores,imgs= get_top_N_semantic_similarity(similarity,N=1)
#print ("scores ",scores)
#ipyplot.plot_images(imgs,img_width=350)
return imgs[0]
#text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",)
#img = gr.inputs.Image()
#img = "image"
gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch();