File size: 3,004 Bytes
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c64f929
be31d6b
008bbdb
ec5658e
008bbdb
 
 
 
 
 
ce80bdb
 
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d6916b
 
008bbdb
 
7b13a4f
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21ef9b6
008bbdb
 
 
c64f929
008bbdb
 
 
 
 
 
 
6217eeb
008bbdb
c64f929
415d5da
008bbdb
3e99bd6
008bbdb
e1ccc30
3e99bd6
e348155
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import csv
import gradio as gr
import glob
import pprint as pp
from sys import excepthook
from re import T
from urllib.parse import parse_qs, urlparse
import clip
import numpy as np
import requests
import torch
import io


from IPython.display import Image, display
from PIL import Image, ImageFont
import os
import cv2
import torch
import glob

# Model

def predict(img,text):
  import tempfile
  model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
  results = model(img)  
  dirpath = tempfile.mkdtemp()
  results.crop(save_dir=dirpath)
  path= dirpath+'/crops/**/*.jpg'
  txtfiles = []
  for file in glob.glob(path):
      txtfiles.append(file)
      
  from PIL import Image
  l = []
  #keyList = list(range(len(txtfiles)))
  for filename in glob.glob(path):
    foo = Image.open(filename).convert('RGB')
    #resized_image = foo.resize((250,250))
    l.append(foo) 

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model, preprocess = clip.load("ViT-B/32", device=device)

  images = torch.stack([preprocess(im) for im in l]).to(device)
  with torch.no_grad():
    image_features = model.encode_image(images)
    image_features /= image_features.norm(dim=-1, keepdim=True)

  image_features.cpu().numpy()

  image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])
  image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])

  images = [preprocess(im) for im in l]
  image_input = torch.tensor(np.stack(images))
  image_input -= image_mean[:, None, None]
  image_input /= image_std[:, None, None]
  with torch.no_grad():
      image_features = model.encode_image(image_input).float()
  image_features /= image_features.norm(dim=-1, keepdim=True)

  def get_top_N_semantic_similarity(similarity_list,N):
    results = zip(range(len(similarity_list)), similarity_list)
    results = sorted(results, key=lambda x: x[1],reverse= True)
    top_N_images = []
    scores=[]
    for index,score in results[:N]:
      scores.append(score)
      top_N_images.append(l[index])
    return scores,top_N_images

  #search_query = text

  with torch.no_grad():
      # Encode and normalize the description using CLIP
      text_encoded = model.encode_text(clip.tokenize(text).to(device))
      text_encoded /= text_encoded.norm(dim=-1, keepdim=True)

  similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T
  similarity = similarity[0]
  scores,imgs= get_top_N_semantic_similarity(similarity,N=1)
  #print ("scores ",scores)
  #ipyplot.plot_images(imgs,img_width=350)
  return imgs[0]

#text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",)
#img = gr.inputs.Image()

#img = "image"



gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch();