File size: 2,961 Bytes
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c64f929
be31d6b
008bbdb
ec5658e
008bbdb
 
 
 
 
 
ce80bdb
 
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d6916b
 
008bbdb
 
7b13a4f
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21ef9b6
008bbdb
 
 
c64f929
008bbdb
 
 
 
 
 
 
6217eeb
008bbdb
c64f929
415d5da
008bbdb
3e99bd6
008bbdb
e1ccc30
3e99bd6
e348155
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import csv
import gradio as gr
import glob
import pprint as pp
from sys import excepthook
from re import T
from urllib.parse import parse_qs, urlparse
import clip
import numpy as np
import requests
import torch
import io


from PIL import Image, ImageFont
import os
import cv2
import torch
import glob

# Model

def predict(img,text):
  import tempfile
  model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
  results = model(img)  
  dirpath = tempfile.mkdtemp()
  results.crop(save_dir=dirpath)
  path= dirpath+'/crops/**/*.jpg'
  txtfiles = []
  for file in glob.glob(path):
      txtfiles.append(file)
      
  from PIL import Image
  l = []
  #keyList = list(range(len(txtfiles)))
  for filename in glob.glob(path):
    foo = Image.open(filename).convert('RGB')
    #resized_image = foo.resize((250,250))
    l.append(foo) 

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model, preprocess = clip.load("ViT-B/32", device=device)

  images = torch.stack([preprocess(im) for im in l]).to(device)
  with torch.no_grad():
    image_features = model.encode_image(images)
    image_features /= image_features.norm(dim=-1, keepdim=True)

  image_features.cpu().numpy()

  image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073])
  image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711])

  images = [preprocess(im) for im in l]
  image_input = torch.tensor(np.stack(images))
  image_input -= image_mean[:, None, None]
  image_input /= image_std[:, None, None]
  with torch.no_grad():
      image_features = model.encode_image(image_input).float()
  image_features /= image_features.norm(dim=-1, keepdim=True)

  def get_top_N_semantic_similarity(similarity_list,N):
    results = zip(range(len(similarity_list)), similarity_list)
    results = sorted(results, key=lambda x: x[1],reverse= True)
    top_N_images = []
    scores=[]
    for index,score in results[:N]:
      scores.append(score)
      top_N_images.append(l[index])
    return scores,top_N_images

  #search_query = text

  with torch.no_grad():
      # Encode and normalize the description using CLIP
      text_encoded = model.encode_text(clip.tokenize(text).to(device))
      text_encoded /= text_encoded.norm(dim=-1, keepdim=True)

  similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T
  similarity = similarity[0]
  scores,imgs= get_top_N_semantic_similarity(similarity,N=1)
  #print ("scores ",scores)
  #ipyplot.plot_images(imgs,img_width=350)
  return imgs[0]

#text = gr.inputs.Textbox(lines=1, label="Text query", placeholder="Introduce the search text...",)
#img = gr.inputs.Image()

#img = "image"



gr.Interface(predict, ["image", gr.inputs.Textbox(lines=1, label="Text query", placeholder="Type here...",)], outputs="image", title="Crop-CLIP", description ="Search subjects/objects in an image using simple text description and get cropped results.This is done by combining Object detection Yolov5 and OpenAI's CLIP model.").launch();