File size: 2,842 Bytes
008bbdb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415d5da
 
008bbdb
 
415d5da
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import csv
import gradio as gr
import glob
import pprint as pp
from sys import excepthook
from re import T
from urllib.parse import parse_qs, urlparse
import clip
import numpy as np
import requests
import torch
from sklearn.utils.extmath import softmax
import io


from IPython.display import Image, display
from PIL import Image, ImageFont
import os
import cv2
import torch
import glob

# Model

def predict(text,img):
  model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
  img = cv2.imread(img)
  results = model(img1)  
  dirpath = tempfile.mkdtemp()
  results.crop(save_dir=dirpath)
  path= dirpath+'/crops/**/*.jpg'
  txtfiles = []
  for file in glob.glob(path):
      txtfiles.append(file)

  import ipyplot
  from PIL import Image
  l = []
  #keyList = list(range(len(txtfiles)))
  for filename in glob.glob(path):
    foo = Image.open(filename).convert('RGB')
    #resized_image = foo.resize((250,250))
    l.append(foo) 

  device = "cuda" if torch.cuda.is_available() else "cpu"
  model, preprocess = clip.load("ViT-B/32", device=device)

  images = torch.stack([preprocess(im) for im in l]).to(device)
  with torch.no_grad():
    image_features = model.encode_image(images)
    image_features /= image_features.norm(dim=-1, keepdim=True)

  image_features.cpu().numpy()

  image_mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).cuda()
  image_std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).cuda()

  images = [preprocess(im) for im in l]
  image_input = torch.tensor(np.stack(images)).cuda()
  image_input -= image_mean[:, None, None]
  image_input /= image_std[:, None, None]
  with torch.no_grad():
      image_features = model.encode_image(image_input).float()
  image_features /= image_features.norm(dim=-1, keepdim=True)

  def get_top_N_semantic_similarity(similarity_list,N):
    results = zip(range(len(similarity_list)), similarity_list)
    results = sorted(results, key=lambda x: x[1],reverse= True)
    top_N_images = []
    scores=[]
    for index,score in results[:N]:
      scores.append(score)
      top_N_images.append(l[index])
    return scores,top_N_images

  search_query = "White car"

  with torch.no_grad():
      # Encode and normalize the description using CLIP
      text_encoded = model.encode_text(clip.tokenize(search_query).to(device))
      text_encoded /= text_encoded.norm(dim=-1, keepdim=True)

  similarity = text_encoded.cpu().numpy() @ image_features.cpu().numpy().T
  similarity = similarity[0]
  scores,imgs= get_top_N_semantic_similarity(similarity,N=1)
  #print ("scores ",scores)
  #ipyplot.plot_images(imgs,img_width=350)
  return imgs

#text = gr.inputs.Textbox(lines=5, label="Context")
#img = gr.inputs.Image()


gr_interface = gr.Interface(fn=predict,["img", gr.inputs.Textbox(lines=1, label="Context")], outputs="image", title='Search inside image').launch();