simonza commited on
Commit
28379d5
1 Parent(s): 505c35a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -0
app.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pinecone
2
+ from datasets import load_dataset
3
+ import requests
4
+ from transformers import BertTokenizerFast
5
+ from sentence_transformers import SentenceTransformer
6
+ import transformers.models.clip.image_processing_clip
7
+ import torch
8
+ import gradio as gr
9
+ from deep_translator import GoogleTranslator
10
+ import shutil
11
+ from PIL import Image
12
+ import os
13
+
14
+ with open('pinecone_text.py' ,'w') as fb:
15
+ fb.write(requests.get('https://storage.googleapis.com/gareth-pinecone-datasets/pinecone_text.py').text)
16
+ import pinecone_text
17
+
18
+
19
+ # init connection to pinecone
20
+ pinecone.init(
21
+ api_key="0898750a-ee05-44f1-ac8a-98c5fef92f4a", # app.pinecone.io
22
+ environment="asia-southeast1-gcp-free" # find next to api key
23
+ )
24
+
25
+ index_name = "hybrid-image-search"
26
+ index = pinecone.GRPCIndex(index_name)
27
+
28
+ # load the dataset from huggingface datasets hub
29
+ fashion = load_dataset(
30
+ "ashraq/fashion-product-images-small",
31
+ split='train[:1000]'
32
+ )
33
+
34
+ images = fashion["image"]
35
+ metadata = fashion.remove_columns("image")
36
+
37
+ # load bert tokenizer from huggingface
38
+ tokenizer = BertTokenizerFast.from_pretrained(
39
+ 'bert-base-uncased'
40
+ )
41
+
42
+ def tokenize_func(text):
43
+ token_ids = tokenizer(
44
+ text,
45
+ add_special_tokens=False
46
+ )['input_ids']
47
+ return tokenizer.convert_ids_to_tokens(token_ids)
48
+
49
+ bm25 = pinecone_text.BM25(tokenize_func)
50
+ bm25.fit(metadata['productDisplayName'])
51
+
52
+
53
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
54
+
55
+ # load a CLIP model from huggingface
56
+ model = SentenceTransformer(
57
+ 'sentence-transformers/clip-ViT-B-32',
58
+ device=device
59
+ )
60
+
61
+
62
+ def hybrid_scale(dense, sparse, alpha: float):
63
+ if alpha < 0 or alpha > 1:
64
+ raise ValueError("Alpha must be between 0 and 1")
65
+ # scale sparse and dense vectors to create hybrid search vecs
66
+ hsparse = {
67
+ 'indices': sparse['indices'],
68
+ 'values': [v * (1 - alpha) for v in sparse['values']]
69
+ }
70
+ hdense = [v * alpha for v in dense]
71
+ return hdense, hsparse
72
+
73
+
74
+ def text_to_image(query, alpha, k_results):
75
+ sparse = bm25.transform_query(query)
76
+ dense = model.encode(query).tolist()
77
+
78
+ # scale sparse and dense vectors
79
+ hdense, hsparse = hybrid_scale(dense, sparse, alpha=alpha)
80
+
81
+ # search
82
+ result = index.query(
83
+ top_k=k_results,
84
+ vector=hdense,
85
+ sparse_vector=hsparse,
86
+ include_metadata=True
87
+ )
88
+ # used returned product ids to get images
89
+ imgs = [images[int(r["id"])] for r in result["matches"]]
90
+
91
+ description = []
92
+ for x in result["matches"]:
93
+ description.append( x["metadata"]['productDisplayName'] )
94
+
95
+ return imgs, description
96
+
97
+
98
+ def img_to_file_list(imgs):
99
+ path = "searches"
100
+ sub_path = './' + path + '/' + 'search' + '_' + str(counter["dir_num"])
101
+
102
+ # Check whether the specified path exists or not
103
+ isExist = os.path.exists('.'+'/'+path)
104
+
105
+ if not isExist:
106
+ print("Directory does not exists")
107
+ # Create a new directory because it does not exist
108
+ os.makedirs('.'+'/'+path, exist_ok = True)
109
+ print("The new directory is created!")
110
+
111
+ # Check whether the specified path exists or not
112
+ isExist = os.path.exists(sub_path)
113
+
114
+ if isExist:
115
+ shutil.rmtree(sub_path)
116
+
117
+ os.makedirs(sub_path, exist_ok = True)
118
+
119
+ img_files = {'search'+str(counter["dir_num"]):[]}
120
+ i = 0
121
+
122
+ for img in imgs:
123
+ img.save(sub_path+"/img_" + str(i) + ".png","PNG")
124
+ img_files['search'+str(counter["dir_num"])].append(sub_path + '/' + 'img_'+ str(i) + ".png")
125
+ i+=1
126
+
127
+ counter["dir_num"]+=1
128
+
129
+ return img_files['search'+str(counter["dir_num"]-1)]
130
+
131
+
132
+
133
+ def hybrid_scale(dense, sparse, alpha: float):
134
+ if alpha < 0 or alpha > 1:
135
+ raise ValueError("Alpha must be between 0 and 1")
136
+ # scale sparse and dense vectors to create hybrid search vecs
137
+ hsparse = {
138
+ 'indices': sparse['indices'],
139
+ 'values': [v * (1 - alpha) for v in sparse['values']]
140
+ }
141
+ hdense = [v * alpha for v in dense]
142
+ return hdense, hsparse
143
+
144
+
145
+ def text_to_image(query, alpha, k_results):
146
+ sparse = bm25.transform_query(query)
147
+ dense = model.encode(query).tolist()
148
+
149
+ # scale sparse and dense vectors
150
+ hdense, hsparse = hybrid_scale(dense, sparse, alpha=alpha)
151
+
152
+ # search
153
+ result = index.query(
154
+ top_k=k_results,
155
+ vector=hdense,
156
+ sparse_vector=hsparse,
157
+ include_metadata=True
158
+ )
159
+ # used returned product ids to get images
160
+ imgs = [images[int(r["id"])] for r in result["matches"]]
161
+
162
+ description = []
163
+ for x in result["matches"]:
164
+ description.append( x["metadata"]['productDisplayName'] )
165
+
166
+ return imgs, description
167
+
168
+
169
+ def img_to_file_list(imgs):
170
+ path = "searches"
171
+ sub_path = './' + path + '/' + 'search' + '_' + str(counter["dir_num"])
172
+
173
+ # Check whether the specified path exists or not
174
+ isExist = os.path.exists('.'+'/'+path)
175
+
176
+ if not isExist:
177
+ print("Directory does not exists")
178
+ # Create a new directory because it does not exist
179
+ os.makedirs('.'+'/'+path, exist_ok = True)
180
+ print("The new directory is created!")
181
+
182
+ # Check whether the specified path exists or not
183
+ isExist = os.path.exists(sub_path)
184
+
185
+ if isExist:
186
+ shutil.rmtree(sub_path)
187
+
188
+ os.makedirs(sub_path, exist_ok = True)
189
+
190
+ img_files = {'search'+str(counter["dir_num"]):[]}
191
+ i = 0
192
+
193
+ for img in imgs:
194
+ img.save(sub_path+"/img_" + str(i) + ".png","PNG")
195
+ img_files['search'+str(counter["dir_num"])].append(sub_path + '/' + 'img_'+ str(i) + ".png")
196
+ i+=1
197
+
198
+ counter["dir_num"]+=1
199
+
200
+ return img_files['search'+str(counter["dir_num"]-1)]