EdwardXu commited on
Commit
7216d39
1 Parent(s): 6308713

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +454 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # %load miniproject1_part4-2-1.py
4
+ import streamlit as st
5
+ import numpy as np
6
+ import numpy.linalg as la
7
+ import pickle
8
+ import os
9
+ import gdown
10
+ from sentence_transformers import SentenceTransformer
11
+ import matplotlib.pyplot as plt
12
+ import math
13
+
14
+
15
+ # Compute Cosine Similarity
16
+ def cosine_similarity(x, y):
17
+ """
18
+ Exponentiated cosine similarity
19
+ 1. Compute cosine similarity
20
+ 2. Exponentiate cosine similarity
21
+ 3. Return exponentiated cosine similarity
22
+ (20 pts)
23
+ """
24
+ # Compute cosine similarity
25
+ dot_product = np.dot(x, y)
26
+ norm_x = np.linalg.norm(x)
27
+ norm_y = np.linalg.norm(y)
28
+ cosine_sim = dot_product / (norm_x * norm_y)
29
+
30
+ # Exponentiate cosine similarity
31
+ exp_cosine_sim = np.exp(cosine_sim)
32
+
33
+ # Return exponentiated cosine similarity
34
+ return exp_cosine_sim
35
+
36
+
37
+ # Function to Load Glove Embeddings
38
+ def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
39
+ with open(glove_path, "rb") as f:
40
+ embeddings_dict = pickle.load(f, encoding="latin1")
41
+
42
+ return embeddings_dict
43
+
44
+
45
+ def get_model_id_gdrive(model_type):
46
+ if model_type == "25d":
47
+ word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
48
+ embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
49
+ elif model_type == "50d":
50
+ embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
51
+ word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
52
+ elif model_type == "100d":
53
+ word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
54
+ embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
55
+
56
+ return word_index_id, embeddings_id
57
+
58
+
59
+ def download_glove_embeddings_gdrive(model_type):
60
+ # Get glove embeddings from google drive
61
+ word_index_id, embeddings_id = get_model_id_gdrive(model_type)
62
+
63
+ # Use gdown to get files from google drive
64
+
65
+ # 修改的
66
+ embeddings_temp = "embeddings_50d_temp.npy"
67
+ # embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
68
+
69
+ # 修改的
70
+ word_index_temp = "word_index_dict_50d_temp.pkl"
71
+ # word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
72
+
73
+ # Download word_index pickle file
74
+ print("Downloading word index dictionary....\n")
75
+ # gdown.download(id=word_index_id, output=word_index_temp, quiet=False)
76
+
77
+ # Download embeddings numpy file
78
+ print("Donwloading embedings...\n\n")
79
+ # gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
80
+
81
+
82
+ # @st.cache_data()
83
+ def load_glove_embeddings_gdrive(model_type):
84
+ word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
85
+ embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
86
+
87
+ # Load word index dictionary
88
+ word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
89
+
90
+ # Load embeddings numpy
91
+ embeddings = np.load(embeddings_temp)
92
+
93
+ return word_index_dict, embeddings
94
+
95
+
96
+ @st.cache_resource()
97
+ def load_sentence_transformer_model(model_name):
98
+ sentenceTransformer = SentenceTransformer(model_name)
99
+ return sentenceTransformer
100
+
101
+
102
+ def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
103
+ """
104
+ Get sentence transformer embeddings for a sentence
105
+ """
106
+ # 384 dimensional embedding
107
+ # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
108
+
109
+ sentenceTransformer = load_sentence_transformer_model(model_name)
110
+
111
+ try:
112
+ return sentenceTransformer.encode(sentence)
113
+ except:
114
+ if model_name == "all-MiniLM-L6-v2":
115
+ return np.zeros(384)
116
+ else:
117
+ return np.zeros(512)
118
+
119
+
120
+ def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
121
+ """
122
+ Get glove embedding for a single word
123
+ """
124
+ if word.lower() in word_index_dict:
125
+ return embeddings[word_index_dict[word.lower()]]
126
+ else:
127
+ return np.zeros(int(model_type.split("d")[0]))
128
+
129
+
130
+ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
131
+ """
132
+ Get averaged glove embeddings for a sentence
133
+ 1. Split sentence into words
134
+ 2. Get embeddings for each word
135
+ 3. Add embeddings for each word
136
+ 4. Divide by number of words
137
+ 5. Return averaged embeddings
138
+ (30 pts)
139
+ """
140
+ words = sentence.split() # Step 1: Split sentence into words
141
+ embedding_sum = np.zeros(int(model_type.split("d")[0]))
142
+ valid_word_count = 0
143
+
144
+ for word in words: # Step 2: Get embeddings for each word
145
+ word_embedding = get_glove_embeddings(word, word_index_dict, embeddings, model_type)
146
+ if np.any(word_embedding): # Only consider valid embeddings
147
+ embedding_sum += word_embedding
148
+ valid_word_count += 1
149
+
150
+ if valid_word_count > 0: # Step 4: Divide by number of words
151
+ averaged_embedding = embedding_sum / valid_word_count
152
+ else:
153
+ averaged_embedding = np.zeros(int(model_type.split("d")[0]))
154
+
155
+ return averaged_embedding # Step 5: Return averaged embeddings
156
+
157
+ def get_category_embeddings(embeddings_metadata):
158
+ """
159
+ Get embeddings for each category
160
+ 1. Split categories into words
161
+ 2. Get embeddings for each word
162
+ """
163
+ model_name = embeddings_metadata["model_name"]
164
+ st.session_state["cat_embed_" + model_name] = {}
165
+ for category in st.session_state.categories.split(" "):
166
+ if model_name:
167
+ if not category in st.session_state["cat_embed_" + model_name]:
168
+ st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, model_name=model_name)
169
+ else:
170
+ if not category in st.session_state["cat_embed_" + model_name]:
171
+ st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category)
172
+
173
+
174
+ def update_category_embeddings(embedings_metadata):
175
+ """
176
+ Update embeddings for each category
177
+ """
178
+ get_category_embeddings(embeddings_metadata)
179
+
180
+
181
+ def get_sorted_cosine_similarity(embeddings_metadata):
182
+ """
183
+ Get sorted cosine similarity between input sentence and categories
184
+ Steps:
185
+ 1. Get embeddings for input sentence
186
+ 2. Get embeddings for categories (if not found, update category embeddings)
187
+ 3. Compute cosine similarity between input sentence and categories
188
+ 4. Sort cosine similarity
189
+ 5. Return sorted cosine similarity
190
+ (50 pts)
191
+ """
192
+ categories = st.session_state.categories.split(" ")
193
+ cosine_sim = {}
194
+ if embeddings_metadata["embedding_model"] == "glove":
195
+ word_index_dict = embeddings_metadata["word_index_dict"]
196
+ embeddings = embeddings_metadata["embeddings"]
197
+ model_type = embeddings_metadata["model_type"]
198
+
199
+ input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search,
200
+ word_index_dict,
201
+ embeddings, model_type)
202
+
203
+ for category in categories:
204
+ # Get embedding for category
205
+ category_embedding = averaged_glove_embeddings_gdrive(category, word_index_dict, embeddings, model_type)
206
+ # Compute cosine similarity
207
+ cos_sim = cosine_similarity(input_embedding, category_embedding)
208
+ cosine_sim[category] = cos_sim
209
+
210
+ else:
211
+ model_name = embeddings_metadata["model_name"]
212
+ if not "cat_embed_" + model_name in st.session_state:
213
+ get_category_embeddings(embeddings_metadata)
214
+
215
+ category_embeddings = st.session_state["cat_embed_" + model_name]
216
+
217
+ print("text_search = ", st.session_state.text_search)
218
+ if model_name:
219
+ input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
220
+ else:
221
+ input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
222
+ for category in categories:
223
+ # Update category embeddings if category not found
224
+
225
+ if category not in category_embeddings:
226
+ update_category_embeddings(embeddings_metadata)
227
+ category_embeddings = st.session_state["cat_embed_" + model_name]
228
+
229
+ # Compute cosine similarity
230
+ category_embedding = category_embeddings[category]
231
+ cos_sim = cosine_similarity(input_embedding, category_embedding)
232
+ cosine_sim[category] = cos_sim
233
+
234
+ # Sort the cosine similarities
235
+ sorted_cosine_sim = dict(sorted(cosine_sim.items(), key=lambda item: item[1], reverse=True))
236
+
237
+ return sorted_cosine_sim
238
+
239
+ #
240
+ # def plot_piechart(sorted_cosine_scores_items):
241
+ # sorted_cosine_scores = np.array([
242
+ # sorted_cosine_scores_items[index][1]
243
+ # for index in range(len(sorted_cosine_scores_items))
244
+ # ]
245
+ # )
246
+ # categories = st.session_state.categories.split(" ")
247
+ # categories_sorted = [
248
+ # categories[sorted_cosine_scores_items[index][0]]
249
+ # for index in range(len(sorted_cosine_scores_items))
250
+ # ]
251
+ # fig, ax = plt.subplots()
252
+ # ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%")
253
+ # st.pyplot(fig) # Figure
254
+
255
+
256
+ def plot_piechart_helper(sorted_cosine_scores_items):
257
+ sorted_cosine_scores = np.array(list(sorted_cosine_scores_items.values()))
258
+ categories_sorted = list(sorted_cosine_scores_items.keys())
259
+
260
+ fig, ax = plt.subplots(figsize=(3, 3))
261
+ my_explode = np.zeros(len(categories_sorted))
262
+ my_explode[0] = 0.2
263
+ if len(categories_sorted) == 3:
264
+ my_explode[1] = 0.1
265
+ elif len(categories_sorted) > 3:
266
+ my_explode[2] = 0.05
267
+
268
+ ax.pie(
269
+ sorted_cosine_scores,
270
+ labels=categories_sorted,
271
+ autopct="%1.1f%%",
272
+ explode=my_explode,
273
+ )
274
+
275
+ return fig
276
+
277
+
278
+ def plot_piecharts(sorted_cosine_scores_models):
279
+ scores_list = []
280
+ categories = st.session_state.categories.split(" ")
281
+ index = 0
282
+ for model in sorted_cosine_scores_models:
283
+ scores_list.append(sorted_cosine_scores_models[model])
284
+ # scores_list[index] = np.array([scores_list[index][ind2][1] for ind2 in range(len(scores_list[index]))])
285
+ index += 1
286
+
287
+ if len(sorted_cosine_scores_models) == 2:
288
+ fig, (ax1, ax2) = plt.subplots(2)
289
+
290
+ categories_sorted = [
291
+ categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))
292
+ ]
293
+ sorted_scores = np.array(
294
+ [scores_list[0][index][1] for index in range(len(scores_list[0]))]
295
+ )
296
+ ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
297
+
298
+ categories_sorted = [
299
+ categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))
300
+ ]
301
+ sorted_scores = np.array(
302
+ [scores_list[1][index][1] for index in range(len(scores_list[1]))]
303
+ )
304
+ ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
305
+
306
+ st.pyplot(fig)
307
+
308
+
309
+ def plot_alatirchart(sorted_cosine_scores_models):
310
+
311
+
312
+ models = list(sorted_cosine_scores_models.keys())
313
+ tabs = st.tabs(models)
314
+ figs = {}
315
+ for model in models:
316
+ # modified
317
+ figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
318
+
319
+ for index in range(len(tabs)):
320
+ with tabs[index]:
321
+ st.pyplot(figs[models[index]])
322
+
323
+
324
+ ### Text Search ###
325
+ st.sidebar.title("GloVe Twitter")
326
+ st.sidebar.markdown(
327
+ """
328
+ GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
329
+ 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
330
+
331
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
332
+ """
333
+ )
334
+
335
+
336
+ if 'categories' not in st.session_state:
337
+ st.session_state['categories'] = "Flowers Colors Cars Weather Food"
338
+ if 'text_search' not in st.session_state:
339
+ st.session_state['text_search'] = "Roses are red, trucks are blue, and Seattle is grey right now"
340
+
341
+
342
+ model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)
343
+
344
+ st.title("Search Based Retrieval Demo")
345
+ st.subheader(
346
+ "Pass in space separated categories you want this search demo to be about."
347
+ )
348
+ # st.selectbox(label="Pick the categories you want this search demo to be about...",
349
+ # options=("Flowers Colors Cars Weather Food", "Chocolate Milk", "Anger Joy Sad Frustration Worry Happiness", "Positive Negative"),
350
+ # key="categories"
351
+ # )
352
+
353
+
354
+ # categories of user input
355
+ user_categories = st.text_input(
356
+ label="Categories", value=st.session_state.categories
357
+ )
358
+
359
+ st.session_state.categories = user_categories
360
+
361
+ # st.text_input(
362
+ # label="Categories", key="categories", value="Flowers Colors Cars Weather Food"
363
+ # )
364
+
365
+ # Categories = st.session_state.get('categories', "Flowers Colors Cars Weather Food")
366
+
367
+
368
+ print(st.session_state.get("categories"))
369
+ # print(st.session_state["categories"])
370
+
371
+ print(type(st.session_state.get("categories")))
372
+ # print(type(st.session_state["categories"]))
373
+
374
+ # print("Categories = ", categories)
375
+ # st.session_state.categories = categories
376
+
377
+ st.subheader("Pass in an input word or even a sentence")
378
+ user_text_search = st.text_input(
379
+ label="Input your sentence",
380
+ value=st.session_state.text_search,
381
+
382
+ )
383
+
384
+ st.session_state.text_search = user_text_search
385
+ # st.session_state.text_search = text_search
386
+
387
+ # Download glove embeddings if it doesn't exist
388
+ embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
389
+ word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
390
+ if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
391
+ print("Model type = ", model_type)
392
+ glove_path = "Data/glove_" + str(model_type) + ".pkl"
393
+ print("glove_path = ", glove_path)
394
+
395
+ # Download embeddings from google drive
396
+ with st.spinner("Downloading glove embeddings..."):
397
+ download_glove_embeddings_gdrive(model_type)
398
+
399
+
400
+ # Load glove embeddings
401
+ word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
402
+
403
+
404
+ # Find closest word to an input word
405
+ if st.session_state.text_search:
406
+ # Glove embeddings
407
+ print("Glove Embedding")
408
+ embeddings_metadata = {
409
+ "embedding_model": "glove",
410
+ "word_index_dict": word_index_dict,
411
+ "embeddings": embeddings,
412
+ "model_type": model_type,
413
+ "text_search": st.session_state.text_search
414
+ }
415
+ with st.spinner("Obtaining Cosine similarity for Glove..."):
416
+ sorted_cosine_sim_glove = get_sorted_cosine_similarity(
417
+ embeddings_metadata
418
+ )
419
+
420
+ # Sentence transformer embeddings
421
+ print("Sentence Transformer Embedding")
422
+ embeddings_metadata = {"embedding_model": "transformers", "model_name": "",
423
+ "text_search": st.session_state.text_search }
424
+ with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
425
+ sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
426
+ embeddings_metadata
427
+ )
428
+
429
+ # Results and Plot Pie Chart for Glove
430
+ print("Categories are: ", st.session_state.categories)
431
+ st.subheader(
432
+ "Closest word I have between: "
433
+ + st.session_state.categories
434
+ + " as per different Embeddings"
435
+ )
436
+
437
+ print(sorted_cosine_sim_glove)
438
+ print(sorted_cosine_sim_transformer)
439
+
440
+ st.write(f"Closest category using GloVe embeddings : {list(sorted_cosine_sim_glove.keys())[0]}")
441
+ st.write(
442
+ f"Closest category using Sentence Transformer embeddings : {list(sorted_cosine_sim_transformer.keys())[0]}")
443
+
444
+ plot_alatirchart(
445
+ {
446
+ "glove_" + str(model_type): sorted_cosine_sim_glove,
447
+ "sentence_transformer_384": sorted_cosine_sim_transformer,
448
+ }
449
+ )
450
+
451
+ st.write("")
452
+ st.write(
453
+ "Demo developed by [V50](https://huggingface.co/spaces/ericlkc/V50)"
454
+ )
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ pickleshare
4
+ gdown
5
+ sentence-transformers
6
+ matplotlib