long1104 commited on
Commit
33cb156
1 Parent(s): cad8ee2

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +444 -0
app.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """miniproject1_part4.ipynb
2
+
3
+ Automatically generated by Colaboratory.
4
+
5
+ Original file is located at
6
+ https://colab.research.google.com/drive/1019NliGG7hWr87uyV6I748EbERk7Jt0p
7
+ """
8
+
9
+ import streamlit as st
10
+ import numpy as np
11
+ import numpy.linalg as la
12
+ import pickle
13
+ import os
14
+ import gdown
15
+ from sentence_transformers import SentenceTransformer
16
+ import matplotlib.pyplot as plt
17
+ import math
18
+
19
+ def cosine_similarity(x, y):
20
+ """
21
+ Exponentiated cosine similarity
22
+ 1. Compute cosine similarity
23
+ 2. Exponentiate cosine similarity
24
+ 3. Return exponentiated cosine similarity
25
+ (20 pts)
26
+ """
27
+ ##################################
28
+ ### TODO: Add code here ##########
29
+ ##################################
30
+ x_arr = np.atleast_2d(np.array(x))
31
+ y_arr = np.atleast_2d(np.array(y))
32
+
33
+ x_arr_mod = la.norm(x_arr, axis=1, keepdims=True)
34
+ y_arr_mod = la.norm(y_arr, axis=1, keepdims=True)
35
+
36
+ # np.exp((x_arr / x_arr_mod) @ (y_arr / y_arr_mod).T)
37
+
38
+ return np.exp((x_arr / x_arr_mod) @ (y_arr / y_arr_mod).T)
39
+
40
+ # Function to Load Glove Embeddings
41
+ def load_glove_embeddings(glove_path="Data/embeddings.pkl"):
42
+ with open(glove_path, "rb") as f:
43
+ embeddings_dict = pickle.load(f, encoding="latin1")
44
+
45
+ return embeddings_dict
46
+
47
+ def get_model_id_gdrive(model_type):
48
+ word_index_id = None
49
+ embeddings_id = None
50
+
51
+ if model_type == "25d":
52
+ word_index_id = "13qMXs3-oB9C6kfSRMwbAtzda9xuAUtt8"
53
+ embeddings_id = "1-RXcfBvWyE-Av3ZHLcyJVsps0RYRRr_2"
54
+ elif model_type == "50d":
55
+ embeddings_id = "1DBaVpJsitQ1qxtUvV1Kz7ThDc3az16kZ"
56
+ word_index_id = "1rB4ksHyHZ9skes-fJHMa2Z8J1Qa7awQ9"
57
+ elif model_type == "100d":
58
+ word_index_id = "1-oWV0LqG3fmrozRZ7WB1jzeTJHRUI3mq"
59
+ embeddings_id = "1SRHfX130_6Znz7zbdfqboKosz-PfNvNp"
60
+
61
+ return word_index_id, embeddings_id
62
+
63
+ def download_glove_embeddings_gdrive(model_type):
64
+ # Get glove embeddings from google drive
65
+ word_index_id, embeddings_id = get_model_id_gdrive(model_type)
66
+
67
+ # Use gdown to get files from google drive
68
+ embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
69
+ word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
70
+
71
+ # Download word_index pickle file
72
+ print("Downloading word index dictionary....\n")
73
+ gdown.download(id=word_index_id, output=word_index_temp, quiet=False)
74
+
75
+ # Download embeddings numpy file
76
+ print("Donwloading embedings...\n\n")
77
+ gdown.download(id=embeddings_id, output=embeddings_temp, quiet=False)
78
+
79
+ # @st.cache_data()
80
+ def load_glove_embeddings_gdrive(model_type):
81
+ word_index_temp = "word_index_dict_" + str(model_type) + "_temp.pkl"
82
+ embeddings_temp = "embeddings_" + str(model_type) + "_temp.npy"
83
+
84
+ # Load word index dictionary
85
+ word_index_dict = pickle.load(open(word_index_temp, "rb"), encoding="latin")
86
+
87
+ # Load embeddings numpy
88
+ embeddings = np.load(embeddings_temp)
89
+
90
+ return word_index_dict, embeddings
91
+
92
+ #download_glove_embeddings_gdrive("50d")
93
+
94
+ #word_index_dict, embeddings = load_glove_embeddings_gdrive("50d")
95
+
96
+ #embeddings.shape
97
+
98
+ @st.cache_resource()
99
+ def load_sentence_transformer_model(model_name):
100
+ sentenceTransformer = SentenceTransformer(model_name)
101
+ return sentenceTransformer
102
+
103
+ def get_sentence_transformer_embeddings(sentence, model_name="all-MiniLM-L6-v2"):
104
+ """
105
+ Get sentence transformer embeddings for a sentence
106
+ """
107
+ # 384 dimensional embedding
108
+ # Default model: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
109
+
110
+ sentenceTransformer = load_sentence_transformer_model(model_name)
111
+
112
+ try:
113
+ return sentenceTransformer.encode(sentence)
114
+ except:
115
+ if model_name == "all-MiniLM-L6-v2":
116
+ return np.zeros(384)
117
+ else:
118
+ return np.zeros(512)
119
+
120
+ def get_glove_embeddings(word, word_index_dict, embeddings, model_type):
121
+ """
122
+ Get glove embedding for a single word
123
+ """
124
+ if word.lower() in word_index_dict:
125
+ return embeddings[word_index_dict[word.lower()]]
126
+ else:
127
+ return np.zeros(int(model_type.split("d")[0]))
128
+
129
+ def averaged_glove_embeddings_gdrive(sentence, word_index_dict, embeddings, model_type=50):
130
+ """
131
+ Get averaged glove embeddings for a sentence
132
+ 1. Split sentence into words
133
+ 2. Get embeddings for each word
134
+ 3. Add embeddings for each word
135
+ 4. Divide by number of words
136
+ 5. Return averaged embeddings
137
+ (20 pts)
138
+ """
139
+ embedding = np.zeros(int(model_type.split("d")[0]))
140
+ ##################################
141
+ ##### TODO: Add code here ########
142
+ ##################################
143
+ words = sentence.split()
144
+
145
+ # print(word_index_dict)
146
+ # print(embeddings)
147
+
148
+ for word in words:
149
+ embedding += get_glove_embeddings(word, word_index_dict, embeddings, model_type)
150
+
151
+ return embedding / max(len(words), 1.)
152
+
153
+ def get_category_embeddings(embeddings_metadata):
154
+ """
155
+ Get embeddings for each category
156
+ 1. Split categories into words
157
+ 2. Get embeddings for each word
158
+ """
159
+ model_name = embeddings_metadata["model_name"]
160
+ st.session_state["cat_embed_" + model_name] = {}
161
+ for category in st.session_state.categories.split(" "):
162
+ if model_name:
163
+ if not category in st.session_state["cat_embed_" + model_name]:
164
+ st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category, model_name=model_name)
165
+ else:
166
+ if not category in st.session_state["cat_embed_" + model_name]:
167
+ st.session_state["cat_embed_" + model_name][category] = get_sentence_transformer_embeddings(category)
168
+
169
+ def update_category_embeddings(embedings_metadata):
170
+ """
171
+ Update embeddings for each category
172
+ """
173
+ get_category_embeddings(embeddings_metadata)
174
+
175
+ def sorting_similarity(categories, input_embedding, categories_embeddings):
176
+ cosine_sim = {}
177
+
178
+ similarity_matrix = cosine_similarity(input_embedding, categories_embeddings)
179
+
180
+ ranking_indices = np.argsort(-similarity_matrix, axis=1)
181
+
182
+
183
+ sorted_indices = ranking_indices[0]
184
+ categories_sorting = np.array(categories)[list(sorted_indices)]
185
+ ranked_similarity_matrix = np.take_along_axis(similarity_matrix, ranking_indices, axis=1)
186
+
187
+ #for cat, score in zip(categories_sorting, ranked_similarity_matrix):
188
+ #cosine_sim[cat] = score
189
+
190
+ return tuple(zip(categories_sorting, ranked_similarity_matrix))
191
+
192
+ def get_sorted_cosine_similarity(sentence, embeddings_metadata):
193
+ """
194
+ Get sorted cosine similarity between input sentence and categories
195
+ Steps:
196
+ 1. Get embeddings for input sentence
197
+ 2. Get embeddings for categories (if not found, update category embeddings)
198
+ 3. Compute cosine similarity between input sentence and categories
199
+ 4. Sort cosine similarity
200
+ 5. Return sorted cosine similarity
201
+ (50 pts)
202
+ """
203
+ categories = st.session_state.categories.split(" ")
204
+ cosine_sim = {}
205
+ if embeddings_metadata["embedding_model"] == "glove":
206
+ word_index_dict = embeddings_metadata["word_index_dict"]
207
+ embeddings = embeddings_metadata["embeddings"]
208
+ model_type = embeddings_metadata["model_type"]
209
+
210
+ input_embedding = averaged_glove_embeddings_gdrive(st.session_state.text_search,
211
+ word_index_dict,
212
+ embeddings, model_type)
213
+
214
+ ##########################################
215
+ ## TODO: Get embeddings for categories ###
216
+ ##########################################
217
+ categories_embeddings = []
218
+ for index in range(len(categories)):
219
+ cat = categories[index]
220
+ cat_embedding = get_glove_embeddings(cat,
221
+ word_index_dict,
222
+ embeddings,
223
+ model_type)
224
+
225
+ cosine_score = cosine_similarity(input_embedding, cat_embedding)
226
+ cosine_sim[index] = cosine_score[0][0]
227
+
228
+ else:
229
+ model_name = embeddings_metadata["model_name"]
230
+ if not "cat_embed_" + model_name in st.session_state:
231
+ get_category_embeddings(embeddings_metadata)
232
+
233
+ category_embeddings = st.session_state["cat_embed_" + model_name]
234
+
235
+ print("text_search = ", st.session_state.text_search)
236
+ if model_name:
237
+ input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search, model_name=model_name)
238
+ else:
239
+ input_embedding = get_sentence_transformer_embeddings(st.session_state.text_search)
240
+ for index in range(len(categories)):
241
+ ##########################################
242
+ # TODO: Compute cosine similarity between input sentence and categories
243
+ # TODO: Update category embeddings if category not found
244
+ ##########################################
245
+ cat = categories[index]
246
+
247
+ if not cat in category_embeddings:
248
+ update_category_embeddings(embeddings_metadata)
249
+
250
+ cosine_score = cosine_similarity(input_embedding, category_embeddings[cat])
251
+ cosine_sim[index] = cosine_score[0][0]
252
+
253
+ cosine_sim = sorted(cosine_sim.items(), key=lambda x:x[1], reverse=True)
254
+
255
+ print(type(cosine_sim))
256
+
257
+ return list(cosine_sim)
258
+
259
+ def plot_piechart(sorted_cosine_scores_items):
260
+ sorted_cosine_scores = np.array([
261
+ sorted_cosine_scores_items[index][1]
262
+ for index in range(len(sorted_cosine_scores_items))
263
+ ]
264
+ )
265
+ categories = st.session_state.categories.split(" ")
266
+ categories_sorted = [
267
+ categories[sorted_cosine_scores_items[index][0]]
268
+ for index in range(len(sorted_cosine_scores_items))
269
+ ]
270
+ fig, ax = plt.subplots()
271
+ ax.pie(sorted_cosine_scores, labels=categories_sorted, autopct="%1.1f%%")
272
+ st.pyplot(fig) # Figure
273
+
274
+ def plot_piechart_helper(sorted_cosine_scores_items):
275
+ sorted_cosine_scores = np.array(
276
+ [
277
+ sorted_cosine_scores_items[index][1]
278
+ for index in range(len(sorted_cosine_scores_items))
279
+ ]
280
+ )
281
+ categories = st.session_state.categories.split(" ")
282
+ categories_sorted = [
283
+ categories[sorted_cosine_scores_items[index][0]]
284
+ for index in range(len(sorted_cosine_scores_items))
285
+ ]
286
+ fig, ax = plt.subplots(figsize=(3, 3))
287
+ my_explode = np.zeros(len(categories_sorted))
288
+ my_explode[0] = 0.2
289
+ if len(categories_sorted) == 3:
290
+ my_explode[1] = 0.1 # explode this by 0.2
291
+ elif len(categories_sorted) > 3:
292
+ my_explode[2] = 0.05
293
+ ax.pie(
294
+ sorted_cosine_scores,
295
+ labels=categories_sorted,
296
+ autopct="%1.1f%%",
297
+ explode=my_explode,
298
+ )
299
+
300
+ return fig
301
+
302
+ def plot_piecharts(sorted_cosine_scores_models):
303
+ scores_list = []
304
+ categories = st.session_state.categories.split(" ")
305
+ index = 0
306
+ for model in sorted_cosine_scores_models:
307
+ scores_list.append(sorted_cosine_scores_models[model])
308
+ # scores_list[index] = np.array([scores_list[index][ind2][1] for ind2 in range(len(scores_list[index]))])
309
+ index += 1
310
+
311
+ if len(sorted_cosine_scores_models) == 2:
312
+ fig, (ax1, ax2) = plt.subplots(2)
313
+
314
+ categories_sorted = [
315
+ categories[scores_list[0][index][0]] for index in range(len(scores_list[0]))
316
+ ]
317
+ sorted_scores = np.array(
318
+ [scores_list[0][index][1] for index in range(len(scores_list[0]))]
319
+ )
320
+ ax1.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
321
+
322
+ categories_sorted = [
323
+ categories[scores_list[1][index][0]] for index in range(len(scores_list[1]))
324
+ ]
325
+ sorted_scores = np.array(
326
+ [scores_list[1][index][1] for index in range(len(scores_list[1]))]
327
+ )
328
+ ax2.pie(sorted_scores, labels=categories_sorted, autopct="%1.1f%%")
329
+
330
+ st.pyplot(fig)
331
+
332
+ def plot_alatirchart(sorted_cosine_scores_models):
333
+ models = list(sorted_cosine_scores_models.keys())
334
+ tabs = st.tabs(models)
335
+ figs = {}
336
+ for model in models:
337
+ figs[model] = plot_piechart_helper(sorted_cosine_scores_models[model])
338
+
339
+ for index in range(len(tabs)):
340
+ with tabs[index]:
341
+ st.pyplot(figs[models[index]])
342
+
343
+ ### Text Search ###
344
+ st.sidebar.title("GloVe Twitter")
345
+ st.sidebar.markdown(
346
+ """
347
+ GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Pretrained on
348
+ 2 billion tweets with vocabulary size of 1.2 million. Download from [Stanford NLP](http://nlp.stanford.edu/data/glove.twitter.27B.zip).
349
+
350
+ Jeffrey Pennington, Richard Socher, and Christopher D. Manning. 2014. *GloVe: Global Vectors for Word Representation*.
351
+ """
352
+ )
353
+
354
+ model_type = st.sidebar.selectbox("Choose the model", ("25d", "50d"), index=1)
355
+
356
+
357
+ st.title("Search Based Retrieval Demo")
358
+ st.subheader(
359
+ "Pass in space separated categories you want this search demo to be about."
360
+ )
361
+ # st.selectbox(label="Pick the categories you want this search demo to be about...",
362
+ # options=("Flowers Colors Cars Weather Food", "Chocolate Milk", "Anger Joy Sad Frustration Worry Happiness", "Positive Negative"),
363
+ # key="categories"
364
+ # )
365
+ st.text_input(
366
+ label="Categories", key="categories", value="Flowers Colors Cars Weather Food"
367
+ )
368
+ print(st.session_state["categories"])
369
+ print(type(st.session_state["categories"]))
370
+ # print("Categories = ", categories)
371
+ # st.session_state.categories = categories
372
+
373
+ st.subheader("Pass in an input word or even a sentence")
374
+ text_search = st.text_input(
375
+ label="Input your sentence",
376
+ key="text_search",
377
+ value="Roses are red, trucks are blue, and Seattle is grey right now",
378
+ )
379
+ # st.session_state.text_search = text_search
380
+
381
+ # Download glove embeddings if it doesn't exist
382
+ embeddings_path = "embeddings_" + str(model_type) + "_temp.npy"
383
+ word_index_dict_path = "word_index_dict_" + str(model_type) + "_temp.pkl"
384
+ if not os.path.isfile(embeddings_path) or not os.path.isfile(word_index_dict_path):
385
+ print("Model type = ", model_type)
386
+ glove_path = "Data/glove_" + str(model_type) + ".pkl"
387
+ print("glove_path = ", glove_path)
388
+
389
+ # Download embeddings from google drive
390
+ with st.spinner("Downloading glove embeddings..."):
391
+ download_glove_embeddings_gdrive(model_type)
392
+
393
+
394
+ # Load glove embeddings
395
+ word_index_dict, embeddings = load_glove_embeddings_gdrive(model_type)
396
+
397
+
398
+ # Find closest word to an input word
399
+ if st.session_state.text_search:
400
+ # Glove embeddings
401
+ print("Glove Embedding")
402
+ embeddings_metadata = {
403
+ "embedding_model": "glove",
404
+ "word_index_dict": word_index_dict,
405
+ "embeddings": embeddings,
406
+ "model_type": model_type,
407
+ }
408
+ with st.spinner("Obtaining Cosine similarity for Glove..."):
409
+ sorted_cosine_sim_glove = get_sorted_cosine_similarity(
410
+ st.session_state.text_search, embeddings_metadata
411
+ )
412
+
413
+ # Sentence transformer embeddings
414
+ print("Sentence Transformer Embedding")
415
+ embeddings_metadata = {"embedding_model": "transformers", "model_name": ""}
416
+ with st.spinner("Obtaining Cosine similarity for 384d sentence transformer..."):
417
+ sorted_cosine_sim_transformer = get_sorted_cosine_similarity(
418
+ st.session_state.text_search, embeddings_metadata
419
+ )
420
+
421
+ # Results and Plot Pie Chart for Glove
422
+ print("Categories are: ", st.session_state.categories)
423
+ st.subheader(
424
+ "Closest word I have between: "
425
+ + st.session_state.categories
426
+ + " as per different Embeddings"
427
+ )
428
+
429
+ print(sorted_cosine_sim_glove)
430
+ print(sorted_cosine_sim_transformer)
431
+ # print(sorted_distilbert)
432
+ # Altair Chart for all models
433
+ plot_alatirchart(
434
+ {
435
+ "glove_" + str(model_type): sorted_cosine_sim_glove,
436
+ "sentence_transformer_384": sorted_cosine_sim_transformer,
437
+ }
438
+ )
439
+ # "distilbert_512": sorted_distilbert})
440
+
441
+ st.write("")
442
+ st.write(
443
+ "Demo developed by [Dr. Karthik Mohan](https://www.linkedin.com/in/karthik-mohan-72a4b323/)"
444
+ )