taskswithcode commited on
Commit
aea620e
1 Parent(s): 0f0b570
app.py ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import streamlit as st
4
+ import string
5
+ from io import StringIO
6
+ import pdb
7
+ import json
8
+ from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
9
+ from twc_clustering import TWCClustering
10
+ import torch
11
+ import requests
12
+ import socket
13
+
14
+
15
+ MAX_INPUT = 100
16
+
17
+ SEM_SIMILARITY="1"
18
+ DOC_RETRIEVAL="2"
19
+ CLUSTERING="3"
20
+
21
+
22
+ use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
23
+ use_case_url = {"1":"https://huggingface.co/spaces/taskswithcode/semantic_similarity","2":"https://huggingface.co/spaces/taskswithcode/semantic_search","3":""}
24
+
25
+
26
+
27
+ from transformers import BertTokenizer, BertForMaskedLM
28
+
29
+
30
+ APP_NAME = "hf/semantic_clustering"
31
+ INFO_URL = "http://www.taskswithcode.com/stats/"
32
+
33
+
34
+
35
+
36
+
37
+ def get_views(action):
38
+ ret_val = 0
39
+ hostname = socket.gethostname()
40
+ ip_address = socket.gethostbyname(hostname)
41
+ if ("view_count" not in st.session_state):
42
+ try:
43
+ app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
44
+ res = requests.post(INFO_URL, json = app_info).json()
45
+ print(res)
46
+ data = res["count"]
47
+ except:
48
+ data = 0
49
+ ret_val = data
50
+ st.session_state["view_count"] = data
51
+ else:
52
+ ret_val = st.session_state["view_count"]
53
+ if (action != "init"):
54
+ app_info = {'name': APP_NAME,"action":action,"host":hostname,"ip":ip_address}
55
+ res = requests.post(INFO_URL, json = app_info).json()
56
+ return "{:,}".format(ret_val)
57
+
58
+
59
+
60
+
61
+ def construct_model_info_for_display(model_names):
62
+ options_arr = []
63
+ markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b><br/><i>These are either state-of-the-art or the most downloaded models on Huggingface</i></div>"
64
+ markdown_str += f"<div style=\"font-size:2px; color: #2f2f2f; text-align: left\"><br/></div>"
65
+ for node in model_names:
66
+ options_arr .append(node["name"])
67
+ if (node["mark"] == "True"):
68
+ markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
69
+ if ("Note" in node):
70
+ markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
71
+ markdown_str += "<div style=\"font-size:16px; color: #5f5f5f; text-align: left\"><br/></div>"
72
+
73
+ markdown_str += "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><b>Note:</b><br/>•&nbsp;Uploaded files are loaded into non-persistent memory for the duration of the computation. They are not cached</div>"
74
+ limit = "{:,}".format(MAX_INPUT)
75
+ markdown_str += f"<div style=\"font-size:12px; color: #9f9f9f; text-align: left\">•&nbsp;User uploaded file has a maximum limit of {limit} sentences.</div>"
76
+ return options_arr,markdown_str
77
+
78
+
79
+ st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for tasks using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
80
+ menu_items={
81
+ 'About': 'This app was created by taskswithcode. http://taskswithcode.com'
82
+
83
+ })
84
+ col,pad = st.columns([85,15])
85
+
86
+ with col:
87
+ st.image("long_form_logo_with_icon.png")
88
+
89
+
90
+ @st.experimental_memo
91
+ def load_model(model_name,model_class,load_model_name):
92
+ try:
93
+ ret_model = None
94
+ obj_class = globals()[model_class]
95
+ ret_model = obj_class()
96
+ ret_model.init_model(load_model_name)
97
+ assert(ret_model is not None)
98
+ except Exception as e:
99
+ st.error("Unable to load model:" + model_name + " " + load_model_name + " " + str(e))
100
+ pass
101
+ return ret_model
102
+
103
+
104
+
105
+ @st.experimental_memo
106
+ def cached_compute_similarity(sentences,_model,model_name,threshold,_cluster):
107
+ texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
108
+ results = _cluster.cluster(None,texts,embeddings,threshold)
109
+ return results
110
+
111
+
112
+ def uncached_compute_similarity(sentences,_model,model_name,threshold,cluster):
113
+ with st.spinner('Computing vectors for sentences'):
114
+ texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
115
+ results = cluster.cluster(None,texts,embeddings,threshold)
116
+ #st.success("Similarity computation complete")
117
+ return results
118
+
119
+ DEFAULT_HF_MODEL = "sentence-transformers/paraphrase-MiniLM-L6-v2"
120
+ def get_model_info(model_names,model_name):
121
+ for node in model_names:
122
+ if (model_name == node["name"]):
123
+ return node,model_name
124
+ return get_model_info(model_names,DEFAULT_HF_MODEL)
125
+
126
+
127
+ def run_test(model_names,model_name,sentences,display_area,threshold,user_uploaded,custom_model):
128
+ display_area.text("Loading model:" + model_name)
129
+ #Note. model_name may get mapped to new name in the call below for custom models
130
+ orig_model_name = model_name
131
+ model_info,model_name = get_model_info(model_names,model_name)
132
+ if (model_name != orig_model_name):
133
+ load_model_name = orig_model_name
134
+ else:
135
+ load_model_name = model_info["model"]
136
+ if ("Note" in model_info):
137
+ fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
138
+ display_area.write(fail_link)
139
+ model = load_model(model_name,model_info["class"],load_model_name)
140
+ display_area.text("Model " + model_name + " load complete")
141
+ try:
142
+ if (user_uploaded):
143
+ results = uncached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"])
144
+ else:
145
+ display_area.text("Computing vectors for sentences")
146
+ results = cached_compute_similarity(sentences,model,model_name,threshold,st.session_state["cluster"])
147
+ display_area.text("Similarity computation complete")
148
+ return results
149
+
150
+ except Exception as e:
151
+ st.error("Some error occurred during prediction" + str(e))
152
+ st.stop()
153
+ return {}
154
+
155
+
156
+
157
+
158
+
159
+ def display_results(orig_sentences,results,response_info,app_mode,model_name):
160
+ main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
161
+ main_sent += f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">Showing results for model:&nbsp;<b>{model_name}</b></div>"
162
+ score_text = "cosine distance"
163
+ main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Clustering by {score_text}.&nbsp;<b>{len(results['clusters'])} clusters</b>.&nbsp;&nbsp;mean:{results['info']['mean']:.2f}&nbsp;std:{results['info']['std']:.2f}&nbsp;threshold hints:{str(results['info']['zscores'])}</div>"
164
+ body_sent = []
165
+ download_data = {}
166
+ for i in range(len(results["clusters"])):
167
+ pivot_index = results["clusters"][i]["pivot_index"]
168
+ pivot_sent = orig_sentences[pivot_index]
169
+ pivot_index += 1
170
+ d_cluster = {}
171
+ download_data[i + 1] = d_cluster
172
+ d_cluster["pivot"] = {"pivot_index":pivot_index,"sent":pivot_sent,"children":{}}
173
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{pivot_index}]&nbsp;{pivot_sent}&nbsp;<b><i>(Cluster {i+1})</i></b>&nbsp;&nbsp;</div>")
174
+ neighs_dict = results["clusters"][i]["neighs"]
175
+ for key in neighs_dict:
176
+ cosine_dist = neighs_dict[key]
177
+ child_index = key
178
+ sentence = orig_sentences[child_index]
179
+ child_index += 1
180
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{child_index}]&nbsp;{sentence}&nbsp;&nbsp;&nbsp;<b>{cosine_dist:.2f}</b></div>")
181
+ d_cluster["pivot"]["children"][sentence] = f"{cosine_dist:.2f}"
182
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">&nbsp;</div>")
183
+ main_sent = main_sent + "\n" + '\n'.join(body_sent)
184
+ st.markdown(main_sent,unsafe_allow_html=True)
185
+ st.session_state["download_ready"] = json.dumps(download_data,indent=4)
186
+ get_views("submit")
187
+
188
+
189
+ def init_session():
190
+ if ("model_name" not in st.session_state):
191
+ st.session_state["model_name"] = "ss_test"
192
+ st.session_state["download_ready"] = None
193
+ st.session_state["model_name"] = "ss_test"
194
+ st.session_state["threshold"] = 1.5
195
+ st.session_state["file_name"] = "default"
196
+ st.session_state["cluster"] = TWCClustering()
197
+ else:
198
+ print("Skipping init session")
199
+
200
+ def app_main(app_mode,example_files,model_name_files):
201
+ init_session()
202
+ with open(example_files) as fp:
203
+ example_file_names = json.load(fp)
204
+ with open(model_name_files) as fp:
205
+ model_names = json.load(fp)
206
+ curr_use_case = use_case[app_mode].split(".")[0]
207
+ st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
208
+ st.markdown(f"<p style='font-size:14px; color: #4f4f4f; text-align: center'><i>Or compare your own model with state-of-the-art/popular models</p>", unsafe_allow_html=True)
209
+ st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['1']}\' target='_blank'>{use_case['1']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;<a href=\'{use_case_url['2']}\' target='_blank'>{use_case['2']}</a><br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
210
+ st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views('init')}</div>", unsafe_allow_html=True)
211
+
212
+
213
+ try:
214
+
215
+
216
+ with st.form('twc_form'):
217
+
218
+ step1_line = "Step 1. Upload text file(one sentence in a line) or choose an example text file below"
219
+ if (app_mode == DOC_RETRIEVAL):
220
+ step1_line += ". The first line is treated as the query"
221
+ uploaded_file = st.file_uploader(step1_line, type=".txt")
222
+
223
+ selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
224
+ options = list(dict.keys(example_file_names)), index=0, key = "twc_file")
225
+ st.write("")
226
+ options_arr,markdown_str = construct_model_info_for_display(model_names)
227
+ selection_label = 'Step 2. Select Model'
228
+ selected_model = st.selectbox(label=selection_label,
229
+ options = options_arr, index=0, key = "twc_model")
230
+ st.write("")
231
+ custom_model_selection = st.text_input("Model not listed above? Type any Huggingface semantic search model name ", "",key="custom_model")
232
+ hf_link_str = "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><a href='https://huggingface.co/models?pipeline_tag=sentence-similarity' target = '_blank'>List of Huggingface semantic search models</a><br/><br/><br/></div>"
233
+ st.markdown(hf_link_str, unsafe_allow_html=True)
234
+ threshold = st.number_input('Step 3. Choose a zscore threshold (number of std devs from mean)',value=st.session_state["threshold"],min_value = 0.0,step=.01)
235
+ st.write("")
236
+ submit_button = st.form_submit_button('Run')
237
+
238
+
239
+ input_status_area = st.empty()
240
+ display_area = st.empty()
241
+ if submit_button:
242
+ start = time.time()
243
+ if uploaded_file is not None:
244
+ st.session_state["file_name"] = uploaded_file.name
245
+ sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
246
+ else:
247
+ st.session_state["file_name"] = example_file_names[selected_file_index]["name"]
248
+ sentences = open(example_file_names[selected_file_index]["name"]).read()
249
+ sentences = sentences.split("\n")[:-1]
250
+ if (len(sentences) > MAX_INPUT):
251
+ st.info(f"Input sentence count exceeds maximum sentence limit. First {MAX_INPUT} out of {len(sentences)} sentences chosen")
252
+ sentences = sentences[:MAX_INPUT]
253
+ if (len(custom_model_selection) != 0):
254
+ run_model = custom_model_selection
255
+ else:
256
+ run_model = selected_model
257
+ st.session_state["model_name"] = selected_model
258
+ st.session_state["threshold"] = threshold
259
+ results = run_test(model_names,run_model,sentences,display_area,threshold,(uploaded_file is not None),(len(custom_model_selection) != 0))
260
+ display_area.empty()
261
+ with display_area.container():
262
+ device = 'GPU' if torch.cuda.is_available() else 'CPU'
263
+ response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
264
+ if (len(custom_model_selection) != 0):
265
+ st.info("Custom model overrides model selection in step 2 above. So please clear the custom model text box to choose models from step 2")
266
+ display_results(sentences,results,response_info,app_mode,run_model)
267
+ #st.json(results)
268
+ st.download_button(
269
+ label="Download results as json",
270
+ data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
271
+ disabled = False if st.session_state["download_ready"] != None else True,
272
+ file_name= (st.session_state["model_name"] + "_" + str(st.session_state["threshold"]) + "_" + '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
273
+ mime='text/json',
274
+ key ="download"
275
+ )
276
+
277
+
278
+
279
+ except Exception as e:
280
+ st.error("Some error occurred during loading" + str(e))
281
+ st.stop()
282
+
283
+ st.markdown(markdown_str, unsafe_allow_html=True)
284
+
285
+
286
+
287
+ if __name__ == "__main__":
288
+ #print("comand line input:",len(sys.argv),str(sys.argv))
289
+ #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
290
+ #app_main("1","sim_app_examples.json","sim_app_models.json")
291
+ app_main("3","clus_app_examples.json","clus_app_models.json")
292
+
clus_app_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "Machine learning terms (phrases test)": {"name":"small_test.txt"},
3
+ "Customer feedback mixed with noise":{"name":"larger_test.txt"},
4
+ "Movie reviews": {"name":"imdb_sent.txt"}
5
+ }
clus_app_models.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+
3
+ { "name":"sentence-transformers/all-MiniLM-L6-v2",
4
+ "model":"sentence-transformers/all-MiniLM-L6-v2",
5
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
6
+ "orig_author_url":"https://github.com/UKPLab",
7
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
8
+ "sota_info": {
9
+ "task":"Over 3.8 million downloads from Huggingface",
10
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
11
+ },
12
+ "paper_url":"https://arxiv.org/abs/1908.10084",
13
+ "mark":"True",
14
+ "class":"HFModel"},
15
+ { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
16
+ "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
17
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
18
+ "orig_author_url":"https://github.com/UKPLab",
19
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
20
+ "sota_info": {
21
+ "task":"Over 2 million downloads from Huggingface",
22
+ "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
23
+ },
24
+ "paper_url":"https://arxiv.org/abs/1908.10084",
25
+ "mark":"True",
26
+ "class":"HFModel"},
27
+ { "name":"sentence-transformers/bert-base-nli-mean-tokens",
28
+ "model":"sentence-transformers/bert-base-nli-mean-tokens",
29
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
30
+ "orig_author_url":"https://github.com/UKPLab",
31
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
32
+ "sota_info": {
33
+ "task":"Over 700,000 downloads from Huggingface",
34
+ "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
35
+ },
36
+ "paper_url":"https://arxiv.org/abs/1908.10084",
37
+ "mark":"True",
38
+ "class":"HFModel"},
39
+ { "name":"sentence-transformers/all-mpnet-base-v2",
40
+ "model":"sentence-transformers/all-mpnet-base-v2",
41
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
42
+ "orig_author_url":"https://github.com/UKPLab",
43
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
44
+ "sota_info": {
45
+ "task":"Over 500,000 downloads from Huggingface",
46
+ "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
47
+ },
48
+ "paper_url":"https://arxiv.org/abs/1908.10084",
49
+ "mark":"True",
50
+ "class":"HFModel"},
51
+ { "name":"sentence-transformers/all-MiniLM-L12-v2",
52
+ "model":"sentence-transformers/all-MiniLM-L12-v2",
53
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
54
+ "orig_author_url":"https://github.com/UKPLab",
55
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
56
+ "sota_info": {
57
+ "task":"Over 500,000 downloads from Huggingface",
58
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
59
+ },
60
+ "paper_url":"https://arxiv.org/abs/1908.10084",
61
+ "mark":"True",
62
+ "class":"HFModel"},
63
+
64
+ { "name":"SGPT-125M",
65
+ "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
66
+ "fork_url":"https://github.com/taskswithcode/sgpt",
67
+ "orig_author_url":"https://github.com/Muennighoff",
68
+ "orig_author":"Niklas Muennighoff",
69
+ "sota_info": {
70
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
71
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
72
+ },
73
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
74
+ "mark":"True",
75
+ "class":"SGPTModel"},
76
+ { "name":"SIMCSE-base" ,
77
+ "model":"princeton-nlp/sup-simcse-roberta-base",
78
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
79
+ "orig_author_url":"https://github.com/princeton-nlp",
80
+ "orig_author":"Princeton Natural Language Processing",
81
+ "sota_info": {
82
+ "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
83
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
84
+ },
85
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
86
+ "mark":"True",
87
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
88
+
89
+
90
+ ]
imdb_sent.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "A rating of ""1"" does not begin to express how dull, depressing and relentlessly bad this movie is."
2
+ Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME!!!
3
+ "Long, boring, blasphemous. Never have I been so glad to see ending credits roll."
4
+ This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
5
+ "Were I not with friends, and so cheap, I would have walked out. It failed miserably as satire and didn't even have the redemption of camp."
6
+ For pure gothic vampire cheese nothing can compare to the Subspecies films. I highly recommend each and every one of them.
7
+ "A great film in its genre, the direction, acting, most especially the casting of the film makes it even more powerful. A must see."
8
+ "This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say."
9
+ I wouldn't rent this one even on dollar rental night.
10
+ "More suspenseful, more subtle, much, much more disturbing...."
11
+ This is a good film. This is very funny. Yet after this film there were no good Ernest films!
12
+ A touching movie. It is full of emotions and wonderful acting. I could have sat through it a second time.
13
+ "Great movie - especially the music - Etta James - ""At Last"". This speaks volumes when you have finally found that special someone."
14
+ If you've ever had a mad week-end out with your mates then you'll appreciate this film. Excellent fun and a laugh a minute.
15
+ "I think it's one of the greatest movies which are ever made, and I've seen many... The book is better, but it's still a very good movie!"
16
+ Brilliant and moving performances by Tom Courtenay and Peter Finch.
17
+ The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil.
18
+ You've got to be kidding. This movie sucked for the sci-fi fans. I would only recommend watching this only if you think Armageddon was good.
19
+ Ten minutes of people spewing gallons of pink vomit. Recurring scenes of enormous piles of dog excrement - need one say more???
20
+ "As usual, Sean Connery does a great job. Lawrence Fishburn is good, but I have a hard time not seeing him as Ike Turner."
21
+ This movie is terrible but it has some good effects.
22
+ You'd better choose Paul Verhoeven's even if you have watched it.
23
+ "Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it."
24
+ "I don't know why I like this movie so well, but I never get tired of watching it."
25
+ The one-liners fly so fast in this movie that you can watch it over and over and still catch new ones. By far one of the best of this genre.
26
+ "Don't waste your time and money on it. It's not quite as bad as ""Adrenalin"", by the same director but that's not saying much."
27
+ "Read the book, forget the movie!"
28
+ This is a great movie. Too bad it is not available on home video.
29
+ "Very intelligent language usage of Ali, which you musn't miss! In one word: (eeh sentence...) Wicked, so keep it real and pass it on!"
30
+ Primary plot!Primary direction!Poor interpretation.
31
+ "If you like Pauly Shore, you'll love Son in Law. If you hate Pauly Shore, then, well...I liked it!"
32
+ Just love the interplay between two great characters of stage & screen - Veidt & Barrymore
33
+ "This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act."
34
+ This is the greatest movie ever. If you have written it off with out ever seeing it. You must give it a second try.
35
+ "What a script, what a story, what a mess!"
36
+ "I caught this film late at night on HBO. Talk about wooden acting, unbelievable plot, et al. Very little going in its favor. Skip it."
37
+ This is without a doubt the worst movie I have ever seen. It is not funny. It is not interesting and should not have been made.
38
+ Ming The Merciless does a little Bardwork and a movie most foul!
39
+ This is quite possibly the worst sequel ever made. The script is unfunny and the acting stinks. The exact opposite of the original.
40
+ "This is the definitive movie version of Hamlet. Branagh cuts nothing, but there are no wasted moments."
41
+ My favorite movie. What a great story this really was. I'd just like to be able to buy a copy of it but this does not seem possible.
42
+ "Comment this movie is impossible. Is terrible, very improbable, bad interpretation e direction. Not look!!!!!"
43
+ "Brilliant movie. The drawings were just amazing. Too bad it ended before it begun. I´ve waited 21 years for a sequel, but nooooo!!!"
44
+ a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
45
+ "This is a very cool movie. The ending of the movie is a bit more defined than the play's ending, but either way it is still a good movie."
46
+ "Without a doubt, one of Tobe Hoppor's best! Epic storytellng, great special effects, and The Spacegirl (vamp me baby!)."
47
+ I hope this group of film-makers never re-unites.
48
+ Unwatchable. You can't even make it past the first three minutes. And this is coming from a huge Adam Sandler fan!!1
49
+ "One of the funniest movies made in recent years. Good characterization, plot and exceptional chemistry make this one a classic"
50
+ "Add this little gem to your list of holiday regulars. It is sweet, funny, and endearing"
51
+ "no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!"
52
+ "If you haven't seen this, it's terrible. It is pure trash. I saw this about 17 years ago, and I'm still screwed up from it."
53
+ Absolutely fantastic! Whatever I say wouldn't do this underrated movie the justice it deserves. Watch it now! FANTASTIC!
54
+ "As a big fan of Tiny Toon Adventures, I loved this movie!!! It was so funny!!! It really captured how cartoons spent their summers."
55
+ Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)
56
+ The Fiendish Plot of Dr. Fu Manchu (1980). This is hands down the worst film I've ever seen. What a sad way for a great comedian to go out.
57
+ "Obviously written for the stage. Lightweight but worthwhile. How can you go wrong with Ralph Richardson, Olivier and Merle Oberon."
58
+ This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.
59
+ This movie is terrible. It's about some no brain surfin dude that inherits some company. Does Carrot Top have no shame?
60
+ Adrian Pasdar is excellent is this film. He makes a fascinating woman.
61
+ "An unfunny, unworthy picture which is an undeserving end to Peter Sellers' career. It is a pity this movie was ever made."
62
+ "The plot was really weak and confused. This is a true Oprah flick. (In Oprah's world, all men are evil and all women are victims.)"
larger_test.txt ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ do u really want me to unistall this app...whenever i open this app, u ask for review...how many times i have to give review-feedback...
2
+ I don't like how it asks to give a review everytime I open the app
3
+ Stop asking for review everytime I open the app..it's pathetic..the updated version sucks
4
+ If i already provided the review for this application but why this application is asking for reviews every time when I am opening the application so improve this feature. This feature is very irritating. Apart from that overall experience is very good.
5
+ as you guys bother me so much for the review even i gave my opinion already but every time i open the app it ask for review so i gave it 1 star , previsiousally it was 4 star.
6
+ repeatedly asking to rate the app...
7
+ Very irritating. Everytime i open app it asked for review hence giving 2 instead of 4
8
+ stop asking for ratings every time when open the app. I had rated this app 5 star but now every time app asking for give rating, its disgusting. so I'll give only one star
9
+ I swear if i see that feedback ad one more time im gonna uninstall this app and start using another one else
10
+ I'm am downgrading my rating because the app is good and I also gave it 5 satar but why I am getting unnecessary pop up to give it review please fix it
11
+ No rating ... worsted app ... please playstore delete this app
12
+ Much bad experience . when I used to open the app it requires feedback every time.
13
+ I already rated it then why always it pop up... Its irritate me a lot everytime when I open this app... Plz fix this
14
+ This app any time ask me for rating i hate this
15
+ Very Good app but asks to rate it all the time....all these popups are annoying when you are in hurry
16
+ I'm already rated this app. And now from one week and adove this app is asking for rating please solve the problem as soon as possible. Thank you
17
+ The app is too good but it send me notification again and again to rate it that's why am I giving one star to it
18
+ Constantly asks me to rate the app! So annoying.
19
+ Today again i am go to rating this app due to its ad less and best interface with good features again more
20
+ App is very disturbing .. very bad app
21
+ If I don't want to rate it's my personal choice, so why this app gives notification every single time,, it's quite frustrating therefore 1star Other wise app is best for it's work
22
+ For frustrating me every time to rate your app
23
+ Super exalent app can you pls reply to my comment how is my review so thanks to provide this app thanks
24
+ Vey bad app so disturbance ... All time get notification about rating... That allready done
25
+ Earlier I had given 5 stars to this app but even after giving review in this app, it speaks to rate now, so I removed 5 stars in edit review and put 1 star, now this app will be happy.
26
+ Every time I open the app it asking rating. I rated 4 before now I de rate to 1."
27
+ I love this app. So damn good
28
+ This app rocks!!!!!!
29
+ This app totally sucks
30
+ Wow what a useful app
31
+ I cant live without this app!
32
+ Shit! This app rocks. I can never imagine going out without using this app. So damn useful!
33
+ Elon musk is the founder of SpaceX
34
+ Parasites suck blood out of deer
35
+ A review of his conduct revealed he violated the rules everytime he downloaded movies
36
+ My god. If only I could rate this app 100 stars for its excellence
37
+ The board conducted a review and determined electons were fair
38
+ WTF!
39
+ Crossing the chasm is a great book review that is often quoted by readers
40
+ Why am I seeing everything in double like I m drink - is my vision going bad???
41
+ Expolanets keep going round and round their stars many times a day
42
+ I have recommended this app to so many friends and they love it too
43
+ The sale of electric cars has gone up since the increse in gas prices
44
+ Stable diffusion app is the rage on the internet with multiple people either downloading on the laptop and trying it or useing the web interface
45
+ OpenAI is trying to make money by exposin their NLP apps through an API
46
+ Co:here, Ai21 and other are trying to emulate OpenAIs business model and exposing NLP apps that depend on LLMs through metered APIs
47
+ Serverless GPUs have emerged as a new business model catering to end users who want to host apps
48
+ Cerberas, Sambanova are betting on models to grow larger and harder to train on traditional GPUS
49
+ Nvidia released Hopper series as a successor to A100 series
50
+ Oh my god! I am done with this app!
51
+ Oh my god! I love this sweet puppy. He rounds around the chair so many times
52
+ I plan to write a nasty review for this shitty movie
long_form_logo_with_icon.png ADDED
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers
2
+ scipy
3
+ torch
4
+ sentencepiece
run.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ streamlit run app.py --server.port 80 "1" "sim_app_examples.json" "sim_app_models.json"
2
+
small_test.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ machine learning
2
+ Transformers have become the staple architecture for deep learning models
3
+ NLP
4
+ Diffusion models
5
+ natural language processing
6
+ deep learning
7
+ Deep Learning
8
+ Support vector machines
9
+ random forests
10
+ probability distribution
11
+ Cross entropy loss
12
+ Kullback leibler divergence
13
+ Shannon entropy
14
+ Activation functions
15
+ ATM
16
+ deep fakes
17
+ AGI
18
+ AI
19
+ deep trouble
20
+ artificial intelligence
21
+ deep diving
22
+ artificial snow
23
+ shallow waters
24
+ deep end
25
+ RELU
26
+ sigmoid
27
+ GELU
28
+ RNN
29
+ CNN
30
+ Gaussian
twc_clustering.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scipy.spatial.distance import cosine
2
+ import argparse
3
+ import json
4
+ import pdb
5
+ import torch
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ import time
9
+ from collections import OrderedDict
10
+
11
+
12
+ class TWCClustering:
13
+ def __init__(self):
14
+ print("In Zscore Clustering")
15
+
16
+ def compute_matrix(self,embeddings):
17
+ print("Computing similarity matrix ...)")
18
+ embeddings= np.array(embeddings)
19
+ start = time.time()
20
+ vec_a = embeddings.T #vec_a shape (1024,)
21
+ vec_a = vec_a/np.linalg.norm(vec_a,axis=0) #Norm is along axis 0 - rows
22
+ vec_a = vec_a.T #vec_a shape becomes (,1024)
23
+ similarity_matrix = np.inner(vec_a,vec_a)
24
+ end = time.time()
25
+ time_val = (end-start)*1000
26
+ print(f"Similarity matrix computation complete. Time taken:{(time_val/(1000*60)):.2f} minutes")
27
+ return similarity_matrix
28
+
29
+ def get_terms_above_threshold(self,matrix,embeddings,pivot_index,threshold):
30
+ run_index = pivot_index
31
+ picked_arr = []
32
+ while (run_index < len(embeddings)):
33
+ if (matrix[pivot_index][run_index] >= threshold):
34
+ #picked_arr.append({"index":run_index,"val":matrix[pivot_index][run_index]})
35
+ picked_arr.append({"index":run_index})
36
+ run_index += 1
37
+ return picked_arr
38
+
39
+ def update_picked_dict(self,picked_dict,in_dict):
40
+ for key in in_dict:
41
+ picked_dict[key] = 1
42
+
43
+ def find_pivot_subgraph(self,pivot_index,arr,matrix,threshold):
44
+ center_index = pivot_index
45
+ center_score = 0
46
+ center_dict = {}
47
+ for i in range(len(arr)):
48
+ node_i_index = arr[i]["index"]
49
+ running_score = 0
50
+ temp_dict = {}
51
+ for j in range(len(arr)):
52
+ node_j_index = arr[j]["index"]
53
+ cosine_dist = matrix[node_i_index][node_j_index]
54
+ if (cosine_dist < threshold):
55
+ continue
56
+ running_score += cosine_dist
57
+ temp_dict[node_j_index] = cosine_dist
58
+ if (running_score > center_score):
59
+ center_index = node_i_index
60
+ center_dict = temp_dict
61
+ center_score = running_score
62
+ sorted_d = OrderedDict(sorted(center_dict.items(), key=lambda kv: kv[1], reverse=True))
63
+ return {"pivot_index":center_index,"orig_index":pivot_index,"neighs":sorted_d}
64
+
65
+
66
+ def cluster(self,output_file,texts,embeddings,threshold = 1.5):
67
+ matrix = self.compute_matrix(embeddings)
68
+ mean = np.mean(matrix)
69
+ std = np.std(matrix)
70
+ zscores = []
71
+ inc = 0
72
+ value = mean
73
+ while (value < 1):
74
+ zscores.append(round(value,2))
75
+ inc += 1
76
+ value = mean + inc*std
77
+ print("In clustering:",round(std,2),zscores)
78
+ cluster_dict = {}
79
+ cluster_dict["clusters"] = []
80
+ picked_dict = {}
81
+
82
+ for i in range(len(embeddings)):
83
+ if (i in picked_dict):
84
+ continue
85
+ zscore = mean + threshold*std
86
+ arr = self.get_terms_above_threshold(matrix,embeddings,i,zscore)
87
+ cluster_info = self.find_pivot_subgraph(i,arr,matrix,zscore)
88
+ self.update_picked_dict(picked_dict,cluster_info["neighs"])
89
+ cluster_dict["clusters"].append(cluster_info)
90
+ cluster_dict["info"] ={"mean":mean,"std":std,"zscores":zscores}
91
+ return cluster_dict
92
+
93
+
twc_embeddings.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ from transformers import AutoModelForCausalLM
3
+ from scipy.spatial.distance import cosine
4
+ import argparse
5
+ import json
6
+ import pdb
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ def read_text(input_file):
11
+ arr = open(input_file).read().split("\n")
12
+ return arr[:-1]
13
+
14
+
15
+ class CausalLMModel:
16
+ def __init__(self):
17
+ self.model = None
18
+ self.tokenizer = None
19
+ self.debug = False
20
+ print("In CausalLMModel Constructor")
21
+
22
+ def init_model(self,model_name = None):
23
+ # Get our models - The package will take care of downloading the models automatically
24
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
25
+ if (self.debug):
26
+ print("Init model",model_name)
27
+ # For best performance: EleutherAI/gpt-j-6B
28
+ if (model_name is None):
29
+ model_name = "EleutherAI/gpt-neo-125M"
30
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
32
+ self.model.eval()
33
+ self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
34
+
35
+ def compute_embeddings(self,input_data,is_file):
36
+ if (self.debug):
37
+ print("Computing embeddings for:", input_data[:20])
38
+ model = self.model
39
+ tokenizer = self.tokenizer
40
+
41
+ texts = read_text(input_data) if is_file == True else input_data
42
+ query = texts[0]
43
+ docs = texts[1:]
44
+
45
+ # Tokenize input texts
46
+
47
+ #print(f"Query: {query}")
48
+ scores = []
49
+ for doc in docs:
50
+ context = self.prompt.format(doc)
51
+
52
+ context_enc = tokenizer.encode(context, add_special_tokens=False)
53
+ continuation_enc = tokenizer.encode(query, add_special_tokens=False)
54
+ # Slice off the last token, as we take its probability from the one before
55
+ model_input = torch.tensor(context_enc+continuation_enc[:-1])
56
+ continuation_len = len(continuation_enc)
57
+ input_len, = model_input.shape
58
+
59
+ # [seq_len] -> [seq_len, vocab]
60
+ logprobs = torch.nn.functional.log_softmax(model(model_input)[0], dim=-1).cpu()
61
+ # [seq_len, vocab] -> [continuation_len, vocab]
62
+ logprobs = logprobs[input_len-continuation_len:]
63
+ # Gather the log probabilities of the continuation tokens -> [continuation_len]
64
+ logprobs = torch.gather(logprobs, 1, torch.tensor(continuation_enc).unsqueeze(-1)).squeeze(-1)
65
+ score = torch.sum(logprobs)
66
+ scores.append(score.tolist())
67
+ return texts,scores
68
+
69
+ def output_results(self,output_file,texts,scores,main_index = 0):
70
+ cosine_dict = {}
71
+ docs = texts[1:]
72
+ if (self.debug):
73
+ print("Total sentences",len(texts))
74
+ assert(len(scores) == len(docs))
75
+ for i in range(len(docs)):
76
+ cosine_dict[docs[i]] = scores[i]
77
+
78
+ if (self.debug):
79
+ print("Input sentence:",texts[main_index])
80
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
81
+ if (self.debug):
82
+ for key in sorted_dict:
83
+ print("Document score for \"%s\" is: %.3f" % (key[:100], sorted_dict[key]))
84
+ if (output_file is not None):
85
+ with open(output_file,"w") as fp:
86
+ fp.write(json.dumps(sorted_dict,indent=0))
87
+ return sorted_dict
88
+
89
+
90
+ class SGPTQnAModel:
91
+ def __init__(self):
92
+ self.model = None
93
+ self.tokenizer = None
94
+ self.debug = False
95
+ print("In SGPT Q&A Constructor")
96
+
97
+
98
+ def init_model(self,model_name = None):
99
+ # Get our models - The package will take care of downloading the models automatically
100
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
101
+ if (self.debug):
102
+ print("Init model",model_name)
103
+ if (model_name is None):
104
+ model_name = "Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit"
105
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
106
+ self.model = AutoModel.from_pretrained(model_name)
107
+ self.model.eval()
108
+ self.SPECB_QUE_BOS = self.tokenizer.encode("[", add_special_tokens=False)[0]
109
+ self.SPECB_QUE_EOS = self.tokenizer.encode("]", add_special_tokens=False)[0]
110
+
111
+ self.SPECB_DOC_BOS = self.tokenizer.encode("{", add_special_tokens=False)[0]
112
+ self.SPECB_DOC_EOS = self.tokenizer.encode("}", add_special_tokens=False)[0]
113
+
114
+
115
+ def tokenize_with_specb(self,texts, is_query):
116
+ # Tokenize without padding
117
+ batch_tokens = self.tokenizer(texts, padding=False, truncation=True)
118
+ # Add special brackets & pay attention to them
119
+ for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
120
+ if is_query:
121
+ seq.insert(0, self.SPECB_QUE_BOS)
122
+ seq.append(self.SPECB_QUE_EOS)
123
+ else:
124
+ seq.insert(0, self.SPECB_DOC_BOS)
125
+ seq.append(self.SPECB_DOC_EOS)
126
+ att.insert(0, 1)
127
+ att.append(1)
128
+ # Add padding
129
+ batch_tokens = self.tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
130
+ return batch_tokens
131
+
132
+ def get_weightedmean_embedding(self,batch_tokens, model):
133
+ # Get the embeddings
134
+ with torch.no_grad():
135
+ # Get hidden state of shape [bs, seq_len, hid_dim]
136
+ last_hidden_state = self.model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
137
+
138
+ # Get weights of shape [bs, seq_len, hid_dim]
139
+ weights = (
140
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
141
+ .unsqueeze(0)
142
+ .unsqueeze(-1)
143
+ .expand(last_hidden_state.size())
144
+ .float().to(last_hidden_state.device)
145
+ )
146
+
147
+ # Get attn mask of shape [bs, seq_len, hid_dim]
148
+ input_mask_expanded = (
149
+ batch_tokens["attention_mask"]
150
+ .unsqueeze(-1)
151
+ .expand(last_hidden_state.size())
152
+ .float()
153
+ )
154
+
155
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
156
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
157
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
158
+
159
+ embeddings = sum_embeddings / sum_mask
160
+
161
+ return embeddings
162
+
163
+ def compute_embeddings(self,input_data,is_file):
164
+ if (self.debug):
165
+ print("Computing embeddings for:", input_data[:20])
166
+ model = self.model
167
+ tokenizer = self.tokenizer
168
+
169
+ texts = read_text(input_data) if is_file == True else input_data
170
+
171
+ queries = [texts[0]]
172
+ docs = texts[1:]
173
+ query_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(queries, is_query=True), self.model)
174
+ doc_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(docs, is_query=False), self.model)
175
+ return texts,(query_embeddings,doc_embeddings)
176
+
177
+
178
+
179
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
180
+ # Calculate cosine similarities
181
+ # Cosine similarities are in [-1, 1]. Higher means more similar
182
+ query_embeddings = embeddings[0]
183
+ doc_embeddings = embeddings[1]
184
+ cosine_dict = {}
185
+ queries = [texts[0]]
186
+ docs = texts[1:]
187
+ if (self.debug):
188
+ print("Total sentences",len(texts))
189
+ for i in range(len(docs)):
190
+ cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
191
+
192
+ if (self.debug):
193
+ print("Input sentence:",texts[main_index])
194
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
195
+ if (self.debug):
196
+ for key in sorted_dict:
197
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
198
+ if (output_file is not None):
199
+ with open(output_file,"w") as fp:
200
+ fp.write(json.dumps(sorted_dict,indent=0))
201
+ return sorted_dict
202
+
203
+
204
+ class SimCSEModel:
205
+ def __init__(self):
206
+ self.model = None
207
+ self.tokenizer = None
208
+ self.debug = False
209
+ print("In SimCSE constructor")
210
+
211
+ def init_model(self,model_name = None):
212
+ if (model_name == None):
213
+ model_name = "princeton-nlp/sup-simcse-roberta-large"
214
+ #self.model = SimCSE(model_name)
215
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
216
+ self.model = AutoModel.from_pretrained(model_name)
217
+
218
+ def compute_embeddings(self,input_data,is_file):
219
+ texts = read_text(input_data) if is_file == True else input_data
220
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
221
+ with torch.no_grad():
222
+ embeddings = self.model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
223
+ return texts,embeddings
224
+
225
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
226
+ # Calculate cosine similarities
227
+ # Cosine similarities are in [-1, 1]. Higher means more similar
228
+ cosine_dict = {}
229
+ #print("Total sentences",len(texts))
230
+ for i in range(len(texts)):
231
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
232
+
233
+ #print("Input sentence:",texts[main_index])
234
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
235
+ if (self.debug):
236
+ for key in sorted_dict:
237
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
238
+ if (output_file is not None):
239
+ with open(output_file,"w") as fp:
240
+ fp.write(json.dumps(sorted_dict,indent=0))
241
+ return sorted_dict
242
+
243
+
244
+
245
+ class SGPTModel:
246
+ def __init__(self):
247
+ self.model = None
248
+ self.tokenizer = None
249
+ self.debug = False
250
+ print("In SGPT Constructor")
251
+
252
+
253
+ def init_model(self,model_name = None):
254
+ # Get our models - The package will take care of downloading the models automatically
255
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
256
+ if (self.debug):
257
+ print("Init model",model_name)
258
+ if (model_name is None):
259
+ model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
260
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
261
+ self.model = AutoModel.from_pretrained(model_name)
262
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
263
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
264
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
265
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
266
+ # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
267
+ self.model.eval()
268
+
269
+ def compute_embeddings(self,input_data,is_file):
270
+ if (self.debug):
271
+ print("Computing embeddings for:", input_data[:20])
272
+ model = self.model
273
+ tokenizer = self.tokenizer
274
+
275
+ texts = read_text(input_data) if is_file == True else input_data
276
+
277
+ # Tokenize input texts
278
+ batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
279
+
280
+ # Get the embeddings
281
+ with torch.no_grad():
282
+ # Get hidden state of shape [bs, seq_len, hid_dim]
283
+ last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
284
+
285
+ # Get weights of shape [bs, seq_len, hid_dim]
286
+ weights = (
287
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
288
+ .unsqueeze(0)
289
+ .unsqueeze(-1)
290
+ .expand(last_hidden_state.size())
291
+ .float().to(last_hidden_state.device)
292
+ )
293
+
294
+ # Get attn mask of shape [bs, seq_len, hid_dim]
295
+ input_mask_expanded = (
296
+ batch_tokens["attention_mask"]
297
+ .unsqueeze(-1)
298
+ .expand(last_hidden_state.size())
299
+ .float()
300
+ )
301
+
302
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
303
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
304
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
305
+
306
+ embeddings = sum_embeddings / sum_mask
307
+ return texts,embeddings
308
+
309
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
310
+ # Calculate cosine similarities
311
+ # Cosine similarities are in [-1, 1]. Higher means more similar
312
+ cosine_dict = {}
313
+ if (self.debug):
314
+ print("Total sentences",len(texts))
315
+ for i in range(len(texts)):
316
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
317
+
318
+ if (self.debug):
319
+ print("Input sentence:",texts[main_index])
320
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
321
+ if (self.debug):
322
+ for key in sorted_dict:
323
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
324
+ if (output_file is not None):
325
+ with open(output_file,"w") as fp:
326
+ fp.write(json.dumps(sorted_dict,indent=0))
327
+ return sorted_dict
328
+
329
+
330
+
331
+
332
+
333
+ class HFModel:
334
+ def __init__(self):
335
+ self.model = None
336
+ self.tokenizer = None
337
+ self.debug = False
338
+ print("In HF Constructor")
339
+
340
+
341
+ def init_model(self,model_name = None):
342
+ # Get our models - The package will take care of downloading the models automatically
343
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
344
+ #print("Init model",model_name)
345
+ if (model_name is None):
346
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
347
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
348
+ self.model = AutoModel.from_pretrained(model_name)
349
+ self.model.eval()
350
+
351
+ def mean_pooling(self,model_output, attention_mask):
352
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
353
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
354
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
355
+
356
+ def compute_embeddings(self,input_data,is_file):
357
+ #print("Computing embeddings for:", input_data[:20])
358
+ model = self.model
359
+ tokenizer = self.tokenizer
360
+
361
+ texts = read_text(input_data) if is_file == True else input_data
362
+
363
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
364
+
365
+ # Compute token embeddings
366
+ with torch.no_grad():
367
+ model_output = model(**encoded_input)
368
+
369
+ # Perform pooling
370
+ sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
371
+
372
+ # Normalize embeddings
373
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
374
+
375
+ return texts,sentence_embeddings
376
+
377
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
378
+ # Calculate cosine similarities
379
+ # Cosine similarities are in [-1, 1]. Higher means more similar
380
+ cosine_dict = {}
381
+ #print("Total sentences",len(texts))
382
+ for i in range(len(texts)):
383
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
384
+
385
+ #print("Input sentence:",texts[main_index])
386
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
387
+ if (self.debug):
388
+ for key in sorted_dict:
389
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
390
+ if (output_file is not None):
391
+ with open(output_file,"w") as fp:
392
+ fp.write(json.dumps(sorted_dict,indent=0))
393
+ return sorted_dict
394
+
395
+
396
+
397
+ if __name__ == '__main__':
398
+ parser = argparse.ArgumentParser(description='SGPT model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
399
+ parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
400
+ parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
401
+ parser.add_argument('-model', action="store", dest="model",default="sentence-transformers/all-MiniLM-L6-v2",help="model name")
402
+
403
+ results = parser.parse_args()
404
+ obj = HFModel()
405
+ obj.init_model(results.model)
406
+ texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
407
+ results = obj.output_results(results.output,texts,embeddings)