taskswithcode commited on
Commit
ce6a2ba
1 Parent(s): fb189b3

Initial addition

Browse files
app.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import streamlit as st
4
+ import string
5
+ from io import StringIO
6
+ import pdb
7
+ import json
8
+ from twc_embeddings import HFModel,SimCSEModel,SGPTModel,CausalLMModel,SGPTQnAModel
9
+ import torch
10
+
11
+
12
+ MAX_INPUT = 100
13
+
14
+ SEM_SIMILARITY="1"
15
+ DOC_RETRIEVAL="2"
16
+ CLUSTERING="3"
17
+
18
+
19
+ use_case = {"1":"Finding similar phrases/sentences","2":"Retrieving semantically matching information to a query. It may not be a factual match","3":"Clustering"}
20
+
21
+
22
+
23
+
24
+ from transformers import BertTokenizer, BertForMaskedLM
25
+
26
+
27
+
28
+ view_count_file = "view_count.txt"
29
+
30
+ def get_views():
31
+ ret_val = 0
32
+ if ("view_count" not in st.session_state):
33
+ try:
34
+ data = int(open(view_count_file).read().strip("\n"))
35
+ except:
36
+ data = 0
37
+ data += 1
38
+ ret_val = data
39
+ st.session_state["view_count"] = data
40
+ with open(view_count_file,"w") as fp:
41
+ fp.write(str(data))
42
+ else:
43
+ ret_val = st.session_state["view_count"]
44
+ return "{:,}".format(ret_val)
45
+
46
+
47
+
48
+ def construct_model_info_for_display(model_names):
49
+ options_arr = []
50
+ markdown_str = f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><br/><b>Models evaluated ({len(model_names)})</b></div>"
51
+ for node in model_names:
52
+ options_arr .append(node["name"])
53
+ if (node["mark"] == "True"):
54
+ markdown_str += f"<div style=\"font-size:16px; color: #5f5f5f; text-align: left\">&nbsp;•&nbsp;Model:&nbsp;<a href=\'{node['paper_url']}\' target='_blank'>{node['name']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Code released by:&nbsp;<a href=\'{node['orig_author_url']}\' target='_blank'>{node['orig_author']}</a><br/>&nbsp;&nbsp;&nbsp;&nbsp;Model info:&nbsp;<a href=\'{node['sota_info']['sota_link']}\' target='_blank'>{node['sota_info']['task']}</a></div>"
55
+ if ("Note" in node):
56
+ markdown_str += f"<div style=\"font-size:16px; color: #a91212; text-align: left\">&nbsp;&nbsp;&nbsp;&nbsp;{node['Note']}<a href=\'{node['alt_url']}\' target='_blank'>link</a></div>"
57
+ markdown_str += "<div style=\"font-size:16px; color: #5f5f5f; text-align: left\"><br/></div>"
58
+
59
+ markdown_str += "<div style=\"font-size:12px; color: #9f9f9f; text-align: left\"><b>Note:</b><br/>•&nbsp;Uploaded files are loaded into non-persistent memory for the duration of the computation. They are not cached</div>"
60
+ limit = "{:,}".format(MAX_INPUT)
61
+ markdown_str += f"<div style=\"font-size:12px; color: #9f9f9f; text-align: left\">•&nbsp;User uploaded file has a maximum limit of {limit} sentences.</div>"
62
+ return options_arr,markdown_str
63
+
64
+
65
+ st.set_page_config(page_title='TWC - Compare popular/state-of-the-art models for tasks using sentence embeddings', page_icon="logo.jpg", layout='centered', initial_sidebar_state='auto',
66
+ menu_items={
67
+ 'About': 'This app was created by taskswithcode. http://taskswithcode.com'
68
+
69
+ })
70
+ col,pad = st.columns([85,15])
71
+
72
+ with col:
73
+ st.image("long_form_logo_with_icon.png")
74
+
75
+
76
+ @st.experimental_memo
77
+ def load_model(model_name,model_names):
78
+ try:
79
+ ret_model = None
80
+ for node in model_names:
81
+ if (model_name.startswith(node["name"])):
82
+ obj_class = globals()[node["class"]]
83
+ ret_model = obj_class()
84
+ ret_model.init_model(node["model"])
85
+ assert(ret_model is not None)
86
+ except Exception as e:
87
+ st.error("Unable to load model:" + model_name + " " + str(e))
88
+ pass
89
+ return ret_model
90
+
91
+
92
+ @st.experimental_memo
93
+ def cached_compute_similarity(sentences,_model,model_name,main_index):
94
+ texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
95
+ results = _model.output_results(None,texts,embeddings,main_index)
96
+ return results
97
+
98
+
99
+ def uncached_compute_similarity(sentences,_model,model_name,main_index):
100
+ with st.spinner('Computing vectors for sentences'):
101
+ texts,embeddings = _model.compute_embeddings(sentences,is_file=False)
102
+ results = _model.output_results(None,texts,embeddings,main_index)
103
+ #st.success("Similarity computation complete")
104
+ return results
105
+
106
+ def get_model_info(model_names,model_name):
107
+ for node in model_names:
108
+ if (model_name == node["name"]):
109
+ return node
110
+
111
+ def run_test(model_names,model_name,sentences,display_area,main_index,user_uploaded):
112
+ display_area.text("Loading model:" + model_name)
113
+ model_info = get_model_info(model_names,model_name)
114
+ if ("Note" in model_info):
115
+ fail_link = f"{model_info['Note']} [link]({model_info['alt_url']})"
116
+ display_area.write(fail_link)
117
+ model = load_model(model_name,model_names)
118
+ display_area.text("Model " + model_name + " load complete")
119
+ try:
120
+ if (user_uploaded):
121
+ results = uncached_compute_similarity(sentences,model,model_name,main_index)
122
+ else:
123
+ display_area.text("Computing vectors for sentences")
124
+ results = cached_compute_similarity(sentences,model,model_name,main_index)
125
+ display_area.text("Similarity computation complete")
126
+ return results
127
+
128
+ except Exception as e:
129
+ st.error("Some error occurred during prediction" + str(e))
130
+ st.stop()
131
+ return {}
132
+
133
+
134
+
135
+
136
+
137
+ def display_results(orig_sentences,main_index,results,response_info,app_mode):
138
+ main_sent = f"<div style=\"font-size:14px; color: #2f2f2f; text-align: left\">{response_info}<br/><br/></div>"
139
+ score_text = "cosine_distance" if app_mode == "similarity" else "cosine_distance/score"
140
+ pivot_name = "main sentence" if app_mode == "similarity" else "query"
141
+ main_sent += f"<div style=\"font-size:14px; color: #6f6f6f; text-align: left\">Results sorted by {score_text}. Closest to furthest away from {pivot_name}</div>"
142
+ pivot_name = pivot_name[0].upper() + pivot_name[1:]
143
+ main_sent += f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\"><b>{pivot_name}:</b>&nbsp;&nbsp;{orig_sentences[main_index]}</div>"
144
+ body_sent = []
145
+ download_data = {}
146
+ first = True
147
+ for key in results:
148
+ if (app_mode == DOC_RETRIEVAL and first):
149
+ first = False
150
+ continue
151
+ index = orig_sentences.index(key) + 1
152
+ body_sent.append(f"<div style=\"font-size:16px; color: #2f2f2f; text-align: left\">{index}]&nbsp;{key}&nbsp;&nbsp;&nbsp;<b>{results[key]:.2f}</b></div>")
153
+ download_data[key] = f"{results[key]:.2f}"
154
+ main_sent = main_sent + "\n" + '\n'.join(body_sent)
155
+ st.markdown(main_sent,unsafe_allow_html=True)
156
+ st.session_state["download_ready"] = json.dumps(download_data,indent=4)
157
+
158
+
159
+ def init_session():
160
+ st.session_state["download_ready"] = None
161
+ st.session_state["model_name"] = "ss_test"
162
+ st.session_state["main_index"] = 1
163
+ st.session_state["file_name"] = "default"
164
+
165
+ def app_main(app_mode,example_files,model_name_files):
166
+ init_session()
167
+ with open(example_files) as fp:
168
+ example_file_names = json.load(fp)
169
+ with open(model_name_files) as fp:
170
+ model_names = json.load(fp)
171
+ curr_use_case = use_case[app_mode].split(".")[0]
172
+ st.markdown("<h5 style='text-align: center;'>Compare popular/state-of-the-art models for tasks using sentence embeddings</h5>", unsafe_allow_html=True)
173
+ st.markdown(f"<div style='color: #4f4f4f; text-align: left'>Use cases for sentence embeddings<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['1']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['2']}<br/>&nbsp;&nbsp;&nbsp;•&nbsp;&nbsp;{use_case['3']}<br/><i>This app illustrates <b>'{curr_use_case}'</b> use case</i></div>", unsafe_allow_html=True)
174
+ st.markdown(f"<div style='color: #9f9f9f; text-align: right'>views:&nbsp;{get_views()}</div>", unsafe_allow_html=True)
175
+
176
+
177
+ try:
178
+
179
+
180
+ with st.form('twc_form'):
181
+
182
+ uploaded_file = st.file_uploader("Step 1. Upload text file(one sentence in a line) or choose an example text file below", type=".txt")
183
+
184
+ selected_file_index = st.selectbox(label=f'Example files ({len(example_file_names)})',
185
+ options = list(dict.keys(example_file_names)), index=0, key = "twc_file")
186
+ st.write("")
187
+ options_arr,markdown_str = construct_model_info_for_display(model_names)
188
+ selection_label = 'Step 2. Select Model'
189
+ selected_model = st.selectbox(label=selection_label,
190
+ options = options_arr, index=0, key = "twc_model")
191
+ st.write("")
192
+ if (app_mode == "similarity"):
193
+ main_index = st.number_input('Step 3. Enter index of sentence in file to make it the main sentence',value=1,min_value = 1)
194
+ else:
195
+ main_index = 1
196
+ st.write("")
197
+ submit_button = st.form_submit_button('Run')
198
+
199
+
200
+ input_status_area = st.empty()
201
+ display_area = st.empty()
202
+ if submit_button:
203
+ start = time.time()
204
+ if uploaded_file is not None:
205
+ st.session_state["file_name"] = uploaded_file.name
206
+ sentences = StringIO(uploaded_file.getvalue().decode("utf-8")).read()
207
+ else:
208
+ st.session_state["file_name"] = example_file_names[selected_file_index]["name"]
209
+ sentences = open(example_file_names[selected_file_index]["name"]).read()
210
+ sentences = sentences.split("\n")[:-1]
211
+ if (len(sentences) < main_index):
212
+ main_index = len(sentences)
213
+ st.info("Selected sentence index is larger than number of sentences in file. Truncating to " + str(main_index))
214
+ if (len(sentences) > MAX_INPUT):
215
+ st.info(f"Input sentence count exceeds maximum sentence limit. First {MAX_INPUT} out of {len(sentences)} sentences chosen")
216
+ sentences = sentences[:MAX_INPUT]
217
+ st.session_state["model_name"] = selected_model
218
+ st.session_state["main_index"] = main_index
219
+ results = run_test(model_names,selected_model,sentences,display_area,main_index - 1,(uploaded_file is not None))
220
+ display_area.empty()
221
+ with display_area.container():
222
+ device = 'GPU' if torch.cuda.is_available() else 'CPU'
223
+ response_info = f"Computation time on {device}: {time.time() - start:.2f} secs for {len(sentences)} sentences"
224
+ display_results(sentences,main_index - 1,results,response_info,app_mode)
225
+ #st.json(results)
226
+ st.download_button(
227
+ label="Download results as json",
228
+ data= st.session_state["download_ready"] if st.session_state["download_ready"] != None else "",
229
+ disabled = False if st.session_state["download_ready"] != None else True,
230
+ file_name= (st.session_state["model_name"] + "_" + str(st.session_state["main_index"]) + "_" + '_'.join(st.session_state["file_name"].split(".")[:-1]) + ".json").replace("/","_"),
231
+ mime='text/json',
232
+ key ="download"
233
+ )
234
+
235
+
236
+
237
+ except Exception as e:
238
+ st.error("Some error occurred during loading" + str(e))
239
+ st.stop()
240
+
241
+ st.markdown(markdown_str, unsafe_allow_html=True)
242
+
243
+
244
+
245
+ if __name__ == "__main__":
246
+ #print("comand line input:",len(sys.argv),str(sys.argv))
247
+ #app_main(sys.argv[1],sys.argv[2],sys.argv[3])
248
+ #app_main("1","sim_app_examples.json","sim_app_models.json")
249
+ app_main("2","doc_app_examples.json","doc_app_models.json")
250
+
doc_app_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ {
2
+ "Querying about a planet": {"name":"planets_qna.txt"},
3
+ "Querying about a disease": {"name":"qna.txt"},
4
+ "Querying about a protein": {"name":"qna2.txt"}
5
+ }
doc_app_models.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ { "name":"SGPT-125M-Search",
3
+ "model":"Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit",
4
+ "fork_url":"https://github.com/taskswithcode/sgpt",
5
+ "orig_author_url":"https://github.com/Muennighoff",
6
+ "orig_author":"Niklas Muennighoff",
7
+ "sota_info": {
8
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
9
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
10
+ },
11
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
12
+ "mark":"True",
13
+ "class":"SGPTQnAModel"},
14
+ { "name":"GPT-Neo-125M",
15
+ "model":"EleutherAI/gpt-neo-125M",
16
+ "fork_url":"https://github.com/taskswithcode/sgpt",
17
+ "orig_author_url":"https://www.eleuther.ai/",
18
+ "orig_author":"EleuthorAI",
19
+ "sota_info": {
20
+ "task":"Top 20 in multiple NLP tasks (smaller variant)",
21
+ "sota_link":"https://paperswithcode.com/paper/gpt-neox-20b-an-open-source-autoregressive-1"
22
+ },
23
+ "paper_url":"https://zenodo.org/record/5551208#.YyV0k-zMLX0",
24
+ "mark":"True",
25
+ "class":"CausalLMModel"},
26
+
27
+ { "name":"sentence-transformers/all-MiniLM-L6-v2",
28
+ "model":"sentence-transformers/all-MiniLM-L6-v2",
29
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
30
+ "orig_author_url":"https://github.com/UKPLab",
31
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
32
+ "sota_info": {
33
+ "task":"Over 3.8 million downloads from huggingface",
34
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
35
+ },
36
+ "paper_url":"https://arxiv.org/abs/1908.10084",
37
+ "mark":"True",
38
+ "class":"HFModel"},
39
+ { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
40
+ "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
41
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
42
+ "orig_author_url":"https://github.com/UKPLab",
43
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
44
+ "sota_info": {
45
+ "task":"Over 2 million downloads from huggingface",
46
+ "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
47
+ },
48
+ "paper_url":"https://arxiv.org/abs/1908.10084",
49
+ "mark":"True",
50
+ "class":"HFModel"},
51
+ { "name":"sentence-transformers/bert-base-nli-mean-tokens",
52
+ "model":"sentence-transformers/bert-base-nli-mean-tokens",
53
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
54
+ "orig_author_url":"https://github.com/UKPLab",
55
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
56
+ "sota_info": {
57
+ "task":"Over 700,000 downloads from huggingface",
58
+ "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
59
+ },
60
+ "paper_url":"https://arxiv.org/abs/1908.10084",
61
+ "mark":"True",
62
+ "class":"HFModel"},
63
+ { "name":"sentence-transformers/all-mpnet-base-v2",
64
+ "model":"sentence-transformers/all-mpnet-base-v2",
65
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
66
+ "orig_author_url":"https://github.com/UKPLab",
67
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
68
+ "sota_info": {
69
+ "task":"Over 500,000 downloads from huggingface",
70
+ "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
71
+ },
72
+ "paper_url":"https://arxiv.org/abs/1908.10084",
73
+ "mark":"True",
74
+ "class":"HFModel"},
75
+ { "name":"sentence-transformers/all-MiniLM-L12-v2",
76
+ "model":"sentence-transformers/all-MiniLM-L12-v2",
77
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
78
+ "orig_author_url":"https://github.com/UKPLab",
79
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
80
+ "sota_info": {
81
+ "task":"Over 500,000 downloads from huggingface",
82
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
83
+ },
84
+ "paper_url":"https://arxiv.org/abs/1908.10084",
85
+ "mark":"True",
86
+ "class":"HFModel"},
87
+
88
+ { "name":"SGPT-125M",
89
+ "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
90
+ "fork_url":"https://github.com/taskswithcode/sgpt",
91
+ "orig_author_url":"https://github.com/Muennighoff",
92
+ "orig_author":"Niklas Muennighoff",
93
+ "sota_info": {
94
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
95
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
96
+ },
97
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
98
+ "mark":"True",
99
+ "class":"SGPTModel"},
100
+ { "name":"SIMCSE-base" ,
101
+ "model":"princeton-nlp/sup-simcse-roberta-base",
102
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
103
+ "orig_author_url":"https://github.com/princeton-nlp",
104
+ "orig_author":"Princeton Natural Language Processing",
105
+ "sota_info": {
106
+ "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
107
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
108
+ },
109
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
110
+ "mark":"True",
111
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
112
+
113
+
114
+ ]
imdb_sent.txt ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "A rating of ""1"" does not begin to express how dull, depressing and relentlessly bad this movie is."
2
+ Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME!!!
3
+ "Long, boring, blasphemous. Never have I been so glad to see ending credits roll."
4
+ This film made John Glover a star. Alan Raimy is one of the most compelling character that I have ever seen on film. And I mean that sport.
5
+ "Were I not with friends, and so cheap, I would have walked out. It failed miserably as satire and didn't even have the redemption of camp."
6
+ For pure gothic vampire cheese nothing can compare to the Subspecies films. I highly recommend each and every one of them.
7
+ "A great film in its genre, the direction, acting, most especially the casting of the film makes it even more powerful. A must see."
8
+ "This is a terrible movie, don't waste your money on it. Don't even watch it for free. That's all I have to say."
9
+ I wouldn't rent this one even on dollar rental night.
10
+ "More suspenseful, more subtle, much, much more disturbing...."
11
+ This is a good film. This is very funny. Yet after this film there were no good Ernest films!
12
+ A touching movie. It is full of emotions and wonderful acting. I could have sat through it a second time.
13
+ "Great movie - especially the music - Etta James - ""At Last"". This speaks volumes when you have finally found that special someone."
14
+ If you've ever had a mad week-end out with your mates then you'll appreciate this film. Excellent fun and a laugh a minute.
15
+ "I think it's one of the greatest movies which are ever made, and I've seen many... The book is better, but it's still a very good movie!"
16
+ Brilliant and moving performances by Tom Courtenay and Peter Finch.
17
+ The characters are unlikeable and the script is awful. It's a waste of the talents of Deneuve and Auteuil.
18
+ You've got to be kidding. This movie sucked for the sci-fi fans. I would only recommend watching this only if you think Armageddon was good.
19
+ Ten minutes of people spewing gallons of pink vomit. Recurring scenes of enormous piles of dog excrement - need one say more???
20
+ "As usual, Sean Connery does a great job. Lawrence Fishburn is good, but I have a hard time not seeing him as Ike Turner."
21
+ This movie is terrible but it has some good effects.
22
+ You'd better choose Paul Verhoeven's even if you have watched it.
23
+ "Brilliant. Ranks along with Citizen Kane, The Matrix and Godfathers. Must see, at least for basset in her early days. Watch it."
24
+ "I don't know why I like this movie so well, but I never get tired of watching it."
25
+ The one-liners fly so fast in this movie that you can watch it over and over and still catch new ones. By far one of the best of this genre.
26
+ "Don't waste your time and money on it. It's not quite as bad as ""Adrenalin"", by the same director but that's not saying much."
27
+ "Read the book, forget the movie!"
28
+ This is a great movie. Too bad it is not available on home video.
29
+ "Very intelligent language usage of Ali, which you musn't miss! In one word: (eeh sentence...) Wicked, so keep it real and pass it on!"
30
+ Primary plot!Primary direction!Poor interpretation.
31
+ "If you like Pauly Shore, you'll love Son in Law. If you hate Pauly Shore, then, well...I liked it!"
32
+ Just love the interplay between two great characters of stage & screen - Veidt & Barrymore
33
+ "This movie will always be a Broadway and Movie classic, as long as there are still people who sing, dance, and act."
34
+ This is the greatest movie ever. If you have written it off with out ever seeing it. You must give it a second try.
35
+ "What a script, what a story, what a mess!"
36
+ "I caught this film late at night on HBO. Talk about wooden acting, unbelievable plot, et al. Very little going in its favor. Skip it."
37
+ This is without a doubt the worst movie I have ever seen. It is not funny. It is not interesting and should not have been made.
38
+ Ming The Merciless does a little Bardwork and a movie most foul!
39
+ This is quite possibly the worst sequel ever made. The script is unfunny and the acting stinks. The exact opposite of the original.
40
+ "This is the definitive movie version of Hamlet. Branagh cuts nothing, but there are no wasted moments."
41
+ My favorite movie. What a great story this really was. I'd just like to be able to buy a copy of it but this does not seem possible.
42
+ "Comment this movie is impossible. Is terrible, very improbable, bad interpretation e direction. Not look!!!!!"
43
+ "Brilliant movie. The drawings were just amazing. Too bad it ended before it begun. I´ve waited 21 years for a sequel, but nooooo!!!"
44
+ a mesmerizing film that certainly keeps your attention... Ben Daniels is fascinating (and courageous) to watch.
45
+ "This is a very cool movie. The ending of the movie is a bit more defined than the play's ending, but either way it is still a good movie."
46
+ "Without a doubt, one of Tobe Hoppor's best! Epic storytellng, great special effects, and The Spacegirl (vamp me baby!)."
47
+ I hope this group of film-makers never re-unites.
48
+ Unwatchable. You can't even make it past the first three minutes. And this is coming from a huge Adam Sandler fan!!1
49
+ "One of the funniest movies made in recent years. Good characterization, plot and exceptional chemistry make this one a classic"
50
+ "Add this little gem to your list of holiday regulars. It is<br /><br />sweet, funny, and endearing"
51
+ "no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!"
52
+ "If you haven't seen this, it's terrible. It is pure trash. I saw this about 17 years ago, and I'm still screwed up from it."
53
+ Absolutely fantastic! Whatever I say wouldn't do this underrated movie the justice it deserves. Watch it now! FANTASTIC!
54
+ "As a big fan of Tiny Toon Adventures, I loved this movie!!! It was so funny!!! It really captured how cartoons spent their summers."
55
+ Widow hires a psychopath as a handyman. Sloppy film noir thriller which doesn't make much of its tension promising set-up. (3/10)
56
+ The Fiendish Plot of Dr. Fu Manchu (1980). This is hands down the worst film I've ever seen. What a sad way for a great comedian to go out.
57
+ "Obviously written for the stage. Lightweight but worthwhile. How can you go wrong with Ralph Richardson, Olivier and Merle Oberon."
58
+ This movie turned out to be better than I had expected it to be. Some parts were pretty funny. It was nice to have a movie with a new plot.
59
+ This movie is terrible. It's about some no brain surfin dude that inherits some company. Does Carrot Top have no shame?<br /><br />
60
+ Adrian Pasdar is excellent is this film. He makes a fascinating woman.
61
+ "An unfunny, unworthy picture which is an undeserving end to Peter Sellers' career. It is a pity this movie was ever made."
62
+ "The plot was really weak and confused. This is a true Oprah flick. (In Oprah's world, all men are evil and all women are victims.)"
planets_qna.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I'm searching for a planet not too far from Earth.
2
+ Mercury is closest to the sun
3
+ Venus and Mars are closest to earth
4
+ Pluto is not too far from neptune
5
+ Neptune is the eighth and farthest-known Solar planet from the Sun. In the Solar System, it is the fourth-largest planet by diameter, the third-most-massive planet, and the densest giant planet. It is 17 times the mass of Earth, slightly more massive than its near-twin Uranus.
6
+ TRAPPIST-1d, also designated as 2MASS J23062928-0502285 d, is a small exoplanet (about 30% the mass of the earth), which orbits on the inner edge of the habitable zone of the ultracool dwarf star TRAPPIST-1 approximately 40 light-years (12.1 parsecs, or nearly 3.7336×1014 km) away from Earth in the constellation of Aquarius.
7
+ A harsh desert world orbiting twin suns in the galaxy’s Outer Rim, Tatooine is a lawless place ruled by Hutt gangsters. Many settlers scratch out a living on moisture farms, while spaceport cities such as Mos Eisley and Mos Espa serve as home base for smugglers, criminals, and other rogues.
8
+ For some individual rights are very close to the heart and the reduction of it, regardless of the valid reasons, was uncacceptable
9
+ A quasar is an extremely luminous active galactic nucleus, powered by a supermassive black hole, with mass ranging from millions to tens of billions of solar masses, surrounded by a gaseous accretion disc
10
+ Politics is the last resort of the scounderal
11
+ Planets with water have been found in many places now
12
+ Some drugs have serious side effects
13
+ The dog ran all around the park like a planet orbiting a star
14
+ Quantum computing is yet to take off
15
+ His world revolved around his girl friend like a planet revolving around a star
16
+ The news cycle obsessively revolved around the newly weds with starry eyed fascination for the royalty
17
+ Starry nights with planets is a rare sight due to city lights
18
+ The dog ran all around the park in circles around the cat
19
+ Milky way Galaxy
20
+ Twin planet stars
qna.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Is Hirschsprung disease a mendelian or a multifactorial disorder?
2
+ Coding sequence mutations in RET, GDNF, EDNRB, EDN3, and SOX10 are involved in the development of Hirschsprung disease. The majority of these genes was shown to be related to Mendelian syndromic forms of Hirschsprung's disease, whereas the non-Mendelian inheritance of sporadic non-syndromic Hirschsprung disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model.
3
+ Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes
4
+ In this study, we review the identification of genes and loci involved in the non-syndromic common form and syndromic Mendelian forms of Hirschsprung's disease. The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease. The non-Mendelian inheritance of sporadic non-syndromic Hirschsprung's disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model
5
+ Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes
6
+ For almost all of the identified HSCR genes incomplete penetrance of the HSCR phenotype has been reported, probably due to modifier loci. Therefore, HSCR has become a model for a complex oligo-/polygenic disorder in which the relationship between different genes creating a non-mendelian inheritance pattern still remains to be elucidated
7
+ Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.
8
+ The inheritance of Hirschsprung disease is generally consistent with sex-modified multifactorial inheritance with a lower threshold of expression in males.
9
+ Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes.
10
+ Differential contributions of rare and common, coding and noncoding Ret mutations to multifactorial Hirschsprung disease liability.
11
+ In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance.
12
+ The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease
13
+ In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance
14
+ On the basis of a skewed sex-ratio (M/F = 4/1) and a risk to relatives much higher than the incidence in the general population, HSCR has long been regarded as a sex-modified multifactorial disorder
15
+ The inheritance of Hirschsprung disease is generally consistent with sex-modified multifactorial inheritance with a lower threshold of expression in males
16
+ The non-Mendelian inheritance of sporadic non-syndromic Hirschsprung's disease proved to be complex; involvement of multiple loci was demonstrated in a multiplicative model
qna2.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Is the protein Papilin secreted?
2
+ Yes, papilin is a secreted protein
3
+ Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin.
4
+ We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette
5
+ For almost all of the identified HSCR genes incomplete penetrance of the HSCR phenotype has been reported, probably due to modifier loci. Therefore, HSCR has become a model for a complex oligo-/polygenic disorder in which the relationship between different genes creating a non-mendelian inheritance pattern still remains to be elucidated
6
+ apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains.
7
+ The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and cell guidance. This review samples some of the contemporary literature regarding TSR superfamily members (e.g. F-spondin, UNC-5, ADAMTS, papilin, and TRAP) where specific functions are assigned to the TSR domains.
8
+ Papilin is an extracellular matrix glycoprotein
9
+ Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium.
10
+ A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
11
+ A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
12
+ The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease
13
+ In the etiology of Hirschsprung disease various genes play a role; these are: RET, EDNRB, GDNF, EDN3 and SOX10, NTN3, ECE1, Mutations in these genes may result in dominant, recessive or multifactorial patterns of inheritance
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ transformers
2
+ scipy
3
+ torch
sim_app_examples.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
1
+ {
2
+ "Machine learning terms (phrases test)": {"name":"small_test.txt"},
3
+ "Customer feedback mixed with noise":{"name":"larger_test.txt"},
4
+ "Movie reviews": {"name":"imdb_sent.txt"}
5
+ }
sim_app_models.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+
3
+ { "name":"sentence-transformers/all-MiniLM-L6-v2",
4
+ "model":"sentence-transformers/all-MiniLM-L6-v2",
5
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
6
+ "orig_author_url":"https://github.com/UKPLab",
7
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
8
+ "sota_info": {
9
+ "task":"Over 3.8 million downloads from huggingface",
10
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2"
11
+ },
12
+ "paper_url":"https://arxiv.org/abs/1908.10084",
13
+ "mark":"True",
14
+ "class":"HFModel"},
15
+ { "name":"sentence-transformers/paraphrase-MiniLM-L6-v2",
16
+ "model":"sentence-transformers/paraphrase-MiniLM-L6-v2",
17
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
18
+ "orig_author_url":"https://github.com/UKPLab",
19
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
20
+ "sota_info": {
21
+ "task":"Over 2 million downloads from huggingface",
22
+ "sota_link":"https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2"
23
+ },
24
+ "paper_url":"https://arxiv.org/abs/1908.10084",
25
+ "mark":"True",
26
+ "class":"HFModel"},
27
+ { "name":"sentence-transformers/bert-base-nli-mean-tokens",
28
+ "model":"sentence-transformers/bert-base-nli-mean-tokens",
29
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
30
+ "orig_author_url":"https://github.com/UKPLab",
31
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
32
+ "sota_info": {
33
+ "task":"Over 700,000 downloads from huggingface",
34
+ "sota_link":"https://huggingface.co/sentence-transformers/bert-base-nli-mean-tokens"
35
+ },
36
+ "paper_url":"https://arxiv.org/abs/1908.10084",
37
+ "mark":"True",
38
+ "class":"HFModel"},
39
+ { "name":"sentence-transformers/all-mpnet-base-v2",
40
+ "model":"sentence-transformers/all-mpnet-base-v2",
41
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
42
+ "orig_author_url":"https://github.com/UKPLab",
43
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
44
+ "sota_info": {
45
+ "task":"Over 500,000 downloads from huggingface",
46
+ "sota_link":"https://huggingface.co/sentence-transformers/all-mpnet-base-v2"
47
+ },
48
+ "paper_url":"https://arxiv.org/abs/1908.10084",
49
+ "mark":"True",
50
+ "class":"HFModel"},
51
+ { "name":"sentence-transformers/all-MiniLM-L12-v2",
52
+ "model":"sentence-transformers/all-MiniLM-L12-v2",
53
+ "fork_url":"https://github.com/taskswithcode/sentence_similarity_hf_model",
54
+ "orig_author_url":"https://github.com/UKPLab",
55
+ "orig_author":"Ubiquitous Knowledge Processing Lab",
56
+ "sota_info": {
57
+ "task":"Over 500,000 downloads from huggingface",
58
+ "sota_link":"https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2"
59
+ },
60
+ "paper_url":"https://arxiv.org/abs/1908.10084",
61
+ "mark":"True",
62
+ "class":"HFModel"},
63
+
64
+ { "name":"SGPT-125M",
65
+ "model":"Muennighoff/SGPT-125M-weightedmean-nli-bitfit",
66
+ "fork_url":"https://github.com/taskswithcode/sgpt",
67
+ "orig_author_url":"https://github.com/Muennighoff",
68
+ "orig_author":"Niklas Muennighoff",
69
+ "sota_info": {
70
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
71
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
72
+ },
73
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
74
+ "mark":"True",
75
+ "class":"SGPTModel"},
76
+ { "name":"SGPT-1.3B",
77
+ "model": "Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit",
78
+ "fork_url":"https://github.com/taskswithcode/sgpt",
79
+ "orig_author_url":"https://github.com/Muennighoff",
80
+ "orig_author":"Niklas Muennighoff",
81
+ "sota_info": {
82
+ "task":"#1 in multiple information retrieval & search tasks(smaller variant)",
83
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
84
+ },
85
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
86
+ "Note":"If this large model takes too long or fails to load , try this ",
87
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
88
+ "mark":"True",
89
+ "class":"SGPTModel"},
90
+ { "name":"SGPT-5.8B",
91
+ "model": "Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit" ,
92
+ "fork_url":"https://github.com/taskswithcode/sgpt",
93
+ "orig_author_url":"https://github.com/Muennighoff",
94
+ "orig_author":"Niklas Muennighoff",
95
+ "Note":"If this large model takes too long or fails to load , try this ",
96
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
97
+ "sota_info": {
98
+ "task":"#1 in multiple information retrieval & search tasks",
99
+ "sota_link":"https://paperswithcode.com/paper/sgpt-gpt-sentence-embeddings-for-semantic"
100
+ },
101
+ "paper_url":"https://arxiv.org/abs/2202.08904v5",
102
+ "mark":"True",
103
+ "class":"SGPTModel"},
104
+
105
+ { "name":"SIMCSE-large" ,
106
+ "model":"princeton-nlp/sup-simcse-roberta-large",
107
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
108
+ "orig_author_url":"https://github.com/princeton-nlp",
109
+ "orig_author":"Princeton Natural Language Processing",
110
+ "Note":"If this large model takes too long or fails to load , try this ",
111
+ "alt_url":"http://www.taskswithcode.com/sentence_similarity/",
112
+ "sota_info": {
113
+ "task":"Within top 10 in multiple semantic textual similarity tasks",
114
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
115
+ },
116
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
117
+ "mark":"True",
118
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"},
119
+
120
+ { "name":"SIMCSE-base" ,
121
+ "model":"princeton-nlp/sup-simcse-roberta-base",
122
+ "fork_url":"https://github.com/taskswithcode/SimCSE",
123
+ "orig_author_url":"https://github.com/princeton-nlp",
124
+ "orig_author":"Princeton Natural Language Processing",
125
+ "sota_info": {
126
+ "task":"Within top 10 in multiple semantic textual similarity tasks(smaller variant)",
127
+ "sota_link":"https://paperswithcode.com/paper/simcse-simple-contrastive-learning-of"
128
+ },
129
+ "paper_url":"https://arxiv.org/abs/2104.08821v4",
130
+ "mark":"True",
131
+ "class":"SimCSEModel","sota_link":"https://paperswithcode.com/sota/semantic-textual-similarity-on-sick"}
132
+
133
+
134
+ ]
twc_embeddings.py ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModel, AutoTokenizer
2
+ from transformers import AutoModelForCausalLM
3
+ from scipy.spatial.distance import cosine
4
+ import argparse
5
+ import json
6
+ import pdb
7
+ import torch
8
+ import torch.nn.functional as F
9
+
10
+ def read_text(input_file):
11
+ arr = open(input_file).read().split("\n")
12
+ return arr[:-1]
13
+
14
+
15
+ class CausalLMModel:
16
+ def __init__(self):
17
+ self.model = None
18
+ self.tokenizer = None
19
+ self.debug = False
20
+ print("In CausalLMModel Constructor")
21
+
22
+ def init_model(self,model_name = None):
23
+ # Get our models - The package will take care of downloading the models automatically
24
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
25
+ if (self.debug):
26
+ print("Init model",model_name)
27
+ # For best performance: EleutherAI/gpt-j-6B
28
+ if (model_name is None):
29
+ model_name = "EleutherAI/gpt-neo-125M"
30
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
31
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
32
+ self.model.eval()
33
+ self.prompt = 'Documents are searched to find matches with the same content.\nThe document "{}" is a good search result for "'
34
+
35
+ def compute_embeddings(self,input_data,is_file):
36
+ if (self.debug):
37
+ print("Computing embeddings for:", input_data[:20])
38
+ model = self.model
39
+ tokenizer = self.tokenizer
40
+
41
+ texts = read_text(input_data) if is_file == True else input_data
42
+ query = texts[0]
43
+ docs = texts[1:]
44
+
45
+ # Tokenize input texts
46
+
47
+ #print(f"Query: {query}")
48
+ scores = []
49
+ for doc in docs:
50
+ context = self.prompt.format(doc)
51
+
52
+ context_enc = tokenizer.encode(context, add_special_tokens=False)
53
+ continuation_enc = tokenizer.encode(query, add_special_tokens=False)
54
+ # Slice off the last token, as we take its probability from the one before
55
+ model_input = torch.tensor(context_enc+continuation_enc[:-1])
56
+ continuation_len = len(continuation_enc)
57
+ input_len, = model_input.shape
58
+
59
+ # [seq_len] -> [seq_len, vocab]
60
+ logprobs = torch.nn.functional.log_softmax(model(model_input)[0], dim=-1).cpu()
61
+ # [seq_len, vocab] -> [continuation_len, vocab]
62
+ logprobs = logprobs[input_len-continuation_len:]
63
+ # Gather the log probabilities of the continuation tokens -> [continuation_len]
64
+ logprobs = torch.gather(logprobs, 1, torch.tensor(continuation_enc).unsqueeze(-1)).squeeze(-1)
65
+ score = torch.sum(logprobs)
66
+ scores.append(score.tolist())
67
+ return texts,scores
68
+
69
+ def output_results(self,output_file,texts,scores,main_index = 0):
70
+ cosine_dict = {}
71
+ docs = texts[1:]
72
+ if (self.debug):
73
+ print("Total sentences",len(texts))
74
+ assert(len(scores) == len(docs))
75
+ for i in range(len(docs)):
76
+ cosine_dict[docs[i]] = scores[i]
77
+
78
+ if (self.debug):
79
+ print("Input sentence:",texts[main_index])
80
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
81
+ if (self.debug):
82
+ for key in sorted_dict:
83
+ print("Document score for \"%s\" is: %.3f" % (key[:100], sorted_dict[key]))
84
+ if (output_file is not None):
85
+ with open(output_file,"w") as fp:
86
+ fp.write(json.dumps(sorted_dict,indent=0))
87
+ return sorted_dict
88
+
89
+
90
+ class SGPTQnAModel:
91
+ def __init__(self):
92
+ self.model = None
93
+ self.tokenizer = None
94
+ self.debug = False
95
+ print("In SGPT Q&A Constructor")
96
+
97
+
98
+ def init_model(self,model_name = None):
99
+ # Get our models - The package will take care of downloading the models automatically
100
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
101
+ if (self.debug):
102
+ print("Init model",model_name)
103
+ if (model_name is None):
104
+ model_name = "Muennighoff/SGPT-125M-weightedmean-msmarco-specb-bitfit"
105
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
106
+ self.model = AutoModel.from_pretrained(model_name)
107
+ self.model.eval()
108
+ self.SPECB_QUE_BOS = self.tokenizer.encode("[", add_special_tokens=False)[0]
109
+ self.SPECB_QUE_EOS = self.tokenizer.encode("]", add_special_tokens=False)[0]
110
+
111
+ self.SPECB_DOC_BOS = self.tokenizer.encode("{", add_special_tokens=False)[0]
112
+ self.SPECB_DOC_EOS = self.tokenizer.encode("}", add_special_tokens=False)[0]
113
+
114
+
115
+ def tokenize_with_specb(self,texts, is_query):
116
+ # Tokenize without padding
117
+ batch_tokens = self.tokenizer(texts, padding=False, truncation=True)
118
+ # Add special brackets & pay attention to them
119
+ for seq, att in zip(batch_tokens["input_ids"], batch_tokens["attention_mask"]):
120
+ if is_query:
121
+ seq.insert(0, self.SPECB_QUE_BOS)
122
+ seq.append(self.SPECB_QUE_EOS)
123
+ else:
124
+ seq.insert(0, self.SPECB_DOC_BOS)
125
+ seq.append(self.SPECB_DOC_EOS)
126
+ att.insert(0, 1)
127
+ att.append(1)
128
+ # Add padding
129
+ batch_tokens = self.tokenizer.pad(batch_tokens, padding=True, return_tensors="pt")
130
+ return batch_tokens
131
+
132
+ def get_weightedmean_embedding(self,batch_tokens, model):
133
+ # Get the embeddings
134
+ with torch.no_grad():
135
+ # Get hidden state of shape [bs, seq_len, hid_dim]
136
+ last_hidden_state = self.model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
137
+
138
+ # Get weights of shape [bs, seq_len, hid_dim]
139
+ weights = (
140
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
141
+ .unsqueeze(0)
142
+ .unsqueeze(-1)
143
+ .expand(last_hidden_state.size())
144
+ .float().to(last_hidden_state.device)
145
+ )
146
+
147
+ # Get attn mask of shape [bs, seq_len, hid_dim]
148
+ input_mask_expanded = (
149
+ batch_tokens["attention_mask"]
150
+ .unsqueeze(-1)
151
+ .expand(last_hidden_state.size())
152
+ .float()
153
+ )
154
+
155
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
156
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
157
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
158
+
159
+ embeddings = sum_embeddings / sum_mask
160
+
161
+ return embeddings
162
+
163
+ def compute_embeddings(self,input_data,is_file):
164
+ if (self.debug):
165
+ print("Computing embeddings for:", input_data[:20])
166
+ model = self.model
167
+ tokenizer = self.tokenizer
168
+
169
+ texts = read_text(input_data) if is_file == True else input_data
170
+
171
+ queries = [texts[0]]
172
+ docs = texts[1:]
173
+ query_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(queries, is_query=True), self.model)
174
+ doc_embeddings = self.get_weightedmean_embedding(self.tokenize_with_specb(docs, is_query=False), self.model)
175
+ return texts,(query_embeddings,doc_embeddings)
176
+
177
+
178
+
179
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
180
+ # Calculate cosine similarities
181
+ # Cosine similarities are in [-1, 1]. Higher means more similar
182
+ query_embeddings = embeddings[0]
183
+ doc_embeddings = embeddings[1]
184
+ cosine_dict = {}
185
+ queries = [texts[0]]
186
+ docs = texts[1:]
187
+ if (self.debug):
188
+ print("Total sentences",len(texts))
189
+ for i in range(len(docs)):
190
+ cosine_dict[docs[i]] = 1 - cosine(query_embeddings[0], doc_embeddings[i])
191
+
192
+ if (self.debug):
193
+ print("Input sentence:",texts[main_index])
194
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
195
+ if (self.debug):
196
+ for key in sorted_dict:
197
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
198
+ if (output_file is not None):
199
+ with open(output_file,"w") as fp:
200
+ fp.write(json.dumps(sorted_dict,indent=0))
201
+ return sorted_dict
202
+
203
+
204
+ class SimCSEModel:
205
+ def __init__(self):
206
+ self.model = None
207
+ self.tokenizer = None
208
+ self.debug = False
209
+ print("In SimCSE constructor")
210
+
211
+ def init_model(self,model_name = None):
212
+ if (model_name == None):
213
+ model_name = "princeton-nlp/sup-simcse-roberta-large"
214
+ #self.model = SimCSE(model_name)
215
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
216
+ self.model = AutoModel.from_pretrained(model_name)
217
+
218
+ def compute_embeddings(self,input_data,is_file):
219
+ texts = read_text(input_data) if is_file == True else input_data
220
+ inputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
221
+ with torch.no_grad():
222
+ embeddings = self.model(**inputs, output_hidden_states=True, return_dict=True).pooler_output
223
+ return texts,embeddings
224
+
225
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
226
+ # Calculate cosine similarities
227
+ # Cosine similarities are in [-1, 1]. Higher means more similar
228
+ cosine_dict = {}
229
+ #print("Total sentences",len(texts))
230
+ for i in range(len(texts)):
231
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
232
+
233
+ #print("Input sentence:",texts[main_index])
234
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
235
+ if (self.debug):
236
+ for key in sorted_dict:
237
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
238
+ if (output_file is not None):
239
+ with open(output_file,"w") as fp:
240
+ fp.write(json.dumps(sorted_dict,indent=0))
241
+ return sorted_dict
242
+
243
+
244
+
245
+ class SGPTModel:
246
+ def __init__(self):
247
+ self.model = None
248
+ self.tokenizer = None
249
+ self.debug = False
250
+ print("In SGPT Constructor")
251
+
252
+
253
+ def init_model(self,model_name = None):
254
+ # Get our models - The package will take care of downloading the models automatically
255
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
256
+ if (self.debug):
257
+ print("Init model",model_name)
258
+ if (model_name is None):
259
+ model_name = "Muennighoff/SGPT-125M-weightedmean-nli-bitfit"
260
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
261
+ self.model = AutoModel.from_pretrained(model_name)
262
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
263
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-1.3B-weightedmean-msmarco-specb-bitfit")
264
+ #self.tokenizer = AutoTokenizer.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
265
+ #self.model = AutoModel.from_pretrained("Muennighoff/SGPT-5.8B-weightedmean-msmarco-specb-bitfit")
266
+ # Deactivate Dropout (There is no dropout in the above models so it makes no difference here but other SGPT models may have dropout)
267
+ self.model.eval()
268
+
269
+ def compute_embeddings(self,input_data,is_file):
270
+ if (self.debug):
271
+ print("Computing embeddings for:", input_data[:20])
272
+ model = self.model
273
+ tokenizer = self.tokenizer
274
+
275
+ texts = read_text(input_data) if is_file == True else input_data
276
+
277
+ # Tokenize input texts
278
+ batch_tokens = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
279
+
280
+ # Get the embeddings
281
+ with torch.no_grad():
282
+ # Get hidden state of shape [bs, seq_len, hid_dim]
283
+ last_hidden_state = model(**batch_tokens, output_hidden_states=True, return_dict=True).last_hidden_state
284
+
285
+ # Get weights of shape [bs, seq_len, hid_dim]
286
+ weights = (
287
+ torch.arange(start=1, end=last_hidden_state.shape[1] + 1)
288
+ .unsqueeze(0)
289
+ .unsqueeze(-1)
290
+ .expand(last_hidden_state.size())
291
+ .float().to(last_hidden_state.device)
292
+ )
293
+
294
+ # Get attn mask of shape [bs, seq_len, hid_dim]
295
+ input_mask_expanded = (
296
+ batch_tokens["attention_mask"]
297
+ .unsqueeze(-1)
298
+ .expand(last_hidden_state.size())
299
+ .float()
300
+ )
301
+
302
+ # Perform weighted mean pooling across seq_len: bs, seq_len, hidden_dim -> bs, hidden_dim
303
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded * weights, dim=1)
304
+ sum_mask = torch.sum(input_mask_expanded * weights, dim=1)
305
+
306
+ embeddings = sum_embeddings / sum_mask
307
+ return texts,embeddings
308
+
309
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
310
+ # Calculate cosine similarities
311
+ # Cosine similarities are in [-1, 1]. Higher means more similar
312
+ cosine_dict = {}
313
+ if (self.debug):
314
+ print("Total sentences",len(texts))
315
+ for i in range(len(texts)):
316
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
317
+
318
+ if (self.debug):
319
+ print("Input sentence:",texts[main_index])
320
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
321
+ if (self.debug):
322
+ for key in sorted_dict:
323
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
324
+ if (output_file is not None):
325
+ with open(output_file,"w") as fp:
326
+ fp.write(json.dumps(sorted_dict,indent=0))
327
+ return sorted_dict
328
+
329
+
330
+
331
+
332
+
333
+ class HFModel:
334
+ def __init__(self):
335
+ self.model = None
336
+ self.tokenizer = None
337
+ self.debug = False
338
+ print("In HF Constructor")
339
+
340
+
341
+ def init_model(self,model_name = None):
342
+ # Get our models - The package will take care of downloading the models automatically
343
+ # For best performance: Muennighoff/SGPT-5.8B-weightedmean-nli-bitfit
344
+ #print("Init model",model_name)
345
+ if (model_name is None):
346
+ model_name = "sentence-transformers/all-MiniLM-L6-v2"
347
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
348
+ self.model = AutoModel.from_pretrained(model_name)
349
+ self.model.eval()
350
+
351
+ def mean_pooling(self,model_output, attention_mask):
352
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
353
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
354
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
355
+
356
+ def compute_embeddings(self,input_data,is_file):
357
+ #print("Computing embeddings for:", input_data[:20])
358
+ model = self.model
359
+ tokenizer = self.tokenizer
360
+
361
+ texts = read_text(input_data) if is_file == True else input_data
362
+
363
+ encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
364
+
365
+ # Compute token embeddings
366
+ with torch.no_grad():
367
+ model_output = model(**encoded_input)
368
+
369
+ # Perform pooling
370
+ sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
371
+
372
+ # Normalize embeddings
373
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
374
+
375
+ return texts,sentence_embeddings
376
+
377
+ def output_results(self,output_file,texts,embeddings,main_index = 0):
378
+ # Calculate cosine similarities
379
+ # Cosine similarities are in [-1, 1]. Higher means more similar
380
+ cosine_dict = {}
381
+ #print("Total sentences",len(texts))
382
+ for i in range(len(texts)):
383
+ cosine_dict[texts[i]] = 1 - cosine(embeddings[main_index], embeddings[i])
384
+
385
+ #print("Input sentence:",texts[main_index])
386
+ sorted_dict = dict(sorted(cosine_dict.items(), key=lambda item: item[1],reverse = True))
387
+ if (self.debug):
388
+ for key in sorted_dict:
389
+ print("Cosine similarity with \"%s\" is: %.3f" % (key, sorted_dict[key]))
390
+ if (output_file is not None):
391
+ with open(output_file,"w") as fp:
392
+ fp.write(json.dumps(sorted_dict,indent=0))
393
+ return sorted_dict
394
+
395
+
396
+
397
+ if __name__ == '__main__':
398
+ parser = argparse.ArgumentParser(description='SGPT model for sentence embeddings ',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
399
+ parser.add_argument('-input', action="store", dest="input",required=True,help="Input file with sentences")
400
+ parser.add_argument('-output', action="store", dest="output",default="output.txt",help="Output file with results")
401
+ parser.add_argument('-model', action="store", dest="model",default="sentence-transformers/all-MiniLM-L6-v2",help="model name")
402
+
403
+ results = parser.parse_args()
404
+ obj = HFModel()
405
+ obj.init_model(results.model)
406
+ texts, embeddings = obj.compute_embeddings(results.input,is_file = True)
407
+ results = obj.output_results(results.output,texts,embeddings)
view_count.txt ADDED
@@ -0,0 +1 @@
 
1
+ 23