ola13 commited on
Commit
fa9f1ec
1 Parent(s): 2e2eeaa

exact search

Browse files
Files changed (1) hide show
  1. app.py +96 -15
app.py CHANGED
@@ -5,9 +5,13 @@ import requests
5
  from huggingface_hub import HfApi
6
 
7
  hf_api = HfApi()
8
- roots_datasets = {dset.id.split("/")[-1]:dset for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))}
 
 
 
 
 
9
 
10
- # def get_dataset_metadata():
11
 
12
  def get_docid_html(docid):
13
  data_org, dataset, docid = docid.split("/")
@@ -29,7 +33,7 @@ def get_docid_html(docid):
29
  f'style="color:#2D31FA;"'
30
  f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
31
  f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
32
- )
33
  return docid_html
34
 
35
 
@@ -41,7 +45,9 @@ def process_pii(text):
41
  for tag in PII_TAGS:
42
  text = text.replace(
43
  PII_PREFIX + tag,
44
- """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
 
 
45
  )
46
  return text
47
 
@@ -68,7 +74,9 @@ def process_results(results, highlight_terms):
68
  <a href='{}' target='_blank'>{}</a></p>""".format(
69
  result["meta"]["url"], result["meta"]["url"]
70
  )
71
- if "meta" in result and result["meta"] is not None and "url" in result["meta"]
 
 
72
  else ""
73
  )
74
  docid_html = get_docid_html(result["docid"])
@@ -83,13 +91,13 @@ def process_results(results, highlight_terms):
83
  return results_html + "<hr>"
84
 
85
 
86
- def scisearch(query, language, num_results=10):
87
  try:
88
  query = " ".join(query.split())
89
  if query == "" or query is None:
90
  return ""
91
 
92
- post_data = {"query": query, "k": num_results}
93
  if language != "detect_language":
94
  post_data["lang"] = language
95
 
@@ -157,9 +165,57 @@ def scisearch(query, language, num_results=10):
157
  return results_html
158
 
159
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  def flag(query, language, num_results, issue_description):
161
  try:
162
- post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
 
 
 
 
 
163
  if language != "detect_language":
164
  post_data["lang"] = language
165
 
@@ -194,7 +250,12 @@ if __name__ == "__main__":
194
  with gr.Row():
195
  gr.Markdown(value=description)
196
  with gr.Row():
197
- query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
 
 
 
 
 
198
  with gr.Row():
199
  lang = gr.Dropdown(
200
  choices=[
@@ -220,7 +281,12 @@ if __name__ == "__main__":
220
  with gr.Row():
221
  k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
222
  with gr.Row():
223
- submit_btn = gr.Button("Submit")
 
 
 
 
 
224
  with gr.Row():
225
  results = gr.HTML(label="Results")
226
  flag_description = """
@@ -237,16 +303,31 @@ if __name__ == "__main__":
237
  flag_btn = gr.Button("Flag Results")
238
  flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
239
 
240
- def submit(query, lang, k):
 
241
  query = query.strip()
242
  if query is None or query == "":
243
  return "", ""
 
 
 
 
 
 
244
  return {
245
- results: scisearch(query, lang, k),
246
  flagging_form: gr.update(visible=True),
247
  }
248
 
249
- query.submit(fn=submit, inputs=[query, lang, k], outputs=[results, flagging_form])
250
- submit_btn.click(submit, inputs=[query, lang, k], outputs=[results, flagging_form])
 
 
 
 
 
 
 
 
251
 
252
- demo.launch(enable_queue=True, debug=True)
 
5
  from huggingface_hub import HfApi
6
 
7
  hf_api = HfApi()
8
+ roots_datasets = {
9
+ dset.id.split("/")[-1]: dset
10
+ for dset in hf_api.list_datasets(
11
+ author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token")
12
+ )
13
+ }
14
 
 
15
 
16
  def get_docid_html(docid):
17
  data_org, dataset, docid = docid.split("/")
 
33
  f'style="color:#2D31FA;"'
34
  f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
35
  f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
36
+ )
37
  return docid_html
38
 
39
 
 
45
  for tag in PII_TAGS:
46
  text = text.replace(
47
  PII_PREFIX + tag,
48
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(
49
+ tag
50
+ ),
51
  )
52
  return text
53
 
 
74
  <a href='{}' target='_blank'>{}</a></p>""".format(
75
  result["meta"]["url"], result["meta"]["url"]
76
  )
77
+ if "meta" in result
78
+ and result["meta"] is not None
79
+ and "url" in result["meta"]
80
  else ""
81
  )
82
  docid_html = get_docid_html(result["docid"])
 
91
  return results_html + "<hr>"
92
 
93
 
94
+ def scisearch(query, language, num_results=10, exact_search=False):
95
  try:
96
  query = " ".join(query.split())
97
  if query == "" or query is None:
98
  return ""
99
 
100
+ post_data = {"query": query, "k": num_results, "exact_search": exact_search}
101
  if language != "detect_language":
102
  post_data["lang"] = language
103
 
 
165
  return results_html
166
 
167
 
168
+ def perform_exact_search(query, num_results=10):
169
+ try:
170
+ print("perform_exact_search")
171
+ query = " ".join(query.split())
172
+ if query == "" or query is None:
173
+ return ""
174
+
175
+ post_data = {"query": query, "k": num_results, "exact_search": True}
176
+
177
+ print("post_data", post_data)
178
+
179
+ output = requests.post(
180
+ "http://34.105.160.81:8080",
181
+ headers={"Content-type": "application/json"},
182
+ data=json.dumps(post_data),
183
+ timeout=60,
184
+ )
185
+
186
+ payload = json.loads(output.text)
187
+ results = payload["results"]
188
+
189
+ results_html = ""
190
+ for result in results:
191
+ print(result)
192
+ result_html = """<br><hr><br>"""
193
+ query_start = result.find(query)
194
+ query_end = query_start + len(query)
195
+ result_html += result[0:query_start]
196
+ result_html += "<b>{}</b>".format(result[query_start:query_end])
197
+ result_html += result[query_end:]
198
+ results_html += result_html
199
+ return results_html + "<hr>"
200
+
201
+ except Exception as e:
202
+ results_html = f"""
203
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
204
+ Raised {type(e).__name__}</p>
205
+ <p style='font-size:14px; font-family: Arial; '>
206
+ Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
207
+ </p>
208
+ """
209
+
210
+
211
  def flag(query, language, num_results, issue_description):
212
  try:
213
+ post_data = {
214
+ "query": query,
215
+ "k": num_results,
216
+ "flag": True,
217
+ "description": issue_description,
218
+ }
219
  if language != "detect_language":
220
  post_data["lang"] = language
221
 
 
250
  with gr.Row():
251
  gr.Markdown(value=description)
252
  with gr.Row():
253
+ query = gr.Textbox(
254
+ lines=1,
255
+ max_lines=1,
256
+ placeholder="Type your query here...",
257
+ label="Query",
258
+ )
259
  with gr.Row():
260
  lang = gr.Dropdown(
261
  choices=[
 
281
  with gr.Row():
282
  k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
283
  with gr.Row():
284
+ with gr.Column(scale=1):
285
+ exact_search = gr.Checkbox(
286
+ value=False, label="Exact Search", variant="compact"
287
+ )
288
+ with gr.Column(scale=4):
289
+ submit_btn = gr.Button("Submit")
290
  with gr.Row():
291
  results = gr.HTML(label="Results")
292
  flag_description = """
 
303
  flag_btn = gr.Button("Flag Results")
304
  flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
305
 
306
+ def submit(query, lang, k, exact_search):
307
+ print("submitting", query, lang, k, exact_search)
308
  query = query.strip()
309
  if query is None or query == "":
310
  return "", ""
311
+
312
+ if exact_search:
313
+ return {
314
+ results: perform_exact_search(query, k),
315
+ flagging_form: gr.update(visible=True),
316
+ }
317
  return {
318
+ results: scisearch(query, lang, k, exact_search),
319
  flagging_form: gr.update(visible=True),
320
  }
321
 
322
+ query.submit(
323
+ fn=submit,
324
+ inputs=[query, lang, k, exact_search],
325
+ outputs=[results, flagging_form],
326
+ )
327
+ submit_btn.click(
328
+ submit,
329
+ inputs=[query, lang, k, exact_search],
330
+ outputs=[results, flagging_form],
331
+ )
332
 
333
+ demo.launch(enable_queue=False, debug=True)