lvwerra HF staff commited on
Commit
655c971
โ€ข
1 Parent(s): 3abd585

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -165
app.py CHANGED
@@ -7,47 +7,8 @@ import string
7
 
8
  import gradio as gr
9
  import requests
10
- from huggingface_hub import HfApi
11
-
12
- hf_api = HfApi()
13
- roots_datasets = {dset.id.split("/")[-1]:dset for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))}
14
-
15
- def get_docid_html(docid):
16
- data_org, dataset, docid = docid.split("/")
17
- metadata = roots_datasets[dataset]
18
- if metadata.private:
19
- docid_html = (
20
- f"<a "
21
- f'class="underline-on-hover"'
22
- f'title="This dataset is private. See the introductory text for more information"'
23
- f'style="color:#AA4A44;"'
24
- f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
25
- f'target="_blank"><b>๐Ÿ”’{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
26
- )
27
- else:
28
- docid_html = (
29
- f"<a "
30
- f'class="underline-on-hover"'
31
- f'title="This dataset is licensed {metadata.tags[0].split(":")[-1]}"'
32
- f'style="color:#2D31FA;"'
33
- f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
34
- f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
35
- )
36
- return docid_html
37
-
38
-
39
- PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
40
- PII_PREFIX = "PI:"
41
-
42
-
43
- def process_pii(text):
44
- for tag in PII_TAGS:
45
- text = text.replace(
46
- PII_PREFIX + tag,
47
- """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
48
- )
49
- return text
50
 
 
51
 
52
  def process_results(results, highlight_terms):
53
  if len(results) == 0:
@@ -64,7 +25,6 @@ def process_results(results, highlight_terms):
64
  else:
65
  tokens_html.append(token)
66
  tokens_html = " ".join(tokens_html)
67
- tokens_html = process_pii(tokens_html)
68
  meta_html = (
69
  """
70
  <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
@@ -74,7 +34,7 @@ def process_results(results, highlight_terms):
74
  if "meta" in result and result["meta"] is not None and "url" in result["meta"]
75
  else ""
76
  )
77
- docid_html = get_docid_html(result["docid"])
78
  results_html += """{}
79
  <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
80
  <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
@@ -87,96 +47,25 @@ def process_results(results, highlight_terms):
87
 
88
 
89
  def scisearch(query, language, num_results=10):
90
- try:
91
- query = " ".join(query.split())
92
- if query == "" or query is None:
93
- return ""
94
-
95
- post_data = {"query": query, "k": num_results}
96
- if language != "detect_language":
97
- post_data["lang"] = language
98
-
99
- output = requests.post(
100
- os.environ.get("address"),
101
- headers={"Content-type": "application/json"},
102
- data=json.dumps(post_data),
103
- timeout=60,
104
- )
105
-
106
- payload = json.loads(output.text)
107
-
108
- if "err" in payload:
109
- if payload["err"]["type"] == "unsupported_lang":
110
- detected_lang = payload["err"]["meta"]["detected_lang"]
111
- return f"""
112
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
113
- Detected language <b>{detected_lang}</b> is not supported.<br>
114
- Please choose a language from the dropdown or type another query.
115
- </p><br><hr><br>"""
116
-
117
- results = payload["results"]
118
- highlight_terms = payload["highlight_terms"]
119
-
120
- if language == "detect_language":
121
- return (
122
- (
123
- f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
124
- Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
125
- if len(results) > 0 and language == "detect_language"
126
- else ""
127
- )
128
- + process_results(results, highlight_terms)
129
- )
130
-
131
- if language == "all":
132
- results_html = ""
133
- for lang, results_for_lang in results.items():
134
- if len(results_for_lang) == 0:
135
- results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
136
- No results for language: <b>{lang}</b><hr></p>"""
137
- continue
138
-
139
- collapsible_results = f"""
140
- <details>
141
- <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
142
- Results for language: <b>{lang}</b><hr>
143
- </summary>
144
- {process_results(results_for_lang, highlight_terms)}
145
- </details>"""
146
- results_html += collapsible_results
147
- return results_html
148
-
149
- return process_results(results, highlight_terms)
150
-
151
- except Exception as e:
152
- results_html = f"""
153
- <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
154
- Raised {type(e).__name__}</p>
155
- <p style='font-size:14px; font-family: Arial; '>
156
- Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
157
- </p>
158
- """
159
 
160
- return results_html
 
 
161
 
 
162
 
163
- def flag(query, language, num_results, issue_description):
164
- try:
165
- post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
166
- if language != "detect_language":
167
- post_data["lang"] = language
 
168
 
169
- output = requests.post(
170
- os.environ.get("address"),
171
- headers={"Content-type": "application/json"},
172
- data=json.dumps(post_data),
173
- timeout=120,
174
- )
175
 
176
- results = json.loads(output.text)
177
- except:
178
- print("Error flagging")
179
- return ""
180
 
181
 
182
  description = """# <p style="text-align: center;"> ๐ŸŒธ ๐Ÿ”Ž ROOTS search tool ๐Ÿ” ๐ŸŒธ </p>
@@ -198,47 +87,14 @@ if __name__ == "__main__":
198
  gr.Markdown(value=description)
199
  with gr.Row():
200
  query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
201
- with gr.Row():
202
- lang = gr.Dropdown(
203
- choices=[
204
- "ar",
205
- "ca",
206
- "code",
207
- "en",
208
- "es",
209
- "eu",
210
- "fr",
211
- "id",
212
- "indic",
213
- "nigercongo",
214
- "pt",
215
- "vi",
216
- "zh",
217
- "detect_language",
218
- "all",
219
- ],
220
- value="en",
221
- label="Language",
222
- )
223
  with gr.Row():
224
  k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
225
  with gr.Row():
226
  submit_btn = gr.Button("Submit")
227
  with gr.Row():
228
  results = gr.HTML(label="Results")
229
- flag_description = """
230
- <p class='flagging'>
231
- If you choose to flag your search, we will save the query, language and the number of results you requested.
232
- Please consider adding any additional context in the box on the right.</p>"""
233
- with gr.Column(visible=False) as flagging_form:
234
- flag_txt = gr.Textbox(
235
- lines=1,
236
- placeholder="Type here...",
237
- label="""If you choose to flag your search, we will save the query, language and the number of results
238
- you requested. Please consider adding relevant additional context below:""",
239
- )
240
- flag_btn = gr.Button("Flag Results")
241
- flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
242
 
243
  def submit(query, lang, k):
244
  query = query.strip()
@@ -246,10 +102,9 @@ if __name__ == "__main__":
246
  return "", ""
247
  return {
248
  results: scisearch(query, lang, k),
249
- flagging_form: gr.update(visible=True),
250
  }
251
 
252
- query.submit(fn=submit, inputs=[query, lang, k], outputs=[results, flagging_form])
253
- submit_btn.click(submit, inputs=[query, lang, k], outputs=[results, flagging_form])
254
 
255
  demo.launch(enable_queue=True, debug=True)
 
7
 
8
  import gradio as gr
9
  import requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ lang = "en"
12
 
13
  def process_results(results, highlight_terms):
14
  if len(results) == 0:
 
25
  else:
26
  tokens_html.append(token)
27
  tokens_html = " ".join(tokens_html)
 
28
  meta_html = (
29
  """
30
  <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
 
34
  if "meta" in result and result["meta"] is not None and "url" in result["meta"]
35
  else ""
36
  )
37
+ docid_html = str(result["docid"])
38
  results_html += """{}
39
  <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
40
  <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
 
47
 
48
 
49
  def scisearch(query, language, num_results=10):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
+ query = " ".join(query.split())
52
+ if query == "" or query is None:
53
+ return ""
54
 
55
+ post_data = {"query": query, "k": num_results}
56
 
57
+ output = requests.post(
58
+ os.environ.get("address"),
59
+ headers={"Content-type": "application/json"},
60
+ data=json.dumps(post_data),
61
+ timeout=60,
62
+ )
63
 
64
+ payload = json.loads(output.text)
 
 
 
 
 
65
 
66
+ results = payload["results"]
67
+ highlight_terms = payload["highlight_terms"]
68
+ return process_results(results, highlight_terms)
 
69
 
70
 
71
  description = """# <p style="text-align: center;"> ๐ŸŒธ ๐Ÿ”Ž ROOTS search tool ๐Ÿ” ๐ŸŒธ </p>
 
87
  gr.Markdown(value=description)
88
  with gr.Row():
89
  query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
90
+
91
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  with gr.Row():
93
  k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
94
  with gr.Row():
95
  submit_btn = gr.Button("Submit")
96
  with gr.Row():
97
  results = gr.HTML(label="Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  def submit(query, lang, k):
100
  query = query.strip()
 
102
  return "", ""
103
  return {
104
  results: scisearch(query, lang, k),
 
105
  }
106
 
107
+ query.submit(fn=submit, inputs=[query, lang, k], outputs=[results])
108
+ submit_btn.click(submit, inputs=[query, lang, k], outputs=[results])
109
 
110
  demo.launch(enable_queue=True, debug=True)