ola13 commited on
Commit
5d0f7ed
1 Parent(s): 7a7bd96

bootstrap the app

Browse files
Files changed (3) hide show
  1. Makefile +7 -0
  2. README.md +6 -5
  3. app.py +240 -0
Makefile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .PHONY: style
2
+
3
+ # Format source code automatically
4
+
5
+ style:
6
+ black --line-length 119 --target-version py36 .
7
+ isort .
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: GÆA / gaia / gæa
3
- emoji: 🌏🌖
4
- colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.9
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Roots Search Tool
3
+ emoji: 🌸 🔎
4
+ colorFrom: green
5
+ colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 3.7
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import string
7
+
8
+ import gradio as gr
9
+ import requests
10
+
11
+
12
+ def get_docid_html(docid):
13
+ data_org, dataset, docid = docid.split("/")
14
+
15
+ docid_html = """<a
16
+ class="underline-on-hover"
17
+ title="I am hovering over the text"
18
+ style="color:#2D31FA;"
19
+ href="https://huggingface.co/datasets/bigscience-data/{}"
20
+ target="_blank">{}</a><span style="color: #7978FF;">/{}</span>""".format(
21
+ dataset, data_org + "/" + dataset, docid
22
+ )
23
+ return docid_html
24
+
25
+
26
+ PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
27
+ PII_PREFIX = "PI:"
28
+
29
+
30
+ def process_pii(text):
31
+ for tag in PII_TAGS:
32
+ text = text.replace(
33
+ PII_PREFIX + tag,
34
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
35
+ )
36
+ return text
37
+
38
+
39
+ def process_results(results, highlight_terms):
40
+ if len(results) == 0:
41
+ return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
42
+ No results retrieved.</p><br><hr>"""
43
+
44
+ results_html = ""
45
+ for result in results:
46
+ tokens = result["text"].split()
47
+ tokens_html = []
48
+ for token in tokens:
49
+ if token in highlight_terms:
50
+ tokens_html.append("<b>{}</b>".format(token))
51
+ else:
52
+ tokens_html.append(token)
53
+ tokens_html = " ".join(tokens_html)
54
+ tokens_html = process_pii(tokens_html)
55
+ meta_html = (
56
+ """
57
+ <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
58
+ <a href='{}' target='_blank'>{}</a></p>""".format(
59
+ result["meta"]["url"], result["meta"]["url"]
60
+ )
61
+ if "meta" in result and result["meta"] is not None and "url" in result["meta"]
62
+ else ""
63
+ )
64
+ docid_html = get_docid_html(result["docid"])
65
+ results_html += """{}
66
+ <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
67
+ <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
68
+ <p style='font-family: Arial;'>{}</p>
69
+ <br>
70
+ """.format(
71
+ meta_html, docid_html, result["lang"], tokens_html
72
+ )
73
+ return results_html + "<hr>"
74
+
75
+
76
+ def scisearch(query, language, num_results=10):
77
+ try:
78
+ query = query.strip()
79
+ if query == "" or query is None:
80
+ return
81
+
82
+ post_data = {"query": query, "k": num_results}
83
+ if language != "detect_language":
84
+ post_data["lang"] = language
85
+
86
+ output = requests.post(
87
+ os.environ.get("address"),
88
+ headers={"Content-type": "application/json"},
89
+ data=json.dumps(post_data),
90
+ timeout=60,
91
+ )
92
+
93
+ payload = json.loads(output.text)
94
+
95
+ if "err" in payload:
96
+ if payload["err"]["type"] == "unsupported_lang":
97
+ detected_lang = payload["err"]["meta"]["detected_lang"]
98
+ return f"""
99
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
100
+ Detected language <b>{detected_lang}</b> is not supported.<br>
101
+ Please choose a language from the dropdown or type another query.
102
+ </p><br><hr><br>"""
103
+
104
+ results = payload["results"]
105
+ highlight_terms = payload["highlight_terms"]
106
+
107
+ if language == "detect_language":
108
+ return (
109
+ (
110
+ f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
111
+ Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
112
+ if len(results) > 0 and language == "detect_language"
113
+ else ""
114
+ )
115
+ + process_results(results, highlight_terms)
116
+ )
117
+
118
+ if language == "all":
119
+ results_html = ""
120
+ for lang, results_for_lang in results.items():
121
+ if len(results_for_lang) == 0:
122
+ results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
123
+ No results for language: <b>{lang}</b><hr></p>"""
124
+ continue
125
+
126
+ collapsible_results = f"""
127
+ <details>
128
+ <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
129
+ Results for language: <b>{lang}</b><hr>
130
+ </summary>
131
+ {process_results(results_for_lang, highlight_terms)}
132
+ </details>"""
133
+ results_html += collapsible_results
134
+ return results_html
135
+
136
+ return process_results(results, highlight_terms)
137
+
138
+ except Exception as e:
139
+ results_html = f"""
140
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
141
+ Raised {type(e).__name__}</p>
142
+ <p style='font-size:14px; font-family: Arial; '>
143
+ Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
144
+ </p>
145
+ """
146
+
147
+ return results_html
148
+
149
+
150
+ def flag(query, language, num_results, issue_description):
151
+ try:
152
+ post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
153
+ if language != "detect_language":
154
+ post_data["lang"] = language
155
+
156
+ output = requests.post(
157
+ os.environ.get("address"),
158
+ headers={"Content-type": "application/json"},
159
+ data=json.dumps(post_data),
160
+ timeout=120,
161
+ )
162
+
163
+ results = json.loads(output.text)
164
+ except:
165
+ print("Error flagging")
166
+ return ""
167
+
168
+
169
+ description = """# <p style="text-align: center;"> 🌸 🔎 ROOTS search tool 🔍 🌸 </p>
170
+ The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
171
+ of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
172
+ you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
173
+ ROOTS. You can read more about the details of the tool design
174
+ [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
175
+ information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
176
+
177
+
178
+ if __name__ == "__main__":
179
+ demo = gr.Blocks(
180
+ css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
181
+ )
182
+
183
+ with demo:
184
+ with gr.Row():
185
+ gr.Markdown(value=description)
186
+ with gr.Row():
187
+ query = gr.Textbox(lines=2, placeholder="Type your query here...", label="Query")
188
+ with gr.Row():
189
+ lang = gr.Dropdown(
190
+ choices=[
191
+ "ar",
192
+ "ca",
193
+ "code",
194
+ "en",
195
+ "es",
196
+ "eu",
197
+ "fr",
198
+ "id",
199
+ "indic",
200
+ "nigercongo",
201
+ "pt",
202
+ "vi",
203
+ "zh",
204
+ "detect_language",
205
+ "all",
206
+ ],
207
+ value="en",
208
+ label="Language",
209
+ )
210
+ with gr.Row():
211
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
212
+ with gr.Row():
213
+ submit_btn = gr.Button("Submit")
214
+ with gr.Row():
215
+ results = gr.HTML(label="Results")
216
+ flag_description = """
217
+ <p class='flagging'>
218
+ If you choose to flag your search, we will save the query, language and the number of results you requested.
219
+ Please consider adding any additional context in the box on the right.</p>"""
220
+ with gr.Column(visible=False) as flagging_form:
221
+ flag_txt = gr.Textbox(
222
+ lines=1,
223
+ placeholder="Type here...",
224
+ label="""If you choose to flag your search, we will save the query, language and the number of results
225
+ you requested. Please consider adding relevant additional context below:""",
226
+ )
227
+ flag_btn = gr.Button("Flag Results")
228
+ flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
229
+
230
+ def submit(query, lang, k):
231
+ if query == "":
232
+ return ["", ""]
233
+ return {
234
+ results: scisearch(query, lang, k),
235
+ flagging_form: gr.update(visible=True),
236
+ }
237
+
238
+ submit_btn.click(submit, inputs=[query, lang, k], outputs=[results, flagging_form])
239
+
240
+ demo.launch(enable_queue=True, debug=True)