cakiki ola13 commited on
Commit
cb35e87
โ€ข
0 Parent(s):

Duplicate from bigscience-data/roots-search

Browse files

Co-authored-by: Aleksandra Piktus <ola13@users.noreply.huggingface.co>

Files changed (6) hide show
  1. .gitattributes +34 -0
  2. Makefile +7 -0
  3. README.md +14 -0
  4. app.py +255 -0
  5. roots_search_tool_specs.pdf +3 -0
  6. spaces.code-workspace +8 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.npy filter=lfs diff=lfs merge=lfs -text
14
+ *.npz filter=lfs diff=lfs merge=lfs -text
15
+ *.onnx filter=lfs diff=lfs merge=lfs -text
16
+ *.ot filter=lfs diff=lfs merge=lfs -text
17
+ *.parquet filter=lfs diff=lfs merge=lfs -text
18
+ *.pb filter=lfs diff=lfs merge=lfs -text
19
+ *.pickle filter=lfs diff=lfs merge=lfs -text
20
+ *.pkl filter=lfs diff=lfs merge=lfs -text
21
+ *.pt filter=lfs diff=lfs merge=lfs -text
22
+ *.pth filter=lfs diff=lfs merge=lfs -text
23
+ *.rar filter=lfs diff=lfs merge=lfs -text
24
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
25
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
26
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
27
+ *.tflite filter=lfs diff=lfs merge=lfs -text
28
+ *.tgz filter=lfs diff=lfs merge=lfs -text
29
+ *.wasm filter=lfs diff=lfs merge=lfs -text
30
+ *.xz filter=lfs diff=lfs merge=lfs -text
31
+ *.zip filter=lfs diff=lfs merge=lfs -text
32
+ *.zst filter=lfs diff=lfs merge=lfs -text
33
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.pdf filter=lfs diff=lfs merge=lfs -text
Makefile ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
1
+ .PHONY: style
2
+
3
+ # Format source code automatically
4
+
5
+ style:
6
+ black --line-length 119 --target-version py36 .
7
+ isort .
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Roots Search Tool
3
+ emoji: ๐ŸŒธ ๐Ÿ”Ž
4
+ colorFrom: green
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 3.7
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: bigscience-data/roots-search
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import http.client as http_client
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import string
7
+
8
+ import gradio as gr
9
+ import requests
10
+ from huggingface_hub import HfApi
11
+
12
+ hf_api = HfApi()
13
+ roots_datasets = {dset.id.split("/")[-1]:dset for dset in hf_api.list_datasets(author="bigscience-data", use_auth_token=os.environ.get("bigscience_data_token"))}
14
+
15
+ def get_docid_html(docid):
16
+ data_org, dataset, docid = docid.split("/")
17
+ metadata = roots_datasets[dataset]
18
+ if metadata.private:
19
+ docid_html = (
20
+ f"<a "
21
+ f'class="underline-on-hover"'
22
+ f'title="This dataset is private. See the introductory text for more information"'
23
+ f'style="color:#AA4A44;"'
24
+ f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
25
+ f'target="_blank"><b>๐Ÿ”’{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
26
+ )
27
+ else:
28
+ docid_html = (
29
+ f"<a "
30
+ f'class="underline-on-hover"'
31
+ f'title="This dataset is licensed {metadata.tags[0].split(":")[-1]}"'
32
+ f'style="color:#2D31FA;"'
33
+ f'href="https://huggingface.co/datasets/bigscience-data/{dataset}"'
34
+ f'target="_blank"><b>{dataset}</b></a><span style="color: #7978FF;">/{docid}</span>'
35
+ )
36
+ return docid_html
37
+
38
+
39
+ PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"}
40
+ PII_PREFIX = "PI:"
41
+
42
+
43
+ def process_pii(text):
44
+ for tag in PII_TAGS:
45
+ text = text.replace(
46
+ PII_PREFIX + tag,
47
+ """<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format(tag),
48
+ )
49
+ return text
50
+
51
+
52
+ def process_results(results, highlight_terms):
53
+ if len(results) == 0:
54
+ return """<br><p style='font-family: Arial; color:Silver; text-align: center;'>
55
+ No results retrieved.</p><br><hr>"""
56
+
57
+ results_html = ""
58
+ for result in results:
59
+ tokens = result["text"].split()
60
+ tokens_html = []
61
+ for token in tokens:
62
+ if token in highlight_terms:
63
+ tokens_html.append("<b>{}</b>".format(token))
64
+ else:
65
+ tokens_html.append(token)
66
+ tokens_html = " ".join(tokens_html)
67
+ tokens_html = process_pii(tokens_html)
68
+ meta_html = (
69
+ """
70
+ <p class='underline-on-hover' style='font-size:12px; font-family: Arial; color:#585858; text-align: left;'>
71
+ <a href='{}' target='_blank'>{}</a></p>""".format(
72
+ result["meta"]["url"], result["meta"]["url"]
73
+ )
74
+ if "meta" in result and result["meta"] is not None and "url" in result["meta"]
75
+ else ""
76
+ )
77
+ docid_html = get_docid_html(result["docid"])
78
+ results_html += """{}
79
+ <p style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {}</p>
80
+ <p style='font-size:12px; font-family: Arial; color:MediumAquaMarine'>Language: {}</p>
81
+ <p style='font-family: Arial;'>{}</p>
82
+ <br>
83
+ """.format(
84
+ meta_html, docid_html, result["lang"], tokens_html
85
+ )
86
+ return results_html + "<hr>"
87
+
88
+
89
+ def scisearch(query, language, num_results=10):
90
+ try:
91
+ query = " ".join(query.split())
92
+ if query == "" or query is None:
93
+ return ""
94
+
95
+ post_data = {"query": query, "k": num_results}
96
+ if language != "detect_language":
97
+ post_data["lang"] = language
98
+
99
+ output = requests.post(
100
+ os.environ.get("address"),
101
+ headers={"Content-type": "application/json"},
102
+ data=json.dumps(post_data),
103
+ timeout=60,
104
+ )
105
+
106
+ payload = json.loads(output.text)
107
+
108
+ if "err" in payload:
109
+ if payload["err"]["type"] == "unsupported_lang":
110
+ detected_lang = payload["err"]["meta"]["detected_lang"]
111
+ return f"""
112
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
113
+ Detected language <b>{detected_lang}</b> is not supported.<br>
114
+ Please choose a language from the dropdown or type another query.
115
+ </p><br><hr><br>"""
116
+
117
+ results = payload["results"]
118
+ highlight_terms = payload["highlight_terms"]
119
+
120
+ if language == "detect_language":
121
+ return (
122
+ (
123
+ f"""<p style='font-family: Arial; color:MediumAquaMarine; text-align: center; line-height: 3em'>
124
+ Detected language: <b>{results[0]["lang"]}</b></p><br><hr><br>"""
125
+ if len(results) > 0 and language == "detect_language"
126
+ else ""
127
+ )
128
+ + process_results(results, highlight_terms)
129
+ )
130
+
131
+ if language == "all":
132
+ results_html = ""
133
+ for lang, results_for_lang in results.items():
134
+ if len(results_for_lang) == 0:
135
+ results_html += f"""<p style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
136
+ No results for language: <b>{lang}</b><hr></p>"""
137
+ continue
138
+
139
+ collapsible_results = f"""
140
+ <details>
141
+ <summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
142
+ Results for language: <b>{lang}</b><hr>
143
+ </summary>
144
+ {process_results(results_for_lang, highlight_terms)}
145
+ </details>"""
146
+ results_html += collapsible_results
147
+ return results_html
148
+
149
+ return process_results(results, highlight_terms)
150
+
151
+ except Exception as e:
152
+ results_html = f"""
153
+ <p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
154
+ Raised {type(e).__name__}</p>
155
+ <p style='font-size:14px; font-family: Arial; '>
156
+ Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.
157
+ </p>
158
+ """
159
+
160
+ return results_html
161
+
162
+
163
+ def flag(query, language, num_results, issue_description):
164
+ try:
165
+ post_data = {"query": query, "k": num_results, "flag": True, "description": issue_description}
166
+ if language != "detect_language":
167
+ post_data["lang"] = language
168
+
169
+ output = requests.post(
170
+ os.environ.get("address"),
171
+ headers={"Content-type": "application/json"},
172
+ data=json.dumps(post_data),
173
+ timeout=120,
174
+ )
175
+
176
+ results = json.loads(output.text)
177
+ except:
178
+ print("Error flagging")
179
+ return ""
180
+
181
+
182
+ description = """# <p style="text-align: center;"> ๐ŸŒธ ๐Ÿ”Ž ROOTS search tool ๐Ÿ” ๐ŸŒธ </p>
183
+ The ROOTS corpus was developed during the [BigScience workshop](https://bigscience.huggingface.co/) for the purpose
184
+ of training the Multilingual Large Language Model [BLOOM](https://huggingface.co/bigscience/bloom). This tool allows
185
+ you to search through the ROOTS corpus. We serve a BM25 index for each language or group of languages included in
186
+ ROOTS. You can read more about the details of the tool design
187
+ [here](https://huggingface.co/spaces/bigscience-data/scisearch/blob/main/roots_search_tool_specs.pdf). For more
188
+ information and instructions on how to access the full corpus check [this form](https://forms.gle/qyYswbEL5kA23Wu99)."""
189
+
190
+
191
+ if __name__ == "__main__":
192
+ demo = gr.Blocks(
193
+ css=".underline-on-hover:hover { text-decoration: underline; } .flagging { font-size:12px; color:Silver; }"
194
+ )
195
+
196
+ with demo:
197
+ with gr.Row():
198
+ gr.Markdown(value=description)
199
+ with gr.Row():
200
+ query = gr.Textbox(lines=1, max_lines=1, placeholder="Type your query here...", label="Query")
201
+ with gr.Row():
202
+ lang = gr.Dropdown(
203
+ choices=[
204
+ "ar",
205
+ "ca",
206
+ "code",
207
+ "en",
208
+ "es",
209
+ "eu",
210
+ "fr",
211
+ "id",
212
+ "indic",
213
+ "nigercongo",
214
+ "pt",
215
+ "vi",
216
+ "zh",
217
+ "detect_language",
218
+ "all",
219
+ ],
220
+ value="en",
221
+ label="Language",
222
+ )
223
+ with gr.Row():
224
+ k = gr.Slider(1, 100, value=10, step=1, label="Max Results")
225
+ with gr.Row():
226
+ submit_btn = gr.Button("Submit")
227
+ with gr.Row():
228
+ results = gr.HTML(label="Results")
229
+ flag_description = """
230
+ <p class='flagging'>
231
+ If you choose to flag your search, we will save the query, language and the number of results you requested.
232
+ Please consider adding any additional context in the box on the right.</p>"""
233
+ with gr.Column(visible=False) as flagging_form:
234
+ flag_txt = gr.Textbox(
235
+ lines=1,
236
+ placeholder="Type here...",
237
+ label="""If you choose to flag your search, we will save the query, language and the number of results
238
+ you requested. Please consider adding relevant additional context below:""",
239
+ )
240
+ flag_btn = gr.Button("Flag Results")
241
+ flag_btn.click(flag, inputs=[query, lang, k, flag_txt], outputs=[flag_txt])
242
+
243
+ def submit(query, lang, k):
244
+ query = query.strip()
245
+ if query is None or query == "":
246
+ return "", ""
247
+ return {
248
+ results: scisearch(query, lang, k),
249
+ flagging_form: gr.update(visible=True),
250
+ }
251
+
252
+ query.submit(fn=submit, inputs=[query, lang, k], outputs=[results, flagging_form])
253
+ submit_btn.click(submit, inputs=[query, lang, k], outputs=[results, flagging_form])
254
+
255
+ demo.launch(enable_queue=True, debug=True)
roots_search_tool_specs.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44cf8caa3819e8abf036178c4d329363d0649def28ec4e0979a0e0b2b94362d9
3
+ size 2642911
spaces.code-workspace ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "folders": [
3
+ {
4
+ "path": ".."
5
+ }
6
+ ],
7
+ "settings": {}
8
+ }