Spaces:
Running
Running
add roots
Browse files
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🌏🌖
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: streamlit
|
7 |
-
sdk_version: 1.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: streamlit
|
7 |
+
sdk_version: 1.18.1
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
CHANGED
@@ -6,6 +6,7 @@ import streamlit as st
|
|
6 |
import streamlit.components.v1 as components
|
7 |
import requests
|
8 |
|
|
|
9 |
|
10 |
pp = pprint.PrettyPrinter(indent=2)
|
11 |
|
@@ -20,6 +21,7 @@ with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file:
|
|
20 |
|
21 |
corpus_name_map = {
|
22 |
"LAION": "laion",
|
|
|
23 |
"The Pile": "pile",
|
24 |
"C4": "c4",
|
25 |
}
|
@@ -64,11 +66,11 @@ st.sidebar.markdown(
|
|
64 |
# </p>
|
65 |
|
66 |
|
67 |
-
query = st.sidebar.text_input(label="Query",
|
68 |
corpus = st.sidebar.selectbox(
|
69 |
"Corpus",
|
70 |
tuple(corpus_name_map.keys()),
|
71 |
-
index=
|
72 |
)
|
73 |
max_results = st.sidebar.slider(
|
74 |
"Max Results",
|
@@ -127,10 +129,15 @@ def scisearch(query, corpus, num_results=10):
|
|
127 |
if query == "" or query is None:
|
128 |
return
|
129 |
|
130 |
-
post_data = {"query": query, "corpus": corpus, "k": num_results}
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
output = requests.post(
|
133 |
-
|
134 |
headers={"Content-type": "application/json"},
|
135 |
data=json.dumps(post_data),
|
136 |
timeout=60,
|
@@ -170,9 +177,54 @@ def highlight_string(paragraph: str, highlight_terms: list) -> str:
|
|
170 |
return process_pii(tokens_html)
|
171 |
|
172 |
|
173 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
hit_list = []
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
res_head = f"""
|
177 |
<p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
|
178 |
"""
|
@@ -200,76 +252,86 @@ def process_results(corpus: str, hits: list, highlight_terms: list) -> str:
|
|
200 |
return " ".join(hit_list)
|
201 |
|
202 |
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
<p
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
"""
|
244 |
-
<
|
245 |
-
|
246 |
-
</
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
.
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import streamlit.components.v1 as components
|
7 |
import requests
|
8 |
|
9 |
+
from typing import Union
|
10 |
|
11 |
pp = pprint.PrettyPrinter(indent=2)
|
12 |
|
|
|
21 |
|
22 |
corpus_name_map = {
|
23 |
"LAION": "laion",
|
24 |
+
"ROOTS": "roots",
|
25 |
"The Pile": "pile",
|
26 |
"C4": "c4",
|
27 |
}
|
|
|
66 |
# </p>
|
67 |
|
68 |
|
69 |
+
query = st.sidebar.text_input(label="Query", placeholder="Type your query here")
|
70 |
corpus = st.sidebar.selectbox(
|
71 |
"Corpus",
|
72 |
tuple(corpus_name_map.keys()),
|
73 |
+
index=2,
|
74 |
)
|
75 |
max_results = st.sidebar.slider(
|
76 |
"Max Results",
|
|
|
129 |
if query == "" or query is None:
|
130 |
return
|
131 |
|
132 |
+
post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"}
|
133 |
+
address = (
|
134 |
+
os.environ.get("address")
|
135 |
+
if corpus != "roots"
|
136 |
+
else "http://34.116.206.238:8080"
|
137 |
+
)
|
138 |
|
139 |
output = requests.post(
|
140 |
+
address,
|
141 |
headers={"Content-type": "application/json"},
|
142 |
data=json.dumps(post_data),
|
143 |
timeout=60,
|
|
|
177 |
return process_pii(tokens_html)
|
178 |
|
179 |
|
180 |
+
def extract_lang_from_docid(docid):
|
181 |
+
return docid.split("_")[1]
|
182 |
+
|
183 |
+
|
184 |
+
def format_result(result, highlight_terms):
|
185 |
+
text = result["text"]
|
186 |
+
docid = result["docid"]
|
187 |
+
tokens_html = highlight_string(text, highlight_terms)
|
188 |
+
language = extract_lang_from_docid(docid)
|
189 |
+
result_html = """
|
190 |
+
<span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span>
|
191 |
+
<span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span><br>
|
192 |
+
<span style='font-family: Arial;'>{}</span><br>
|
193 |
+
<br>
|
194 |
+
""".format(
|
195 |
+
language, docid, tokens_html
|
196 |
+
)
|
197 |
+
return "<p>" + result_html + "</p>"
|
198 |
+
|
199 |
+
|
200 |
+
def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str:
|
201 |
hit_list = []
|
202 |
+
|
203 |
+
if corpus == "roots":
|
204 |
+
result_page_html = ""
|
205 |
+
for lang, results_for_lang in hits.items():
|
206 |
+
print("Processing language", lang)
|
207 |
+
if len(results_for_lang) == 0:
|
208 |
+
result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'>
|
209 |
+
No results for language: <b>{}</b></div>""".format(
|
210 |
+
lang
|
211 |
+
)
|
212 |
+
continue
|
213 |
+
results_for_lang_html = ""
|
214 |
+
for result in results_for_lang:
|
215 |
+
result_html = format_result(result, highlight_terms)
|
216 |
+
results_for_lang_html += result_html
|
217 |
+
results_for_lang_html = f"""
|
218 |
+
<details>
|
219 |
+
<summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'>
|
220 |
+
Results for language: <b>{lang}</b>
|
221 |
+
</summary>
|
222 |
+
{results_for_lang_html}
|
223 |
+
</details>"""
|
224 |
+
result_page_html += results_for_lang_html
|
225 |
+
return result_page_html
|
226 |
+
|
227 |
+
for hit in hits:
|
228 |
res_head = f"""
|
229 |
<p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p>
|
230 |
"""
|
|
|
252 |
return " ".join(hit_list)
|
253 |
|
254 |
|
255 |
+
submit_button = st.sidebar.button("Search", type="primary")
|
256 |
+
|
257 |
+
if submit_button or query:
|
258 |
+
query = query.strip()
|
259 |
+
if query is None or query == "":
|
260 |
+
components.html(
|
261 |
+
"""<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'>
|
262 |
+
Please provide a non-empty query.
|
263 |
+
</p><br><hr><br>"""
|
264 |
+
)
|
265 |
+
else:
|
266 |
+
hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results)
|
267 |
+
html_results = process_results(corpus_name_map[corpus], hits, highlight_terms)
|
268 |
+
rendered_results = f"""
|
269 |
+
<div id="searchresultsarea">
|
270 |
+
<br>
|
271 |
+
<p id="searchresultsnumber">About {max_results} results</p>
|
272 |
+
{html_results}
|
273 |
+
</div>"""
|
274 |
+
# st.markdown(
|
275 |
+
# """
|
276 |
+
# <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
|
277 |
+
# integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
|
278 |
+
# """,
|
279 |
+
# unsafe_allow_html=True,
|
280 |
+
# )
|
281 |
+
# st.markdown(
|
282 |
+
# """
|
283 |
+
# <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
|
284 |
+
# """,
|
285 |
+
# unsafe_allow_html=True,
|
286 |
+
# )
|
287 |
+
# st.markdown(
|
288 |
+
# f"""
|
289 |
+
# <div class="row no-gutters mt-3 align-items-center">
|
290 |
+
# Gaia Search 🌖🌏
|
291 |
+
# <div class="col col-md-4">
|
292 |
+
# <input class="form-control border-secondary rounded-pill pr-5" type="search" value="{query}" id="example-search-input2">
|
293 |
+
# </div>
|
294 |
+
# <div class="col-auto">
|
295 |
+
# <button class="btn btn-outline-light text-dark border-0 rounded-pill ml-n5" type="button">
|
296 |
+
# <i class="fa fa-search"></i>
|
297 |
+
# </button>
|
298 |
+
# </div>
|
299 |
+
# </div>
|
300 |
+
# """,
|
301 |
+
# unsafe_allow_html=True,
|
302 |
+
# )
|
303 |
+
# .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;}
|
304 |
+
components.html(
|
305 |
+
"""
|
306 |
+
<head>
|
307 |
+
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'>
|
308 |
+
</head>
|
309 |
+
<style>
|
310 |
+
#searchresultsarea {
|
311 |
+
font-family: "Source Sans Pro", sans-serif;
|
312 |
+
}
|
313 |
+
#searchresultsnumber {
|
314 |
+
font-size: 0.8rem;
|
315 |
+
color: gray;
|
316 |
+
}
|
317 |
+
.searchresult h2 {
|
318 |
+
font-size: 19px;
|
319 |
+
line-height: 18px;
|
320 |
+
font-weight: normal;
|
321 |
+
color: rgb(7, 111, 222);
|
322 |
+
margin-bottom: 0px;
|
323 |
+
margin-top: 25px;
|
324 |
+
color: #7978FF;"
|
325 |
+
}
|
326 |
+
.searchresult a {
|
327 |
+
font-size: 12px;
|
328 |
+
line-height: 12px;
|
329 |
+
color: green;
|
330 |
+
margin-bottom: 0px;
|
331 |
+
}
|
332 |
+
</style>
|
333 |
+
"""
|
334 |
+
+ rendered_results,
|
335 |
+
height=800,
|
336 |
+
scrolling=True,
|
337 |
+
)
|