|
import json |
|
import os |
|
import pprint |
|
|
|
import streamlit as st |
|
import streamlit.components.v1 as components |
|
import requests |
|
|
|
from typing import Union |
|
|
|
pp = pprint.PrettyPrinter(indent=2) |
|
|
|
os.environ["address"] = "http://34.79.83.149:8080" |
|
|
|
st.set_page_config(page_title="Gaia Search ππ", layout="wide") |
|
|
|
os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) |
|
with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: |
|
file.write('[theme]\nbase="light"') |
|
|
|
|
|
corpus_name_map = { |
|
"LAION": "laion", |
|
"ROOTS": "roots", |
|
"The Pile": "pile", |
|
"C4": "c4", |
|
} |
|
|
|
st.sidebar.markdown( |
|
""" |
|
<style> |
|
.aligncenter { |
|
text-align: center; |
|
font-weight: bold; |
|
font-size: 36px; |
|
} |
|
</style> |
|
<p class="aligncenter">Gaia Search ππ</p> |
|
<p>A search engine for large scale texual |
|
corpora. Most of the datasets included in the tool are based on Common |
|
Crawl. By using the tool, you are also bound by the Common Crawl terms |
|
of use in respect of the content contained in the datasets. |
|
</p> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
st.sidebar.markdown( |
|
""" |
|
<style> |
|
.aligncenter { |
|
text-align: center; |
|
} |
|
</style> |
|
<p style='text-align: center'> |
|
<a href="" style="color:#7978FF;">GitHub</a> | <a href="" style="color:#7978FF;" >Project Report</a> | <a href="" style="color:#7978FF;" >Colab</a> |
|
</p> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
query = st.sidebar.text_input(label="Query", placeholder="Type your query here") |
|
corpus = st.sidebar.selectbox( |
|
"Corpus", |
|
tuple(corpus_name_map.keys()), |
|
index=2, |
|
) |
|
max_results = st.sidebar.slider( |
|
"Max Results", |
|
min_value=1, |
|
max_value=100, |
|
step=1, |
|
value=10, |
|
help="Max Number of Documents to return", |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
footer = """ |
|
<style> |
|
.footer { |
|
position: fixed; |
|
left: 0; |
|
bottom: 0; |
|
width: 100%; |
|
background-color: white; |
|
color: black; |
|
text-align: center; |
|
} |
|
</style> |
|
<div class="footer"> |
|
<p>Powered by <a href="https://huggingface.co/" >HuggingFace π€</a> and <a href="https://github.com/castorini/pyserini" >Pyserini π¦</a></p> |
|
</div> |
|
""" |
|
st.sidebar.markdown(footer, unsafe_allow_html=True) |
|
|
|
|
|
def scisearch(query, corpus, num_results=10): |
|
try: |
|
print(query, corpus, num_results) |
|
query = query.strip() |
|
if query == "" or query is None: |
|
return |
|
|
|
post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"} |
|
address = ( |
|
os.environ.get("address") |
|
if corpus != "roots" |
|
else "http://34.116.206.238:8080" |
|
) |
|
|
|
output = requests.post( |
|
address, |
|
headers={"Content-type": "application/json"}, |
|
data=json.dumps(post_data), |
|
timeout=60, |
|
) |
|
|
|
payload = json.loads(output.text) |
|
return payload["results"], payload["highlight_terms"] |
|
|
|
except Exception as e: |
|
print(e) |
|
|
|
|
|
PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} |
|
PII_PREFIX = "PI:" |
|
|
|
|
|
def process_pii(text): |
|
for tag in PII_TAGS: |
|
text = text.replace( |
|
PII_PREFIX + tag, |
|
"""<b><mark style="background: Fuchsia; color: Lime;">REDACTED {}</mark></b>""".format( |
|
tag |
|
), |
|
) |
|
return text |
|
|
|
|
|
def highlight_string(paragraph: str, highlight_terms: list) -> str: |
|
tokens = paragraph.split() |
|
tokens_html = [] |
|
for token in tokens: |
|
if token in highlight_terms: |
|
tokens_html.append("<b>{}</b>".format(token)) |
|
else: |
|
tokens_html.append(token) |
|
tokens_html = " ".join(tokens_html) |
|
return process_pii(tokens_html) |
|
|
|
|
|
def extract_lang_from_docid(docid): |
|
return docid.split("_")[1] |
|
|
|
|
|
def format_result(result, highlight_terms): |
|
text = result["text"] |
|
docid = result["docid"] |
|
tokens_html = highlight_string(text, highlight_terms) |
|
language = extract_lang_from_docid(docid) |
|
result_html = """ |
|
<span style='font-size:14px; font-family: Arial; color:MediumAquaMarine'>Language: {} | </span> |
|
<span style='font-size:14px; font-family: Arial; color:#7978FF; text-align: left;'>Document ID: {} | </span><br> |
|
<span style='font-family: Arial;'>{}</span><br> |
|
<br> |
|
""".format( |
|
language, docid, tokens_html |
|
) |
|
return "<p>" + result_html + "</p>" |
|
|
|
|
|
def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str: |
|
hit_list = [] |
|
|
|
if corpus == "roots": |
|
result_page_html = "" |
|
for lang, results_for_lang in hits.items(): |
|
print("Processing language", lang) |
|
if len(results_for_lang) == 0: |
|
result_page_html += """<div style='font-family: Arial; color:Silver; text-align: left; line-height: 3em'> |
|
No results for language: <b>{}</b></div>""".format( |
|
lang |
|
) |
|
continue |
|
results_for_lang_html = "" |
|
for result in results_for_lang: |
|
result_html = format_result(result, highlight_terms) |
|
results_for_lang_html += result_html |
|
results_for_lang_html = f""" |
|
<details> |
|
<summary style='font-family: Arial; color:MediumAquaMarine; text-align: left; line-height: 3em'> |
|
Results for language: <b>{lang}</b> |
|
</summary> |
|
{results_for_lang_html} |
|
</details>""" |
|
result_page_html += results_for_lang_html |
|
return result_page_html |
|
|
|
for hit in hits: |
|
res_head = f""" |
|
<p class="searchresult" style="color: #7978FF;">Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}</p> |
|
""" |
|
if corpus == "laion": |
|
res_head += f""" |
|
<p style="color: #7978FF;">Caption:</p> |
|
<p>{highlight_string(hit['text'], highlight_terms)}</p> |
|
""" |
|
if ( |
|
"meta" in hit |
|
and hit["meta"] is not None |
|
and "docs" in hit["meta"] |
|
and len(hit["meta"]["docs"]) > 0 |
|
): |
|
res_head += """<p style="color: #7978FF;"> Image links:</p><ul>""" |
|
for subhit in hit["meta"]["docs"]: |
|
res_head += f"""<li><a href={subhit["URL"]} target="_blank" style="color:#ffcdf8; ">{subhit["URL"]}</a></li>""" |
|
res_head += "</ul>" |
|
res_head += "<hr>" |
|
else: |
|
res_head += ( |
|
f"""<p>{highlight_string(hit['text'], highlight_terms)}</p></div><hr>""" |
|
) |
|
hit_list.append(res_head) |
|
return " ".join(hit_list) |
|
|
|
|
|
submit_button = st.sidebar.button("Search", type="primary") |
|
|
|
if submit_button or query: |
|
query = query.strip() |
|
if query is None or query == "": |
|
components.html( |
|
"""<p style='font-size:18px; font-family: Arial; color:MediumVioletRed; text-align: center;'> |
|
Please provide a non-empty query. |
|
</p><br><hr><br>""" |
|
) |
|
else: |
|
hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results) |
|
html_results = process_results(corpus_name_map[corpus], hits, highlight_terms) |
|
rendered_results = f""" |
|
<div id="searchresultsarea"> |
|
<br> |
|
<p id="searchresultsnumber">About {max_results} results</p> |
|
{html_results} |
|
</div>""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
components.html( |
|
""" |
|
<head> |
|
<link href='https://fonts.googleapis.com/css?family=Source+Sans+Pro' rel='stylesheet' type='text/css'> |
|
</head> |
|
<style> |
|
#searchresultsarea { |
|
font-family: "Source Sans Pro", sans-serif; |
|
} |
|
#searchresultsnumber { |
|
font-size: 0.8rem; |
|
color: gray; |
|
} |
|
.searchresult h2 { |
|
font-size: 19px; |
|
line-height: 18px; |
|
font-weight: normal; |
|
color: rgb(7, 111, 222); |
|
margin-bottom: 0px; |
|
margin-top: 25px; |
|
color: #7978FF;" |
|
} |
|
.searchresult a { |
|
font-size: 12px; |
|
line-height: 12px; |
|
color: green; |
|
margin-bottom: 0px; |
|
} |
|
</style> |
|
""" |
|
+ rendered_results, |
|
height=800, |
|
scrolling=True, |
|
) |
|
|