import json import os import pprint import streamlit as st import streamlit.components.v1 as components import requests from typing import Union pp = pprint.PrettyPrinter(indent=2) st.set_page_config(page_title="Gaia Search 🌖🌏", layout="wide") os.makedirs(os.path.join(os.getcwd(), ".streamlit"), exist_ok=True) with open(os.path.join(os.getcwd(), ".streamlit/config.toml"), "w") as file: file.write('[theme]\nbase="light"') corpus_name_map = { "LAION": "laion", "ROOTS": "roots", "The Pile": "pile", "C4": "c4", } st.sidebar.markdown( """

Gaia Search 🌖🌏

A search engine for large scale texual corpora. Most of the datasets included in the tool are based on Common Crawl. By using the tool, you are also bound by the Common Crawl terms of use in respect of the content contained in the datasets.

""", unsafe_allow_html=True, ) st.sidebar.markdown( """

GitHub | Paper | Colab

""", unsafe_allow_html=True, ) #

# # # #

query = st.sidebar.text_input(label="Query", placeholder="Type your query here") corpus = st.sidebar.selectbox( "Corpus", tuple(corpus_name_map.keys()), index=2, ) max_results = st.sidebar.slider( "Max Results", min_value=1, max_value=100, step=1, value=10, help="Max Number of Documents to return", ) # dark_mode_toggle = """ # # # """ # st.sidebar.markdown(dark_mode_toggle, unsafe_allow_html=True) footer = """ """ st.sidebar.markdown(footer, unsafe_allow_html=True) def scisearch(query, corpus, num_results=10): try: print(query, corpus, num_results) query = query.strip() if query == "" or query is None: return post_data = {"query": query, "corpus": corpus, "k": num_results, "lang": "all"} address = ( os.environ.get("address") if corpus != "roots" else os.environ.get("address_roots") ) output = requests.post( address, headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) return payload["results"], payload["highlight_terms"] except Exception as e: print(e) PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format( tag ), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: tokens = paragraph.split() tokens_html = [] for token in tokens: if token in highlight_terms: tokens_html.append("{}".format(token)) else: tokens_html.append(token) tokens_html = " ".join(tokens_html) return process_pii(tokens_html) def extract_lang_from_docid(docid): return docid.split("_")[1] def format_result(result, highlight_terms): text = result["text"] docid = result["docid"] tokens_html = highlight_string(text, highlight_terms) language = extract_lang_from_docid(docid) result_html = """ Language: {} | Document ID: {} |
{}

""".format( language, docid, tokens_html ) return "

" + result_html + "

" def process_results(corpus: str, hits: Union[list, dict], highlight_terms: list) -> str: hit_list = [] if corpus == "roots": result_page_html = "" for lang, results_for_lang in hits.items(): print("Processing language", lang) if len(results_for_lang) == 0: result_page_html += """

No results for language: {}

""".format( lang ) continue results_for_lang_html = "" for result in results_for_lang: result_html = format_result(result, highlight_terms) results_for_lang_html += result_html results_for_lang_html = f"""

Results for language: {lang}

{results_for_lang_html}

""" result_page_html += results_for_lang_html return result_page_html for hit in hits: res_head = f"""

Document ID: {hit['docid']} | Score: {round(hit['score'], 2)}

""" if corpus == "laion": res_head += f"""

Caption:

{highlight_string(hit['text'], highlight_terms)}

""" if ( "meta" in hit and hit["meta"] is not None and "docs" in hit["meta"] and len(hit["meta"]["docs"]) > 0 ): res_head += """

Image links:

{subhit["URL"]}

" res_head += "

" else: res_head += ( f"""

{highlight_string(hit['text'], highlight_terms)}

""" ) hit_list.append(res_head) return " ".join(hit_list) submit_button = st.sidebar.button("Search", type="primary") if submit_button or query: query = query.strip() if query is None or query == "": components.html( """

Please provide a non-empty query.

""" ) else: hits, highlight_terms = scisearch(query, corpus_name_map[corpus], max_results) html_results = process_results(corpus_name_map[corpus], hits, highlight_terms) rendered_results = f"""

About {max_results} results

{html_results}

""" # st.markdown( # """ # # """, # unsafe_allow_html=True, # ) # st.markdown( # """ # # """, # unsafe_allow_html=True, # ) # st.markdown( # f""" #

# Gaia Search 🌖🌏 #

# #

# """, # unsafe_allow_html=True, # ) # .bk-root{position:relative;width:auto;height:auto;box-sizing:border-box;font-family:Helvetica, Arial, sans-serif;font-size:13px;}.bk-root .bk,.bk-root .bk:before,.bk-root .bk:after{box-sizing:inherit;margin:0;border:0;padding:0;background-image:none;font-family:inherit;font-size:100%;line-height:1.42857143;}.bk-root pre.bk{font-family:Courier, monospace;} components.html( """ """ + rendered_results, height=800, scrolling=True, )