import http.client as http_client import json import logging import os import pprint import re import string import streamlit as st import streamlit.components.v1 as components import requests pp = pprint.PrettyPrinter(indent=2) st.set_page_config(page_title="Gaia Search", layout="wide") os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: file.write( '[theme]\nbase="light"' ) LANG_MAPPING = {'Arabic':'ar', 'Catalan':'ca', 'Code':'code', 'English':'en', 'Spanish':'es', 'French':'fr', 'Indonesian':'id', 'Indic':'indic', 'Niger-Congo':'nigercongo', 'Portuguese': 'pt', 'Vietnamese': 'vi', 'Chinese': 'zh', 'Detect Language':'detect_language', 'All':'all'} st.sidebar.markdown( """

Gaia Search 🌖🌏

A search engine for the LAION large scale image caption corpora

""", unsafe_allow_html=True, ) st.sidebar.markdown( """

GitHub | Project Report

""", unsafe_allow_html=True, ) query = st.sidebar.text_input(label='Search query', value='') language = st.sidebar.selectbox( 'Language', ('Arabic', 'Catalan', 'Code', 'English', 'Spanish', 'French', 'Indonesian', 'Indic', 'Niger-Congo', 'Portuguese', 'Vietnamese', 'Chinese', 'Detect Language', 'All'), index=3) max_results = st.sidebar.slider( "Maximum Number of Results", min_value=1, max_value=100, step=1, value=10, help="Maximum Number of Documents to return", ) footer=""" """ st.sidebar.markdown(footer,unsafe_allow_html=True) def scisearch(query, language, num_results=10): try: query = query.strip() if query == "" or query is None: return post_data = {"query": query, "k": num_results} if language != "detect_language": post_data["lang"] = language output = requests.post( os.environ.get("address"), headers={"Content-type": "application/json"}, data=json.dumps(post_data), timeout=60, ) payload = json.loads(output.text) if "err" in payload: if payload["err"]["type"] == "unsupported_lang": detected_lang = payload["err"]["meta"]["detected_lang"] return f"""

Detected language {detected_lang} is not supported.
Please choose a language from the dropdown or type another query.




""" results = payload["results"] highlight_terms = payload["highlight_terms"] except Exception as e: results_html = f"""

Raised {type(e).__name__}

Check if a relevant discussion already exists in the Community tab. If not, please open a discussion.

""" print(e) return results, highlight_terms PII_TAGS = {"KEY", "EMAIL", "USER", "IP_ADDRESS", "ID", "IPv4", "IPv6"} PII_PREFIX = "PI:" def process_pii(text): for tag in PII_TAGS: text = text.replace( PII_PREFIX + tag, """REDACTED {}""".format(tag), ) return text def highlight_string(paragraph: str, highlight_terms: list) -> str: for term in highlight_terms: paragraph = re.sub(f"\\b{term}\\b", f"{term}", paragraph, flags=re.I) paragraph = process_pii(paragraph) return paragraph def process_results(hits: list, highlight_terms: list) -> str: hit_list = [] for i, hit in enumerate(hits): res_head = f"""

{i+1}. Document ID: {hit['docid']}

Language: {hit['lang']}, Score: {round(hit['score'], 2)}

""" for subhit in hit['meta']['docs']: res_head += f"""

{subhit['URL']}

{highlight_string(subhit['TEXT'], highlight_terms)}

""" res_head += f"""

{highlight_string(hit['text'], highlight_terms)}


""" hit_list.append(res_head) return " ".join(hit_list) if st.sidebar.button("Search"): hits, highlight_terms = scisearch(query, LANG_MAPPING[language], max_results) html_results = process_results(hits, highlight_terms) rendered_results = f"""

About {max_results} results

{html_results}
""" st.markdown(""" """, unsafe_allow_html=True) st.markdown( """ """, unsafe_allow_html=True) st.markdown( f"""
Gaia Search 🌖🌏
""", unsafe_allow_html=True) components.html( """ """ + rendered_results, height=800, scrolling=True )