import http.client as http_client import json import logging import os import pprint import re import time import string import streamlit as st import streamlit.components.v1 as components from typing import Callable, Optional, Tuple, Union from pyserini import util from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder VERSION = '1.0' st.set_page_config(page_title="Miracl Search - Chinese", layout="wide") os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: file.write( '[theme]\nbase="light"' ) Searcher = Union[FaissSearcher, LuceneSearcher] LANG_MAPPING = {'Chinese':'zh'} st.sidebar.markdown( """

MIRACL Chinese Demo

🌍🙌🌏

MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.

""", unsafe_allow_html=True, ) st.sidebar.markdown( """

GitHub | Paper

""", unsafe_allow_html=True, ) query = st.sidebar.text_input(label='Search query', value='') language = 'Chinese' max_results = st.sidebar.slider( "Maximum Number of Results", min_value=1, max_value=1000, step=1, value=10, help="Maximum Number of Documents to return", ) def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher): searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856') searcher.set_language(language) if k1 is not None and b is not None: searcher.set_bm25(k1, b) retriever_name = f'BM25 (k1={k1}, b={b})' else: retriever_name = 'BM25' return searcher def search(query, language, num_results=10): searcher = _load_sparse_searcher(language=LANG_MAPPING[language]) t_0 = time.time() search_results = searcher.search(query, k=num_results) search_time = time.time() - t_0 results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language} for i, result in enumerate(search_results): result = json.loads(result.raw) results_dict["docs"].append(result["text"]) results_dict["doc_ids"].append(result["docid"]) results_dict["score"].append(search_results[i].score) return results_dict, search_time def highlight_string(paragraph: str, highlight_terms: list) -> str: for term in highlight_terms: paragraph = re.sub(f"\\b{term}\\b", f"{term}", paragraph, flags=re.I) return paragraph def process_results(hits: dict, highlight_terms: list) -> str: hit_list = [] for i in range(len(hits['doc_ids'])): res_head = f"""

{i+1}. Document ID: {hits['doc_ids'][i]}

Language: {hits['lang']}, Score: {round(hits['score'][i], 2)}

{highlight_string(hits['docs'][i], highlight_terms)}


""" hit_list.append(res_head) return " ".join(hit_list) if st.sidebar.button("Search"): hits, search_time = search(query, language, max_results) html_results = process_results(hits, []) rendered_results = f"""

About {max_results} results

{html_results}
""" st.markdown(""" """, unsafe_allow_html=True) st.markdown( """ """, unsafe_allow_html=True) st.markdown( f"""

Search Results

""", unsafe_allow_html=True) components.html( """ """ + rendered_results, height=800, scrolling=True )