Spaces:
Sleeping
Sleeping
import http.client as http_client | |
import json | |
import logging | |
import os | |
import pprint | |
import re | |
import time | |
import string | |
import streamlit as st | |
import streamlit.components.v1 as components | |
from typing import Callable, Optional, Tuple, Union | |
from pyserini import util | |
from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder | |
VERSION = '1.0' | |
st.set_page_config(page_title="Miracl Search - Chinese", layout="wide") | |
os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True) | |
with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file: | |
file.write( | |
'[theme]\nbase="light"' | |
) | |
Searcher = Union[FaissSearcher, LuceneSearcher] | |
LANG_MAPPING = {'Chinese':'zh'} | |
st.sidebar.markdown( | |
""" | |
<style> | |
.aligncenter { | |
text-align: center; | |
font-weight: bold; | |
font-size: 30px; | |
} | |
</style> | |
<p class="aligncenter">MIRACL Chinese Demo</p> | |
<p class="aligncenter">πππ</p> | |
<p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p> | |
""", | |
unsafe_allow_html=True, | |
) | |
st.sidebar.markdown( | |
""" | |
<style> | |
.aligncenter { | |
text-align: center; | |
} | |
</style> | |
<p style='text-align: center'> | |
<a href="https://github.com/project-miracl" >GitHub</a> | <a href="https://arxiv.org/abs/2210.09984" >Paper</a> | |
</p> | |
""", | |
unsafe_allow_html=True, | |
) | |
query = st.sidebar.text_input(label='Search query', value='') | |
language = 'Chinese' | |
max_results = st.sidebar.slider( | |
"Maximum Number of Results", | |
min_value=1, | |
max_value=1000, | |
step=1, | |
value=10, | |
help="Maximum Number of Documents to return", | |
) | |
def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher): | |
searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856') | |
searcher.set_language(language) | |
if k1 is not None and b is not None: | |
searcher.set_bm25(k1, b) | |
retriever_name = f'BM25 (k1={k1}, b={b})' | |
else: | |
retriever_name = 'BM25' | |
return searcher | |
def search(query, language, num_results=10): | |
searcher = _load_sparse_searcher(language=LANG_MAPPING[language]) | |
t_0 = time.time() | |
search_results = searcher.search(query, k=num_results) | |
search_time = time.time() - t_0 | |
results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language} | |
for i, result in enumerate(search_results): | |
result = json.loads(result.raw) | |
results_dict["docs"].append(result["text"]) | |
results_dict["doc_ids"].append(result["docid"]) | |
results_dict["score"].append(search_results[i].score) | |
return results_dict, search_time | |
def highlight_string(paragraph: str, highlight_terms: list) -> str: | |
for term in highlight_terms: | |
paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I) | |
return paragraph | |
def process_results(hits: dict, highlight_terms: list) -> str: | |
hit_list = [] | |
for i in range(len(hits['doc_ids'])): | |
res_head = f""" | |
<div class='searchresult'> | |
<h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2> | |
<p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p> | |
<p>{highlight_string(hits['docs'][i], highlight_terms)}</p> | |
</div> | |
<hr> | |
""" | |
hit_list.append(res_head) | |
return " ".join(hit_list) | |
if st.sidebar.button("Search"): | |
hits, search_time = search(query, language, max_results) | |
html_results = process_results(hits, []) | |
rendered_results = f""" | |
<div id="searchresultsarea"> | |
<br> | |
<p id="searchresultsnumber">About {max_results} results</p> | |
{html_results} | |
</div> | |
""" | |
st.markdown(""" | |
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" | |
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous"> | |
""", | |
unsafe_allow_html=True) | |
st.markdown( | |
""" | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css"> | |
""", | |
unsafe_allow_html=True) | |
st.markdown( | |
f""" | |
<div class="row no-gutters mt-3 align-items-center"> | |
<h2> Search Results </h2> | |
</div> | |
""", | |
unsafe_allow_html=True) | |
components.html( | |
""" | |
<style> | |
#searchresultsarea { | |
font-family: 'Arial'; | |
} | |
#searchresultsnumber { | |
font-size: 0.8rem; | |
color: gray; | |
} | |
.searchresult h2 { | |
font-size: 19px; | |
line-height: 18px; | |
font-weight: normal; | |
color: rgb(7, 111, 222); | |
margin-bottom: 0px; | |
margin-top: 25px; | |
} | |
.searchresult a { | |
font-size: 12px; | |
line-height: 12px; | |
color: green; | |
margin-bottom: 0px; | |
} | |
.dark-mode { | |
color: white; | |
} | |
</style> | |
<script> | |
function load_image(id){ | |
console.log(id) | |
var x = document.getElementById(id); | |
console.log(x) | |
if (x.style.display === "none") { | |
x.style.display = "block"; | |
} else { | |
x.style.display = "none"; | |
} | |
}; | |
function myFunction() { | |
var element = document.body; | |
element.classList.toggle("dark-mode"); | |
} | |
</script> | |
<button onclick="myFunction()">Toggle dark mode</button> | |
""" + rendered_results, height=800, scrolling=True | |
) | |