Spaces:

spacerini
/

miracl-chinese

Sleeping

App Files Files Community

miracl-chinese / app.py

ToluClassics

Upload 23 files

4eaec19 about 2 years ago

raw

history blame

6 kB

	import http.client as http_client
	import json
	import logging
	import os
	import pprint
	import re
	import time
	import string

	import streamlit as st

	import streamlit.components.v1 as components
	from typing import Callable, Optional, Tuple, Union
	from pyserini import util
	from pyserini.search import LuceneSearcher, FaissSearcher, AutoQueryEncoder


	VERSION = '1.0'
	st.set_page_config(page_title="Miracl Search - Chinese", layout="wide")

	os.makedirs(os.path.join(os.getcwd(),".streamlit"), exist_ok = True)
	with open(os.path.join(os.getcwd(),".streamlit/config.toml"), "w") as file:
	file.write(
	'[theme]\nbase="light"'
	)

	Searcher = Union[FaissSearcher, LuceneSearcher]
	LANG_MAPPING = {'Chinese':'zh'}


	st.sidebar.markdown(
	"""
	<style>
	.aligncenter {
	text-align: center;
	font-weight: bold;
	font-size: 30px;
	}
	</style>
	<p class="aligncenter">MIRACL Chinese Demo</p>
	<p class="aligncenter">🌍🙌🌏</p>
	<p style="text-align: center;"> MIRACL is a multilingual dataset for ad hoc retrieval that consists of 18 different languages, collectively encompassing over three billion native speakers around the world.</p>
	""",
	unsafe_allow_html=True,
	)

	st.sidebar.markdown(
	"""
	<style>
	.aligncenter {
	text-align: center;
	}
	</style>
	<p style='text-align: center'>
	<a href="https://github.com/project-miracl" >GitHub</a> \| <a href="https://arxiv.org/abs/2210.09984" >Paper</a>
	</p>
	""",
	unsafe_allow_html=True,
	)

	query = st.sidebar.text_input(label='Search query', value='')
	language = 'Chinese'

	max_results = st.sidebar.slider(
	"Maximum Number of Results",
	min_value=1,
	max_value=1000,
	step=1,
	value=10,
	help="Maximum Number of Documents to return",
	)


	def _load_sparse_searcher(language: str, k1: Optional[float]=None, b: Optional[float]=None) -> (Searcher):
	searcher = LuceneSearcher(f'lucene-index.miracl-v1.0-{language}.20221004.2b2856')
	searcher.set_language(language)
	if k1 is not None and b is not None:
	searcher.set_bm25(k1, b)
	retriever_name = f'BM25 (k1={k1}, b={b})'
	else:
	retriever_name = 'BM25'

	return searcher

	def search(query, language, num_results=10):
	searcher = _load_sparse_searcher(language=LANG_MAPPING[language])

	t_0 = time.time()
	search_results = searcher.search(query, k=num_results)
	search_time = time.time() - t_0

	results_dict ={"docs": [], "doc_ids": [], "score":[], "lang": language}
	for i, result in enumerate(search_results):
	result = json.loads(result.raw)
	results_dict["docs"].append(result["text"])
	results_dict["doc_ids"].append(result["docid"])
	results_dict["score"].append(search_results[i].score)

	return results_dict, search_time



	def highlight_string(paragraph: str, highlight_terms: list) -> str:
	for term in highlight_terms:
	paragraph = re.sub(f"\\b{term}\\b", f"<b>{term}</b>", paragraph, flags=re.I)
	return paragraph

	def process_results(hits: dict, highlight_terms: list) -> str:
	hit_list = []
	for i in range(len(hits['doc_ids'])):
	res_head = f"""
	<div class='searchresult'>
	<h2>{i+1}. Document ID: {hits['doc_ids'][i]}</h2>
	<p>Language: <string>{hits['lang']}</string>, Score: {round(hits['score'][i], 2)}</p>
	<p>{highlight_string(hits['docs'][i], highlight_terms)}</p>
	</div>
	<hr>
	"""
	hit_list.append(res_head)
	return " ".join(hit_list)



	if st.sidebar.button("Search"):
	hits, search_time = search(query, language, max_results)
	html_results = process_results(hits, [])
	rendered_results = f"""
	<div id="searchresultsarea">
	<br>
	<p id="searchresultsnumber">About {max_results} results</p>
	{html_results}
	</div>
	"""
	st.markdown("""
	<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
	integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
	""",
	unsafe_allow_html=True)
	st.markdown(
	"""
	<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
	""",
	unsafe_allow_html=True)
	st.markdown(
	f"""
	<div class="row no-gutters mt-3 align-items-center">
	<h2> Search Results </h2>
	</div>
	""",
	unsafe_allow_html=True)
	components.html(
	"""
	<style>
	#searchresultsarea {
	font-family: 'Arial';
	}

	#searchresultsnumber {
	font-size: 0.8rem;
	color: gray;
	}

	.searchresult h2 {
	font-size: 19px;
	line-height: 18px;
	font-weight: normal;
	color: rgb(7, 111, 222);
	margin-bottom: 0px;
	margin-top: 25px;
	}

	.searchresult a {
	font-size: 12px;
	line-height: 12px;
	color: green;
	margin-bottom: 0px;
	}

	.dark-mode {
	color: white;
	}
	</style>
	<script>
	function load_image(id){
	console.log(id)
	var x = document.getElementById(id);
	console.log(x)
	if (x.style.display === "none") {
	x.style.display = "block";
	} else {
	x.style.display = "none";
	}
	};
	function myFunction() {
	var element = document.body;
	element.classList.toggle("dark-mode");
	}
	</script>
	<button onclick="myFunction()">Toggle dark mode</button>
	""" + rendered_results, height=800, scrolling=True
	)