eduardofv's picture
migrated from test space
feaaa7e
import streamlit as st
import torch
import sentence_transformers as sent
import datasets as ds
d = ds.load_dataset("wikipedia", "20220301.simple")
t = d["train"]
titles = t['title']
@st.cache(allow_output_mutation=True)
def load_model():
return sent.SentenceTransformer("distiluse-base-multilingual-cased-v1")#"all-MiniLM-L6-v2")
@st.cache
def load_wikipedia_embeddings():
return torch.load("titles-simple-0.pt", map_location=torch.device('cpu'))
st.title("Multilingual Semantic Search for Wikipedia Simple English")
st.markdown("""
Use semantic search to find related articles in Wikipedia Simple English: using a language model (sentence-transformers/distiluse-base-multilingual-cased-v1) we can find the closests titles from Wikipedia Simple English (wikipedia) queried in any of the model's trained languages: Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Portuguese, Russian, Spanish, Turkish:
- colesterol
- développement humain
- Crise dos mísseis de Cuba
Also, "near natural language" queries are usually enough to bring up relevant results. Try:
- ¿cuál es el edificio más alto del mundo?
- comment préparer du poulet frit
- melhores películas de pixar
(note: search is done only on the article titles, not the content)
""")
model = load_model()
embeddings = load_wikipedia_embeddings()
#queries = ["Aristoteles", "Autismo", "Mental", "crecimiento poblacional"]
query = st.text_input("Query (es, fr, pt, ...)")
if query != "":
queries = [query]
queries_emb = model.encode(queries, convert_to_tensor=True)
hits = sent.util.semantic_search(queries_emb, embeddings, top_k=5)
for i,q in enumerate(queries):
f"----\n{q}:\n"
for hit in hits[i]:
cid = hit['corpus_id']
title = titles[cid]
url = t[cid]['url']
text = t[cid]['text'][:500] + "..."
st.header(f"{title}")
url
text
hit