Spaces:

abokbot
/

wikipedia-search-engine

File size: 3,105 Bytes

1e57a2c
4cbcb94
1e57a2c
 
 
 
 
 
 
 
 
 
 
e7ded97
1e57a2c
cd97c23
84b4358
1e57a2c
 
 
8fc12bf
 
 
 
d45163a
3ad472b
 
 
 
 
 
 
 
 
 
 
c5ea378
 
 
 
988f448
c5ea378
 
 
 
 
3ad472b
 
 
 
 
 
 
 
1e57a2c
 
d45163a
 
 
1e57a2c
 
fdde008
1e57a2c
c5ea378
1e57a2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d45163a

import streamlit as st
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
from huggingface_hub import hf_hub_download

embedding_path = "abokbot/wikipedia-embedding"

st.header("Wikipedia Search Engine app")

st_model_load = st.text('Loading wikipedia embedding...')

@st.cache_resource
def load_embedding():
    print("Loading embedding...")
    path = hf_hub_download(repo_id="abokbot/wikipedia-embedding", filename="simple_wikipedia_embedding.pt")
    wikipedia_embedding = torch.load(path, map_location=torch.device('cpu')) 
    print("Embedding loaded!")
    return wikipedia_embedding

wikipedia_embedding = load_embedding()
st.success('Embedding loaded!')
st_model_load.text("")

@st.cache_resource
def load_encoders():
    print("Loading encoders...")
    bi_encoder = SentenceTransformer('msmarco-MiniLM-L-6-v3')
    bi_encoder.max_seq_length = 256     #Truncate long passages to 256 tokens
    top_k = 32  
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2-v2')
    return bi_encoder, cross_encoder

bi_encoder, cross_encoder = load_encoders()
st.success('Encoders loaded!')
st_model_load.text("")

@st.cache_resource
def load_wikipedia_dataset():
    print("Loading wikipedia dataset...")
    dataset = load_dataset("abokbot/wikipedia-first-paragraph")["train"]
    return dataset
    
dataset = load_wikipedia_dataset()
st.success('Datset loaded!')
st_model_load.text("")
    
if 'text' not in st.session_state:
    st.session_state.text = ""
st_text_area = st.text_area(
    'Enter query (e.g. What is the capital city of Kenya? or Number of deputees in French parliement)', 
    value=st.session_state.text, 
    height=100
)


def search():
    st.session_state.text = st_text_area
    query = st_text_area
    ##### Sematic Search #####
    # Encode the query using the bi-encoder and find potentially relevant passages
    top_k = 32
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding, wikipedia_embedding, top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    ##### Re-Ranking #####
    # Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, dataset["text"][hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    # Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    # Output of top-3 hits from re-ranker
    print("\n-------------------------\n")
    print("Top-3 Cross-Encoder Re-ranker hits")
    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    for hit in hits[0:3]:
        print("score: ",  round(hit['cross-score'], 3),"\n",
              "title: ", dataset["title"][hit['corpus_id']], "\n", 
              "substract: ", dataset["text"][hit['corpus_id']].replace("\n", " "), "\n", 
              "link: ", dataset["url"][hit['corpus_id']],"\n")


# search button
st_search_button = st.button('Search', on_click=search)