|
import faiss |
|
import pickle |
|
import pandas as pd |
|
import streamlit as st |
|
from sentence_transformers import SentenceTransformer |
|
from vector_engine.utils import vector_search |
|
|
|
@st.cache_data |
|
def read_data(pibdata="pib2022_23_cleaned_abs.csv"): |
|
"""Read the pib data.""" |
|
return pd.read_csv(pibdata) |
|
|
|
|
|
@st.cache_resource |
|
def load_bert_model(name="pushpdeep/sbertmsmarco-en_to_indic_ur-murilv1"): |
|
"""Instantiate a sentence-level DistilBERT model.""" |
|
return SentenceTransformer(name) |
|
|
|
|
|
@st.cache_data |
|
def load_faiss_index(path_to_faiss="models/faiss_index_ip.pickle"): |
|
"""Load and deserialize the Faiss index.""" |
|
with open(path_to_faiss, "rb") as h: |
|
data = pickle.load(h) |
|
return faiss.deserialize_index(data) |
|
|
|
def main(): |
|
|
|
data = read_data() |
|
model = load_bert_model() |
|
faiss_index = load_faiss_index() |
|
|
|
st.title("Vector-based search with Sentence Transformers and Faiss") |
|
|
|
|
|
user_input = st.text_area("Search box", "हिंद महासागर जलवायु परिवर्तन") |
|
|
|
|
|
st.sidebar.markdown("**Filters**") |
|
|
|
languages = ['English', 'Urdu','Hindi', 'Bengali', 'Marathi', 'Telugu', 'Tamil', 'Gujarati', 'Kannada', 'Odia', 'Malayalam', 'Panjabi', 'Assamese'] |
|
|
|
|
|
num_results = st.sidebar.slider("Number of search results", 10, 50, 10) |
|
|
|
|
|
selected_languages = st.sidebar.multiselect('Select languages for search results', languages) |
|
|
|
|
|
|
|
if user_input: |
|
|
|
D, I = vector_search([user_input], model, faiss_index, num_results) |
|
|
|
|
|
if selected_languages: |
|
frame = data[data['language'].isin(selected_languages)] |
|
else: |
|
frame = data |
|
|
|
|
|
i = 0 |
|
for id_ in I.flatten().tolist(): |
|
i+=1 |
|
if id_ in set(frame.rid): |
|
f = frame[(frame.rid == id_)] |
|
else: |
|
continue |
|
st.write( |
|
f""" |
|
**Rank**: {i} |
|
**Language**: {f.iloc[0].language} |
|
**Article**: {f.iloc[0].abstract} https://pib.gov.in/PressReleasePage.aspx?PRID={f.iloc[0].rid} |
|
""" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |