import faiss import pickle import pandas as pd import streamlit as st from sentence_transformers import SentenceTransformer from vector_engine.utils import vector_search @st.cache_data def read_data(pibdata="pib2022_23_cleaned_abs.csv"): """Read the pib data.""" return pd.read_csv(pibdata) @st.cache_resource def load_bert_model(name="pushpdeep/sbertmsmarco-en_to_indic_ur-murilv1"): """Instantiate a sentence-level DistilBERT model.""" return SentenceTransformer(name) @st.cache_data def load_faiss_index(path_to_faiss="models/faiss_index_ip.pickle"): """Load and deserialize the Faiss index.""" with open(path_to_faiss, "rb") as h: data = pickle.load(h) return faiss.deserialize_index(data) def main(): # Load data and models data = read_data() model = load_bert_model() faiss_index = load_faiss_index() st.title("Vector-based search with Sentence Transformers and Faiss") # User search user_input = st.text_area("Search box", "हिंद महासागर जलवायु परिवर्तन") # Filters st.sidebar.markdown("**Filters**") # List of available languages languages = ['English', 'Urdu','Hindi', 'Bengali', 'Marathi', 'Telugu', 'Tamil', 'Gujarati', 'Kannada', 'Odia', 'Malayalam', 'Panjabi', 'Assamese'] num_results = st.sidebar.slider("Number of search results", 10, 50, 10) # Multiselect for choosing languages selected_languages = st.sidebar.multiselect('Select languages for search results', languages) # Fetch results if user_input: # Get paper IDs D, I = vector_search([user_input], model, faiss_index, num_results) # Filter data by selected languages (if any) if selected_languages: frame = data[data['language'].isin(selected_languages)] else: frame = data # Get individual results i = 0 for id_ in I.flatten().tolist(): i+=1 if id_ in set(frame.rid): f = frame[(frame.rid == id_)] else: continue st.write( f""" **Rank**: {i} **Language**: {f.iloc[0].language} **Article**: {f.iloc[0].abstract} https://pib.gov.in/PressReleasePage.aspx?PRID={f.iloc[0].rid} """ ) if __name__ == "__main__": main()