File size: 2,413 Bytes
70a4e1e f3ca77b 70a4e1e 0c81fea aebeb19 0c81fea aebeb19 0c81fea ea6b7df 0c81fea 70a4e1e ea6b7df 70a4e1e 2ba7cb6 70a4e1e 2ba7cb6 70a4e1e 2ba7cb6 70a4e1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import faiss
import pickle
import pandas as pd
import streamlit as st
from sentence_transformers import SentenceTransformer
from vector_engine.utils import vector_search
@st.cache_data
def read_data(pibdata="pib2022_23_cleaned_abs.csv"):
"""Read the pib data."""
return pd.read_csv(pibdata)
@st.cache_resource
def load_bert_model(name="pushpdeep/sbertmsmarco-en_to_indic_ur-murilv1"):
"""Instantiate a sentence-level DistilBERT model."""
return SentenceTransformer(name)
@st.cache_data
def load_faiss_index(path_to_faiss="models/faiss_index_ip.pickle"):
"""Load and deserialize the Faiss index."""
with open(path_to_faiss, "rb") as h:
data = pickle.load(h)
return faiss.deserialize_index(data)
def main():
# Load data and models
data = read_data()
model = load_bert_model()
faiss_index = load_faiss_index()
st.title("Vector-based search with Sentence Transformers and Faiss")
# User search
user_input = st.text_area("Search box", "हिंद महासागर जलवायु परिवर्तन")
# Filters
st.sidebar.markdown("**Filters**")
# List of available languages
languages = ['English', 'Urdu','Hindi', 'Bengali', 'Marathi', 'Telugu', 'Tamil', 'Gujarati', 'Kannada', 'Odia', 'Malayalam', 'Panjabi', 'Assamese']
num_results = st.sidebar.slider("Number of search results", 10, 50, 10)
# Multiselect for choosing languages
selected_languages = st.sidebar.multiselect('Select languages for search results', languages)
# Fetch results
if user_input:
# Get paper IDs
D, I = vector_search([user_input], model, faiss_index, num_results)
# Filter data by selected languages (if any)
if selected_languages:
frame = data[data['language'].isin(selected_languages)]
else:
frame = data
# Get individual results
i = 0
for id_ in I.flatten().tolist():
i+=1
if id_ in set(frame.rid):
f = frame[(frame.rid == id_)]
else:
continue
st.write(
f"""
**Rank**: {i}
**Language**: {f.iloc[0].language}
**Article**: {f.iloc[0].abstract} https://pib.gov.in/PressReleasePage.aspx?PRID={f.iloc[0].rid}
"""
)
if __name__ == "__main__":
main() |