File size: 2,413 Bytes
70a4e1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3ca77b
70a4e1e
 
 
0c81fea
aebeb19
0c81fea
aebeb19
 
 
0c81fea
ea6b7df
0c81fea
70a4e1e
 
 
 
 
ea6b7df
 
 
 
 
 
 
70a4e1e
2ba7cb6
70a4e1e
2ba7cb6
70a4e1e
 
 
 
 
 
2ba7cb6
70a4e1e
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import faiss
import pickle
import pandas as pd
import streamlit as st
from sentence_transformers import SentenceTransformer
from vector_engine.utils import vector_search

@st.cache_data
def read_data(pibdata="pib2022_23_cleaned_abs.csv"):
    """Read the pib data."""
    return pd.read_csv(pibdata)


@st.cache_resource
def load_bert_model(name="pushpdeep/sbertmsmarco-en_to_indic_ur-murilv1"):
    """Instantiate a sentence-level DistilBERT model."""
    return SentenceTransformer(name)


@st.cache_data
def load_faiss_index(path_to_faiss="models/faiss_index_ip.pickle"):
    """Load and deserialize the Faiss index."""
    with open(path_to_faiss, "rb") as h:
        data = pickle.load(h)
    return faiss.deserialize_index(data)

def main():
    # Load data and models
    data = read_data()
    model = load_bert_model()
    faiss_index = load_faiss_index()

    st.title("Vector-based search with Sentence Transformers and Faiss")

    # User search
    user_input = st.text_area("Search box", "हिंद महासागर जलवायु परिवर्तन")

    # Filters
    st.sidebar.markdown("**Filters**")
    # List of available languages
    languages = ['English', 'Urdu','Hindi', 'Bengali', 'Marathi', 'Telugu', 'Tamil', 'Gujarati', 'Kannada', 'Odia', 'Malayalam', 'Panjabi', 'Assamese']
    
    
    num_results = st.sidebar.slider("Number of search results", 10, 50, 10)

    # Multiselect for choosing languages
    selected_languages = st.sidebar.multiselect('Select languages for search results', languages)


    # Fetch results
    if user_input:
        # Get paper IDs
        D, I = vector_search([user_input], model, faiss_index, num_results)

        # Filter data by selected languages (if any)
        if selected_languages:
            frame = data[data['language'].isin(selected_languages)]
        else:
            frame = data

        # Get individual results
        i = 0
        for id_ in I.flatten().tolist():
            i+=1
            if id_ in set(frame.rid):
                f = frame[(frame.rid == id_)]
            else:
                continue
            st.write(
                f"""
            **Rank**: {i}  
            **Language**: {f.iloc[0].language}  
            **Article**: {f.iloc[0].abstract} https://pib.gov.in/PressReleasePage.aspx?PRID={f.iloc[0].rid}
            """
            )


if __name__ == "__main__":
    main()