File size: 6,149 Bytes
ea72d75
 
 
74ce942
 
ea72d75
 
74ce942
d5f15cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b64c266
74ce942
ea72d75
74ce942
 
 
 
 
 
 
ea72d75
 
 
74ce942
ea72d75
 
 
 
 
 
74ce942
 
 
 
 
 
ea72d75
74ce942
 
 
 
 
 
 
 
 
 
 
 
 
ea72d75
74ce942
 
ea72d75
 
74ce942
 
 
 
 
 
356174d
 
176bc83
 
74ce942
ea72d75
 
74ce942
176bc83
ea72d75
74ce942
 
 
 
 
 
 
 
ea72d75
 
74ce942
 
 
 
 
 
 
 
 
 
 
d5f15cb
74ce942
 
 
 
d5f15cb
ea72d75
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from logging import getLogger
from pathlib import Path

import pandas as pd
import plotly.express as px
import streamlit as st
from st_aggrid import AgGrid, ColumnsAutoSizeMode, GridOptionsBuilder

from utilities import initialization

initialization()


# @st.cache(show_spinner=False)
# def initialize_state():
#     with st.spinner("Loading app..."):
#         if 'model' not in st.session_state:
#             model = Top2Vec.load('models/model.pkl')
#             model._check_model_status()
#             model.hierarchical_topic_reduction(num_topics=20)
#
#             st.session_state.model = model
#             st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
#             logger.info("loading data...")
#
#         if 'data' not in st.session_state:
#             logger.info("loading data...")
#             data = pd.read_csv(proj_dir / 'data' / 'data.csv')
#             data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
#             st.session_state.data = data
#             st.session_state.selected_data = data
#             st.session_state.all_topics = list(data.topic_id.unique())
#
#         if 'topics' not in st.session_state:
#             logger.info("loading topics...")
#             topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
#             topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
#             st.session_state.topics = topics
#
#             st.session_state.selected_points = []


def main():
    max_docs = st.sidebar.slider("# docs", 10, 100, value=50)
    to_search = st.text_input("Write your query here", "") or ""
    with st.spinner('Embedding Query...'):
        vector = st.session_state.model.embed([to_search])
    with st.spinner('Dimension Reduction...'):
        point = st.session_state.umap_model.transform(vector.reshape(1, -1))

    documents, document_scores, document_ids = st.session_state.model.search_documents_by_vector(vector.flatten(),
                                                                                                 num_docs=max_docs)
    st.session_state.search_raw_df = pd.DataFrame({'document_ids': document_ids, 'document_scores': document_scores})

    st.session_state.data_to_model = st.session_state.data.merge(st.session_state.search_raw_df, left_on='id',
                                                                 right_on='document_ids').drop(['document_ids'], axis=1)
    st.session_state.data_to_model = st.session_state.data_to_model.sort_values(by='document_scores',
                                                                                ascending=False)  # to make legend sorted https://bioinformatics.stackexchange.com/a/18847
    st.session_state.data_to_model.loc[len(st.session_state.data_to_model.index)] = ['Point', *point[0].tolist(),
                                                                                     to_search, 'Query', 0]
    st.session_state.data_to_model_with_point = st.session_state.data_to_model
    st.session_state.data_to_model_without_point = st.session_state.data_to_model.iloc[:-1]

    def get_topics_counts() -> pd.DataFrame:
        topic_counts = st.session_state.data_to_model_without_point["topic_id"].value_counts().to_frame()
        merged = topic_counts.merge(st.session_state.topics, left_index=True, right_on='topic_id')
        cleaned = merged.drop(['topic_id_y'], axis=1).rename({'topic_id_x': 'topic_count'}, axis=1)
        cols = ['topic_id'] + [col for col in cleaned.columns if col != 'topic_id']
        return cleaned[cols]

    st.write(""" 
    # Semantic Search
    This shows a 2d representation of documents embeded in a semantic space. Each dot is a document
    and the dots close represent documents that are close in meaning. 

    Note that the distance metrics were computed at a higher dimension so take the representation with
    a grain of salt.

    The Query is shown with the documents in yellow.
            """
             )

    df = st.session_state.data_to_model_with_point.sort_values(by='topic_id', ascending=True)
    fig = px.scatter(df.iloc[:-1], x='x', y='y', color='topic_id', template='plotly_dark',
                     hover_data=['id', 'topic_id', 'x', 'y'])
    fig.add_traces(px.scatter(df.tail(1), x="x", y="y").update_traces(marker_size=10, marker_color="yellow").data)
    st.plotly_chart(fig, use_container_width=True)
    tab1, tab2 = st.tabs(["Docs", "Topics"])

    with tab1:
        cols = ['id', 'document_scores', 'topic_id', 'documents']
        data = st.session_state.data_to_model_without_point.loc[:, cols]
        data['topic_word'] = data.topic_id.replace(st.session_state.topic_str_to_word)
        ordered_cols = ['id', 'document_scores', 'topic_id', 'topic_word', 'documents']
        builder = GridOptionsBuilder.from_dataframe(data[ordered_cols])
        builder.configure_pagination()
        builder.configure_column('document_scores', type=["numericColumn", "numberColumnFilter", "customNumericFormat"],
                                 precision=2)
        go = builder.build()
        AgGrid(data[ordered_cols], theme='streamlit', gridOptions=go,
               columns_auto_size_mode=ColumnsAutoSizeMode.FIT_CONTENTS)

    with tab2:
        cols = ['topic_id', 'topic_count', 'topic_0']
        topic_counts = get_topics_counts()
        builder = GridOptionsBuilder.from_dataframe(topic_counts[cols])
        builder.configure_pagination()
        builder.configure_column('topic_0', header_name='Topic Word', wrap_text=True)
        go = builder.build()
        AgGrid(topic_counts.loc[:, cols], theme='streamlit', gridOptions=go,
               columns_auto_size_mode=ColumnsAutoSizeMode.FIT_ALL_COLUMNS_TO_VIEW)


if __name__ == "__main__":
    # Setting up Logger and proj_dir
    logger = getLogger(__name__)
    proj_dir = Path(__file__).parents[2]

    # For max width tables
    pd.set_option('display.max_colwidth', 0)

    # Streamlit settings
    # st.set_page_config(layout="wide")
    md_title = "# Semantic Search πŸ”"
    st.markdown(md_title)
    st.sidebar.markdown(md_title)

    # initialize_state()
    main()