File size: 1,664 Bytes
d5f15cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356174d
 
 
d5f15cb
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from logging import getLogger
from pathlib import Path

import joblib
import pandas as pd
import streamlit as st
from top2vec import Top2Vec

logger = getLogger(__name__)

proj_dir = Path(__file__).parents[1]


def initialization():
    with st.spinner("Loading app..."):
        if 'model' not in st.session_state:
            model = Top2Vec.load('models/model.pkl')
            model._check_model_status()
            model.hierarchical_topic_reduction(num_topics=20)

            st.session_state.model = model
            st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav')
            logger.info("loading data...")

        if 'data' not in st.session_state:
            logger.info("loading data...")
            data = pd.read_csv(proj_dir / 'data' / 'data.csv')
            data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}')
            st.session_state.data = data
            st.session_state.selected_data = data
            st.session_state.all_topics = list(data.topic_id.unique())

        if 'topics' not in st.session_state:
            logger.info("loading topics...")
            topics = pd.read_csv(proj_dir / 'data' / 'topics.csv')
            topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}')
            st.session_state.topics = topics
            topics_dict = topics[['topic_id', 'topic_0']].to_dict()
            topic_str_to_word = {topics_dict['topic_id'][i]: topics_dict['topic_0'][i] for i in range(20)}
            st.session_state.topic_str_to_word = topic_str_to_word

        if 'selected_points' not in st.session_state:
            st.session_state.selected_points = []