from logging import getLogger from pathlib import Path import joblib import pandas as pd import streamlit as st from top2vec import Top2Vec logger = getLogger(__name__) proj_dir = Path(__file__).parents[1] def initialization(): with st.spinner("Loading app..."): if 'model' not in st.session_state: model = Top2Vec.load('models/model.pkl') model._check_model_status() model.hierarchical_topic_reduction(num_topics=20) st.session_state.model = model st.session_state.umap_model = joblib.load(proj_dir / 'models' / 'umap.sav') logger.info("loading data...") if 'data' not in st.session_state: logger.info("loading data...") data = pd.read_csv(proj_dir / 'data' / 'data.csv') data['topic_id'] = data['topic_id'].apply(lambda x: f'{x:02d}') st.session_state.data = data st.session_state.selected_data = data st.session_state.all_topics = list(data.topic_id.unique()) if 'topics' not in st.session_state: logger.info("loading topics...") topics = pd.read_csv(proj_dir / 'data' / 'topics.csv') topics['topic_id'] = topics['topic_id'].apply(lambda x: f'{x:02d}') st.session_state.topics = topics topics_dict = topics[['topic_id', 'topic_0']].to_dict() topic_str_to_word = {topics_dict['topic_id'][i]: topics_dict['topic_0'][i] for i in range(20)} st.session_state.topic_str_to_word = topic_str_to_word if 'selected_points' not in st.session_state: st.session_state.selected_points = []