import streamlit as st import pandas as pd, numpy as np from bertopic import BERTopic from datetime import datetime import math from helper import visualize_topics_over_time, visualize_topics_per_class @st.cache_data def get_df(url): return pd.read_csv(url) @st.cache_resource def get_model(url): return BERTopic.load(url) @st.cache_data def get_topics_over_time(frame,lens): strings = frame.proc2.apply(lambda x: str(x)) date = pd.to_datetime(frame.date,format=st.session_state.datetime_format) return st.session_state.model.topics_over_time(strings, date, nr_bins=math.floor(len(frame.date.unique())/3)) @st.cache_data def get_topics_per_class(frame,colname): strings = frame.proc2.apply(lambda x: str(x)) classes = st.session_state.df[colname].apply(lambda x: str(x)) return st.session_state.model.topics_per_class(strings, classes=classes) st.set_page_config( page_title="BoardTopic", page_icon="🤖", layout="wide" ) st.header("🤖 BoardTopic") st.subheader("Turning your data into insight with behavioral data science") if "model" not in st.session_state: st.markdown("Welcome to BoardTopic, a friendly way to understand your big data.") st.markdown("If you do not have a BoardTopic model trained, please go to the 'Create Model' tab.") st.markdown("If you already have a BoardTopic model trained, please enter the information below:") model_name = st.text_input("Please enter model file name (e.g., 'model')") df_name = st.text_input("Please enter dataframe file name (e.g., 'df_small.csv')") uploaded_file2 = st.file_uploader("Choose a file") #datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="") st.session_state.datetime_format = None #if datetime_format == "" else datetime_format if uploaded_file2 is not None: st.session_state.model = get_model(f'models/{model_name}') st.session_state.df = get_df(f'models/{df_name}') st.success("Model and dataframe loaded!") if "model" in st.session_state: if "datetime_format" not in st.session_state: st.session_state.datetime_format = st.text_input("Please enter the date format (e.g., '%d.%m.%Y')", value="", key="datetime_format") st.session_state.datetime_format = None if st.session_state.datetime_format == "" else st.session_state.datetime_format #st.session_state.df = get_df("df_small.csv") st.session_state.model.set_topic_labels(st.session_state.model.generate_topic_labels(nr_words=6, topic_prefix=False, word_length=10, separator=", ")) st.session_state.model_df = st.session_state.model.get_document_info(st.session_state.df.proc) st.session_state.df["id"] = st.session_state.model_df.index st.session_state.model_df["id"] = st.session_state.model_df.index st.session_state.model_df = pd.merge(st.session_state.model_df,st.session_state.df,how="left",on="id") st.session_state.model_df["date"] = pd.to_datetime(st.session_state.model_df.date,format=st.session_state.datetime_format) topics_over_time = get_topics_over_time(st.session_state.df,len(st.session_state.df)) largest_topics = st.session_state.model_df.groupby("Topic").agg("count").sort_values("Document",ascending=False)[0:10] st.write(visualize_topics_over_time(st.session_state.model, topics_over_time, topics=list(largest_topics.index), custom_labels=True, title = "10 most popular narratives over time")) st.markdown("#### Overall document distribution") grouped = st.session_state.model_df.groupby("date").agg("count") grouped['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format) st.bar_chart(data=grouped, x='date', y='Document') st.markdown("#### Emotions") joy = st.session_state.model_df.joy.apply(lambda x: 1 if x > 0.9 else 0) sadness = st.session_state.model_df.sadness.apply(lambda x: 1 if x > 0.9 else 0) surprise = st.session_state.model_df.surprise.apply(lambda x: 1 if x > 0.9 else 0) fear = st.session_state.model_df.fear.apply(lambda x: 1 if x > 0.9 else 0) anger = st.session_state.model_df.anger.apply(lambda x: 1 if x > 0.9 else 0) emotions = pd.DataFrame({"date":st.session_state.model_df.date, "source": st.session_state.model_df.source, "joy":joy, "sadness":sadness, "surprise":surprise, "fear":fear, "anger":anger}) #dates = pd.to_datetime(emotions.date.unique(),format="%d.%m.%Y").sort_values() #emotions["date"] = pd.to_datetime(emotions.date,format="%d.%m.%Y") #emnew = emotions[(dates[-7] <= emotions.date) & (emotions.date <= dates[-1])].drop('date',axis=1, inplace=False).mean() #emplot = pd.DataFrame({f"Week of {str(dates[-14])[:10]}": emold, f"Week of {str(dates[-7])[:10]}": emnew}).T st.markdown("##### Percent with emotion by platform") st.bar_chart(emotions.groupby("source").agg("mean").T*100) st.markdown("##### Platform breakdown") st.bar_chart(emotions.groupby("source").agg("mean")*100) emotionsgr = emotions.groupby("date").agg("mean")*100 emotionsgr['date'] = pd.to_datetime(grouped.index,format=st.session_state.datetime_format) st.markdown("##### Emotional dynamics over time") st.line_chart(emotionsgr,x="date") st.markdown("#### Topics per class") if "source" in st.session_state.df.columns: topics_per_class1 = get_topics_per_class(st.session_state.df,"source") st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class1, top_n_topics=20, width = 900, height = 600, custom_labels=True, title = "20 most popular narratives per platform")) st.session_state.df["emotion"] = st.session_state.df[["joy","sadness","surprise","fear",'anger','no_emotion']].idxmax(axis=1) topics_per_class2 = get_topics_per_class(st.session_state.df,"emotion") st.plotly_chart(visualize_topics_per_class(st.session_state.model, topics_per_class2, top_n_topics=20, width = 900, height = 600, custom_labels=True, title = "20 most popular narratives per emotion")) st.markdown("#### All topics") last_week = st.session_state.model_df largest_topics_last_week = last_week.groupby("Topic").agg("count").sort_values("Document",ascending=False) largest_topics_last_week["Name"] = [ list(last_week[last_week.Topic == i]["CustomName"])[0] for i in largest_topics_last_week.index ] largest_topics_last_week["Count"] = largest_topics_last_week["Document"] largest_topics_last_week["Percent"] = round(100*largest_topics_last_week["Count"]/len(st.session_state.model_df),3) st.table(largest_topics_last_week[["Name", "Count","Percent"]]) dictionary = {i:st.session_state.model.custom_labels_[i] for i in range(len(st.session_state.model.custom_labels_))} def mapping(item): return dictionary[item] st.markdown("#### Explore representative documents") st.selectbox("Select topic",list(st.session_state.model_df.Topic.unique()),key="selected_topic",format_func=mapping) repr_docs_mappings, repr_docs, repr_docs_indices = st.session_state.model._extract_representative_docs(st.session_state.model.c_tf_idf_,st.session_state.model_df,st.session_state.model.topic_representations_) ind = repr_docs_indices[st.session_state.selected_topic] j = 1 for doc in st.session_state.model_df.iloc[ind].Document: st.markdown(f"**Representative document {j}**") st.text(doc) j+=1 st.markdown("---") st.markdown("### Save current model") name = st.text_input("Please name this model file (e.g., 'my_cool_model')") if st.button("Save this model"): st.session_state.model.save(f"models/model_{name}") st.session_state.df.to_csv(f"models/df_{name}.csv") st.success(f"Model and dataframe saved in folder 'models'!") if st.button("Restart"): st.cache_data.clear() st.cache_resource.clear() for key in st.session_state.keys(): del st.session_state[key]