import json import os import streamlit as st import pandas as pd import matplotlib.pyplot as plt import plotly.express as px from utils import load_data_csv, load_data_pickle, load_model_pickle, load_numpy from st_pages import add_indentation st.set_page_config(layout="wide") ## Start of Streamlit app st.title("Topic Modeling 📚") st.markdown("### What is Topic Modeling ?") st.info(""" Topic modeling is a text-mining technique used to **identify topics within a collection of documents**. It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge. """) st.markdown(" ") _, col, _ = st.columns([0.25,0.4,0.35]) with col: st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True) st.markdown("""Common applications of Topic Modeling include: - **Search Engine Optimization (SEO): 🔎** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings. - **Customer Support** ✍️: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints. - **Fraud Detection and Risk Management: 🏦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings. - **Market Research 🌎**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news. """) st.markdown(" ") st.divider() st.markdown("# Topic modeling on product descriptions 🛍️") st.info("""In this use case, we will use a topic model to categorize around 20 000 e-commerce products using text descriptions and identify the main types of products solds.""") _, col, _ = st.columns([0.2,0.6,0.2]) with col: st.image("images/e-commerce.jpg") st.markdown(" ") # Load data path_data = "data/topic-modeling" # data = load_data_csv(path_data,"data-topicmodeling.csv") # Load the topic data topic_info = load_data_pickle(path_data, 'topic_info.pkl') ##### ABOUT THE USE CASE st.markdown("#### About the data 📋") st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer.
This dataset contains the products' title and description on the website.""", unsafe_allow_html=True) st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters. These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""") see_data = st.checkbox('**See the data**', key="credit_score_data") # Corrected the key to use an underscore if see_data: st.markdown(" ") st.warning("This view only shows a subset of the 20 000 product description used.") data = load_data_pickle(path_data,"data-tm-view.pkl") data_show = data[["TITLE", "DESCRIPTION"]] st.dataframe(data_show.reset_index(drop=True), use_container_width=True) st.markdown(" ") st.markdown(" ") # RUN THE MODEL st.markdown("#### About the model 📚") st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities. We will use here a topic model to automatically categorize/group the retailer's products based on their description, as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True) st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'. A topic's top words are chosen based on how much they appear in the topic's documents.""") def show_results(): st.markdown("#### See the results ☑️") tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"]) st.markdown(" ") # Tab 1: Summary Table with tab1: st.header("Overall results") st.markdown("""This tab showcases all of the **topics identified** within the product dataset, each topic's most significant words (**top words**), as well as the **proportion** of products that were assigned to the specific topic.""") summary_table = topic_info[['Title','Representation', 'Percentage']].copy() summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5 summary_table = summary_table[["Title","Top Words","Percentage"]] summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True) st.data_editor( summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)], column_config={ "Percentage": st.column_config.ProgressColumn( "Proportion %", help="Propotion of documents within each topic", format="%.1f%%", min_value=0, max_value=100)}, use_container_width=True ) st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM. Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""") # Tab 2: Specific Topic Details with tab2: # Load top words with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file: top_words_dict = json.load(json_file) # Load similarity df and scores similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl") similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy") #st.markdown(" ") st.header("Learn more about each topic") st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the **other topics that are most similar to it**.""") # st.info("""In this section, you can find more information on each of the topics identified by the model. # This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""") st.markdown(" ") # Select topic topics = topic_info["Title"].sort_values().to_list() selected_topic = st.selectbox('**Select a Topic**', topics) selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1 st.markdown(" ") col1, col2 = st.columns(2) # Top words with col1: top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"]) top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True) top_words_df["Importance"] = top_words_df["Importance"].round(2) fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True) fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False) st.plotly_chart(fig, use_container_width=True) st.info("""**Note:** Each score was computed based on the words importance in the particular topic using a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """) # Similar topics to the selected topic with col2: similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic] similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:] similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"] fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True) fig.update_layout(yaxis=dict(range=[0, 100]), xaxis_title="", showlegend=False) st.plotly_chart(fig, use_container_width=True) st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as well as improve the topics coherence.""") return None if 'button_clicked' not in st.session_state: st.session_state['button_clicked'] = False def run_model(): run_model = st.button("**Run the model**", type="primary") st.markdown(" ") st.markdown(" ") if not st.session_state['button_clicked']: if run_model: show_results() st.session_state['button_clicked'] = True else: show_results() run_model()