Spaces:

hi-paris
/

app-ai-ds-hec

Sleeping

File size: 9,656 Bytes


import json
import os
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px 

from utils import load_data_pickle, load_numpy, check_password
from st_pages import add_indentation


st.set_page_config(layout="wide")

if check_password():
    st.title("Topic Modeling 📚")
    st.markdown("### What is Topic Modeling ?")

    st.info("""
    Topic modeling is a text-mining technique used to **identify topics within a collection of documents**.
    It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge.
    """)

    st.markdown(" ")
    _, col, _ = st.columns([0.25,0.4,0.35])
    with col:
        st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True)


    st.markdown("""Common applications of Topic Modeling include: 
- **Search Engine Optimization (SEO): 🔎** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings.
- **Customer Support** ✍️: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints.   
- **Fraud Detection and Risk Management: 🏦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings.
- **Market Research 🌎**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news. 
    """)


    st.markdown("  ")
    st.divider()

    st.markdown("# Topic modeling on product descriptions 🛍️")
    st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
            the main types of products solds.""")

    _, col, _ = st.columns([0.2,0.6,0.2])
    with col:
        st.image("images/e-commerce.jpg")

    st.markdown(" ")

    # Load data 
    path_data = "data/topic-modeling"  
    # data = load_data_csv(path_data,"data-topicmodeling.csv")

    # Load the topic data
    topic_info = load_data_pickle(path_data, 'topic_info.pkl')



    ##### ABOUT THE USE CASE
    st.markdown("#### About the data 📋")
    st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br>
                This dataset contains the products' title and description on the website.""", unsafe_allow_html=True)
    st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters.
            These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""")
    see_data = st.checkbox('**See the data**', key="credit_score_data")  # Corrected the key to use an underscore
    if see_data:
        st.markdown(" ")
        st.warning("This view only shows a subset of the 20 000 product description used.")
        data = load_data_pickle(path_data,"data-tm-view.pkl") 
        data_show = data[["TITLE", "DESCRIPTION"]]
        st.dataframe(data_show.reset_index(drop=True), use_container_width=True)


    st.markdown("  ")
    st.markdown("  ")



    # RUN THE MODEL
    st.markdown("#### About the model 📚")
    st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities.
                We will use here a topic model to automatically categorize/group the retailer's products based on their description, 
                as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True)

    st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'. 
            A topic's top words are chosen based on how much they appear in the topic's documents.""")

    def show_results():
        st.markdown("#### See the results ☑️")
        tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"])
        st.markdown(" ")

        # Tab 1: Summary Table
        with tab1:
            st.header("Overall results")
            st.markdown("""This tab showcases all of the **topics identified** within the product dataset. <br>
            Each topic's <b>most significant words</b> (top words), as well as the <b>proportion</b> of products that were assigned to it are given.""", 
            unsafe_allow_html=True)

            summary_table = topic_info[['Title','Representation', 'Percentage']].copy()
            summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5
            summary_table = summary_table[["Title","Top Words","Percentage"]]
            summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True)
            
            st.data_editor(
                summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)],
                    column_config={
                        "Percentage": st.column_config.ProgressColumn(
                            "Proportion %",
                            help="Propotion of documents within each topic",
                            format="%.1f%%",
                            min_value=0,
                            max_value=100)},
                use_container_width=True
            )

            st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM.
                Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""")

        # Tab 2: Specific Topic Details
        with tab2:

            # Load top words
            with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file:
                top_words_dict = json.load(json_file)
            
            # Load similarity df and scores
            similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl")
            similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy")

            #st.markdown(" ")
            st.header("Learn more about each topic")
            st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the
                        **other topics that are most similar to it**.""")
            # st.info("""In this section, you can find more information on each of the topics identified by the model.
            #         This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""")

            st.markdown(" ")

            # Select topic
            topics = topic_info["Title"].sort_values().to_list()
            selected_topic = st.selectbox('**Select a Topic**', topics)
            selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1

            st.markdown(" ")
            col1, col2 = st.columns(2)

            # Top words 
            with col1:
                top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"])
                top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True)
                top_words_df["Importance"] = top_words_df["Importance"].round(2)
                
                fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True)
                fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False)
                st.plotly_chart(fig, use_container_width=True)
                st.info("""**Note:** Each score was computed based on the words importance in the particular topic using 
                        a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """)


            # Similar topics to the selected topic
            with col2:
                similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic]
                similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:] 
                similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"]
                
                fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True)
                fig.update_layout(yaxis=dict(range=[0, 100]), 
                                xaxis_title="", 
                                showlegend=False)
                
                st.plotly_chart(fig, use_container_width=True)
                st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as
                        well as improve the topics' coherence.""")
        
        return None

    if 'button_clicked' not in st.session_state:
        st.session_state['button_clicked'] = False

    def run_model():
        run_model = st.button("**Run the model**", type="primary")
        st.markdown(" ")
        st.markdown(" ")

        if not st.session_state['button_clicked']: 
            if run_model:
                show_results()
                st.session_state['button_clicked'] = True 
        else:
            show_results()
            
    run_model()