File size: 9,656 Bytes
aa667a1
 
 
 
 
 
 
 
b2174f1
aa667a1
 
 
 
 
b2174f1
 
 
aa667a1
b2174f1
 
 
 
aa667a1
b2174f1
 
 
 
aa667a1
 
b2174f1
aa667a1
 
 
 
b2174f1
aa667a1
 
b2174f1
 
aa667a1
b2174f1
 
 
aa667a1
b2174f1
 
 
aa667a1
 
 
b2174f1
 
 
aa667a1
b2174f1
 
aa667a1
 
 
b2174f1
 
 
 
 
 
 
 
 
 
 
 
 
aa667a1
 
b2174f1
 
aa667a1
 
 
b2174f1
 
 
 
 
aa667a1
b2174f1
 
aa667a1
b2174f1
 
 
aa667a1
 
b2174f1
 
 
 
 
 
 
 
 
 
 
aa667a1
b2174f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa667a1
b2174f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa667a1
b2174f1
 
aa667a1
b2174f1
 
 
 
aa667a1
b2174f1
 
 
 
 
aa667a1
b2174f1
 
aa667a1
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198

import json
import os
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px 

from utils import load_data_pickle, load_numpy, check_password
from st_pages import add_indentation


st.set_page_config(layout="wide")

if check_password():
    st.title("Topic Modeling πŸ“š")
    st.markdown("### What is Topic Modeling ?")

    st.info("""
    Topic modeling is a text-mining technique used to **identify topics within a collection of documents**.
    It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge.
    """)

    st.markdown(" ")
    _, col, _ = st.columns([0.25,0.4,0.35])
    with col:
        st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True)


    st.markdown("""Common applications of Topic Modeling include: 
- **Search Engine Optimization (SEO): πŸ”Ž** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings.
- **Customer Support** ✍️: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints.   
- **Fraud Detection and Risk Management: 🏦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings.
- **Market Research 🌎**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news. 
    """)


    st.markdown("  ")
    st.divider()

    st.markdown("# Topic modeling on product descriptions πŸ›οΈ")
    st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
            the main types of products solds.""")

    _, col, _ = st.columns([0.2,0.6,0.2])
    with col:
        st.image("images/e-commerce.jpg")

    st.markdown(" ")

    # Load data 
    path_data = "data/topic-modeling"  
    # data = load_data_csv(path_data,"data-topicmodeling.csv")

    # Load the topic data
    topic_info = load_data_pickle(path_data, 'topic_info.pkl')



    ##### ABOUT THE USE CASE
    st.markdown("#### About the data πŸ“‹")
    st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br>
                This dataset contains the products' title and description on the website.""", unsafe_allow_html=True)
    st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters.
            These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""")
    see_data = st.checkbox('**See the data**', key="credit_score_data")  # Corrected the key to use an underscore
    if see_data:
        st.markdown(" ")
        st.warning("This view only shows a subset of the 20 000 product description used.")
        data = load_data_pickle(path_data,"data-tm-view.pkl") 
        data_show = data[["TITLE", "DESCRIPTION"]]
        st.dataframe(data_show.reset_index(drop=True), use_container_width=True)


    st.markdown("  ")
    st.markdown("  ")



    # RUN THE MODEL
    st.markdown("#### About the model πŸ“š")
    st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities.
                We will use here a topic model to automatically categorize/group the retailer's products based on their description, 
                as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True)

    st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'. 
            A topic's top words are chosen based on how much they appear in the topic's documents.""")

    def show_results():
        st.markdown("#### See the results β˜‘οΈ")
        tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"])
        st.markdown(" ")

        # Tab 1: Summary Table
        with tab1:
            st.header("Overall results")
            st.markdown("""This tab showcases all of the **topics identified** within the product dataset. <br>
            Each topic's <b>most significant words</b> (top words), as well as the <b>proportion</b> of products that were assigned to it are given.""", 
            unsafe_allow_html=True)

            summary_table = topic_info[['Title','Representation', 'Percentage']].copy()
            summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5
            summary_table = summary_table[["Title","Top Words","Percentage"]]
            summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True)
            
            st.data_editor(
                summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)],
                    column_config={
                        "Percentage": st.column_config.ProgressColumn(
                            "Proportion %",
                            help="Propotion of documents within each topic",
                            format="%.1f%%",
                            min_value=0,
                            max_value=100)},
                use_container_width=True
            )

            st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM.
                Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""")

        # Tab 2: Specific Topic Details
        with tab2:

            # Load top words
            with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file:
                top_words_dict = json.load(json_file)
            
            # Load similarity df and scores
            similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl")
            similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy")

            #st.markdown(" ")
            st.header("Learn more about each topic")
            st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the
                        **other topics that are most similar to it**.""")
            # st.info("""In this section, you can find more information on each of the topics identified by the model.
            #         This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""")

            st.markdown(" ")

            # Select topic
            topics = topic_info["Title"].sort_values().to_list()
            selected_topic = st.selectbox('**Select a Topic**', topics)
            selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1

            st.markdown(" ")
            col1, col2 = st.columns(2)

            # Top words 
            with col1:
                top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"])
                top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True)
                top_words_df["Importance"] = top_words_df["Importance"].round(2)
                
                fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True)
                fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False)
                st.plotly_chart(fig, use_container_width=True)
                st.info("""**Note:** Each score was computed based on the words importance in the particular topic using 
                        a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """)


            # Similar topics to the selected topic
            with col2:
                similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic]
                similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:] 
                similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"]
                
                fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True)
                fig.update_layout(yaxis=dict(range=[0, 100]), 
                                xaxis_title="", 
                                showlegend=False)
                
                st.plotly_chart(fig, use_container_width=True)
                st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as
                        well as improve the topics' coherence.""")
        
        return None

    if 'button_clicked' not in st.session_state:
        st.session_state['button_clicked'] = False

    def run_model():
        run_model = st.button("**Run the model**", type="primary")
        st.markdown(" ")
        st.markdown(" ")

        if not st.session_state['button_clicked']: 
            if run_model:
                show_results()
                st.session_state['button_clicked'] = True 
        else:
            show_results()
            
    run_model()