Spaces:
Running
Running
File size: 9,656 Bytes
aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 b2174f1 aa667a1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import json
import os
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from utils import load_data_pickle, load_numpy, check_password
from st_pages import add_indentation
st.set_page_config(layout="wide")
if check_password():
st.title("Topic Modeling π")
st.markdown("### What is Topic Modeling ?")
st.info("""
Topic modeling is a text-mining technique used to **identify topics within a collection of documents**.
It is a useful tool for organizing and summarizing vast amounts of textual data as well as automate the discovery of hidden thematic structures in a corpus of text data, without any prior knowledge.
""")
st.markdown(" ")
_, col, _ = st.columns([0.25,0.4,0.35])
with col:
st.image("images/topic_modeling.gif", caption="An example of Topic Modeling", use_column_width=True)
st.markdown("""Common applications of Topic Modeling include:
- **Search Engine Optimization (SEO): π** Determine the main topics/keywords present on a website to optimize content and improve search engine rankings.
- **Customer Support** βοΈ: Analyze customer support tickets, emails, and chat transcripts to identify common questions and complaints.
- **Fraud Detection and Risk Management: π¦** : Detect fraudulent activities, compliance violations, and operational risks by analyzing textual data such as transaction descriptions, audit reports and regulatory filings.
- **Market Research π**: Gain competitive intelligence and make informed decisions regarding product development, marketing strategies, and market positioning by analyzing research reports and industry news.
""")
st.markdown(" ")
st.divider()
st.markdown("# Topic modeling on product descriptions ποΈ")
st.markdown("""In this use case, we will use a **topic model** to categorize around **20 000 e-commerce products** as well as identify
the main types of products solds.""")
_, col, _ = st.columns([0.2,0.6,0.2])
with col:
st.image("images/e-commerce.jpg")
st.markdown(" ")
# Load data
path_data = "data/topic-modeling"
# data = load_data_csv(path_data,"data-topicmodeling.csv")
# Load the topic data
topic_info = load_data_pickle(path_data, 'topic_info.pkl')
##### ABOUT THE USE CASE
st.markdown("#### About the data π")
st.markdown("""You were provided a dataset with around 20 000 products from a large e-commerce retailer. <br>
This dataset contains the products' title and description on the website.""", unsafe_allow_html=True)
st.info("""**Note**: Some of the descriptions featured below are shown in their 'raw' form, meaning they contain unprocessed html code and special characters.
These descriptions were first 'cleaned' (by removing unwanted characters) before being used in the model.""")
see_data = st.checkbox('**See the data**', key="credit_score_data") # Corrected the key to use an underscore
if see_data:
st.markdown(" ")
st.warning("This view only shows a subset of the 20 000 product description used.")
data = load_data_pickle(path_data,"data-tm-view.pkl")
data_show = data[["TITLE", "DESCRIPTION"]]
st.dataframe(data_show.reset_index(drop=True), use_container_width=True)
st.markdown(" ")
st.markdown(" ")
# RUN THE MODEL
st.markdown("#### About the model π")
st.markdown("""**Topic models** can be seen as unsupervised clustering models where text documents are grouped into topics/clusters based on their similarities.
We will use here a topic model to automatically categorize/group the retailer's products based on their description,
as well as understand what are the most common type of products being sold.""", unsafe_allow_html=True)
st.info("""**Note**: In topic modeling, the final topics are represented by the model using 'top words'.
A topic's top words are chosen based on how much they appear in the topic's documents.""")
def show_results():
st.markdown("#### See the results βοΈ")
tab1, tab2 = st.tabs(["Overall results", "Specific Topic Details", ])# "Search Similar Topics"])
st.markdown(" ")
# Tab 1: Summary Table
with tab1:
st.header("Overall results")
st.markdown("""This tab showcases all of the **topics identified** within the product dataset. <br>
Each topic's <b>most significant words</b> (top words), as well as the <b>proportion</b> of products that were assigned to it are given.""",
unsafe_allow_html=True)
summary_table = topic_info[['Title','Representation', 'Percentage']].copy()
summary_table['Top Words'] = summary_table['Representation'].apply(lambda x: x[:5]) #:5
summary_table = summary_table[["Title","Top Words","Percentage"]]
summary_table.rename({"Title":"Topic Title"}, axis=1, inplace=True)
st.data_editor(
summary_table, #.loc[df_results_tab1["Customer ID"].isin(filter_customers)],
column_config={
"Percentage": st.column_config.ProgressColumn(
"Proportion %",
help="Propotion of documents within each topic",
format="%.1f%%",
min_value=0,
max_value=100)},
use_container_width=True
)
st.info("""**Note**: The topic 'titles' were not provided by the model but instead were generated by feeding the topic's top words to an LLM.
Traditional topic models define topics using representative/top words but weren't built to generate a specific title to each topic.""")
# Tab 2: Specific Topic Details
with tab2:
# Load top words
with open(os.path.join(path_data,"topics_top_words.json"), "r") as json_file:
top_words_dict = json.load(json_file)
# Load similarity df and scores
similarity_df = load_data_pickle(path_data, "similarity_topic_df.pkl")
similarity_scores = load_numpy(path_data, "similarity_topic_scores.npy")
#st.markdown(" ")
st.header("Learn more about each topic")
st.markdown("""You can **select a specific topic** to get more information on its **top words**, as well as the
**other topics that are most similar to it**.""")
# st.info("""In this section, you can find more information on each of the topics identified by the model.
# This includes the topic's a full list of its top words, the importance of each of these words, as well as the top five topics that are most similar to it.""")
st.markdown(" ")
# Select topic
topics = topic_info["Title"].sort_values().to_list()
selected_topic = st.selectbox('**Select a Topic**', topics)
selected_topic_id = topic_info[topic_info['Title'] == selected_topic]["Topic"].to_numpy()[0] + 1
st.markdown(" ")
col1, col2 = st.columns(2)
# Top words
with col1:
top_words_df = pd.DataFrame(top_words_dict[selected_topic], columns=["Word", "Importance"])
top_words_df.sort_values(by=["Importance"], ascending=False, inplace=True)
top_words_df["Importance"] = top_words_df["Importance"].round(2)
fig = px.bar(top_words_df, x='Word', y='Importance', color="Importance", title="Top words", text_auto=True)
fig.update_layout(yaxis=dict(range=[0, 1]), xaxis_title="", showlegend=False)
st.plotly_chart(fig, use_container_width=True)
st.info("""**Note:** Each score was computed based on the words importance in the particular topic using
a popular metric in NLP called TF-IDF (Term Frequency-Inverse Document Frequency). """)
# Similar topics to the selected topic
with col2:
similarity_df = similarity_df.loc[similarity_df["Topic"]==selected_topic]
similarity_df["scores"] = 100*similarity_scores[selected_topic_id,:]
similarity_df.columns = ["Original Topic", "Rank", "Topic", "Similarity (%)"]
fig = px.bar(similarity_df, y='Similarity (%)', x='Topic', color="Topic", title="Five most similar topics", text_auto=True)
fig.update_layout(yaxis=dict(range=[0, 100]),
xaxis_title="",
showlegend=False)
st.plotly_chart(fig, use_container_width=True)
st.info("""**Note:** Topics with a high similarity score can be merged together as to reduce the number of topics, as
well as improve the topics' coherence.""")
return None
if 'button_clicked' not in st.session_state:
st.session_state['button_clicked'] = False
def run_model():
run_model = st.button("**Run the model**", type="primary")
st.markdown(" ")
st.markdown(" ")
if not st.session_state['button_clicked']:
if run_model:
show_results()
st.session_state['button_clicked'] = True
else:
show_results()
run_model()
|