from collections import defaultdict import streamlit as st from utils import load_and_preprocess_data import pandas as pd import numpy as np import altair as alt from sklearn.mixture import GaussianMixture import plotly.express as px import itertools from typing import Dict, List, Tuple SIDEBAR_DESCRIPTION = """ # Client clustering To cluster a client, we adopt the RFM metrics. They stand for: - R = recency, that is the number of days since the last purchase in the store - F = frequency, that is the number of times a customer has ordered something - M = monetary value, that is how much a customer has spent buying from your business. Given these 3 metrics, we can cluster the customers and find a suitable "definition" based on the clusters they belong to. Since the dataset we're using right now has about 5000 distinct customers, we identify 3 clusters for each metric. ## How we compute the clusters We resort to a GaussianMixture algorithm. We can think of GaussianMixture as generalized k-means clustering that incorporates information about the covariance structure of the data as well as the centers of the clusters. """.lstrip() FREQUENCY_CLUSTERS_EXPLAIN = """ The **frequency** denotes how frequently a customer has ordered. There 3 available clusters for this metric: - cluster 1: denotes a customer that purchases one or few times (range [{}, {}]) - cluster 2: these customer have a discrete amount of orders (range [{}, {}]) - cluster 3: these customer purchases lots of times (range [{}, {}]) ------- """.lstrip() RECENCY_CLUSTERS_EXPLAIN = """ The **recency** refers to how recently a customer has bought; There 3 available clusters for this metric: - cluster 1: the last order of these client is long time ago (range [{}, {}]) - cluster 2: these are clients that purchases something not very recently (range [{}, {}]) - cluster 3: the last order of these client is a few days/weeks ago (range [{}, {}]) ------- """.lstrip() MONETARY_CLUSTERS_EXPLAIN = """ The **revenue** refers to how much a customer has spent buying from your business. There 3 available clusters for this metric: - cluster 1: these clients spent little money (range [{}, {}]) - cluster 2: these clients spent a considerable amount of money (range [{}, {}]) - cluster 3: these clients spent lots of money (range [{}, {}]) ------- """.lstrip() EXPLANATION_DICT = { "Frequency_cluster": FREQUENCY_CLUSTERS_EXPLAIN, "Recency_cluster": RECENCY_CLUSTERS_EXPLAIN, "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN, } def create_features(df: pd.DataFrame): """Creates a new dataframe with the RFM features for each client.""" # Compute frequency, the number of distinct time a user purchased. client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index() client_features.columns = ["CustomerID", "Frequency"] # Add monetary value, the total revenue for each single user. client_takings = df.groupby("CustomerID")["Price"].sum() client_features["Revenue"] = client_takings.values # Add recency, i.e. the days since the last purchase in the store. max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index() max_date.columns = ["CustomerID", "LastPurchaseDate"] client_features["Recency"] = ( max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"] ).dt.days return client_features @st.cache def cluster_clients(df: pd.DataFrame): """Computes the RFM features and clusters for each user based on the RFM metrics.""" df_rfm = create_features(df) for to_cluster, order in zip( ["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"] ): kmeans = GaussianMixture(n_components=3, random_state=42) labels = kmeans.fit_predict(df_rfm[[to_cluster]]) df_rfm[f"{to_cluster}_cluster"] = _order_cluster(kmeans, labels, order) return df_rfm def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"): """Orders the cluster by `order`.""" centroids = cluster_model.means_.sum(axis=1) if order.lower() == "descending": centroids *= -1 ascending_order = np.argsort(centroids) lookup_table = np.zeros_like(ascending_order) # Cluster will start from 1 lookup_table[ascending_order] = np.arange(cluster_model.n_components) + 1 return lookup_table[clusters] def show_purhcase_history(user: int, df: pd.DataFrame): user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]] expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum() expenses.columns = ["Expenses"] expenses = expenses.reset_index() c = ( alt.Chart(expenses) .mark_line(point=True) .encode( x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"), y="Expenses", ) .properties(title="User expenses") ) st.altair_chart(c) def show_user_info(user: int, df_rfm: pd.DataFrame): """Prints some information about the user. The main information are the total expenses, how many times he purchases in the store, and the clusters he belongs to. """ user_row = df_rfm[df_rfm["CustomerID"] == user] if len(user_row) == 0: st.write(f"No user with id {user}") output = [] output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n") output.append( f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n" ) output.append( f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n" ) output.append(f"She/he belongs to the clusters: ") for cluster in [column for column in user_row.columns if "_cluster" in column]: output.append(f"- {cluster} = {user_row[cluster].squeeze()}") st.write("\n".join(output)) return ( user_row["Recency_cluster"].squeeze(), user_row["Frequency_cluster"].squeeze(), user_row["Revenue_cluster"].squeeze(), ) def explain_cluster(cluster_info): """Displays a popup menu explinging the meanining of the clusters.""" with st.expander("Show information about the clusters"): st.write( "**Note**: these values are valid for these dataset." "Different dataset will have different number of clusters" " and values" ) for cluster, info in cluster_info.items(): # Transform the (mins, maxs) tuple into # [min_1, max_1, min_2, max_2, ...] list. min_max_interleaved = list(itertools.chain(*zip(info[0], info[1]))) st.write(EXPLANATION_DICT[cluster].format(*min_max_interleaved)) def categorize_user(recency_cluster, frequency_cluster, monetary_cluster): """Describe the user with few words based on the cluster he belongs to.""" score = f"{recency_cluster}{frequency_cluster}{monetary_cluster}" # @fixme: find a better approeach. These elif chains don't scale at all. description = "" if score == "111": description = "Tourist" elif score.startswith("2"): description = "Losing interest" elif score == "133": description = "Former lover" elif score == "123": description = "Former passionate client" elif score == "113": description = "Spent a lot, but never come back" elif score.startswith("1"): description = "About to dump" elif score == "313": description = "Potential lover" elif score == "312": description = "Interesting new client" elif score == "311": description = "New customer" elif score == "333": description = "Gold client" elif score == "322": description = "Lovers" else: description = "Average client" st.write(f"The customer can be described as: **{description}**") def plot_rfm_distribution( df_rfm: pd.DataFrame, cluster_info: Dict[str, Tuple[List[int], List[int]]] ): """Plots 3 histograms for the RFM metrics.""" for x, to_reverse in zip(("Revenue", "Frequency", "Recency"), (False, False, True)): fig = px.histogram( df_rfm, x=x, log_y=True, title=f"{x} metric", ) # Get the max value in the cluster info. The cluster_info_dict is a # tuple with first element the min values of the cluster, and second # element the max values of the cluster. values = cluster_info[f"{x}_cluster"][1] # get max values print(values) # Add vertical bar on each cluster end. But skip the last cluster. loop_range = range(len(values) - 1) if to_reverse: # Skip the last element loop_range = range(len(values) - 1, 0, -1) for n_cluster in loop_range: print(x) print(values[n_cluster]) fig.add_vline( x=values[n_cluster], annotation_text=f"End of cluster {n_cluster+1}", line_dash="dot", annotation=dict(textangle=90, font_color="red"), ) fig.update_layout( yaxis_title="Count (log scale)", ) st.plotly_chart(fig) def display_dataframe_heatmap(df_rfm: pd.DataFrame, cluster_info_dict): """Displays an heatmap of how many clients lay in the clusters. This method uses some black magic coming from the dataframe styling guide. """ def style_with_limits(x, column, cluster_limit_dict): """Simple function to transform the cluster number into a cluster + range string.""" min_v = cluster_limit_dict[column][0][x - 1] max_v = cluster_limit_dict[column][1][x - 1] return f"{x}: [{int(min_v)}, {int(max_v)}]" # Create a dataframe with the count of clients for each group # of cluster. count = ( df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[ "CustomerID" ] .count() .reset_index() ) count = count.rename(columns={"CustomerID": "Count"}) # Remove duplicates count = count.drop_duplicates( ["Revenue_cluster", "Frequency_cluster", "Recency_cluster"] ) # Add limits to the cells. In this way, we can better display # the heatmap. for cluster in ["Revenue_cluster", "Frequency_cluster", "Recency_cluster"]: count[cluster] = count[cluster].apply( lambda x: style_with_limits(x, cluster, cluster_info_dict) ) # Use the count column as values, then index with the clusters. count = count.pivot( index=["Revenue_cluster", "Frequency_cluster"], columns="Recency_cluster", values="Count", ) # Style manipulation cell_hover = { "selector": "td", "props": "font-size:1.2em", } index_names = { "selector": ".index_name", "props": "font-style: italic; color: Black; font-weight:normal;font-size:1.2em;", } headers = { "selector": "th:not(.index_name)", "props": "background-color: White; color: black; font-size:1.2em", } # Finally, display # We cannot directly print the dataframe since the streamlit # functin remove the multiindex. Thus, we extract the html representation # and then display it. st.markdown("## Heatmap: how the client are distributed between clusters") st.write( count.style.format(thousands=" ", precision=0, na_rep="0") .set_table_styles([cell_hover, index_names, headers]) .background_gradient(cmap="coolwarm") .to_html(), unsafe_allow_html=True, ) def main(): st.sidebar.markdown(SIDEBAR_DESCRIPTION) df, _, _ = load_and_preprocess_data() df_rfm = cluster_clients(df) st.markdown( "# Dataset " "\nThis is the processed dataset with information about the clients, such as" " the RFM values and the clusters they belong to." ) st.dataframe(df_rfm.style.format(formatter={"Revenue": "{:.2f}"})) cluster_info_dict = defaultdict(list) with st.expander("Show more details about the clusters"): for cluster in [column for column in df_rfm.columns if "_cluster" in column]: st.write(cluster) cluster_info = ( df_rfm.groupby(cluster)[cluster.split("_")[0]] .describe() .reset_index(names="Cluster") ) min_cluster = cluster_info["min"].astype(int) max_cluster = cluster_info["max"].astype(int) cluster_info_dict[cluster] = (min_cluster, max_cluster) st.dataframe(cluster_info) st.markdown("## RFM metric distribution") plot_rfm_distribution(df_rfm, cluster_info_dict) display_dataframe_heatmap(df_rfm, cluster_info_dict) st.markdown("## Interactive exploration") filter_by_cluster = st.checkbox( "Filter client: only one client per cluster type", value=True, ) client_to_select = ( df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[ "CustomerID" ] .first() .values if filter_by_cluster else df["CustomerID"].unique() ) # Let the user select the user to investigate user = st.selectbox( "Select a customer to show more information about him.", client_to_select, ) show_purhcase_history(user, df) recency, frequency, revenue = show_user_info(user, df_rfm) categorize_user(recency, frequency, revenue) explain_cluster(cluster_info_dict) main()