tave-st commited on
Commit
e618873
1 Parent(s): 86bb7fc

initial commit

Browse files
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ *.csv filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
Data/OnlineRetail.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d07aec9960083af2339975a3f9d3b26313b342dcd9f86cce0b919b1cde639a44
3
+ size 45580638
README.md CHANGED
@@ -1,13 +1,54 @@
1
- ---
2
- title: Demo Confindustria
3
- emoji: 🐨
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: streamlit
7
- sdk_version: 1.10.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Demo Confindustria
2
+
3
+ Demo with recsys and clustering for the [online retail](https://www.kaggle.com/datasets/vijayuv/onlineretail?select=OnlineRetail.csv) dataset.
4
+
5
+ ## Objective
6
+
7
+ Recommender system:
8
+
9
+ 1. interactively select a user
10
+ 2. show all the recommendations for the user
11
+ 3. explain why we get these suggestions (which purchased object influences the most)
12
+ 4. plot the purchases and suggested articles
13
+
14
+ Clustering:
15
+
16
+ 1. compute the user clustering
17
+ 2. plot users and their clusters
18
+ 3. explain the meaning of the clusters (compute the mean metrics or literally explain them)
19
+
20
+ ## Setup
21
+
22
+ In your terminal run:
23
+
24
+ ```bash
25
+ # Enable the env
26
+ source .venv/bin/activate
27
+
28
+ # Install the dependencies
29
+
30
+ pip install -r requirements.txt
31
+
32
+ # Or install the freezed dependencies from the requirements_freezed.txt
33
+
34
+ # You are ready to rock!
35
+ ```
36
+
37
+ ## Run
38
+
39
+ In your terminal run:
40
+
41
+ ```bash
42
+ streamlit run recommender_system.py
43
+
44
+ # Now the defualt browser will be opened with
45
+ # the stramlit page. It you want to customize the
46
+ # execution of streaming, refer to its documentation.
47
+ ```
48
+
49
+ ## Resources
50
+
51
+ - [streamlit](https://streamlit.io/)
52
+ - [implicit](https://github.com/benfred/implicit), recsys library
53
+ - [t-sne guide](https://distill.pub/2016/misread-tsne/)
54
+ - [RFM segmentation](https://www.omniconvert.com/blog/rfm-score/)
pages/clustering.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import streamlit as st
3
+ from utils import load_and_preprocess_data
4
+ import pandas as pd
5
+ import numpy as np
6
+ import altair as alt
7
+ from sklearn.mixture import GaussianMixture
8
+ import plotly.express as px
9
+ import itertools
10
+ from typing import Dict, List
11
+
12
+ SIDEBAR_DESCRIPTION = """
13
+ # Client clustering
14
+
15
+ To cluster a client, we adopt the RFM metrics. They stand for:
16
+
17
+ - R = recency, that is the number of days since the last purchase
18
+ in the store
19
+ - F = frequency, that is the number of times a customer has ordered something
20
+ - M = monetary value, that is how much a customer has spent buying
21
+ from your business.
22
+
23
+ Given these 3 metrics, we can cluster the customers and find a suitable
24
+ "definition" based on the clusters they belong to. Since the dataset
25
+ we're using right now as about 5000 distinct customers, we identify
26
+ 3 clusters for each metric.
27
+
28
+ ## How we compute the clusters
29
+
30
+ We resort to a simple KMeans algorithm. It tries to find the clusters
31
+ based on the distance between points. In particular, near points tend to be associated
32
+ with the same cluster, while further points should belong to different clusters.
33
+ """.lstrip()
34
+
35
+ FREQUENCY_CLUSTERS_EXPLAIN = """
36
+ The **frequency** denotes how frequently a customer has ordered.
37
+
38
+ There 3 available clusters for this metric:
39
+
40
+ - cluster 0: denotes a customer that purchases one or few times (range [{}, {}])
41
+ - cluster 1: these customer have a discrete amount of orders (range [{}, {}])
42
+ - cluster 2: these customer purchases lots of times (range [{}, {}])
43
+
44
+ -------
45
+ """.lstrip()
46
+
47
+ RECENCY_CLUSTERS_EXPLAIN = """
48
+ The **recency** refers to how recently a customer has bought;
49
+
50
+ There 3 available clusters for this metric:
51
+
52
+ - cluster 0: the last order of these client is long time ago (range [{}, {}])
53
+ - cluster 1: these are clients that purchases something not very recently (range [{}, {}])
54
+ - cluster 2: the last order of these client is a few days/weeks ago (range [{}, {}])
55
+
56
+ -------
57
+ """.lstrip()
58
+
59
+ MONETARY_CLUSTERS_EXPLAIN = """
60
+ The **revenue** refers to how much a customer has spent buying
61
+ from your business.
62
+
63
+ There 3 available clusters for this metric:
64
+
65
+ - cluster 0: these clients spent little money (range [{}, {}])
66
+ - cluster 1: these clients spent a considerable amount of money (range [{}, {}])
67
+ - cluster 2: these clients spent lots of money (range [{}, {}])
68
+
69
+ -------
70
+ """.lstrip()
71
+
72
+ EXPLANATION_DICT = {
73
+ "Frequency_cluster": FREQUENCY_CLUSTERS_EXPLAIN,
74
+ "Recency_cluster": RECENCY_CLUSTERS_EXPLAIN,
75
+ "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
76
+ }
77
+
78
+
79
+ def create_features(df: pd.DataFrame):
80
+ """Creates a new dataframe with the RFM features for each client."""
81
+ # Compute frequency, the number of distinct time a user purchased.
82
+ client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index()
83
+ client_features.columns = ["CustomerID", "Frequency"]
84
+
85
+ # Add monetary value, the total revenue for each single user.
86
+ client_takings = df.groupby("CustomerID")["Price"].sum()
87
+ client_features["Revenue"] = client_takings.values
88
+
89
+ # Add recency, i.e. the days since the last purchase in the store.
90
+ max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index()
91
+ max_date.columns = ["CustomerID", "LastPurchaseDate"]
92
+
93
+ client_features["Recency"] = (
94
+ max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"]
95
+ ).dt.days
96
+
97
+ return client_features
98
+
99
+
100
+ @st.cache
101
+ def cluster_clients(df: pd.DataFrame):
102
+ """Computes the RFM features and clusters for each user based on the RFM metrics."""
103
+
104
+ df_rfm = create_features(df)
105
+
106
+ for to_cluster, order in zip(
107
+ ["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
108
+ ):
109
+ kmeans = GaussianMixture(n_components=3, random_state=42)
110
+ labels = kmeans.fit_predict(df_rfm[[to_cluster]])
111
+ df_rfm[f"{to_cluster}_cluster"] = _order_cluster(kmeans, labels, order)
112
+
113
+ return df_rfm
114
+
115
+
116
+ def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"):
117
+ """Orders the cluster by order."""
118
+ centroids = cluster_model.means_.sum(axis=1)
119
+
120
+ if order.lower() == "descending":
121
+ centroids *= -1
122
+
123
+ ascending_order = np.argsort(centroids)
124
+ lookup_table = np.zeros_like(ascending_order)
125
+ # Cluster will start from 1
126
+ lookup_table[ascending_order] = np.arange(cluster_model.n_components) + 1
127
+ return lookup_table[clusters]
128
+
129
+
130
+ def show_purhcase_history(user: int, df: pd.DataFrame):
131
+ user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]]
132
+ expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum()
133
+ expenses.columns = ["Expenses"]
134
+ expenses = expenses.reset_index()
135
+
136
+ c = (
137
+ alt.Chart(expenses)
138
+ .mark_line(point=True)
139
+ .encode(
140
+ x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"),
141
+ y="Expenses",
142
+ )
143
+ .properties(title="User expenses")
144
+ )
145
+
146
+ st.altair_chart(c, use_container_width=True)
147
+
148
+
149
+ def show_user_info(user: int, df_rfm: pd.DataFrame):
150
+ """Prints some information about the user.
151
+
152
+ The main information are the total expenses, how
153
+ many times he purchases in the store, and the clusters
154
+ he belongs to.
155
+ """
156
+
157
+ user_row = df_rfm[df_rfm["CustomerID"] == user]
158
+ if len(user_row) == 0:
159
+ st.write(f"No user with id {user}")
160
+
161
+ output = []
162
+
163
+ output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n")
164
+ output.append(
165
+ f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n"
166
+ )
167
+ output.append(
168
+ f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n"
169
+ )
170
+ output.append(f"She/he belongs to the clusters: ")
171
+ for cluster in [column for column in user_row.columns if "_cluster" in column]:
172
+ output.append(f"- {cluster} = {user_row[cluster].squeeze()}")
173
+
174
+ st.write("\n".join(output))
175
+
176
+ return (
177
+ user_row["Recency_cluster"].squeeze(),
178
+ user_row["Frequency_cluster"].squeeze(),
179
+ user_row["Revenue_cluster"].squeeze(),
180
+ )
181
+
182
+
183
+ def explain_cluster(cluster_info):
184
+ """Displays a popup menu explinging the meanining of the clusters."""
185
+
186
+ with st.expander("Show information about the clusters"):
187
+ st.write(
188
+ "**Note**: these values are valid for these dataset."
189
+ "Different dataset will have different number of clusters"
190
+ " and values"
191
+ )
192
+ for cluster, info in cluster_info.items():
193
+ st.write(EXPLANATION_DICT[cluster].format(*info))
194
+
195
+
196
+ def categorize_user(recency_cluster, frequency_cluster, monetary_cluster):
197
+ """Describe the user with few words based on the cluster he belongs to."""
198
+
199
+ score = f"{recency_cluster}{frequency_cluster}{monetary_cluster}"
200
+
201
+ # @fixme: find a better approeach. These elif chains don't scale at all.
202
+
203
+ description = ""
204
+
205
+ if score == "111":
206
+ description = "Tourist"
207
+ elif score.startswith("2"):
208
+ description = "Losing interest"
209
+ elif score == "133":
210
+ description = "Former lover"
211
+ elif score == "123":
212
+ description = "Former passionate client"
213
+ elif score == "113":
214
+ description = "Spent a lot, but never come back"
215
+ elif score.startswith("1"):
216
+ description = "About to dump"
217
+ elif score == "313":
218
+ description = "Potential lover"
219
+ elif score == "312":
220
+ description = "Interesting new client"
221
+ elif score == "311":
222
+ description = "New customer"
223
+ elif score == "333":
224
+ description = "Gold client"
225
+ elif score == "322":
226
+ description = "Lovers"
227
+ else:
228
+ description = "Average client"
229
+
230
+ st.write(f"The customer can be described as: **{description}**")
231
+
232
+
233
+ def plot_rfm_distribution(df_rfm: pd.DataFrame, cluster_info: Dict[str, List[int]]):
234
+ """Plots 3 histograms for the RFM metrics."""
235
+
236
+ for x in ("Revenue", "Frequency", "Recency"):
237
+ fig = px.histogram(df_rfm, x=x, log_y=True, title=f"{x} metric")
238
+ # Get the max value in the cluster info. The cluster info is a list of min - max
239
+ # values per cluster.
240
+ values = cluster_info[f"{x}_cluster"]
241
+ for n_cluster, i in enumerate(range(1, len(values), 2)):
242
+ fig.add_vline(
243
+ x=values[i],
244
+ annotation_text=f"End of cluster {n_cluster+1}",
245
+ line_dash="dot",
246
+ annotation=dict(textangle=90, font_color="red"),
247
+ )
248
+ st.plotly_chart(fig, use_container_width=True)
249
+
250
+
251
+ def display_dataframe_heatmap(df_rfm: pd.DataFrame):
252
+ """Displays an heatmap of how many clients lay in the clusters.
253
+
254
+ This method uses some black magic coming from the dataframe
255
+ styling guide.
256
+ """
257
+
258
+ # Create a dataframe with the count of clients for each group
259
+ # of cluster.
260
+
261
+ count = (
262
+ df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
263
+ "CustomerID"
264
+ ]
265
+ .count()
266
+ .reset_index()
267
+ )
268
+ count = count.rename(columns={"CustomerID": "Count"})
269
+
270
+ # Remove duplicates
271
+ count = count.drop_duplicates(
272
+ ["Revenue_cluster", "Frequency_cluster", "Recency_cluster"]
273
+ )
274
+
275
+ # Use the count column as values, then index with the clusters.
276
+ count = count.pivot(
277
+ index=["Revenue_cluster", "Frequency_cluster"],
278
+ columns="Recency_cluster",
279
+ values="Count",
280
+ )
281
+
282
+ # Style manipulation
283
+ cell_hover = {
284
+ "selector": "td",
285
+ "props": "font-size:1.5em",
286
+ }
287
+ index_names = {
288
+ "selector": ".index_name",
289
+ "props": "font-style: italic; color: Black; font-weight:normal;font-size:1.5em;",
290
+ }
291
+ headers = {
292
+ "selector": "th:not(.index_name)",
293
+ "props": "background-color: White; color: black; font-size:1.5em",
294
+ }
295
+
296
+ # Finally, display
297
+ # We cannot directly print the dataframe since the streamlit
298
+ # functin remove the multiindex. Thus, we extract the html representation
299
+ # and then display it.
300
+ st.markdown("## Heatmap: how the client are distributed between clusters")
301
+ st.write(
302
+ count.style.format(thousands=" ", precision=0, na_rep="Missing")
303
+ .set_table_styles([cell_hover, index_names, headers])
304
+ .background_gradient(cmap="coolwarm")
305
+ .to_html(),
306
+ unsafe_allow_html=True,
307
+ )
308
+
309
+
310
+ def main():
311
+ st.sidebar.markdown(SIDEBAR_DESCRIPTION)
312
+
313
+ df, _, _ = load_and_preprocess_data()
314
+ df_rfm = cluster_clients(df)
315
+
316
+ st.markdown(
317
+ "# Dataset "
318
+ "\nThis is the processed dataset with information about the clients, such as"
319
+ " the RFM values and the clusters they belong to."
320
+ )
321
+ st.dataframe(df_rfm)
322
+
323
+ cluster_info_dict = defaultdict(list)
324
+
325
+ with st.expander("Show more details about the clusters"):
326
+ for cluster in [column for column in df_rfm.columns if "_cluster" in column]:
327
+ st.write(cluster)
328
+ cluster_info = (
329
+ df_rfm.groupby(cluster)[cluster.split("_")[0]]
330
+ .describe()
331
+ .reset_index(names="Cluster")
332
+ )
333
+ min_cluster = cluster_info["min"].astype(int)
334
+ max_cluster = cluster_info["max"].astype(int)
335
+ min_max_interlieved = list(itertools.chain(*zip(min_cluster, max_cluster)))
336
+ cluster_info_dict[cluster].extend(min_max_interlieved)
337
+ st.dataframe(cluster_info)
338
+
339
+ st.markdown("## RFM metric distribution")
340
+
341
+ plot_rfm_distribution(df_rfm, cluster_info_dict)
342
+
343
+ display_dataframe_heatmap(df_rfm)
344
+
345
+ st.markdown("## Interactive exploration")
346
+
347
+ filter_by_cluster = st.checkbox(
348
+ "Filter client: only one client per cluster type",
349
+ value=True,
350
+ )
351
+
352
+ client_to_select = (
353
+ df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])["CustomerID"].first().values
354
+ if filter_by_cluster
355
+ else df["CustomerID"].unique()
356
+ )
357
+
358
+ # Let the user select the user to investigate
359
+ user = st.selectbox(
360
+ "Select a customer to show more information about him.",
361
+ client_to_select,
362
+ )
363
+
364
+ show_purhcase_history(user, df)
365
+
366
+ recency, frequency, revenue = show_user_info(user, df_rfm)
367
+
368
+ categorize_user(recency, frequency, revenue)
369
+
370
+ explain_cluster(cluster_info_dict)
371
+
372
+
373
+ main()
recommender.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from implicit.als import AlternatingLeastSquares
2
+ from implicit.lmf import LogisticMatrixFactorization
3
+ from implicit.bpr import BayesianPersonalizedRanking
4
+ from implicit.nearest_neighbours import bm25_weight
5
+ from scipy.sparse import csr_matrix
6
+ from typing import Dict, Any
7
+
8
+ MODEL = {
9
+ "lmf": LogisticMatrixFactorization,
10
+ "als": AlternatingLeastSquares,
11
+ "bpr": BayesianPersonalizedRanking,
12
+ }
13
+
14
+
15
+ def _get_sparse_matrix(values, user_idx, product_idx):
16
+ return csr_matrix(
17
+ (values, (user_idx, product_idx)),
18
+ shape=(len(user_idx.unique()), len(product_idx.unique())),
19
+ )
20
+
21
+
22
+ def _get_model(name: str, **params):
23
+ model = MODEL.get(name)
24
+ if model is None:
25
+ raise ValueError("No model with name {}".format(name))
26
+ return model(**params)
27
+
28
+
29
+ class InternalStatusError(Exception):
30
+ pass
31
+
32
+
33
+ class Recommender:
34
+ def __init__(
35
+ self,
36
+ values,
37
+ user_idx,
38
+ product_idx,
39
+ ):
40
+ self.user_product_matrix = _get_sparse_matrix(values, user_idx, product_idx)
41
+ self.user_idx = user_idx
42
+ self.product_idx = product_idx
43
+
44
+ # This variable will be set during training phase
45
+ self.model = None
46
+ self.fitted = False
47
+
48
+ def create_and_fit(
49
+ self,
50
+ model_name: str,
51
+ weight_strategy: str = "bm25",
52
+ model_params: Dict[str, Any] = {},
53
+ ):
54
+ weight_strategy = weight_strategy.lower()
55
+ if weight_strategy == "bm25":
56
+ data = bm25_weight(
57
+ self.user_product_matrix,
58
+ K1=1.2,
59
+ B=0.75,
60
+ )
61
+ elif weight_strategy == "balanced":
62
+ # Balance the positive and negative (nan) entries
63
+ # http://stanford.edu/~rezab/nips2014workshop/submits/logmat.pdf
64
+ total_size = (
65
+ self.user_product_matrix.shape[0] * self.user_product_matrix.shape[1]
66
+ )
67
+ sum = self.user_product_matrix.sum()
68
+ num_zeros = total_size - self.user_product_matrix.count_nonzero()
69
+ data = self.user_product_matrix.multiply(num_zeros / sum)
70
+ elif weight_strategy == "same":
71
+ data = self.user_product_matrix
72
+ else:
73
+ raise ValueError("Weight strategy not supported")
74
+
75
+ self.model = _get_model(model_name, **model_params)
76
+ self.fitted = True
77
+
78
+ self.model.fit(data)
79
+
80
+ return self
81
+
82
+ def recommend_products(
83
+ self,
84
+ user_id,
85
+ items_to_recommend = 5,
86
+ ):
87
+ """Finds the recommended items for the user.
88
+
89
+ Returns:
90
+ (items, scores) pair, where item is already the name of the suggested item.
91
+ """
92
+
93
+ if not self.fitted:
94
+ raise InternalStatusError(
95
+ "Cannot recommend products without previously fitting the model."
96
+ " Please, consider fitting the model before recommening products."
97
+ )
98
+
99
+ return self.model.recommend(
100
+ user_id,
101
+ self.user_product_matrix[user_id],
102
+ filter_already_liked_items=True,
103
+ N=items_to_recommend,
104
+ )
105
+
106
+ def explain_recommendation(
107
+ self,
108
+ user_id,
109
+ suggested_item_id,
110
+ recommended_items,
111
+ ):
112
+ _, items_score_contrib, _ = self.model.explain(
113
+ user_id,
114
+ self.user_product_matrix,
115
+ suggested_item_id,
116
+ N=recommended_items,
117
+ )
118
+
119
+ return items_score_contrib
120
+
121
+ def similar_users(self, user_id):
122
+ return self.model.similar_users(user_id)
123
+
124
+ @property
125
+ def item_factors(self):
126
+ return self.model.item_factors
recommender_system.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import altair as alt
4
+ from recommender import Recommender
5
+ from sklearn.decomposition import PCA
6
+ from sklearn.manifold import TSNE
7
+ from os import cpu_count
8
+ import numpy as np
9
+ import time
10
+ import random
11
+
12
+ from utils import load_and_preprocess_data
13
+
14
+ import matplotlib.pyplot as plt
15
+ from typing import Union, List, Dict, Any, TYPE_CHECKING
16
+ import plotly.graph_objects as go
17
+
18
+
19
+ COLUMN_NOT_DISPLAY = [
20
+ "StockCode",
21
+ "UnitPrice",
22
+ "Country",
23
+ "CustomerIndex",
24
+ "ProductIndex",
25
+ ]
26
+
27
+
28
+ SIDEBAR_DESCRIPTION = """
29
+ # Recommender system
30
+
31
+ ## What is it?
32
+ A recommender system is a tool that suggests something new to a particular
33
+ user that she/he might be interest in. It becomes really useful when
34
+ the number of items that a user can choose from is high.
35
+
36
+ ## How does it work?
37
+ A recommender system internally finds similar users and similar items,
38
+ based on a suitable definition of "similarity".
39
+ For example, users that purchased the same items can be considered similar.
40
+ When we want to suggest new items to a user, a recommender system exploits
41
+ the items bought by similar users as a starting point for the suggestion.
42
+ The items bought by similar users are compared to the items that the user
43
+ already bought. If they are new and similar, the model suggests them.
44
+
45
+ ## How we prepare the data
46
+ For each user, we compute the quantity purchased for every single item.
47
+ This will be the metric the value considered by the modele to compute
48
+ the similarity. The item that a user has never bought will
49
+ be left at zero. These zeros will be the subject of the recommendation.
50
+ """.lstrip()
51
+
52
+
53
+ @st.cache(allow_output_mutation=True)
54
+ def create_and_fit_recommender(
55
+ model_name: str,
56
+ values: Union[pd.DataFrame, "np.ndarray"],
57
+ users: Union[pd.DataFrame, "np.ndarray"],
58
+ products: Union[pd.DataFrame, "np.ndarray"],
59
+ ) -> Recommender:
60
+ recommender = Recommender(
61
+ values,
62
+ users,
63
+ products,
64
+ )
65
+
66
+ recommender.create_and_fit(
67
+ model_name,
68
+ # Fine-tuned values
69
+ model_params=dict(
70
+ factors=190,
71
+ alpha=0.6,
72
+ regularization=0.06,
73
+ ),
74
+ )
75
+ return recommender
76
+
77
+
78
+ def explain_recommendation(
79
+ recommender: Recommender,
80
+ user_id: int,
81
+ suggestions: List[int],
82
+ df: pd.DataFrame,
83
+ ):
84
+ output = []
85
+
86
+ n_recommended = len(suggestions)
87
+ for suggestion in suggestions:
88
+ explained = recommender.explain_recommendation(
89
+ user_id, suggestion, n_recommended
90
+ )
91
+
92
+ suggested_items_id = [id[0] for id in explained]
93
+
94
+ suggested_description = (
95
+ df.loc[df.ProductIndex == suggestion][["Description", "ProductIndex"]]
96
+ .drop_duplicates(subset=["ProductIndex"])["Description"]
97
+ .unique()[0]
98
+ )
99
+ similar_items_description = (
100
+ df.loc[df["ProductIndex"].isin(suggested_items_id)][
101
+ ["Description", "ProductIndex"]
102
+ ]
103
+ .drop_duplicates(subset=["ProductIndex"])["Description"]
104
+ .unique()
105
+ )
106
+
107
+ output.append(
108
+ f"The item **{suggested_description.strip()}** "
109
+ "has been suggested because it is similar to the following products"
110
+ " bought by the user:"
111
+ )
112
+ for description in similar_items_description:
113
+ output.append(f"- {description.strip()}")
114
+
115
+ with st.expander("See why the model recommended these products"):
116
+ st.write("\n".join(output))
117
+
118
+ st.write("------")
119
+
120
+
121
+ def print_suggestions(suggestions: List[int], df: pd.DataFrame):
122
+ similar_items_description = (
123
+ df.loc[df["ProductIndex"].isin(suggestions)][["Description", "ProductIndex"]]
124
+ .drop_duplicates(subset=["ProductIndex"])["Description"]
125
+ .unique()
126
+ )
127
+
128
+ output = ["The model suggests the following products:"]
129
+ for description in similar_items_description:
130
+ output.append(f"- {description.strip()}")
131
+
132
+ st.write("\n".join(output))
133
+
134
+
135
+ def display_user_char(user: int, data: pd.DataFrame):
136
+ subset = data[data.CustomerIndex == user]
137
+ # products = subset.groupby("ProductIndex").agg(
138
+ # {"Description": lambda x: x.iloc[0], "Quantity": sum}
139
+ # )
140
+
141
+ st.write(
142
+ "The user {} bought {} distinct products. Here is the purchase history: ".format(
143
+ user, subset["Description"].nunique()
144
+ )
145
+ )
146
+ st.dataframe(
147
+ subset.sort_values("InvoiceDate").drop(
148
+ # Do not show the customer since we are display the
149
+ # information for a specific customer.
150
+ COLUMN_NOT_DISPLAY + ["CustomerID"],
151
+ axis=1,
152
+ )
153
+ )
154
+ st.write("-----")
155
+
156
+
157
+ def _extract_description(df, products):
158
+ desc = df[df["ProductIndex"].isin(products)].drop_duplicates(
159
+ "ProductIndex", ignore_index=True
160
+ )[["ProductIndex", "Description"]]
161
+ return desc.set_index("ProductIndex")
162
+
163
+
164
+ def display_recommendation_plots(
165
+ user_id: int,
166
+ suggestions: List[int],
167
+ df: pd.DataFrame,
168
+ model: Recommender,
169
+ ):
170
+ """Plots a t-SNE with the suggested items, togheter with the purchases of
171
+ similar users.
172
+ """
173
+ # Get the purchased items that contribute the most to the suggestions
174
+ contributions = []
175
+ n_recommended = len(suggestions)
176
+ for suggestion in suggestions:
177
+ items_and_score = model.explain_recommendation(
178
+ user_id, suggestion, n_recommended
179
+ )
180
+ contributions.append([t[0] for t in items_and_score])
181
+
182
+ contributions = np.unique(np.concatenate(contributions))
183
+
184
+ print("Contribution computed")
185
+ print(contributions)
186
+ print("=" * 80)
187
+
188
+ # Find the purchases of similar users
189
+ bought_by_similar_users = []
190
+
191
+ sim_users, _ = model.similar_users(user_id)
192
+
193
+ for u in sim_users:
194
+ _, sim_purchases = model.user_product_matrix[u].nonzero()
195
+ bought_by_similar_users.append(sim_purchases)
196
+
197
+ bought_by_similar_users = np.unique(np.concatenate(bought_by_similar_users))
198
+
199
+ print("Similar bought computed")
200
+ print(bought_by_similar_users)
201
+ print("=" * 80)
202
+
203
+ # Compute the t-sne
204
+
205
+ # Concate all the vectors to compute a single time the decomposition
206
+ to_decompose = np.concatenate(
207
+ (
208
+ model.item_factors[suggestions],
209
+ model.item_factors[contributions],
210
+ model.item_factors[bought_by_similar_users],
211
+ )
212
+ )
213
+
214
+ print(f"Shape to decompose: {to_decompose.shape}")
215
+
216
+ with st.spinner("Computing plots (this might take around 60 seconds)..."):
217
+ elapsed = time.time()
218
+ decomposed = _tsne_decomposition(
219
+ to_decompose,
220
+ dict(
221
+ perplexity=30,
222
+ metric="euclidean",
223
+ n_iter=1_000,
224
+ random_state=42,
225
+ ),
226
+ )
227
+ elapsed = time.time() - elapsed
228
+ print(f"TSNE computed in {elapsed}")
229
+ print("=" * 80)
230
+
231
+ # Extract the decomposed vectors
232
+ suggestion_dec = decomposed[: len(suggestions), :]
233
+ contribution_dec = decomposed[
234
+ len(suggestions) : len(suggestions) + len(contributions), :
235
+ ]
236
+ items_others_dec = decomposed[-len(bought_by_similar_users) :, :]
237
+
238
+ # Also, extract the description to create a nice hover in
239
+ # the final plot.
240
+
241
+ contribution_description = _extract_description(df, contributions)
242
+ items_other_description = _extract_description(df, bought_by_similar_users)
243
+ suggestion_description = _extract_description(df, suggestions)
244
+
245
+
246
+ # Plot the scatterplot
247
+
248
+ fig = go.Figure()
249
+
250
+ fig.add_trace(
251
+ go.Scatter(
252
+ x=contribution_dec[:, 0],
253
+ y=contribution_dec[:, 1],
254
+ mode="markers",
255
+ opacity=0.8,
256
+ name="Similar bought by user",
257
+ marker_symbol="square-open",
258
+ marker_color="darkviolet",
259
+ marker_size=10,
260
+ hovertext=contribution_description.loc[contributions].values.squeeze(),
261
+ )
262
+ )
263
+
264
+ fig.add_trace(
265
+ go.Scatter(
266
+ x=items_others_dec[:, 0],
267
+ y=items_others_dec[:, 1],
268
+ mode="markers",
269
+ name="Product bought by similar users",
270
+ opacity=0.7,
271
+ marker_symbol="circle-open",
272
+ marker_size=10,
273
+ hovertext=items_other_description.loc[
274
+ bought_by_similar_users
275
+ ].values.squeeze(),
276
+ )
277
+ )
278
+
279
+ fig.add_trace(
280
+ go.Scatter(
281
+ x=suggestion_dec[:, 0],
282
+ y=suggestion_dec[:, 1],
283
+ mode="markers",
284
+ name="Suggested",
285
+ marker_color="red",
286
+ marker_symbol="star",
287
+ marker_size=10,
288
+ hovertext=suggestion_description.loc[suggestions].values.squeeze(),
289
+ )
290
+ )
291
+
292
+ fig.update_xaxes(visible=False)
293
+ fig.update_yaxes(visible=False)
294
+ fig.update_layout(plot_bgcolor="white")
295
+
296
+ return fig
297
+
298
+
299
+ def _tsne_decomposition(data: np.ndarray, tsne_args: Dict[str, Any]):
300
+ if data.shape[1] > 50:
301
+ print("Performing PCA...")
302
+ data = PCA(n_components=50).fit_transform(data)
303
+ return TSNE(
304
+ n_components=2,
305
+ n_jobs=cpu_count(),
306
+ **tsne_args,
307
+ ).fit_transform(data)
308
+
309
+
310
+ def main():
311
+ # Load and process data
312
+ data, users, products = load_and_preprocess_data()
313
+ recommender = create_and_fit_recommender(
314
+ "als",
315
+ data["Quantity"],
316
+ users,
317
+ products,
318
+ )
319
+
320
+ st.markdown(
321
+ """# Recommender system
322
+ The dataset used for these computations is the following:
323
+ """
324
+ )
325
+ st.sidebar.markdown(SIDEBAR_DESCRIPTION)
326
+
327
+ # Show the data
328
+ st.dataframe(
329
+ data.drop(
330
+ COLUMN_NOT_DISPLAY,
331
+ axis=1,
332
+ ),
333
+ use_container_width=True,
334
+ )
335
+
336
+ st.markdown("## Interactive suggestion")
337
+ with st.form("recommend"):
338
+ # Let the user select the user to investigate
339
+ user = st.selectbox(
340
+ "Select a customer to get his recommendations",
341
+ users.unique(),
342
+ )
343
+
344
+ items_to_recommend = st.slider("How many items to recommend?", 1, 10, 5)
345
+ print(items_to_recommend)
346
+
347
+ submitted = st.form_submit_button("Recommend!")
348
+ if submitted:
349
+ # show_purhcase_history(user, data)
350
+ display_user_char(user, data)
351
+ suggestions_and_score = recommender.recommend_products(
352
+ user, items_to_recommend
353
+ )
354
+ print_suggestions(suggestions_and_score[0], data)
355
+ explain_recommendation(recommender, user, suggestions_and_score[0], data)
356
+
357
+ st.markdown(
358
+ "## How the purchases of similar users influnce the recommendation"
359
+ )
360
+ fig = display_recommendation_plots(
361
+ user, suggestions_and_score[0], data, recommender
362
+ )
363
+ st.plotly_chart(fig)
364
+
365
+
366
+ main()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ sklearn
3
+ streamlit
4
+ implicit
5
+ scipy
6
+ tqdm
7
+ numpy
8
+ matplotlib
9
+ seaborn
10
+ mlxtend
11
+ plotly==5.9.0
requirements_freezed.txt ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ attrs==22.1.0
3
+ black==22.10.0
4
+ blinker==1.5
5
+ cachetools==5.2.0
6
+ certifi==2022.9.24
7
+ charset-normalizer==2.1.1
8
+ click==8.1.3
9
+ commonmark==0.9.1
10
+ contourpy==1.0.5
11
+ cycler==0.11.0
12
+ decorator==5.1.1
13
+ entrypoints==0.4
14
+ fonttools==4.37.4
15
+ gitdb==4.0.9
16
+ GitPython==3.1.29
17
+ idna==3.4
18
+ implicit==0.6.1
19
+ importlib-metadata==5.0.0
20
+ Jinja2==3.1.2
21
+ joblib==1.2.0
22
+ jsonschema==4.16.0
23
+ kiwisolver==1.4.4
24
+ MarkupSafe==2.1.1
25
+ matplotlib==3.6.0
26
+ mlxtend==0.21.0
27
+ mypy-extensions==0.4.3
28
+ numpy==1.23.4
29
+ packaging==21.3
30
+ pandas==1.5.0
31
+ pathspec==0.10.1
32
+ Pillow==9.2.0
33
+ platformdirs==2.5.2
34
+ plotly==5.9.0
35
+ protobuf==3.20.3
36
+ pyarrow==9.0.0
37
+ pydeck==0.8.0b4
38
+ Pygments==2.13.0
39
+ Pympler==1.0.1
40
+ pyparsing==3.0.9
41
+ pyrsistent==0.18.1
42
+ python-dateutil==2.8.2
43
+ pytz==2022.5
44
+ pytz-deprecation-shim==0.1.0.post0
45
+ requests==2.28.1
46
+ rich==12.6.0
47
+ scikit-learn==1.1.2
48
+ scipy==1.9.2
49
+ seaborn==0.12.1
50
+ semver==2.13.0
51
+ six==1.16.0
52
+ sklearn==0.0
53
+ smmap==5.0.0
54
+ streamlit==1.13.0
55
+ tenacity==8.1.0
56
+ threadpoolctl==3.1.0
57
+ toml==0.10.2
58
+ tomli==2.0.1
59
+ toolz==0.12.0
60
+ tornado==6.2
61
+ tqdm==4.64.1
62
+ typing_extensions==4.4.0
63
+ tzdata==2022.5
64
+ tzlocal==4.2
65
+ urllib3==1.26.12
66
+ validators==0.20.0
67
+ watchdog==2.1.9
68
+ zipp==3.9.0
utils.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+
5
+ @st.cache
6
+ def load_and_preprocess_data():
7
+ df = pd.read_csv(
8
+ "Data/OnlineRetail.csv",
9
+ encoding="latin-1",
10
+ )
11
+
12
+ # Remove nans values
13
+ df = df.dropna()
14
+
15
+ # Use only positive quantites. This is not a robust approach,
16
+ # but to keep things simple it quite good.
17
+ df = df[df["Quantity"] > 0]
18
+
19
+ # Parse the date column
20
+ df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"]).dt.floor("d")
21
+
22
+ # Change customer id to int
23
+ df["CustomerID"] = df["CustomerID"].astype(int)
24
+
25
+ # Add price column
26
+ df["Price"] = df["Quantity"] * df["UnitPrice"]
27
+
28
+ # Get unique entries in the dataset of users and products
29
+ users = df["CustomerID"].unique()
30
+ products = df["StockCode"].unique()
31
+
32
+ # Create a categorical type for users and product. User ordered to ensure
33
+ # reproducibility
34
+ user_cat = pd.CategoricalDtype(categories=sorted(users), ordered=True)
35
+ product_cat = pd.CategoricalDtype(categories=sorted(products), ordered=True)
36
+
37
+ # Transform and get the indexes of the columns
38
+ user_idx = df["CustomerID"].astype(user_cat).cat.codes
39
+ product_idx = df["StockCode"].astype(product_cat).cat.codes
40
+
41
+ # Add the categorical index to the starting dataframe
42
+ df["CustomerIndex"] = user_idx
43
+ df["ProductIndex"] = product_idx
44
+
45
+ return df, user_idx, product_idx