juanmartip95 commited on
Commit
bb08294
1 Parent(s): 3861db2

Update pages_clustering.py

Browse files
Files changed (1) hide show
  1. pages_clustering.py +75 -52
pages_clustering.py CHANGED
@@ -8,6 +8,12 @@ from sklearn.mixture import GaussianMixture
8
  import plotly.express as px
9
  import itertools
10
  from typing import Dict, List, Tuple
 
 
 
 
 
 
11
 
12
 
13
  SIDEBAR_DESCRIPTION = """
@@ -76,26 +82,40 @@ EXPLANATION_DICT = {
76
  "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
77
  }
78
 
 
 
 
 
79
 
80
  def create_features(df: pd.DataFrame):
81
- """Creates a new dataframe with the RFM features for each client."""
82
- # Compute frequency, the number of distinct time a user purchased.
83
- client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index()
84
- client_features.columns = ["CustomerID", "Frequency"]
85
-
86
- # Add monetary value, the total revenue for each single user.
87
- client_takings = df.groupby("CustomerID")["Price"].sum()
88
- client_features["Revenue"] = client_takings.values
89
-
90
- # Add recency, i.e. the days since the last purchase in the store.
91
- max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index()
92
- max_date.columns = ["CustomerID", "LastPurchaseDate"]
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- client_features["Recency"] = (
95
- max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"]
96
- ).dt.days
97
 
98
- return client_features
99
 
100
 
101
  @st.cache
@@ -105,7 +125,7 @@ def cluster_clients(df: pd.DataFrame):
105
  df_rfm = create_features(df)
106
 
107
  for to_cluster, order in zip(
108
- ["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
109
  ):
110
  kmeans = GaussianMixture(n_components=3, random_state=42)
111
  labels = kmeans.fit_predict(df_rfm[[to_cluster]])
@@ -128,59 +148,62 @@ def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"):
128
  return lookup_table[clusters]
129
 
130
 
131
- def show_purhcase_history(user: int, df: pd.DataFrame):
132
- user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]]
133
- expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum()
134
- expenses.columns = ["Expenses"]
135
- expenses = expenses.reset_index()
136
-
137
- c = (
138
- alt.Chart(expenses)
139
- .mark_line(point=True)
140
- .encode(
141
- x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"),
142
- y="Expenses",
143
- )
144
- .properties(title="User expenses")
145
  )
 
 
 
 
 
 
 
146
 
147
- st.altair_chart(c)
148
 
149
 
150
- def show_user_info(user: int, df_rfm: pd.DataFrame):
151
  """Prints some information about the user.
152
 
153
- The main information are the total expenses, how
154
- many times he purchases in the store, and the clusters
155
- he belongs to.
156
  """
157
 
158
- user_row = df_rfm[df_rfm["CustomerID"] == user]
159
  if len(user_row) == 0:
160
  st.write(f"No user with id {user}")
161
 
162
  output = []
163
 
164
- output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n")
165
- output.append(
166
- f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n"
167
- )
168
- output.append(
169
- f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n"
170
- )
171
- output.append(f"She/he belongs to the clusters: ")
 
 
 
172
  for cluster in [column for column in user_row.columns if "_cluster" in column]:
173
  output.append(f"- {cluster} = {user_row[cluster].squeeze()}")
174
 
175
  st.write("\n".join(output))
176
 
177
  return (
178
- user_row["Recency_cluster"].squeeze(),
179
- user_row["Frequency_cluster"].squeeze(),
180
- user_row["Revenue_cluster"].squeeze(),
181
  )
182
 
183
 
 
184
  def explain_cluster(cluster_info):
185
  """Displays a popup menu explinging the meanining of the clusters."""
186
 
@@ -292,12 +315,12 @@ def display_dataframe_heatmap(df_rfm: pd.DataFrame, cluster_info_dict):
292
 
293
  count = (
294
  df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
295
- "CustomerID"
296
  ]
297
  .count()
298
  .reset_index()
299
  )
300
- count = count.rename(columns={"CustomerID": "Count"})
301
 
302
  # Remove duplicates
303
  count = count.drop_duplicates(
@@ -389,12 +412,12 @@ def main():
389
 
390
  client_to_select = (
391
  df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
392
- "CustomerID"
393
  ]
394
  .first()
395
  .values
396
  if filter_by_cluster
397
- else df["CustomerID"].unique()
398
  )
399
 
400
  # Let the user select the user to investigate
 
8
  import plotly.express as px
9
  import itertools
10
  from typing import Dict, List, Tuple
11
+ from sklearn.preprocessing import LabelEncoder
12
+
13
+ # Create an instance of LabelEncoder
14
+ label_encoder = LabelEncoder()
15
+
16
+
17
 
18
 
19
  SIDEBAR_DESCRIPTION = """
 
82
  "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
83
  }
84
 
85
+ # Fit and transform the 'Location' column
86
+ merged_df['Location_Encoded'] = label_encoder.fit_transform(merged_df['Location'])
87
+ # Assuming 'Age' contains categorical values (e.g., 'young', 'middle-aged', 'old')
88
+ merged_df['Age_Encoded'] = label_encoder.fit_transform(merged_df['Age'])
89
 
90
  def create_features(df: pd.DataFrame):
91
+ """Creates a new dataframe with the RFM features for each client based on Location and Age."""
92
+ # Compute frequency, the number of distinct books a user has read.
93
+ client_features = df.groupby("User-ID")["ISBN"].nunique().reset_index()
94
+ client_features.columns = ["User-ID", "Frequency"]
95
+
96
+ # For this example, let's assume the 'Price' column represents monetary value.
97
+ # Add monetary value, the total revenue for each single user (total books read by the user).
98
+ client_takings = df.groupby("User-ID").size()
99
+ client_features["Total_Books_Read"] = client_takings.values
100
+
101
+ # Add recency, let's use the count of unique 'ISBN' as a proxy for recency.
102
+ # You can adjust this based on your specific context.
103
+ client_recency = df.groupby("User-ID")["ISBN"].nunique().reset_index()
104
+ client_recency.columns = ["User-ID", "Recency"]
105
+ client_features["Recency"] = client_recency["Recency"]
106
+
107
+ # Incorporating location and age for clustering purposes
108
+ # You might consider encoding location or age if they're categorical
109
+ # For simplicity, assuming both 'Location' and 'Age' are categorical here
110
+ client_location_age = df.drop_duplicates(subset=["User-ID", "Location_Encoded", "Age_Encoded"])
111
+ client_features = client_features.merge(
112
+ client_location_age[["User-ID", "Location", "Age"]],
113
+ on="User-ID",
114
+ how="left",
115
+ )
116
 
117
+ return client_features[["User-ID", "Frequency", "Total_Books_Read", "Recency", "Location", "Age"]]
 
 
118
 
 
119
 
120
 
121
  @st.cache
 
125
  df_rfm = create_features(df)
126
 
127
  for to_cluster, order in zip(
128
+ ["Total_Books_Read", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
129
  ):
130
  kmeans = GaussianMixture(n_components=3, random_state=42)
131
  labels = kmeans.fit_predict(df_rfm[[to_cluster]])
 
148
  return lookup_table[clusters]
149
 
150
 
151
+ def show_rating_history(user: int, df: pd.DataFrame):
152
+ user_ratings = df.loc[df["User-ID"] == user, ["Book-Title", "Book-Rating"]]
153
+
154
+ # Count of books rated by the user
155
+ rated_books_count = user_ratings.shape[0]
156
+
157
+ st.write(
158
+ f"The user {user} has rated {rated_books_count} books. Here is the rating history:"
 
 
 
 
 
 
159
  )
160
+
161
+ st.dataframe(user_ratings)
162
+
163
+ # Total number of books read by the user
164
+ total_books_read = rated_books_count
165
+
166
+ st.write(f"Total number of books read by user {user}: {total_books_read}")
167
 
 
168
 
169
 
170
+ def show_user_info(user: int, df_rfm: pd.DataFrame, df_books_read: pd.DataFrame):
171
  """Prints some information about the user.
172
 
173
+ The main information includes age, location,
174
+ total_books_read, and the clusters they belong to.
 
175
  """
176
 
177
+ user_row = df_rfm[df_rfm["User-ID"] == user]
178
  if len(user_row) == 0:
179
  st.write(f"No user with id {user}")
180
 
181
  output = []
182
 
183
+ # Fetch user's information from df_rfm
184
+ user_info = user_row.iloc[0]
185
+ output.append(f"Age: {user_info['Age']}")
186
+ output.append(f"Location: {user_info['Location']}")
187
+
188
+ # Calculate total_books_read from df_books_read
189
+ total_books_read = df_books_read[df_books_read["User-ID"] == user].shape[0]
190
+ output.append(f"Total books read: {total_books_read}")
191
+
192
+ # Display cluster memberships
193
+ output.append("Cluster memberships:")
194
  for cluster in [column for column in user_row.columns if "_cluster" in column]:
195
  output.append(f"- {cluster} = {user_row[cluster].squeeze()}")
196
 
197
  st.write("\n".join(output))
198
 
199
  return (
200
+ user_info["Recency_cluster"],
201
+ user_info["Frequency_cluster"],
202
+ user_info["Revenue_cluster"],
203
  )
204
 
205
 
206
+
207
  def explain_cluster(cluster_info):
208
  """Displays a popup menu explinging the meanining of the clusters."""
209
 
 
315
 
316
  count = (
317
  df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
318
+ "User-ID"
319
  ]
320
  .count()
321
  .reset_index()
322
  )
323
+ count = count.rename(columns={"User-ID": "Count"})
324
 
325
  # Remove duplicates
326
  count = count.drop_duplicates(
 
412
 
413
  client_to_select = (
414
  df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
415
+ "User-ID"
416
  ]
417
  .first()
418
  .values
419
  if filter_by_cluster
420
+ else df["User-ID"].unique()
421
  )
422
 
423
  # Let the user select the user to investigate