Spaces:

juanmartip95
/

recomenderlacocreadora

Sleeping

App Files Files Community

juanmartip95 commited on Jan 10

Commit

bb08294

•

1 Parent(s): 3861db2

Update pages_clustering.py

Browse files

Files changed (1) hide show

pages_clustering.py +75 -52

pages_clustering.py CHANGED Viewed

@@ -8,6 +8,12 @@ from sklearn.mixture import GaussianMixture
 import plotly.express as px
 import itertools
 from typing import Dict, List, Tuple
 SIDEBAR_DESCRIPTION = """
@@ -76,26 +82,40 @@ EXPLANATION_DICT = {
     "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
 }
 def create_features(df: pd.DataFrame):
-    """Creates a new dataframe with the RFM features for each client."""
-    # Compute frequency, the number of distinct time a user purchased.
-    client_features = df.groupby("CustomerID")["InvoiceDate"].nunique().reset_index()
-    client_features.columns = ["CustomerID", "Frequency"]
-    # Add monetary value, the total revenue for  each single user.
-    client_takings = df.groupby("CustomerID")["Price"].sum()
-    client_features["Revenue"] = client_takings.values
-    # Add recency, i.e. the days since the last purchase in the store.
-    max_date = df.groupby("CustomerID")["InvoiceDate"].max().reset_index()
-    max_date.columns = ["CustomerID", "LastPurchaseDate"]
-    client_features["Recency"] = (
-        max_date["LastPurchaseDate"].max() - max_date["LastPurchaseDate"]
-    ).dt.days
-    return client_features
 @st.cache
@@ -105,7 +125,7 @@ def cluster_clients(df: pd.DataFrame):
     df_rfm = create_features(df)
     for to_cluster, order in zip(
-        ["Revenue", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
     ):
         kmeans = GaussianMixture(n_components=3, random_state=42)
         labels = kmeans.fit_predict(df_rfm[[to_cluster]])
@@ -128,59 +148,62 @@ def _order_cluster(cluster_model: GaussianMixture, clusters, order="ascending"):
     return lookup_table[clusters]
-def show_purhcase_history(user: int, df: pd.DataFrame):
-    user_purchases = df.loc[df.CustomerID == user, ["Price", "InvoiceDate"]]
-    expenses = user_purchases.groupby(user_purchases.InvoiceDate).sum()
-    expenses.columns = ["Expenses"]
-    expenses = expenses.reset_index()
-    c = (
-        alt.Chart(expenses)
-        .mark_line(point=True)
-        .encode(
-            x=alt.X("InvoiceDate", timeUnit="yearmonthdate", title="Date"),
-            y="Expenses",
-        )
-        .properties(title="User expenses")
     )
-    st.altair_chart(c)
-def show_user_info(user: int, df_rfm: pd.DataFrame):
     """Prints some information about the user.
-    The main information are the total expenses, how
-    many times he purchases in the store, and the clusters
-    he belongs to.
     """
-    user_row = df_rfm[df_rfm["CustomerID"] == user]
     if len(user_row) == 0:
         st.write(f"No user with id {user}")
     output = []
-    output.append(f"The user purchased **{user_row['Frequency'].squeeze()} times**.\n")
-    output.append(
-        f"She/he spent **{user_row['Revenue'].squeeze()} dollars** in total.\n"
-    )
-    output.append(
-        f"The last time she/he bought something was **{user_row['Recency'].squeeze()} days ago**.\n"
-    )
-    output.append(f"She/he belongs to the clusters: ")
     for cluster in [column for column in user_row.columns if "_cluster" in column]:
         output.append(f"- {cluster} = {user_row[cluster].squeeze()}")
     st.write("\n".join(output))
     return (
-        user_row["Recency_cluster"].squeeze(),
-        user_row["Frequency_cluster"].squeeze(),
-        user_row["Revenue_cluster"].squeeze(),
     )
 def explain_cluster(cluster_info):
     """Displays a popup menu explinging the meanining of the clusters."""
@@ -292,12 +315,12 @@ def display_dataframe_heatmap(df_rfm: pd.DataFrame, cluster_info_dict):
     count = (
         df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
-            "CustomerID"
         ]
         .count()
         .reset_index()
     )
-    count = count.rename(columns={"CustomerID": "Count"})
     # Remove duplicates
     count = count.drop_duplicates(
@@ -389,12 +412,12 @@ def main():
     client_to_select = (
         df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
-            "CustomerID"
         ]
         .first()
         .values
         if filter_by_cluster
-        else df["CustomerID"].unique()
     )
     # Let the user select the user to investigate

 import plotly.express as px
 import itertools
 from typing import Dict, List, Tuple
+from sklearn.preprocessing import LabelEncoder
+# Create an instance of LabelEncoder
+label_encoder = LabelEncoder()
 SIDEBAR_DESCRIPTION = """
     "Revenue_cluster": MONETARY_CLUSTERS_EXPLAIN,
 }
+# Fit and transform the 'Location' column
+merged_df['Location_Encoded'] = label_encoder.fit_transform(merged_df['Location'])
+# Assuming 'Age' contains categorical values (e.g., 'young', 'middle-aged', 'old')
+merged_df['Age_Encoded'] = label_encoder.fit_transform(merged_df['Age'])
 def create_features(df: pd.DataFrame):
+    """Creates a new dataframe with the RFM features for each client based on Location and Age."""
+    # Compute frequency, the number of distinct books a user has read.
+    client_features = df.groupby("User-ID")["ISBN"].nunique().reset_index()
+    client_features.columns = ["User-ID", "Frequency"]
+    # For this example, let's assume the 'Price' column represents monetary value.
+    # Add monetary value, the total revenue for each single user (total books read by the user).
+    client_takings = df.groupby("User-ID").size()
+    client_features["Total_Books_Read"] = client_takings.values
+    # Add recency, let's use the count of unique 'ISBN' as a proxy for recency.
+    # You can adjust this based on your specific context.
+    client_recency = df.groupby("User-ID")["ISBN"].nunique().reset_index()
+    client_recency.columns = ["User-ID", "Recency"]
+    client_features["Recency"] = client_recency["Recency"]
+    # Incorporating location and age for clustering purposes
+    # You might consider encoding location or age if they're categorical
+    # For simplicity, assuming both 'Location' and 'Age' are categorical here
+    client_location_age = df.drop_duplicates(subset=["User-ID", "Location_Encoded", "Age_Encoded"])
+    client_features = client_features.merge(
+        client_location_age[["User-ID", "Location", "Age"]],
+        on="User-ID",
+        how="left",
+    )
+    return client_features[["User-ID", "Frequency", "Total_Books_Read", "Recency", "Location", "Age"]]
 @st.cache
     df_rfm = create_features(df)
     for to_cluster, order in zip(
+        ["Total_Books_Read", "Frequency", "Recency"], ["ascending", "ascending", "descending"]
     ):
         kmeans = GaussianMixture(n_components=3, random_state=42)
         labels = kmeans.fit_predict(df_rfm[[to_cluster]])
     return lookup_table[clusters]
+def show_rating_history(user: int, df: pd.DataFrame):
+    user_ratings = df.loc[df["User-ID"] == user, ["Book-Title", "Book-Rating"]]
+    # Count of books rated by the user
+    rated_books_count = user_ratings.shape[0]
+    st.write(
+        f"The user {user} has rated {rated_books_count} books. Here is the rating history:"
     )
+    st.dataframe(user_ratings)
+    # Total number of books read by the user
+    total_books_read = rated_books_count
+    st.write(f"Total number of books read by user {user}: {total_books_read}")
+def show_user_info(user: int, df_rfm: pd.DataFrame, df_books_read: pd.DataFrame):
     """Prints some information about the user.
+    The main information includes age, location,
+    total_books_read, and the clusters they belong to.
     """
+    user_row = df_rfm[df_rfm["User-ID"] == user]
     if len(user_row) == 0:
         st.write(f"No user with id {user}")
     output = []
+    # Fetch user's information from df_rfm
+    user_info = user_row.iloc[0]
+    output.append(f"Age: {user_info['Age']}")
+    output.append(f"Location: {user_info['Location']}")
+    # Calculate total_books_read from df_books_read
+    total_books_read = df_books_read[df_books_read["User-ID"] == user].shape[0]
+    output.append(f"Total books read: {total_books_read}")
+    # Display cluster memberships
+    output.append("Cluster memberships:")
     for cluster in [column for column in user_row.columns if "_cluster" in column]:
         output.append(f"- {cluster} = {user_row[cluster].squeeze()}")
     st.write("\n".join(output))
     return (
+        user_info["Recency_cluster"],
+        user_info["Frequency_cluster"],
+        user_info["Revenue_cluster"],
     )
 def explain_cluster(cluster_info):
     """Displays a popup menu explinging the meanining of the clusters."""
     count = (
         df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
+            "User-ID"
         ]
         .count()
         .reset_index()
     )
+    count = count.rename(columns={"User-ID": "Count"})
     # Remove duplicates
     count = count.drop_duplicates(
     client_to_select = (
         df_rfm.groupby(["Recency_cluster", "Frequency_cluster", "Revenue_cluster"])[
+            "User-ID"
         ]
         .first()
         .values
         if filter_by_cluster
+        else df["User-ID"].unique()
     )
     # Let the user select the user to investigate