Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import altair as alt | |
from recommender import Recommender | |
from sklearn.decomposition import PCA | |
from sklearn.manifold import TSNE | |
from os import cpu_count | |
import numpy as np | |
import time | |
from utils import load_and_preprocess_data | |
import matplotlib.pyplot as plt | |
from typing import Union, List, Dict, Any | |
import plotly.graph_objects as go | |
COLUMN_NOT_DISPLAY = [ | |
"ISBN", | |
"Location", | |
"Age", | |
"CustomerIndex", | |
"ProductIndex", | |
] | |
SIDEBAR_DESCRIPTION = """ | |
# Recommender system | |
## What is it? | |
A recommender system is a tool that suggests something new to a particular | |
user that she/he might be interested in. It becomes useful when | |
the number of items a user can choose from is high. | |
## How does it work? | |
A recommender system internally finds similar users and similar items, | |
based on a suitable definition of "similarity". | |
For example, users that purchased the same items can be considered similar. | |
When we want to suggest new items to a user, a recommender system exploits | |
the items bought by similar users as a starting point for the suggestion. | |
The items bought by similar users are compared to the items that the user | |
already bought. If they are new and similar, the model suggests them. | |
## How we prepare the data | |
For each user, we compute the quantity purchased for every single item. | |
This will be the metric the value considered by the model to compute | |
the similarity. The item that a user has never bought will | |
be left at zero. These zeros will be the subject of the recommendation. | |
""".lstrip() | |
def create_and_fit_recommender( | |
model_name: str, | |
values: Union[pd.DataFrame, "np.ndarray"], | |
users: Union[pd.DataFrame, "np.ndarray"], | |
products: Union[pd.DataFrame, "np.ndarray"], | |
) -> Recommender: | |
recommender = Recommender( | |
values, | |
users, | |
products, | |
) | |
recommender.create_and_fit( | |
model_name, | |
# Fine-tuned values | |
model_params=dict( | |
factors=190, | |
alpha=0.6, | |
regularization=0.06, | |
random_state=42, | |
), | |
) | |
return recommender | |
def explain_recommendation( | |
recommender: Recommender, | |
user_id: int, | |
suggestions: List[int], | |
df: pd.DataFrame, | |
): | |
output = [] | |
n_recommended = len(suggestions) | |
for suggestion in suggestions: | |
explained = recommender.explain_recommendation( | |
user_id, suggestion, n_recommended | |
) | |
suggested_items_id = [id[0] for id in explained] | |
suggested_description = ( | |
df.loc[df.ISBN == suggestion][["Book-Title", "ProductIndex"]] | |
.drop_duplicates(subset=["ProductIndex"])["Book-Title"] | |
.unique()[0] | |
) | |
similar_items_description = ( | |
df.loc[df["ProductIndex"].isin(suggested_items_id)][ | |
["Book-Title", "ProductIndex"] | |
] | |
.drop_duplicates(subset=["ProductIndex"])["Book-Title"] | |
.unique() | |
) | |
output.append( | |
f"The item **{suggested_description.strip()}** " | |
"has been suggested because it is similar to the following products" | |
" rated by the user:" | |
) | |
for description in similar_items_description: | |
output.append(f"- {description.strip()}") | |
with st.expander("See why the model recommended these products"): | |
st.write("\n".join(output)) | |
st.write("------") | |
def print_suggestions(suggestions: List[int], df: pd.DataFrame): | |
similar_items_description = ( | |
df.loc[df["ProductIndex"].isin(suggestions)][["Book-Title", "ProductIndex"]] | |
.drop_duplicates(subset=["ProductIndex"])["Book-Title"] | |
.unique() | |
) | |
output = ["The model suggests the following products:"] | |
for description in similar_items_description: | |
output.append(f"- {description.strip()}") | |
st.write("\n".join(output)) | |
def display_user_rat(user: int, data: pd.DataFrame): | |
subset = data[data.CustomerIndex == user] | |
st.write( | |
"The user {} rated {} distinct books. Here is the rating history: ".format( | |
user, subset["Book-Title"].nunique() | |
) | |
) | |
# Displaying the subset of books rated by the user | |
st.dataframe( | |
subset.sort_values("CustomerIndex").drop( | |
# Do not show the customer since we are display the | |
# information for a specific customer. | |
COLUMN_NOT_DISPLAY+ ["CustomerID"], | |
axis=1, | |
) | |
) | |
st.write("-----") | |
def _extract_author(df, products): | |
desc = merged_df[merged_df["ProductIndex"].isin(products)].drop_duplicates( | |
"ProductIndex", ignore_index=True | |
)[["ISBN", "Book-Author"]] | |
return desc.set_index("ProductIndex") | |
def _extract_title(df, products): | |
desc = merged_df[merged_df["ProductIndex"].isin(products)].drop_duplicates( | |
"ProductIndex", ignore_index=True | |
)[["ProductIndex", "Book-Title"]] | |
return desc.set_index("ProductIndex") | |
def display_recommendation_plots( | |
user_id: int, | |
suggestions: List[int], | |
df: pd.DataFrame, | |
model: Recommender, | |
): | |
"""Plots a t-SNE with the suggested items, togheter with the purchases of | |
similar users. | |
""" | |
# Get the purchased items that contribute the most to the suggestions | |
contributions = [] | |
n_recommended = len(suggestions) | |
for suggestion in suggestions: | |
items_and_score = model.explain_recommendation( | |
user_id, suggestion, n_recommended | |
) | |
contributions.append([t[0] for t in items_and_score]) | |
contributions = np.unique(np.concatenate(contributions)) | |
print("Contribution computed") | |
print(contributions) | |
print("=" * 80) | |
# Find the purchases of similar users | |
rated_by_similar_users = [] | |
sim_users, _ = model.similar_users(user_id) | |
for u in sim_users: | |
_, sim_purchases = model.user_product_matrix[u].nonzero() | |
rated_by_similar_users.append(sim_purchases) | |
rated_by_similar_users = np.unique(np.concatenate(rated_by_similar_users)) | |
print("Similar rated computed") | |
print(rated_by_similar_users) | |
print("=" * 80) | |
# Compute the t-sne | |
# Concate all the vectors to compute a single time the decomposition | |
to_decompose = np.concatenate( | |
( | |
model.item_factors[suggestions], | |
model.item_factors[contributions], | |
model.item_factors[rated_by_similar_users_by_similar_users], | |
) | |
) | |
print(f"Shape to decompose: {to_decompose.shape}") | |
with st.spinner("Computing plots (this might take around 60 seconds)..."): | |
elapsed = time.time() | |
decomposed = _tsne_decomposition( | |
to_decompose, | |
dict( | |
perplexity=30, | |
metric="euclidean", | |
n_iter=1_000, | |
random_state=42, | |
), | |
) | |
elapsed = time.time() - elapsed | |
print(f"TSNE computed in {elapsed}") | |
print("=" * 80) | |
# Extract the decomposed vectors | |
suggestion_dec = decomposed[: len(suggestions), :] | |
contribution_dec = decomposed[ | |
len(suggestions) : len(suggestions) + len(contributions), : | |
] | |
items_others_dec = decomposed[-len(rated_by_similar_users) :, :] | |
# Also, extract the description to create a nice hover in | |
# the final plot. | |
contribution_description = _extract_title(merged_df, contributions) | |
items_other_description = _extract_title(merged_df, rated_by_similar_users) | |
suggestion_description = _extract_title(merged_df, suggestions) | |
# Plot the scatterplot | |
fig = go.Figure() | |
fig.add_trace( | |
go.Scatter( | |
x=contribution_dec[:, 0], | |
y=contribution_dec[:, 1], | |
mode="markers", | |
opacity=0.8, | |
name="Similar rated by user", | |
marker_symbol="square-open", | |
marker_color="#010CFA", | |
marker_size=10, | |
hovertext=contribution_description.loc[contributions].values.squeeze(), | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=items_others_dec[:, 0], | |
y=items_others_dec[:, 1], | |
mode="markers", | |
name="Product rated by similar users", | |
opacity=0.7, | |
marker_symbol="circle-open", | |
marker_color="#FA5F19", | |
marker_size=10, | |
hovertext=items_other_description.loc[ | |
rated_by_similar_users | |
].values.squeeze(), | |
) | |
) | |
fig.add_trace( | |
go.Scatter( | |
x=suggestion_dec[:, 0], | |
y=suggestion_dec[:, 1], | |
mode="markers", | |
name="Suggested", | |
marker_color="#1A9626", | |
marker_symbol="star", | |
marker_size=10, | |
hovertext=suggestion_description.loc[suggestions].values.squeeze(), | |
) | |
) | |
fig.update_xaxes(visible=False) | |
fig.update_yaxes(visible=False) | |
fig.update_layout(plot_bgcolor="white") | |
return fig | |
def _tsne_decomposition(data: np.ndarray, tsne_args: Dict[str, Any]): | |
if data.shape[1] > 50: | |
print("Performing PCA...") | |
data = PCA(n_components=50).fit_transform(data) | |
return TSNE( | |
n_components=2, | |
n_jobs=cpu_count(), | |
**tsne_args, | |
).fit_transform(data) | |
def main(): | |
# Load and process data | |
data, users, products = load_and_preprocess_data() | |
recommender = create_and_fit_recommender( | |
"als", | |
data["Book-Rating"], | |
users, | |
products, | |
) | |
st.markdown( | |
"""# Recommender system | |
The dataset used for these computations is the following: | |
""" | |
) | |
st.sidebar.markdown(SIDEBAR_DESCRIPTION) | |
to_display = data.drop( | |
COLUMN_NOT_DISPLAY, | |
axis=1, | |
) | |
# Convert to int just to display the column without trailing decimals. | |
# @note: I know I can use the "format" function of pandas, but I found out | |
# it is super slow when fomratting large tables. | |
to_display["Book-Rating"] = to_display["Book-Rating"].astype(int) | |
# Show the data | |
st.dataframe( | |
to_display, | |
) | |
st.markdown("## Interactive suggestion") | |
with st.form("recommend"): | |
# Let the user select the user to investigate | |
user = st.selectbox( | |
"Select a customer to get his recommendations", | |
users.unique(), | |
) | |
items_to_recommend = st.slider("How many items to recommend?", 1, 10, 5) | |
print(items_to_recommend) | |
submitted = st.form_submit_button("Recommend!") | |
if submitted: | |
# show_purhcase_history(user, data) | |
display_user_rat(user, data) | |
suggestions_and_score = recommender.recommend_products( | |
user, items_to_recommend | |
) | |
print_suggestions(suggestions_and_score[0], data) | |
explain_recommendation(recommender, user, suggestions_and_score[0], data) | |
st.markdown( | |
"## How the purchases of similar users influnce the recommendation" | |
) | |
fig = display_recommendation_plots( | |
user, suggestions_and_score[0], data, recommender | |
) | |
st.plotly_chart(fig) | |
main() | |