File size: 11,266 Bytes
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9112b55
 
 
e6fd8db
 
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6fd8db
 
172edb9
 
 
e6fd8db
 
172edb9
e6fd8db
172edb9
 
 
 
 
 
e6fd8db
172edb9
 
 
 
 
 
 
 
 
 
 
 
e6fd8db
 
172edb9
 
 
 
 
 
 
 
 
9112b55
e6fd8db
172edb9
 
9112b55
 
172edb9
 
9112b55
 
172edb9
e6fd8db
172edb9
 
e6fd8db
172edb9
 
 
9112b55
172edb9
 
 
 
9112b55
e6fd8db
 
9112b55
e6fd8db
9112b55
e6fd8db
 
 
9112b55
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6fd8db
172edb9
e6fd8db
172edb9
e6fd8db
 
 
172edb9
e6fd8db
172edb9
e6fd8db
 
 
172edb9
 
 
 
e6fd8db
 
 
 
 
 
 
172edb9
e6fd8db
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e6fd8db
172edb9
 
 
 
f982372
e6fd8db
f982372
172edb9
 
 
 
 
 
 
 
 
 
 
e6fd8db
172edb9
 
 
 
 
 
 
e6fd8db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a949a92
172edb9
 
9112b55
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9112b55
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9112b55
172edb9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import streamlit as st
import pandas as pd
import altair as alt
from recommender import Recommender
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from os import cpu_count
import numpy as np
import time

from utils import load_and_preprocess_data

import matplotlib.pyplot as plt
from typing import Union, List, Dict, Any
import plotly.graph_objects as go


COLUMN_NOT_DISPLAY = [
    "ISBN",
    "Location",
    "Age",
    "CustomerIndex",
    "ProductIndex",
]


SIDEBAR_DESCRIPTION = """
# Recommender system

## What is it?
A recommender system is a tool that suggests something new to a particular
user that she/he might be interested in. It becomes useful when
the number of items a user can choose from is high.

## How does it work?
A recommender system internally finds similar users and similar items,
based on a suitable definition of "similarity".
For example, users that purchased the same items can be considered similar.
When we want to suggest new items to a user, a recommender system exploits
the items bought by similar users as a starting point for the suggestion. 
The items bought by similar users are compared to the items that the user
already bought. If they are new and similar, the model suggests them.

## How we prepare the data
For each user, we compute the quantity purchased for every single item. 
This will be the metric the value considered by the model to compute 
the similarity. The item that a user has never bought will
be left at zero. These zeros will be the subject of the recommendation.
""".lstrip()


@st.cache(allow_output_mutation=True)
def create_and_fit_recommender(
    model_name: str,
    values: Union[pd.DataFrame, "np.ndarray"],
    users: Union[pd.DataFrame, "np.ndarray"],
    products: Union[pd.DataFrame, "np.ndarray"],
) -> Recommender:
    recommender = Recommender(
        values,
        users,
        products,
    )

    recommender.create_and_fit(
        model_name,
        # Fine-tuned values
        model_params=dict(
            factors=190,
            alpha=0.6,
            regularization=0.06,
            random_state=42,
        ),
    )
    return recommender


def explain_recommendation(
    recommender: Recommender,
    user_id: int,
    suggestions: List[int],
    df: pd.DataFrame,
):
    output = []

    n_recommended = len(suggestions)
    for suggestion in suggestions:
        explained = recommender.explain_recommendation(
            user_id, suggestion, n_recommended
        )

        suggested_items_id = [id[0] for id in explained]

        suggested_description = (
            df.loc[df.ISBN == suggestion][["Book-Title", "ProductIndex"]]
            .drop_duplicates(subset=["ProductIndex"])["Book-Title"]
            .unique()[0]
        )
        similar_items_description = (
            df.loc[df["ProductIndex"].isin(suggested_items_id)][
                ["Book-Title", "ProductIndex"]
            ]
            .drop_duplicates(subset=["ProductIndex"])["Book-Title"]
            .unique()
        )

        output.append(
            f"The item **{suggested_description.strip()}** "
            "has been suggested because it is similar to the following products"
            " rated by the user:"
        )
        for description in similar_items_description:
            output.append(f"- {description.strip()}")

    with st.expander("See why the model recommended these products"):
        st.write("\n".join(output))

    st.write("------")


def print_suggestions(suggestions: List[int], df: pd.DataFrame):
    similar_items_description = (
        df.loc[df["ProductIndex"].isin(suggestions)][["Book-Title", "ProductIndex"]]
        .drop_duplicates(subset=["ProductIndex"])["Book-Title"]
        .unique()
    )

    output = ["The model suggests the following products:"]
    for description in similar_items_description:
        output.append(f"- {description.strip()}")

    st.write("\n".join(output))

def display_user_rat(user: int, data: pd.DataFrame):
    subset = data[data.CustomerIndex == user]

    st.write(
        "The user {} rated {} distinct books. Here is the rating history: ".format(
            user, subset["Book-Title"].nunique()
        )
    )
    
    # Displaying the subset of books rated by the user
    st.dataframe(
        subset.sort_values("CustomerIndex").drop(
            # Do not show the customer since we are display the
            # information for a specific customer.
            COLUMN_NOT_DISPLAY+ ["CustomerID"],
            axis=1,
        )
    )
    
    st.write("-----")



def _extract_author(df, products):
    desc = merged_df[merged_df["ProductIndex"].isin(products)].drop_duplicates(
        "ProductIndex", ignore_index=True
    )[["ISBN", "Book-Author"]]
    return desc.set_index("ProductIndex")
def _extract_title(df, products):
    desc = merged_df[merged_df["ProductIndex"].isin(products)].drop_duplicates(
        "ProductIndex", ignore_index=True
    )[["ProductIndex", "Book-Title"]]
    return desc.set_index("ProductIndex")

def display_recommendation_plots(
    user_id: int,
    suggestions: List[int],
    df: pd.DataFrame,
    model: Recommender,
):
    """Plots a t-SNE with the suggested items, togheter with the purchases of
    similar users.
    """
    # Get the purchased items that contribute the most to the suggestions
    contributions = []
    n_recommended = len(suggestions)
    for suggestion in suggestions:
        items_and_score = model.explain_recommendation(
            user_id, suggestion, n_recommended
        )
        contributions.append([t[0] for t in items_and_score])

    contributions = np.unique(np.concatenate(contributions))

    print("Contribution computed")
    print(contributions)
    print("=" * 80)

    # Find the purchases of similar users
    rated_by_similar_users = []

    sim_users, _ = model.similar_users(user_id)

    for u in sim_users:
        _, sim_purchases = model.user_product_matrix[u].nonzero()
        rated_by_similar_users.append(sim_purchases)

    rated_by_similar_users = np.unique(np.concatenate(rated_by_similar_users))

    print("Similar rated computed")
    print(rated_by_similar_users)
    print("=" * 80)

    # Compute the t-sne

    # Concate all the vectors to compute a single time the decomposition
    to_decompose = np.concatenate(
        (
            model.item_factors[suggestions],
            model.item_factors[contributions],
            model.item_factors[rated_by_similar_users_by_similar_users],
        )
    )

    print(f"Shape to decompose: {to_decompose.shape}")

    with st.spinner("Computing plots (this might take around 60 seconds)..."):
        elapsed = time.time()
        decomposed = _tsne_decomposition(
            to_decompose,
            dict(
                perplexity=30,
                metric="euclidean",
                n_iter=1_000,
                random_state=42,
            ),
        )
    elapsed = time.time() - elapsed
    print(f"TSNE computed in {elapsed}")
    print("=" * 80)

    # Extract the decomposed vectors
    suggestion_dec = decomposed[: len(suggestions), :]
    contribution_dec = decomposed[
        len(suggestions) : len(suggestions) + len(contributions), :
    ]
    items_others_dec = decomposed[-len(rated_by_similar_users) :, :]

    # Also, extract the description to create a nice hover in
    # the final plot.

    contribution_description = _extract_title(merged_df, contributions)
    items_other_description = _extract_title(merged_df, rated_by_similar_users)
    suggestion_description = _extract_title(merged_df, suggestions)

    # Plot the scatterplot

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=contribution_dec[:, 0],
            y=contribution_dec[:, 1],
            mode="markers",
            opacity=0.8,
            name="Similar rated by user",
            marker_symbol="square-open",
            marker_color="#010CFA",
            marker_size=10,
            hovertext=contribution_description.loc[contributions].values.squeeze(),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=items_others_dec[:, 0],
            y=items_others_dec[:, 1],
            mode="markers",
            name="Product rated by similar users",
            opacity=0.7,
            marker_symbol="circle-open",
            marker_color="#FA5F19",
            marker_size=10,
            hovertext=items_other_description.loc[
                rated_by_similar_users
            ].values.squeeze(),
        )
    )

    fig.add_trace(
        go.Scatter(
            x=suggestion_dec[:, 0],
            y=suggestion_dec[:, 1],
            mode="markers",
            name="Suggested",
            marker_color="#1A9626",
            marker_symbol="star",
            marker_size=10,
            hovertext=suggestion_description.loc[suggestions].values.squeeze(),
        )
    )

    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    fig.update_layout(plot_bgcolor="white")

    return fig


def _tsne_decomposition(data: np.ndarray, tsne_args: Dict[str, Any]):
    if data.shape[1] > 50:
        print("Performing PCA...")
        data = PCA(n_components=50).fit_transform(data)
    return TSNE(
        n_components=2,
        n_jobs=cpu_count(),
        **tsne_args,
    ).fit_transform(data)


def main():
    # Load and process data
    data, users, products = load_and_preprocess_data()
    recommender = create_and_fit_recommender(
        "als",
        data["Book-Rating"],
        users,
        products,
    )

    st.markdown(
        """# Recommender system
The dataset used for these computations is the following:
        """
    )
    st.sidebar.markdown(SIDEBAR_DESCRIPTION)

    to_display = data.drop(
        COLUMN_NOT_DISPLAY,
        axis=1,
    )

    # Convert to int just to display the column without trailing decimals.
    # @note: I know I can use the "format" function of pandas, but I found out
    #   it is super slow when fomratting large tables.
    to_display["Book-Rating"] = to_display["Book-Rating"].astype(int)

    # Show the data
    st.dataframe(
        to_display,
    )

    st.markdown("## Interactive suggestion")
    with st.form("recommend"):
        # Let the user select the user to investigate
        user = st.selectbox(
            "Select a customer to get his recommendations",
            users.unique(),
        )

        items_to_recommend = st.slider("How many items to recommend?", 1, 10, 5)
        print(items_to_recommend)

        submitted = st.form_submit_button("Recommend!")
        if submitted:
            # show_purhcase_history(user, data)
            display_user_rat(user, data)
            suggestions_and_score = recommender.recommend_products(
                user, items_to_recommend
            )
            print_suggestions(suggestions_and_score[0], data)
            explain_recommendation(recommender, user, suggestions_and_score[0], data)

            st.markdown(
                "## How the purchases of similar users influnce the recommendation"
            )
            fig = display_recommendation_plots(
                user, suggestions_and_score[0], data, recommender
            )
            st.plotly_chart(fig)


main()