File size: 3,886 Bytes
3b528be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a388efc
3b528be
 
 
 
 
 
 
88d6858
3b528be
a388efc
88d6858
 
a388efc
3b528be
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import pandas as pd
import numpy as np
from fuzzywuzzy import process


def match_books(user_input: str, df: pd.DataFrame, min_score: float = 0.8):
    # Use process.extractOne to get the best match
    book_titles = df['Book-Title'].unique()
    best_match = process.extractOne(user_input, book_titles)
    # Check if the best match score is above the minimum score
    if best_match and best_match[1] >= min_score:
        result = best_match[0]
    else:
        result = None
    return result


def recommend_books(df: pd.DataFrame, book_to_be_recommended: str) -> pd.DataFrame:
    """

    The recommend_books_new function identifies users who have read a specified book,

    finds other books these users have read, computes the correlation between the specified book and these other books,

    and returns a DataFrame with the recommended books, their correlation scores, and average ratings.

    """

    # Get relevant dataset of book's readers
    book_readers = df['User-ID'][df['Book-Title'] == book_to_be_recommended]
    book_readers = book_readers.tolist()
    book_readers = np.unique(book_readers)

    # Final dataset
    books_of_book_readers = df[(df['User-ID'].isin(book_readers))]
    number_of_rating_per_book = books_of_book_readers.groupby(['Book-Title']).agg('count').reset_index()

    # Iterate over the number_of_user_ratings to get the highest number,
    # while keeping at least 10 final records
    threshold = 0
    while True:
        books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold]
        books_to_compare = books_to_compare.tolist()
        print(f"Threshold: {threshold}, Number of books to compare: {len(books_to_compare)}")
        if len(books_to_compare) <= 11:
            books_to_compare = number_of_rating_per_book['Book-Title'][number_of_rating_per_book['User-ID'] >= threshold-1]
            break
        threshold += 1

    ratings_data_raw = books_of_book_readers[['User-ID', 'Book-Rating', 'Book-Title']][
        books_of_book_readers['Book-Title'].isin(books_to_compare)]

    # group by User and Book and compute mean
    ratings_data_raw_nodup = ratings_data_raw.groupby(['User-ID', 'Book-Title'])['Book-Rating'].mean()

    # reset index to see User-ID in every row
    ratings_data_raw_nodup = ratings_data_raw_nodup.to_frame().reset_index()

    dataset_for_corr = ratings_data_raw_nodup.pivot(index='User-ID', columns='Book-Title', values='Book-Rating')

    # Method 1: Using pandas corr() with pairwise complete observations
    correlations = dataset_for_corr.corrwith(dataset_for_corr[book_to_be_recommended], method='pearson')

    # Add average ratings for each book in dataset_for_corr
    average_ratings = ratings_data_raw_nodup.groupby('Book-Title')['Book-Rating'].mean().reset_index()

    # Create DataFrame with correlations
    correlations_df = pd.DataFrame({
        'Book-Title': correlations.index,
        'Correlation [%]': correlations.values,
    })

    # Merge correlations_df with average_ratings
    correlations_df = pd.merge(correlations_df, average_ratings, on='Book-Title')
    correlations_df = correlations_df.rename(columns={'Book-Rating': 'Average ratings'})

    # Sort by correlation value
    correlations_df = correlations_df.sort_values('Correlation [%]', ascending=False)

    # convert correlation column to percentage and limit to two decimals
    correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'] * 100
    correlations_df['Correlation [%]'] = correlations_df['Correlation [%]'].round(2)

    # Remove the book being recommended from the list
    correlations_df = correlations_df[correlations_df['Book-Title'] != book_to_be_recommended]
    correlations_df = correlations_df.head(10)

    return correlations_df