# Sistema de recomendação de filmes usando filtro colaborativo

## Preparando conjunto de dados

### Importações do projeto

In [220]:
import numpy as np
import pandas as pd
from fuzzywuzzy import process
from sklearn.metrics.pairwise import cosine_similarity
import math

### Importando conjunto de dados
ratings: Avaliações dos usuários para cada filme

movies: informações dos filmes que foram avaliados

In [221]:
ratings = pd.read_csv('../data/reduced/ratings_m10.csv')
ratings.reindex()
movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')
movies_title = movies[['title']]

### Junção dos dois conjuntos de dados
união feita pela coluna 'movieId' presente em ambos

In [222]:
ratings_movies = ratings.merge(movies_title, on='movieId')

### Separação do conjunto de dados baseado no timestamp
Para cada usuário foram divididas 90% das suas avaliações para o conjunto de treino e o restante para o conjunto de teste

In [223]:
def train_test_column_split(df, group_column, split_column, y_label, train_size):
    df = df.sort_values(by=split_column, ascending=True)   
    train = pd.DataFrame(columns=df.columns)
    test = pd.DataFrame(columns=df.columns)

    for idx in df[group_column].unique():
        group = df.loc[df[group_column] == idx]

        q_user = group[group[split_column].le(group[split_column].quantile(train_size))]
        p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]

        train = pd.concat([train, q_user])
        test = pd.concat([test, p_user])
    train = train.sort_index(ascending=True)
    test = test.sort_index(ascending=True)

    X_labels = [c for c in df.columns]

    X_train = train[X_labels]
    X_test = test[X_labels]

    return (X_train, X_test)

In [224]:
X_train, X_test = train_test_column_split(ratings_movies, 'userId', 'timestamp', 'rating', .9)

### Criando uma Pivot Matrix
Matriz: {userId x movieId}, cada célula corresponde à avaliação de cada usuário para cada filme, em que na ausência será preenchido com 0 (zero).

In [225]:
#user_movie_mat = ratings_movies.pivot(index='movieId', columns='userId', values='rating').fillna(0)
user_movie_train = X_train.pivot(index='movieId', columns='userId', values='rating').fillna(0)
user_movie_test = X_test.pivot(index='movieId', columns='userId', values='rating').fillna(0)

### Criando Matriz de similaridade dos usuários baseado nas avaliações

In [226]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    """Find correlation between two users based on their rated movies using Pearson correlation"""
    rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0].reshape(1, -1)
    user2_ratings = rated_movies_by_both[:, 1].reshape(1, -1)
    return cosine_similarity(user1_ratings, user2_ratings)

In [227]:
users_list = list(user_movie_train.columns)
movies_list = list(user_movie_train.index)

#users_similarity_mat = np.array([[find_correlation_between_two_users(user_movie_train, user1, user2) for user1 in users_list] for user2 in users_list])
##users_similarity_mat = users_similarity_mat.reshape(608, 608)
#users_similarity_mat = pd.DataFrame(users_similarity_mat, index=users_list, columns=users_list)
users_similarity_mat = pd.read_pickle('../data/preprocessed/users_similarity_mat_cosim.pkl')

## Métodos para prever as notas que o usuário dará para cada filme

In [228]:
def get_rated_user_for_a_movie(ratings_df: pd.DataFrame, movie: str):
    return ratings_df.loc[movie, :].dropna().index.values


def get_top_neighbors(
    similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int
):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()


def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating


def get_neighbor_rating_without_bias_per_movie(
    ratings_df: pd.DataFrame, user: str, movie: str
):
    """Substract the rating of a user from the mean rating of that user to eliminate bias"""
    mean_rating = ratings_df[user].mean()
    rating = ratings_df.loc[movie, user]
    return subtract_bias(rating, mean_rating)


def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):
    """Get the ratings of all neighbors after adjusting for biases"""
    return [
        get_neighbor_rating_without_bias_per_movie(ratings_df, neighbor, movie)
        for neighbor in neighbors
    ]

def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)


def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = ratings_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)


In [229]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    movie: str,
    n_neighbors: int = 2,
):
    """Predict the rating of a user for a movie based on the ratings of neighbors"""
    ratings_df = df.copy()

    rated_users = get_rated_user_for_a_movie(ratings_df, movie)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    #print(f"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}, distance: {list(distance)}")

    ratings = get_ratings_of_neighbors(ratings_df, neighbors, movie)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
    )

    return ger_user_rating(ratings_df, user, avg_neighbor_rating)

In [230]:
def adjust_rating(nota):
    if nota < 0:
        return 0
    elif nota > 5:
        return 5
    else:
        # Arredonda para o valor mais próximo em incrementos de 0.5
        return round(nota * 2) / 2


In [231]:
def get_n_recommendations(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame, n_neighbors: int):
    df = user_movie_mat.copy()
    recommendations = pd.DataFrame(columns=['movieId', 'title', 'pred_rating'])

    for movie, _ in df[user].items():
        if df.loc[movie, user] == 0:
            df.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, n_neighbors)
            new_row = {'movieId': movie, 'title': movies.loc[movie]['title'], 'pred_rating': adjust_rating(df.loc[movie, user])}
            recommendations.loc[len(recommendations)] = new_row

    recommendations = recommendations.sort_values(by='pred_rating', ascending=False)
    return recommendations.head(n) if n > 0  else recommendations

In [232]:
movie_name = 'White Squall'
user1 = 1
movie = process.extractOne(movie_name, movies['title'])[2]
rating = predict_rating(user_movie_train, users_similarity_mat, user1, movie, 30)
rating

3.03

In [236]:
user_id = 1
n_top_neighbors = 30
n_recommendations = -1

n_recommendations = get_n_recommendations(user_id, n_recommendations, user_movie_train, movies, n_top_neighbors)
n_recommendations

Unnamed: 0,movieId,title,pred_rating
248,589,Fallen,3.5
31,47,White Squall,3.0
542,1527,Closer,3.0
310,858,"South Park: Bigger, Longer and Uncut",3.0
364,1036,"Great Muppet Caper, The",3.0
...,...,...,...
385,1100,Starman,0.0
384,1096,Flatliners,0.0
383,1095,Blood Simple,0.0
185,419,Henry V,0.0


## Avaliação dos resultados

In [234]:
def eval_ratings():
    test = user_movie_test.copy()

    real = []
    preds = []

    for user in test.columns:
        for movie, _ in test[user].items():
            if test.loc[movie, user] != 0 and len(n_recommendations[n_recommendations['movieId'] == movie]['pred_rating'].values) > 0:
                title = movies.loc[movie]['title']
                real_rating = test.loc[movie, user]
                pred_rating = n_recommendations[n_recommendations['movieId'] == movie]['pred_rating'].values[0]
                
                real.append(real_rating)
                preds.append(pred_rating)
                
                #print(f'{user:10} - {title:50} - true rating: {real_rating}, pred rating: {pred_rating}, DIFF:{abs(real_rating - pred_rating)}')

    MSE = np.square(np.subtract(real, preds)).mean() 
    
    RMSE = math.sqrt(MSE)
    print("Root Mean Square Error:\n")
    print(RMSE)

In [235]:
eval_ratings()

Root Mean Square Error:

3.0086445530929264
