In [277]:
import numpy as np
import pandas as pd
from fuzzywuzzy import process

In [278]:
ratings = pd.read_csv('../data/reduced/ratings_m10.csv')
ratings.reindex()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
81111,610,159093,3.0,1493847704
81112,610,164179,5.0,1493845631
81113,610,166528,4.0,1493879365
81114,610,168250,5.0,1494273047


In [279]:
movies = pd.read_csv('../data/reduced/movies_m10_rich_pre.csv', index_col='movieId')
movies.sample()

Unnamed: 0_level_0,Unnamed: 0,tmdbId,imdbId,cast,director,keywords,overview,title,genres,year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
290,290,9100,115963,"['Fairuza Balk', 'Neve Campbell', 'Robin Tunne...",Andrew Fleming,"['witch', 'suicide attempt', 'becoming an adul...",A Catholic school newcomer falls in with a cli...,"Craft, The","['Drama', 'Fantasy', 'Horror', 'Thriller']",1996


In [280]:
movies_title = movies[['title']]

In [281]:
rating_movie = ratings.merge(movies_title, on='movieId')
rating_movie

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story
1,5,1,4.0,847434962,Toy Story
2,7,1,4.5,1106635946,Toy Story
3,15,1,2.5,1510577970,Toy Story
4,17,1,4.5,1305696483,Toy Story
...,...,...,...,...,...
35851,546,1327,3.0,973588711,Scooby-Doo
35852,555,1327,3.0,978748648,Scooby-Doo
35853,571,1327,5.0,966900601,Scooby-Doo
35854,600,1327,2.0,1237710102,Scooby-Doo


In [282]:
rating_movie

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story
1,5,1,4.0,847434962,Toy Story
2,7,1,4.5,1106635946,Toy Story
3,15,1,2.5,1510577970,Toy Story
4,17,1,4.5,1305696483,Toy Story
...,...,...,...,...,...
35851,546,1327,3.0,973588711,Scooby-Doo
35852,555,1327,3.0,978748648,Scooby-Doo
35853,571,1327,5.0,966900601,Scooby-Doo
35854,600,1327,2.0,1237710102,Scooby-Doo


In [283]:
def train_test_column_split(df: pd.DataFrame, group_column: str, split_column: str, y_label: str, train_size: float):
    df = df.sort_values(by=split_column, ascending=True)   
    train = pd.DataFrame(columns=df.columns)
    test = pd.DataFrame(columns=df.columns)

    for idx in df[group_column].unique():
        group = df.loc[df[group_column] == idx]

        q_user = group[group[split_column].le(group[split_column].quantile(train_size))]
        p_user = group[group[split_column].ge(group[split_column].quantile(train_size))]

        train = pd.concat([train, q_user])
        test = pd.concat([test, p_user])
    train = train.sort_index(ascending=True)
    test = test.sort_index(ascending=True)

    X_labels = [c for c in df.columns if c != y_label]

    X_train = train[X_labels]
    X_test = test[X_labels]
    y_train = train[y_label]
    y_test = test[y_label]

    return (X_train, X_test, y_train, y_test)

In [284]:
X_train, X_test, y_train, y_test = train_test_column_split(rating_movie, 'userId', 'timestamp', 'rating', .8)

In [285]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)

In [286]:
user_movie_mat = rating_movie.pivot(index='movieId', columns='userId', values='rating').fillna(0)
user_movie_mat_train = train.pivot(index='movieId', columns='userId', values='rating').fillna(0)
user_movie_mat_test = test.pivot(index='movieId', columns='userId', values='rating').fillna(0)

In [287]:
user_movie_mat_train

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,0.0,0.0,2.5,0.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0


In [288]:
user_movie_mat_test

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.5,4.0,0.0,3.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0


In [289]:
user_movie_mat

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,0.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0


In [290]:
def find_correlation_between_two_users(ratings_df: pd.DataFrame, user1: str, user2: str):
    """Find correlation between two users based on their rated movies using Pearson correlation"""
    rated_movies_by_both = ratings_df[[user1, user2]].dropna(axis=0).values
    user1_ratings = rated_movies_by_both[:, 0]
    user2_ratings = rated_movies_by_both[:, 1]
    return np.corrcoef(user1_ratings, user2_ratings)[0, 1]

In [291]:
users_list = list(user_movie_mat.columns)
movies_list = list(user_movie_mat.index)

#users_similarity_mat = np.array([[corr_between_users(user_movie_mat, user1, user2) for user1 in users_list] for user2 in users_list])
#users_similarity_mat = pd.DataFrame(users_similarity_mat, index=users_list, columns=users_list)
users_similarity_mat = pd.read_pickle('../data/preprocessed/users_similarity_mat.pkl')

In [292]:
def get_rated_user_for_a_movie(ratings_df: pd.DataFrame, movie: str):
    return ratings_df.loc[movie, :].dropna().index.values


def get_top_neighbors(
    similarity_df: pd.DataFrame, user: str, rated_users: str, n_neighbors: int
):
    return similarity_df[user][rated_users].nlargest(n_neighbors).to_dict()


def subtract_bias(rating: float, mean_rating: float):
    return rating - mean_rating


def get_neighbor_rating_without_bias_per_movie(
    ratings_df: pd.DataFrame, user: str, movie: str
):
    """Substract the rating of a user from the mean rating of that user to eliminate bias"""
    mean_rating = ratings_df[user].mean()
    rating = ratings_df.loc[movie, user]
    return subtract_bias(rating, mean_rating)


def get_ratings_of_neighbors(ratings_df: pd.DataFrame, neighbors: list, movie: str):
    """Get the ratings of all neighbors after adjusting for biases"""
    return [
        get_neighbor_rating_without_bias_per_movie(ratings_df, neighbor, movie)
        for neighbor in neighbors
    ]

def get_weighted_average_rating_of_neighbors(ratings: list, neighbor_distance: list):
    weighted_sum = np.array(ratings).dot(np.array(neighbor_distance))
    abs_neigbor_distance = np.abs(neighbor_distance)
    return weighted_sum / np.sum(abs_neigbor_distance)


def ger_user_rating(ratings_df: pd.DataFrame, user: str, avg_neighbor_rating: float):
    user_avg_rating = ratings_df[user].mean()
    return round(user_avg_rating + avg_neighbor_rating, 2)


In [293]:
def predict_rating(
    df: pd.DataFrame,
    similarity_df: pd.DataFrame,
    user: str,
    movie: str,
    n_neighbors: int = 2,
):
    """Predict the rating of a user for a movie based on the ratings of neighbors"""
    ratings_df = df.copy()

    rated_users = get_rated_user_for_a_movie(ratings_df, movie)

    top_neighbors_distance = get_top_neighbors(
        similarity_df, user, rated_users, n_neighbors
    )
    neighbors, distance = top_neighbors_distance.keys(), top_neighbors_distance.values()

    #print(f"Top {n_neighbors} neighbors of user {user}, {movie}: {list(neighbors)}, distance: {list(distance)}")

    ratings = get_ratings_of_neighbors(ratings_df, neighbors, movie)
    avg_neighbor_rating = get_weighted_average_rating_of_neighbors(
        ratings, list(distance)
    )

    return ger_user_rating(ratings_df, user, avg_neighbor_rating)

In [294]:
movie_name = 'Heartbreakers'
user1 = 1
movie = process.extractOne(movie_name, movies['title'])[2]
rating = predict_rating(user_movie_mat, users_similarity_mat, user1, movie, 10)
rating

3.02

In [295]:
def get_n_recommendations(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):
    full_ratings = user_movie_mat.copy()
    recommendations = pd.DataFrame(columns=['movieId', 'title', 'rating'])

    for movie, _ in full_ratings[user].items():
        if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:
            full_ratings.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 10)
            new_row = {'movieId': movie, 'title': movies.loc[movie]['title'], 'rating': full_ratings.loc[movie, user]}
            recommendations.loc[len(recommendations)] = new_row

    recommendations = recommendations.sort_values(by='rating', ascending=False)
    return recommendations.head(n) if n > 0 else recommendations

In [296]:
full_ratings = pd.read_csv('../data/preprocessed/full_ratings_comp.csv', index_col='movieId')

In [310]:
user_id = 1
n_recommendations = 10

get_n_recommendations(user_id, n_recommendations, user_movie_mat, movies)

Unnamed: 0,movieId,title,rating
403,1200,Heartbreakers,3.02
303,858,"South Park: Bigger, Longer and Uncut",2.75
243,589,Fallen,2.58
232,541,George of the Jungle,2.51
494,1374,Final Destination 2,2.46
415,1221,Evolution,2.42
326,924,Goldfinger,2.4
549,1610,Hard Candy,2.16
357,1036,"Great Muppet Caper, The",2.13
384,1129,Hollow Man,2.09


In [298]:
user_movie_mat

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,0.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0


In [299]:
def store_ratings(user: int, n: int, user_movie_mat: pd.DataFrame, movies: pd.DataFrame):
    full_ratings = user_movie_mat.copy()

    for movie, _ in user_movie_mat[user].items():
        if np.isnan(user_movie_mat.loc[movie, user]) or user_movie_mat.loc[movie, user] == 0:
            user_movie_mat.loc[movie, user] = predict_rating(user_movie_mat, users_similarity_mat, user, movie, 100)

In [300]:
get_n_recommendations(user_id, n_recommendations, user_movie_mat_train, movies)

Unnamed: 0,movieId,title,rating
438,1240,Dirty Rotten Scoundrels,2.98
413,1200,Heartbreakers,2.81
311,858,"South Park: Bigger, Longer and Uncut",2.78
237,541,George of the Jungle,2.53
426,1221,Evolution,2.44
...,...,...,...
389,1103,"Road Warrior, The (Mad Max 2)",-0.12
96,207,Free Willy,-0.12
386,1096,Flatliners,-0.12
385,1095,Blood Simple,-0.12


In [301]:
from math import sqrt

In [302]:
user_movie_mat

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,0.0,0.0,0.0,2.0,0.0,0.0,5.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.5,0.0,0.0


In [303]:
user = 1
SSE = 0
c = 0

In [304]:
for movie, _ in user_movie_mat_train[user].items():
        if np.isnan(user_movie_mat_train.loc[movie, user]) or user_movie_mat_train.loc[movie, user] == 0:
            user_movie_mat_train.loc[movie, user] = predict_rating(user_movie_mat_train, users_similarity_mat, user, movie, 100)

In [305]:
user_movie_mat_train

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,0.0,0.0,2.5,0.0,5.0
2,0.90,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,5.0,0.0,0.0,0.0,2.0,0.0,0.0
3,4.00,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
5,0.12,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.00,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018,5.00,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019,1.21,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.5,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020,0.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
2021,1.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,0.0


In [306]:

for movie, _ in user_movie_mat_test[user].items():
    if user_movie_mat_test.loc[movie, user] != 0:
        #print(user_movie_mat_test.loc[movie, user], user_movie_mat_train.loc[movie, user])
        E = user_movie_mat_test.loc[movie, user] - user_movie_mat_train.loc[movie, user]
        SSE = SSE + pow(E, 2)
        c = c+1
MSE = SSE/c
RMSE = sqrt(MSE)
print(RMSE)

3.1835473787967214


In [307]:
def ger_full_ratings():
    full_ratings = user_movie_mat.copy()

    for user, movies in full_ratings.items():
        for movie in movies.keys():
            if np.isnan(full_ratings.loc[movie, user]) or full_ratings.loc[movie, user] == 0:
                full_ratings.loc[movie, user] = predict_rating(
                    user_movie_mat, users_similarity_mat, user, movie
                )

In [308]:
full_ratings

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.00,-0.0,-0.0,0.78,4.00,0.05,4.50,0.01,0.01,0.02,...,4.00,0.03,4.00,3.0,4.00,2.50,4.00,2.50,3.00,5.00
2,0.03,-0.0,-0.0,-0.13,1.04,4.00,0.04,4.00,0.01,0.02,...,-0.03,4.00,0.04,5.0,3.50,0.65,0.87,2.00,-0.01,1.23
3,4.00,-0.0,-0.0,-0.13,1.04,5.00,0.04,0.01,0.01,0.02,...,-0.03,0.03,0.04,0.0,0.05,-0.17,-0.00,2.00,-0.01,-0.02
5,0.03,-0.0,-0.0,-0.13,1.04,5.00,0.04,0.01,0.01,0.02,...,-0.03,0.03,0.04,3.0,0.05,0.24,-0.00,0.19,-0.01,-0.02
6,4.00,-0.0,-0.0,0.78,1.04,4.00,0.04,0.01,0.01,0.02,...,-0.03,3.00,4.00,3.0,0.05,0.65,-0.00,1.39,-0.01,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139385,0.03,-0.0,-0.0,-0.13,-0.02,0.05,0.04,0.01,0.01,0.02,...,1.79,0.03,0.04,0.0,0.05,-0.17,-0.00,0.19,-0.01,4.50
139644,0.03,-0.0,-0.0,-0.13,-0.02,0.05,0.04,0.01,0.01,0.02,...,-0.03,0.03,0.04,0.0,0.05,-0.17,-0.00,0.19,-0.01,4.50
140110,0.03,-0.0,-0.0,-0.13,-0.02,0.05,0.04,0.01,0.01,5.00,...,-0.03,0.03,0.04,0.0,0.05,-0.17,-0.00,0.19,-0.01,-0.02
142488,0.03,-0.0,-0.0,-0.13,-0.02,0.05,0.04,0.01,0.01,0.02,...,1.79,0.03,0.04,0.0,0.05,-0.17,-0.00,0.19,-0.01,3.50


In [309]:
user_movie_mat.to_csv('../data/preprocessed/user_movie_mat.csv')
users_similarity_mat.to_pickle('../data/preprocessed/users_similarity_mat.pkl')

em tese podemos usar a similaridade por cosseno para prever a nota de um usuário no filtor por conteudo tbm. 

assim teremos as previsões de avaliações em 2 sistemas (baseado em conteúdo e colaborativo), com uma média poderada obtemos um previsão final, usando ela podemos obter uma lista final.