##Import the required libraries

In [28]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import pickle


##Load the Dataset

In [11]:
ratings = pd.read_csv("/content/drive/MyDrive/Recommendation_Project/ml-latest-small/ratings.csv")
movies = pd.read_csv("/content/drive/MyDrive/Recommendation_Project/ml-latest-small/movies.csv")


##Merge Dataset


In [12]:
movie_data = pd.merge(ratings, movies, on='movieId')

##Extract the year

In [13]:
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)
movies['year'].fillna(movies['year'].median(), inplace=True)


##Encode movie title

In [14]:
movies['title_year'] = movies['title'] + " (" + movies['year'].astype(int).astype(str) + ")"
movie_titles = movies.set_index('movieId')['title_year'].to_dict()


##Encode the genres

In [15]:
movie_genres = movies['genres'].str.get_dummies(sep='|')


##Create Utility Matrix

In [16]:
utility_matrix = movie_data.pivot_table(index='movieId', columns='userId', values='rating')
utility_matrix.fillna(0, inplace=True)


##Normalize Utility Matrix

In [17]:
utility_matrix_normalized = utility_matrix.sub(utility_matrix.mean(axis=1), axis=0)


##Combine data

In [18]:

final = utility_matrix_normalized.merge(movie_genres, left_index=True, right_index=True, how='left')
movies.set_index('movieId', inplace=True)

final = final.merge(movies[['year']], left_index=True, right_index=True, how='left')

final['year'] = (final['year'] - final['year'].min()) / (final['year'].max() - final['year'].min())


##Calculate the similarity matrix

In [30]:
def calculate_pearson_similarity(matrix):
    df = pd.DataFrame(matrix)
    similarity_matrix = df.T.corr(method='pearson')
    similarity_matrix = similarity_matrix.to_numpy()

    return similarity_matrix


In [31]:
similarity_matrix = calculate_pearson_similarity(final.values)
similarity_df = pd.DataFrame(similarity_matrix, index=final.index, columns=final.index)


##Saving the similarity matrix

In [37]:
with open('/content/drive/MyDrive/Recommendation_Project/similarity_matrix.pkl', 'wb') as file:
    pickle.dump(similarity_df, file)

In [38]:
with open('/content/drive/MyDrive/Recommendation_Project/movie_titles.pkl', 'wb') as file:
    pickle.dump(movie_titles, file)

##Get similar movies


In [34]:
def similarMovies(movieid, topn=10):
    with open('similarity_matrix.pkl', 'rb') as file:
        similarity_df = pickle.load(file)

    with open('movie_titles.pkl', 'rb') as file:
        movie_titles = pickle.load(file)


    similar_movies = similarity_df[movieid].sort_values(ascending=False).index[1:topn+1]
    similarities = similarity_df[movieid].sort_values(ascending=False).values[1:topn+1]

    results = pd.DataFrame({
        'movieId': similar_movies,
        'similarity': similarities
    })

    results['title_year'] = results['movieId'].map(movie_titles)

    return results[['movieId', 'title_year', 'similarity']]


In [35]:
df = similarMovies(1, 10)
df


Unnamed: 0,movieId,title_year,similarity
0,3114,Toy Story 2 (1999) (1999),0.461476
1,1265,Groundhog Day (1993) (1993),0.361295
2,780,Independence Day (a.k.a. ID4) (1996) (1996),0.358919
3,1073,Willy Wonka & the Chocolate Factory (1971) (1971),0.357005
4,648,Mission: Impossible (1996) (1996),0.353017
5,788,"Nutty Professor, The (1996) (1996)",0.351191
6,2355,"Bug's Life, A (1998) (1998)",0.346571
7,364,"Lion King, The (1994) (1994)",0.34387
8,34,Babe (1995) (1995),0.341445
9,4886,"Monsters, Inc. (2001) (2001)",0.330622
