{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "source": [ "##Import the required libraries" ], "metadata": { "id": "8iCgYULHnMZ2" } }, { "cell_type": "code", "execution_count": 28, "metadata": { "id": "45nBTOSLmwMP" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from scipy.stats import pearsonr\n", "import pickle\n" ] }, { "cell_type": "markdown", "source": [ "##Load the Dataset" ], "metadata": { "id": "cKBkwH2pnVFb" } }, { "cell_type": "code", "source": [ "ratings = pd.read_csv(\"/content/drive/MyDrive/Recommendation_Project/ml-latest-small/ratings.csv\")\n", "movies = pd.read_csv(\"/content/drive/MyDrive/Recommendation_Project/ml-latest-small/movies.csv\")\n" ], "metadata": { "id": "puRTWNYim2fL" }, "execution_count": 11, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Merge Dataset\n" ], "metadata": { "id": "Hr7wu9SJnbdO" } }, { "cell_type": "code", "source": [ "movie_data = pd.merge(ratings, movies, on='movieId')" ], "metadata": { "id": "lyJ0udfLnINx" }, "execution_count": 12, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Extract the year" ], "metadata": { "id": "DfleDEVqnhFM" } }, { "cell_type": "code", "source": [ "movies['year'] = movies['title'].str.extract(r'\\((\\d{4})\\)').astype(float)\n", "movies['year'].fillna(movies['year'].median(), inplace=True)\n" ], "metadata": { "id": "4ZxsNL0UnLnn" }, "execution_count": 13, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Encode movie title" ], "metadata": { "id": "WGhpQBZLnu2D" } }, { "cell_type": "code", "source": [ "movies['title_year'] = movies['title'] + \" (\" + movies['year'].astype(int).astype(str) + \")\"\n", "movie_titles = movies.set_index('movieId')['title_year'].to_dict()\n" ], "metadata": { "id": "PMrBhmoAnuGo" }, "execution_count": 14, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Encode the genres" ], "metadata": { "id": "ccY0EENon1Yp" } }, { "cell_type": "code", "source": [ "movie_genres = movies['genres'].str.get_dummies(sep='|')\n" ], "metadata": { "id": "RLlQYBffnySw" }, "execution_count": 15, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Create Utility Matrix" ], "metadata": { "id": "qr7mRrUzn-gQ" } }, { "cell_type": "code", "source": [ "utility_matrix = movie_data.pivot_table(index='movieId', columns='userId', values='rating')\n", "utility_matrix.fillna(0, inplace=True)\n" ], "metadata": { "id": "zleyKyMrn9fS" }, "execution_count": 16, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Normalize Utility Matrix" ], "metadata": { "id": "nDKXoMzKoFw9" } }, { "cell_type": "code", "source": [ "utility_matrix_normalized = utility_matrix.sub(utility_matrix.mean(axis=1), axis=0)\n" ], "metadata": { "id": "U2ktQzWZoEvj" }, "execution_count": 17, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Combine data" ], "metadata": { "id": "O1evgm7CoNe9" } }, { "cell_type": "code", "source": [ "\n", "final = utility_matrix_normalized.merge(movie_genres, left_index=True, right_index=True, how='left')\n", "movies.set_index('movieId', inplace=True)\n", "\n", "final = final.merge(movies[['year']], left_index=True, right_index=True, how='left')\n", "\n", "final['year'] = (final['year'] - final['year'].min()) / (final['year'].max() - final['year'].min())\n" ], "metadata": { "id": "yuPUkw0boKgx" }, "execution_count": 18, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Calculate the similarity matrix" ], "metadata": { "id": "xm8K4wRVobAx" } }, { "cell_type": "code", "source": [ "def calculate_pearson_similarity(matrix):\n", " df = pd.DataFrame(matrix)\n", " similarity_matrix = df.T.corr(method='pearson')\n", " similarity_matrix = similarity_matrix.to_numpy()\n", "\n", " return similarity_matrix\n" ], "metadata": { "id": "neJ1NVlroYKX" }, "execution_count": 30, "outputs": [] }, { "cell_type": "code", "source": [ "similarity_matrix = calculate_pearson_similarity(final.values)\n", "similarity_df = pd.DataFrame(similarity_matrix, index=final.index, columns=final.index)\n" ], "metadata": { "id": "9n8zbfslueD9" }, "execution_count": 31, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Saving the similarity matrix" ], "metadata": { "id": "fvD4Vh-qpAFL" } }, { "cell_type": "code", "source": [ "with open('/content/drive/MyDrive/Recommendation_Project/similarity_matrix.pkl', 'wb') as file:\n", " pickle.dump(similarity_df, file)" ], "metadata": { "id": "yW03QM-KogPb" }, "execution_count": 37, "outputs": [] }, { "cell_type": "code", "source": [ "with open('/content/drive/MyDrive/Recommendation_Project/movie_titles.pkl', 'wb') as file:\n", " pickle.dump(movie_titles, file)" ], "metadata": { "id": "Lut-GEUPpj5a" }, "execution_count": 38, "outputs": [] }, { "cell_type": "markdown", "source": [ "##Get similar movies\n" ], "metadata": { "id": "7rekdXOQpIek" } }, { "cell_type": "code", "source": [ "def similarMovies(movieid, topn=10):\n", " with open('similarity_matrix.pkl', 'rb') as file:\n", " similarity_df = pickle.load(file)\n", "\n", " with open('movie_titles.pkl', 'rb') as file:\n", " movie_titles = pickle.load(file)\n", "\n", "\n", " similar_movies = similarity_df[movieid].sort_values(ascending=False).index[1:topn+1]\n", " similarities = similarity_df[movieid].sort_values(ascending=False).values[1:topn+1]\n", "\n", " results = pd.DataFrame({\n", " 'movieId': similar_movies,\n", " 'similarity': similarities\n", " })\n", "\n", " results['title_year'] = results['movieId'].map(movie_titles)\n", "\n", " return results[['movieId', 'title_year', 'similarity']]\n" ], "metadata": { "id": "e5YJbVafpEnj" }, "execution_count": 34, "outputs": [] }, { "cell_type": "code", "source": [ "df = similarMovies(1, 10)\n", "df\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "id": "Y4t4eUIlpS3M", "outputId": "62c80fa7-54ea-4b93-bffe-2434093016e7" }, "execution_count": 35, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title_year similarity\n", "0 3114 Toy Story 2 (1999) (1999) 0.461476\n", "1 1265 Groundhog Day (1993) (1993) 0.361295\n", "2 780 Independence Day (a.k.a. ID4) (1996) (1996) 0.358919\n", "3 1073 Willy Wonka & the Chocolate Factory (1971) (1971) 0.357005\n", "4 648 Mission: Impossible (1996) (1996) 0.353017\n", "5 788 Nutty Professor, The (1996) (1996) 0.351191\n", "6 2355 Bug's Life, A (1998) (1998) 0.346571\n", "7 364 Lion King, The (1994) (1994) 0.343870\n", "8 34 Babe (1995) (1995) 0.341445\n", "9 4886 Monsters, Inc. (2001) (2001) 0.330622" ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitle_yearsimilarity
03114Toy Story 2 (1999) (1999)0.461476
11265Groundhog Day (1993) (1993)0.361295
2780Independence Day (a.k.a. ID4) (1996) (1996)0.358919
31073Willy Wonka & the Chocolate Factory (1971) (1971)0.357005
4648Mission: Impossible (1996) (1996)0.353017
5788Nutty Professor, The (1996) (1996)0.351191
62355Bug's Life, A (1998) (1998)0.346571
7364Lion King, The (1994) (1994)0.343870
834Babe (1995) (1995)0.341445
94886Monsters, Inc. (2001) (2001)0.330622
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", " \n", " \n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "df", "summary": "{\n \"name\": \"df\",\n \"rows\": 10,\n \"fields\": [\n {\n \"column\": \"movieId\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 1498,\n \"min\": 34,\n \"max\": 4886,\n \"num_unique_values\": 10,\n \"samples\": [\n 34,\n 1265,\n 788\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"title_year\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 10,\n \"samples\": [\n \"Babe (1995) (1995)\",\n \"Groundhog Day (1993) (1993)\",\n \"Nutty Professor, The (1996) (1996)\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"similarity\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 0.03663724828764377,\n \"min\": 0.3306216446172369,\n \"max\": 0.4614763983999328,\n \"num_unique_values\": 10,\n \"samples\": [\n 0.341445139872491,\n 0.3612947447903729,\n 0.3511905125596912\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 35 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "D1oSS4BDpbGa" }, "execution_count": null, "outputs": [] } ] }