{ "cells": [ { "cell_type": "markdown", "id": "322a393e-ee13-445d-93e1-338d824e6219", "metadata": {}, "source": [ "# FILM RECOMMENDATION SYSTEM" ] }, { "cell_type": "code", "execution_count": 1, "id": "f7135c09-f799-4d79-b0c3-fba4a3a048b0", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Importing libraries\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import seaborn as sns\n", "import missingno as msno\n", "from sklearn.impute import KNNImputer\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import joblib\n", "from joblib import dump, load\n", "import time\n", "\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from tensorflow.keras.layers import Input, Dense\n", "from tensorflow.keras.models import Model\n", "\n", "pd.set_option('display.max_rows', 500)\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.width', 1000)\n", "import warnings\n", "warnings.filterwarnings('ignore')" ] }, { "cell_type": "code", "execution_count": 2, "id": "cfe9b983-c2c6-426e-89c5-7b7ee1838b31", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Importing data\n", "\n", "# titles data\n", "netflix_titles = pd.read_csv('../data/Netflix/titles.csv')\n", "netflix_titles = netflix_titles[netflix_titles['type']=='MOVIE']\n", "\n", "amazon_titles = pd.read_csv('../data/Amazon_prime/titles.csv')\n", "amazon_titles = amazon_titles[amazon_titles['type']=='MOVIE']\n", "\n", "# credits data\n", "netflix_credits = pd.read_csv('../data/Netflix/credits.csv')\n", "netflix_credits = netflix_credits.loc[:, ('id', 'name')][netflix_credits['role']=='ACTOR']\n", "netflix_credits = netflix_credits.groupby('id')['name'].apply(', '.join).reset_index()\n", "\n", "amazon_credits = pd.read_csv('../data/Amazon_prime/credits.csv')\n", "amazon_credits = amazon_credits.loc[:, ('id', 'name')][amazon_credits['role']=='ACTOR']\n", "amazon_credits = amazon_credits.groupby('id')['name'].apply(', '.join).reset_index()" ] }, { "cell_type": "code", "execution_count": 3, "id": "8f15ccaa-c740-44e8-b136-5a93f7a43f4d", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Merging data for each plateform\n", "netflix = netflix_titles.merge(netflix_credits, on='id', how='left')\n", "amazon = amazon_titles.merge(amazon_credits, on='id', how='left')" ] }, { "cell_type": "code", "execution_count": 4, "id": "f771e728-27dd-4b99-b0b3-1f80ab1d561a", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Concatenating data\n", "dataset = pd.concat([netflix, amazon], axis=0).reset_index()" ] }, { "cell_type": "code", "execution_count": 5, "id": "8242540d-fb36-4655-a521-1b6eab3a4567", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "(13153, 17)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "7fc15ea3-4e67-42ec-a7ca-0528778f2327", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexidtitletypedescriptionrelease_yearage_certificationruntimegenresproduction_countriesseasonsimdb_idimdb_scoreimdb_votestmdb_popularitytmdb_scorename
00tm82169RockyMOVIEWhen world heavyweight boxing champion, Apollo...1976PG119['drama', 'sport']['US']NaNtt00751488.1588100.0106.3617.782Sylvester Stallone, Talia Shire, Burt Young, C...
11tm17823GreaseMOVIEAustralian good girl Sandy and greaser Danny f...1978PG110['romance', 'comedy']['US']NaNtt00776317.2283316.033.1607.406John Travolta, Olivia Newton-John, Stockard Ch...
\n", "
" ], "text/plain": [ " index id title type description release_year age_certification runtime genres production_countries seasons imdb_id imdb_score imdb_votes tmdb_popularity tmdb_score name\n", "0 0 tm82169 Rocky MOVIE When world heavyweight boxing champion, Apollo... 1976 PG 119 ['drama', 'sport'] ['US'] NaN tt0075148 8.1 588100.0 106.361 7.782 Sylvester Stallone, Talia Shire, Burt Young, C...\n", "1 1 tm17823 Grease MOVIE Australian good girl Sandy and greaser Danny f... 1978 PG 110 ['romance', 'comedy'] ['US'] NaN tt0077631 7.2 283316.0 33.160 7.406 John Travolta, Olivia Newton-John, Stockard Ch..." ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.head(2)" ] }, { "cell_type": "code", "execution_count": 7, "id": "d84823ed-b408-4658-8553-87ade8e82b79", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 13153 entries, 0 to 13152\n", "Data columns (total 17 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 index 13153 non-null int64 \n", " 1 id 13153 non-null object \n", " 2 title 13153 non-null object \n", " 3 type 13153 non-null object \n", " 4 description 13032 non-null object \n", " 5 release_year 13153 non-null int64 \n", " 6 age_certification 4209 non-null object \n", " 7 runtime 13153 non-null int64 \n", " 8 genres 13153 non-null object \n", " 9 production_countries 13153 non-null object \n", " 10 seasons 0 non-null float64\n", " 11 imdb_id 12315 non-null object \n", " 12 imdb_score 11941 non-null float64\n", " 13 imdb_votes 11916 non-null float64\n", " 14 tmdb_popularity 12670 non-null float64\n", " 15 tmdb_score 11217 non-null float64\n", " 16 name 12019 non-null object \n", "dtypes: float64(5), int64(3), object(9)\n", "memory usage: 1.7+ MB\n" ] } ], "source": [ "dataset.info()" ] }, { "cell_type": "code", "execution_count": 8, "id": "22c2e0da-7117-422d-91f3-06bb63e98e84", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indexrelease_yearruntimeseasonsimdb_scoreimdb_votestmdb_popularitytmdb_score
count19455.00000019455.00000019455.0000000.017372.0000001.731500e+0418926.00000017050.000000
mean2966.4872272003.04379392.343922NaN6.0596712.717334e+0410.7703416.108614
std2567.45970725.63839230.800705NaN1.2446231.123930e+0543.9424641.332341
min0.0000001901.0000001.000000NaN1.1000005.000000e+000.0001530.500000
25%960.0000001999.00000078.000000NaN5.3000002.020000e+021.6710005.400000
50%2087.0000002015.00000092.000000NaN6.2000001.044000e+033.7130006.200000
75%4457.5000002019.000000107.000000NaN7.0000007.266500e+039.7737507.000000
max9321.0000002023.000000940.000000NaN9.9000002.684317e+063187.53100010.000000
\n", "
" ], "text/plain": [ " index release_year runtime seasons imdb_score imdb_votes tmdb_popularity tmdb_score\n", "count 19455.000000 19455.000000 19455.000000 0.0 17372.000000 1.731500e+04 18926.000000 17050.000000\n", "mean 2966.487227 2003.043793 92.343922 NaN 6.059671 2.717334e+04 10.770341 6.108614\n", "std 2567.459707 25.638392 30.800705 NaN 1.244623 1.123930e+05 43.942464 1.332341\n", "min 0.000000 1901.000000 1.000000 NaN 1.100000 5.000000e+00 0.000153 0.500000\n", "25% 960.000000 1999.000000 78.000000 NaN 5.300000 2.020000e+02 1.671000 5.400000\n", "50% 2087.000000 2015.000000 92.000000 NaN 6.200000 1.044000e+03 3.713000 6.200000\n", "75% 4457.500000 2019.000000 107.000000 NaN 7.000000 7.266500e+03 9.773750 7.000000\n", "max 9321.000000 2023.000000 940.000000 NaN 9.900000 2.684317e+06 3187.531000 10.000000" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.describe()" ] }, { "cell_type": "code", "execution_count": 8, "id": "1d43e6f7-8e86-488d-ab73-51aa31beeed8", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "index 0.000000\n", "id 0.000000\n", "title 0.000000\n", "type 0.000000\n", "description 0.009199\n", "release_year 0.000000\n", "age_certification 0.679997\n", "runtime 0.000000\n", "genres 0.000000\n", "production_countries 0.000000\n", "seasons 1.000000\n", "imdb_id 0.063712\n", "imdb_score 0.092146\n", "imdb_votes 0.094047\n", "tmdb_popularity 0.036722\n", "tmdb_score 0.147191\n", "name 0.086216\n", "dtype: float64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.isnull().sum()/dataset.shape[0]" ] }, { "cell_type": "code", "execution_count": 9, "id": "8d504a43-1d71-479d-baa6-fcadd362072d", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(8,5))\n", "msno.matrix(dataset, sparkline=False)\n", "plt.title('Distribution of Missing Values', size=20)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 10, "id": "a2d071bb-3390-459b-8624-0f81454098a5", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking data duplicates\n", "dataset.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 11, "id": "2102e2a3-edf6-4c08-aa2c-6c8401eea99f", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "87" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking id duplicates\n", "dataset['id'].duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 12, "id": "c77d5092-4a7b-45bb-838d-b966869dcd09", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "278" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking title duplicates\n", "dataset['title'].duplicated().sum()" ] }, { "cell_type": "markdown", "id": "8c202099-e6e8-4312-be98-79d87a9d052e", "metadata": {}, "source": [ "## PREPROCESSING DATA" ] }, { "cell_type": "code", "execution_count": 13, "id": "c155d745-9212-4c54-8988-b5a50d037128", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Dropping 'index', 'type', 'seasons' and 'imdb_id' columns\n", "dataset.drop(['index', 'type', 'seasons', 'imdb_id'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 14, "id": "ae627626-53eb-49a8-9b27-ff7b4be2d0d4", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Removing duplicates\n", "dataset = dataset.drop_duplicates(subset=['title'], keep='first').reset_index()" ] }, { "cell_type": "markdown", "id": "8bcf5e01-c7b7-468a-8ed2-03f20758c93c", "metadata": {}, "source": [ "### release_year" ] }, { "cell_type": "code", "execution_count": 15, "id": "f94f7dc7-b6ab-4a52-b055-4308a78f5bbc", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "count 12875.000000\n", "mean 2006.637670\n", "std 23.435384\n", "min 1912.000000\n", "25% 2007.000000\n", "50% 2017.000000\n", "75% 2020.000000\n", "max 2023.000000\n", "Name: release_year, dtype: float64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# film release year range from 1901 to 2023\n", "dataset['release_year'].describe()" ] }, { "cell_type": "code", "execution_count": 16, "id": "16b71394-165f-4c79-aba8-20e4af210019", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Inspecting release year distribution\n", "plt.figure(figsize=(5,3))\n", "sns.histplot(dataset['release_year'])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 17, "id": "28c6a584-cf1e-4458-912c-5fa55bd02845", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Regrouping film release year into 2 groups\n", "dataset['release_year'] = dataset['release_year'].apply(lambda x: 0 if x<2015 else 1)" ] }, { "cell_type": "code", "execution_count": 18, "id": "6e15aa62-616d-4b33-b60e-72be319351c4", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Converting release year into dummy variables\n", "dataset = pd.get_dummies(dataset, columns=['release_year'], prefix='year', drop_first=True)" ] }, { "cell_type": "markdown", "id": "ae6352f3-fd10-44b0-b579-82069328d0f9", "metadata": {}, "source": [ "### age_certification" ] }, { "cell_type": "code", "execution_count": 19, "id": "144579a3-dfc0-415c-9bfd-9f7d3cfe8cc2", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "R 1795\n", "PG-13 1090\n", "PG 817\n", "G 370\n", "NC-17 25\n", "Name: age_certification, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['age_certification'].value_counts()\n", "# an R rating means that no one under 17 can see this movie unless the parent is with them\n", "# PG-13 means that this movie may not be suitable for kids under 13\n", "# PG means parental guidance. The movie is likely for children, but parents should watch the movie with their children\n", "# A movie that is meant for children will likely be rated as G\n", "# NC-17 as an “adults alone” rating that signifies that no one under the age of 17 can see the film" ] }, { "cell_type": "code", "execution_count": 20, "id": "61912093-0fa2-4346-957f-7b0fdd5373f5", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Replacing null values by NR (not rated)\n", "dataset['age_certification'].fillna('NR', inplace=True)" ] }, { "cell_type": "code", "execution_count": 21, "id": "ee9dbd1e-ec5f-48fc-a473-6a040200f33a", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Encoding age certification\n", "dataset = pd.get_dummies(dataset, columns=['age_certification'], prefix='age', drop_first=True)" ] }, { "cell_type": "markdown", "id": "84f83b2a-d895-4191-b9ac-53e7a8a6c935", "metadata": {}, "source": [ "### runtime" ] }, { "cell_type": "code", "execution_count": 22, "id": "9e978beb-83e8-43e7-b17f-a94478ed2610", "metadata": { "tags": [] }, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(5,3))\n", "sns.histplot(dataset['runtime'])\n", "plt.xlim(10, 300)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 23, "id": "cd8be29f-930a-4e33-919f-123c30f93b28", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "191" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# How much film shorter than 30 minutes?\n", "len(dataset[dataset['runtime']<30])" ] }, { "cell_type": "code", "execution_count": 24, "id": "b031a61e-e733-4bff-a891-10cf50c4bac5", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Regulate short film duration to 30 minutes for avoiding outliers\n", "dataset['runtime'] = dataset['runtime'].apply(lambda x: 30 if x<30 else x)" ] }, { "cell_type": "markdown", "id": "240bcab4-8784-4968-b2db-57fc0f82f4e8", "metadata": {}, "source": [ "### genre" ] }, { "cell_type": "code", "execution_count": 25, "id": "b10595d3-3f73-42bb-a991-76809d37f4bd", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Cleaning text\n", "dataset['genres'] = dataset['genres'].str.replace('[', '')\n", "dataset['genres'] = dataset['genres'].str.replace(']', '')\n", "dataset['genres'] = dataset['genres'].str.replace(\"'\", \"\")\n", "dataset['genres'] = dataset['genres'].str.replace(' ', '')" ] }, { "cell_type": "code", "execution_count": 26, "id": "09f0d86c-22eb-442e-8d81-4071f8961017", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Getting genres list for each film\n", "dataset['genres']=dataset['genres'].str.split(',')" ] }, { "cell_type": "code", "execution_count": 27, "id": "718e7a33-bdfd-4f37-a0da-4460d8583db6", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Retrieving genres list of all films\n", "genres_list = [j for i in dataset['genres'] for j in i]\n", "genres_set = set(genres_list)\n", "genres_set.discard('')" ] }, { "cell_type": "code", "execution_count": 28, "id": "0d221ff1-ff33-4774-8350-749dbefcdbce", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "19" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking number of all genres\n", "len(genres_set)" ] }, { "cell_type": "code", "execution_count": 29, "id": "59e02576-63ce-4350-9bce-c3e5265f39a9", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creating new columns corresponding to each genre\n", "\n", "## Building function to get dummies genres\n", "def get_dummies_genres(data):\n", " unique_genres = list(genres_set)\n", " unique_genres.sort()\n", " dummy_matrix = []\n", " for d in data:\n", " row = [int(genre in d) for genre in unique_genres]\n", " dummy_matrix.append(row)\n", " dummies = {}\n", " for i, col_name in enumerate(unique_genres):\n", " col_values = [row[i] for row in dummy_matrix]\n", " dummies[col_name] = col_values\n", " return dummies\n", "\n", "## Getting the genre dummies data\n", "dummies_genres = get_dummies_genres(dataset['genres'])\n", "\n", "## Adding new genre columns to dataset\n", "for key in dummies_genres.keys():\n", " col = 'genre_'+str(key)\n", " dataset[col] = dummies_genres[key]\n", "\n", "## Removing original genres column\n", "dataset.drop(['genres'], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "id": "eabfc81c-0f48-4daf-96f3-afe7fd70f73e", "metadata": {}, "source": [ "### production_countries" ] }, { "cell_type": "code", "execution_count": 30, "id": "7c013c0c-0f91-4052-aafd-1a1525b666ca", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "703" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking number of films with no production country recorded\n", "len(dataset[dataset['production_countries']=='[]'])" ] }, { "cell_type": "code", "execution_count": 31, "id": "faea79c6-3a60-4299-9566-3879994ed22b", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Replacing missing records by most frequent value (US)\n", "dataset['production_countries'].replace(to_replace=\"[]\", value=\"['US']\", inplace=True)" ] }, { "cell_type": "code", "execution_count": 32, "id": "c40d2877-b0c1-4f81-8360-38be9384dbe7", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Cleaning text\n", "dataset['production_countries'] = dataset['production_countries'].str.replace('[', '')\n", "dataset['production_countries'] = dataset['production_countries'].str.replace(']', '')\n", "dataset['production_countries'] = dataset['production_countries'].str.replace(\"'\", \"\")" ] }, { "cell_type": "code", "execution_count": 33, "id": "785a7951-f660-418a-864b-67ed1dac36e2", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Getting countries list for each film\n", "dataset['production_countries']=dataset['production_countries'].str.split(', ')" ] }, { "cell_type": "code", "execution_count": 34, "id": "ae99de7c-680f-453a-920a-bae788d15a88", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Retrieving countries list of all films\n", "countries_list = [j for i in dataset['production_countries'] for j in i]\n", "countries_set = set(countries_list)" ] }, { "cell_type": "code", "execution_count": 35, "id": "4f59f0fb-219c-4109-8379-271756aa3423", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "145" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking number of all countries\n", "len(countries_set)" ] }, { "cell_type": "code", "execution_count": 36, "id": "f85c1721-0090-4501-b4fe-093889a56a99", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creating new columns corresponding to each country\n", "\n", "## Building function to get dummies countries\n", "def get_dummies_countries(data):\n", " unique_countries = list(countries_set)\n", " unique_countries.sort()\n", " dummy_matrix = []\n", " for d in data:\n", " row = [int(country in d) for country in unique_countries]\n", " dummy_matrix.append(row)\n", " dummies = {}\n", " for i, col_name in enumerate(unique_countries):\n", " col_values = [row[i] for row in dummy_matrix]\n", " dummies[col_name] = col_values\n", " return dummies\n", "\n", "## Getting the genre dummies data\n", "dummies_countries = get_dummies_countries(dataset['production_countries'])\n", "\n", "## Adding new genre columns to dataset\n", "for key in dummies_countries.keys():\n", " col = 'country_'+str(key)\n", " dataset[col] = dummies_countries[key]\n", "\n", "## Removing original genres column\n", "dataset.drop(['production_countries'], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "id": "b7ccf6ed-c8e5-4e90-8786-5e2b143bbf7d", "metadata": {}, "source": [ "### name" ] }, { "cell_type": "code", "execution_count": 37, "id": "477c29e2-3829-4327-8f0d-fe6610daa4f8", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Replacing missing values\n", "dataset['name'].fillna('NoName', inplace=True)" ] }, { "cell_type": "code", "execution_count": 38, "id": "5fdf52d2-f8b7-467d-a552-3c389ceb1466", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Getting actors list for each film\n", "dataset['name']=dataset['name'].str.split(', ')" ] }, { "cell_type": "code", "execution_count": 39, "id": "54ffff42-b3de-42b3-ac93-c4fe1d639b00", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Retrieving actors list of all films\n", "actors_list = [j for i in dataset['name'] for j in i]\n", "actors_set = set(actors_list)" ] }, { "cell_type": "code", "execution_count": 40, "id": "3c7f4e23-1665-4cce-898a-fad84006fbe1", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "111043" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Checking number of all actors\n", "len(actors_set)" ] }, { "cell_type": "code", "execution_count": 41, "id": "af784919-0705-4b5e-8508-972c2be10584", "metadata": { "tags": [] }, "outputs": [], "source": [ "# The actors list is to large, we decide to limit this list into 200 (100 Greatest Actors and 100 Greatest Actresses \n", "# of All Time following IMDB selection\n", "# (reference links: https://www.imdb.com/list/ls050274118/ ; https://www.imdb.com/list/ls000055475/)\n", "greatest_actors = ['Jack Nicholson', 'Marlon Brando', 'Robert De Niro', 'Al Pacino', 'Daniel Day-Lewis', 'Dustin Hoffman', 'Tom Hanks', \n", " 'Anthony Hopkins', 'Paul Newman', 'Denzel Washington', 'Spencer Tracy', 'Laurence Olivier', 'Jack Lemmon', \n", " 'Michael Caine', 'James Stewart', 'Robin Williams', 'Robert Duvall', 'Sean Penn', 'Morgan Freeman', 'Jeff Bridges', \n", " 'Sidney Poitier', 'Peter OToole', 'Clint Eastwood', 'Gene Hackman', 'Charles Chaplin', 'Ben Kingsley', \n", " 'Philip Seymour Hoffman', 'Leonardo DiCaprio', 'Russell Crowe', 'Kevin Spacey', 'Humphrey Bogart', 'Gregory Peck', \n", " 'Clark Gable', 'Gary Cooper', 'George C. Scott', 'Jason Robards', 'Charles Laughton', 'Anthony Quinn', 'Peter Sellers', \n", " 'James Cagney', 'Peter Finch', 'Henry Fonda', 'Cary Grant', 'Richard Burton', 'Burt Lancaster', 'William Holden', \n", " 'John Wayne', 'Kirk Douglas', 'Alec Guinness', 'Christopher Plummer', 'Tommy Lee Jones', 'Sean Connery', 'Alan Arkin', \n", " 'Christopher Walken', 'Joe Pesci', 'Ian McKellen', 'Michael Douglas', 'Jon Voight', 'Albert Finney', 'Geoffrey Rush', \n", " 'Jeremy Irons', 'Javier Bardem', 'Heath Ledger', 'Christoph Waltz', 'Ralph Fiennes', 'Johnny Depp', 'Benicio Del Toro', \n", " 'Jamie Foxx', 'Joaquin Phoenix', 'Colin Firth', 'Matthew McConaughey', 'Christian Bale', 'Gary Oldman', 'Edward Norton', \n", " 'Brad Pitt', 'Tom Cruise', 'Matt Damon', 'Hugh Jackman', 'Robert Downey Jr.', 'Liam Neeson', 'Mel Gibson', 'Harrison Ford',\n", " 'Woody Allen', 'Steve McQueen', 'Orson Welles', 'Robert Redford', 'James Dean', 'Charlton Heston', 'Gene Kelly', \n", " 'Robert Mitchum', 'Bill Murray', 'Samuel L. Jackson', 'Jim Carrey', 'Don Cheadle', 'Martin Sheen', 'Alan Rickman', \n", " 'Edward G. Robinson', 'Will Smith', 'John Goodman', 'Buster Keaton', 'Meryl Streep', 'Ingrid Bergman', 'Vivien Leigh', \n", " 'Bette Davis', 'Jodie Foster', 'Katharine Hepburn', 'Elizabeth Taylor', 'Kate Winslet', 'Hilary Swank', 'Naomi Watts', \n", " 'Shirley MacLaine', 'Natalie Portman', 'Jennifer Connelly', 'Susan Sarandon', 'Charlize Theron', 'Cate Blanchett', \n", " 'Rachel Weisz', 'Diane Lane', 'Sharon Stone', 'Scarlett Johansson', 'Juliette Lewis', 'Gwyneth Paltrow', 'Helen Mirren', \n", " 'Judi Dench', 'Monica Bellucci', 'Angelina Jolie', 'Nicole Kidman', 'Anjelica Huston', 'Romy Schneider', 'Ashley Judd', \n", " 'Grace Kelly', 'Patricia Clarkson', 'Simone Signoret', 'Isabelle Adjani', 'Marion Cotillard', 'Sigourney Weaver', \n", " 'Jane Fonda', 'Helen Hunt', 'Frances McDormand', 'Jessica Lange', 'Catherine Deneuve', 'Barbara Stanwyck', 'Holly Hunter', \n", " 'Kathy Bates', 'Sissy Spacek', 'Sally Field', 'Diane Keaton', 'Faye Dunaway', 'Penélope Cruz', 'Laura Dern', 'Sophia Loren', \n", " 'Ellen Burstyn', 'Julie Christie', 'Olivia de Havilland', 'Greer Garson', 'Helena Bonham Carter', 'Geena Davis', \n", " 'Audrey Tautou', 'Rachel McAdams', 'Tilda Swinton', 'Elisabeth Shue', 'Maggie Gyllenhaal', 'Maria Bello', 'Franka Potente', \n", " 'Kristin Scott Thomas', 'Barbra Streisand', 'Juliette Binoche', 'Kathleen Turner', 'Lauren Bacall', 'Ellen Barkin', \n", " 'Shelley Winters', 'Vanessa Redgrave', 'Audrey Hepburn', 'Greta Garbo', 'Glenn Close', 'Catherine Zeta-Jones', \n", " 'Emmanuelle Béart', 'Marisa Tomei', 'Geraldine Chaplin', 'Jennifer Jason Leigh', 'Birgit Minichmayr', 'Deborah Kerr', \n", " 'Julianne Moore', 'Emmanuelle Seigner', 'Isabelle Huppert', 'Brittany Murphy', 'Uma Thurman', 'Michelle Pfeiffer', \n", " 'Evan Rachel Wood', 'Toni Collette', 'Whoopi Goldberg', 'Charlotte Rampling', 'Demi Moore', 'Sibel Kekilli', 'Rita Hayworth', \n", " 'Amanda Seyfried', 'Sophie Marceau', 'Victoria Abril', 'Giulietta Masina', 'Doris Day']" ] }, { "cell_type": "code", "execution_count": 42, "id": "eee6e04f-e821-46a2-b553-d4af0a1fb76e", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creating new columns corresponding to each actor\n", "\n", "## Building function to get dummies actors\n", "def get_dummies_actors(data):\n", " greatest_actors.sort()\n", " dummy_matrix = []\n", " for d in data:\n", " row = [int(actor in d) for actor in greatest_actors]\n", " dummy_matrix.append(row)\n", " dummies = {}\n", " for i, col_name in enumerate(greatest_actors):\n", " col_values = [row[i] for row in dummy_matrix]\n", " dummies[col_name] = col_values\n", " return dummies\n", "\n", "## Getting the actors dummies matrix\n", "dummies_actors = get_dummies_actors(dataset['name'])\n", "\n", "## Adding new columns corresponding to actor in the list\n", "for actor in dummies_actors.keys():\n", " col = 'actor_'+str(actor)\n", " dataset[col] = dummies_actors[actor]\n", "\n", "## Removing original genres column\n", "dataset.drop(['name'], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "id": "c61e2123-9a71-46af-bb06-2cdaf75ec7cb", "metadata": {}, "source": [ "### imdb_score, imdb_votes, tmdb_popularity, tmdb_score" ] }, { "cell_type": "code", "execution_count": 43, "id": "615dc07e-da4c-4abd-94c0-510856d3e120", "metadata": { "tags": [] }, "outputs": [], "source": [ "# processing missing values\n", "\n", "## imdb_votes, tmdb_popularity: replace missing values using column's mean\n", "dataset['imdb_votes'].fillna(int(dataset['imdb_votes'].mean()), inplace=True)\n", "dataset['tmdb_popularity'].fillna(dataset['tmdb_popularity'].mean(), inplace=True)\n", "\n", "# imdb_score, tmdb_score: fill missing values using KNN Imputation\n", "imputer = KNNImputer(n_neighbors=10, weights='distance', metric='nan_euclidean')\n", "\n", "dataset['imdb_score'] = imputer.fit_transform(np.array(dataset['imdb_score']).reshape(-1, 1))\n", "dataset['imdb_score'] = dataset['imdb_score'].apply(lambda x: round(x, 1))\n", "\n", "dataset['tmdb_score'] = imputer.fit_transform(np.array(dataset['tmdb_score']).reshape(-1, 1))\n", "dataset['tmdb_score'] = dataset['tmdb_score'].apply(lambda x: round(x, 1))" ] }, { "cell_type": "markdown", "id": "a1adf6e6-8272-4a80-aea4-b88a92f50884", "metadata": {}, "source": [ "### Rescaling data" ] }, { "cell_type": "code", "execution_count": 44, "id": "4f3d31d5-fbd4-4376-85a1-d2ac6c62b81d", "metadata": { "tags": [] }, "outputs": [], "source": [ "scaler = MinMaxScaler()\n", "dataset['runtime'] = scaler.fit_transform(np.array(dataset['runtime']).reshape(-1, 1))\n", "dataset['imdb_score'] = scaler.fit_transform(np.array(dataset['imdb_score']).reshape(-1, 1))\n", "dataset['imdb_votes'] = scaler.fit_transform(np.array(dataset['imdb_votes']).reshape(-1, 1))\n", "dataset['tmdb_score'] = scaler.fit_transform(np.array(dataset['tmdb_score']).reshape(-1, 1))\n", "dataset['tmdb_popularity'] = scaler.fit_transform(np.array(dataset['tmdb_popularity']).reshape(-1, 1))" ] }, { "cell_type": "markdown", "id": "0d055f08-f94b-4360-9f7a-a7b9a8abae43", "metadata": {}, "source": [ "### description" ] }, { "cell_type": "code", "execution_count": 45, "id": "5db57c76-0fda-4bb3-842c-12eb136f2a32", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Removing cell with description missing values\n", "dataset.dropna(axis=0, inplace=True)" ] }, { "cell_type": "markdown", "id": "ad03ecad-4a32-4970-bafa-34e1430cd523", "metadata": {}, "source": [ "## MODELLING" ] }, { "cell_type": "code", "execution_count": 47, "id": "2fec25e4-3952-4914-aaf0-7c03df50b670", "metadata": {}, "outputs": [], "source": [ "# We will caculate movies similarities based on 1.movies characteristic features and 2.movies description " ] }, { "cell_type": "code", "execution_count": 46, "id": "3d90d259-92a1-4e27-8a3d-ebb9aaa1ce6e", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Retrieving data for models building\n", "data_text = dataset.loc[:, ('title', 'description')]\n", "data_characteristics = dataset.drop(['index', 'id', 'description'], axis=1)" ] }, { "cell_type": "markdown", "id": "47378e6b-8a4a-497f-bfbe-96aef36d73c1", "metadata": {}, "source": [ "### Calculating movies similarity basing on movies characteristics" ] }, { "cell_type": "code", "execution_count": 47, "id": "e06fe451-6c1e-4d3a-99d5-bc142146b33c", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Creating movies dictionnary\n", "movies_dict = data_characteristics.set_index('title').T.to_dict('list')\n", "# Building movies characteristics matrix\n", "movie_matrix = np.array(list(movies_dict.values()))\n", "# Building movies similarity matrix\n", "similarity_matrix_char = cosine_similarity(movie_matrix)" ] }, { "cell_type": "markdown", "id": "a6d70718-7fc3-459d-b9f5-a6211330674b", "metadata": {}, "source": [ "### Calculating movies similarity basing on movies description" ] }, { "cell_type": "code", "execution_count": null, "id": "b16c2c26-762f-4e29-ab7c-73b3ec35261b", "metadata": {}, "outputs": [], "source": [ "# Here we will implement AutoEncoders model to capture text similarity" ] }, { "cell_type": "code", "execution_count": 48, "id": "0009e687-736e-42eb-bba5-a63d3878bcf4", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Set model parameters\n", "max_words = 1000\n", "latent_dim = 100\n", "#sequence_length = 10\n", "batch_size = 6\n", "epochs = 20" ] }, { "cell_type": "code", "execution_count": 49, "id": "ceb8c0b1-6c71-4cb3-9766-597a4e5c7518", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Create tokenizer for converting text\n", "tokenizer = Tokenizer(num_words=max_words)\n", "tokenizer.fit_on_texts(list(data_text['description']))\n", "sequences = tokenizer.texts_to_sequences(list(data_text['description']))" ] }, { "cell_type": "code", "execution_count": 50, "id": "64a48dce-ab2b-4cab-8560-893cb8007771", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Convert text sequences into one-hot vectors\n", "input_data = tokenizer.sequences_to_matrix(sequences, mode='binary') # use binary mode since data values are all 0 or 1" ] }, { "cell_type": "code", "execution_count": 51, "id": "5ebbae96-396a-45bf-a32c-8dfd1b50ea62", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Create model architecture\n", "input_dim = input_data.shape[1]\n", "\n", "# Encoder\n", "inputs = Input(shape=(input_dim,))\n", "encoded = Dense(256, activation='relu')(inputs)\n", "encoded = Dense(128, activation='relu')(encoded)\n", "encoded = Dense(latent_dim, activation='relu')(encoded)\n", "\n", "# Define Encoder model\n", "encoder = Model(inputs, encoded)\n", "\n", "# Decoder\n", "decoded = Dense(128, activation='relu')(encoded)\n", "decoded = Dense(256, activation='relu')(decoded)\n", "decoded = Dense(input_dim, activation='sigmoid')(decoded)\n", "\n", "# Define AutoEncoder model\n", "autoencoder = Model(inputs, decoded)" ] }, { "cell_type": "code", "execution_count": 52, "id": "cccc32bf-9b11-4819-bb61-413fb38ed414", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model_1\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " input_1 (InputLayer) [(None, 1000)] 0 \n", " \n", " dense (Dense) (None, 256) 256256 \n", " \n", " dense_1 (Dense) (None, 128) 32896 \n", " \n", " dense_2 (Dense) (None, 100) 12900 \n", " \n", " dense_3 (Dense) (None, 128) 12928 \n", " \n", " dense_4 (Dense) (None, 256) 33024 \n", " \n", " dense_5 (Dense) (None, 1000) 257000 \n", " \n", "=================================================================\n", "Total params: 605,004\n", "Trainable params: 605,004\n", "Non-trainable params: 0\n", "_________________________________________________________________\n" ] } ], "source": [ "# Get model summary\n", "autoencoder.summary()" ] }, { "cell_type": "code", "execution_count": 53, "id": "999d1c3c-8af9-4551-bbeb-ada875f5f2fb", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/20\n", "2126/2126 [==============================] - 26s 10ms/step - loss: 0.0849\n", "Epoch 2/20\n", "2126/2126 [==============================] - 19s 9ms/step - loss: 0.0690\n", "Epoch 3/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0627\n", "Epoch 4/20\n", "2126/2126 [==============================] - 18s 9ms/step - loss: 0.0590\n", "Epoch 5/20\n", "2126/2126 [==============================] - 18s 9ms/step - loss: 0.0563\n", "Epoch 6/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0542\n", "Epoch 7/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0523\n", "Epoch 8/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0507\n", "Epoch 9/20\n", "2126/2126 [==============================] - 18s 9ms/step - loss: 0.0494\n", "Epoch 10/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0481\n", "Epoch 11/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0471\n", "Epoch 12/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0460\n", "Epoch 13/20\n", "2126/2126 [==============================] - 19s 9ms/step - loss: 0.0451\n", "Epoch 14/20\n", "2126/2126 [==============================] - 18s 8ms/step - loss: 0.0443\n", "Epoch 15/20\n", "2126/2126 [==============================] - 19s 9ms/step - loss: 0.0436\n", "Epoch 16/20\n", "2126/2126 [==============================] - 19s 9ms/step - loss: 0.0429\n", "Epoch 17/20\n", "2126/2126 [==============================] - 18s 9ms/step - loss: 0.0422\n", "Epoch 18/20\n", "2126/2126 [==============================] - 18s 9ms/step - loss: 0.0416\n", "Epoch 19/20\n", "2126/2126 [==============================] - 20s 9ms/step - loss: 0.0411\n", "Epoch 20/20\n", "2126/2126 [==============================] - 20s 9ms/step - loss: 0.0406\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Train model\n", "autoencoder.compile(optimizer='adam', loss='binary_crossentropy')\n", "autoencoder.fit(input_data, input_data, epochs=epochs, batch_size=batch_size)" ] }, { "cell_type": "code", "execution_count": 54, "id": "7dc181ed-281c-41ad-ba30-1935a9768d3d", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "399/399 [==============================] - 3s 7ms/step\n" ] } ], "source": [ "# Recuperate encoded version of input data\n", "encoded_texts = encoder.predict(input_data) " ] }, { "cell_type": "code", "execution_count": 55, "id": "bd4f41f4-97dc-4d1a-bc3a-fbe4edc08d31", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Recuperate similarity matrix\n", "similarity_matrix_ae = cosine_similarity(encoded_texts)" ] }, { "cell_type": "markdown", "id": "2b7b0d89-9e17-44a5-ae5d-f27394d75803", "metadata": {}, "source": [ "### Generate final similarity matrix" ] }, { "cell_type": "code", "execution_count": 56, "id": "44b7766a-dd5a-4bfb-b29d-e8b898a92765", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Generate similarity matrix\n", "similarity_matrix = (np.array(similarity_matrix_char) + np.array(similarity_matrix_ae))/2\n", "# Create similarity dataframe\n", "movies_list = [movie for movie in dataset['title']]\n", "similarity_frame = pd.DataFrame(data=similarity_matrix, index=movies_list, columns=movies_list)" ] }, { "cell_type": "code", "execution_count": 57, "id": "0d87e15b-8786-4764-a229-40ece3ed8967", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Raw dump duration: 3.323s\n" ] } ], "source": [ "pickle_file = 'pickle_frame.joblib'\n", "start = time.time()\n", "with open(pickle_file, 'wb') as f:\n", " dump(similarity_frame, f)\n", "raw_dump_duration = time.time() - start\n", "print(\"Raw dump duration: %0.3fs\" % raw_dump_duration)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }