diff --git "a/DF_Construction.ipynb" "b/DF_Construction.ipynb" new file mode 100644--- /dev/null +++ "b/DF_Construction.ipynb" @@ -0,0 +1,3348 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "39139b70", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Import relevant libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import json\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import seaborn as sns\n", + "from dotenv import load_dotenv\n", + "from os import environ\n", + "import requests\n", + "from time import sleep\n", + "import re\n", + "\n", + "load_dotenv() # Read local .env file" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2fa58a90", + "metadata": {}, + "source": [ + "# Construct Item Dataframe\n", + "\n", + "The goal here is to construct a dataframe consisting of relevant information for each movie. For this, I'll be using only one of the original csv files:\n", + "\n", + "- **movies_metadata.csv**: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.\n", + "\n", + "> Data description based on [Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)\n", + "\n", + "First, I'll read the csv file and list the different columns in the dataframe:" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "03fa2e2a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Saqi\\AppData\\Local\\Temp\\ipykernel_34392\\3934408411.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " meta_df = pd.read_csv('./data/movies_metadata.csv')\n" + ] + }, + { + "data": { + "text/plain": [ + "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n", + " 'imdb_id', 'original_language', 'original_title', 'overview',\n", + " 'popularity', 'poster_path', 'production_companies',\n", + " 'production_countries', 'release_date', 'revenue', 'runtime',\n", + " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n", + " 'vote_average', 'vote_count'],\n", + " dtype='object')" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df = pd.read_csv('./data/movies_metadata.csv')\n", + "meta_df.columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cb5a13bf", + "metadata": {}, + "source": [ + "Relevant columsn include:\n", + "\n", + "- `adult`: Whether or not a movie has been rated adult. While not relevant in the core recommendation algorithm, it could be useful in the final result such that the user would be able to filter adult content if they so wished.\n", + "- `budget`: The movie's budget. Should a user have a preference for high-budget movies, this column could be a good indicator of that.\n", + "- `genres`: The movie genre.\n", + "- `popularity`: Movie popularity. Will be relevant for the user interface.\n", + "- `revenue`: The amount of money the movie had made. Will be more relevant later during feature engineering.\n", + "- `runtime`: How long the movie was. Could also be useful for filtering purposes.\n", + "- `status`: Whether or not the movie has been released or not.\n", + "- `vote_average`: The average vote that viewers had given this specific movie.\n", + "- `production_companies` and `production_countries`: Which company made the movie in what countries. This could be useful if a user prefers movies made by a certain company or from a certain country\n", + "\n", + "Other columns such as `id`, `imdb_id`, `title` and `overview` will be useful for descriptive purposes later on." + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "464a0be1", + "metadata": {}, + "outputs": [], + "source": [ + "relevant_columns = [\"adult\", \"budget\", \"genres\", \"id\", \"imdb_id\",\n", + " \"overview\", \"popularity\", \"revenue\", \"runtime\", \"status\", \"vote_average\", \"title\", \"overview\",\n", + " \"production_companies\", \"production_countries\"]\n", + "cols_to_drop = [col for col in meta_df.columns if col not in relevant_columns]\n", + "\n", + "meta_df.drop(cols_to_drop, axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b0b309c9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adultbudgetgenresidimdb_idoverviewpopularityproduction_companiesproduction_countriesrevenueruntimestatustitlevote_average
0False30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...862tt0114709Led by Woody, Andy's toys live happily in his ...21.946943[{'name': 'Pixar Animation Studios', 'id': 3}][{'iso_3166_1': 'US', 'name': 'United States o...373554033.081.0ReleasedToy Story7.7
1False65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...8844tt0113497When siblings Judy and Peter discover an encha...17.015539[{'name': 'TriStar Pictures', 'id': 559}, {'na...[{'iso_3166_1': 'US', 'name': 'United States o...262797249.0104.0ReleasedJumanji6.9
2False0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...15602tt0113228A family wedding reignites the ancient feud be...11.7129[{'name': 'Warner Bros.', 'id': 6194}, {'name'...[{'iso_3166_1': 'US', 'name': 'United States o...0.0101.0ReleasedGrumpier Old Men6.5
3False16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...31357tt0114885Cheated on, mistreated and stepped on, the wom...3.859495[{'name': 'Twentieth Century Fox Film Corporat...[{'iso_3166_1': 'US', 'name': 'United States o...81452156.0127.0ReleasedWaiting to Exhale6.1
4False0[{'id': 35, 'name': 'Comedy'}]11862tt0113041Just when George Banks has recovered from his ...8.387519[{'name': 'Sandollar Productions', 'id': 5842}...[{'iso_3166_1': 'US', 'name': 'United States o...76578911.0106.0ReleasedFather of the Bride Part II5.7
\n", + "
" + ], + "text/plain": [ + " adult budget genres id \n", + "0 False 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 862 \\\n", + "1 False 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 8844 \n", + "2 False 0 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 15602 \n", + "3 False 16000000 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 31357 \n", + "4 False 0 [{'id': 35, 'name': 'Comedy'}] 11862 \n", + "\n", + " imdb_id overview popularity \n", + "0 tt0114709 Led by Woody, Andy's toys live happily in his ... 21.946943 \\\n", + "1 tt0113497 When siblings Judy and Peter discover an encha... 17.015539 \n", + "2 tt0113228 A family wedding reignites the ancient feud be... 11.7129 \n", + "3 tt0114885 Cheated on, mistreated and stepped on, the wom... 3.859495 \n", + "4 tt0113041 Just when George Banks has recovered from his ... 8.387519 \n", + "\n", + " production_companies \n", + "0 [{'name': 'Pixar Animation Studios', 'id': 3}] \\\n", + "1 [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n", + "2 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n", + "3 [{'name': 'Twentieth Century Fox Film Corporat... \n", + "4 [{'name': 'Sandollar Productions', 'id': 5842}... \n", + "\n", + " production_countries revenue runtime \n", + "0 [{'iso_3166_1': 'US', 'name': 'United States o... 373554033.0 81.0 \\\n", + "1 [{'iso_3166_1': 'US', 'name': 'United States o... 262797249.0 104.0 \n", + "2 [{'iso_3166_1': 'US', 'name': 'United States o... 0.0 101.0 \n", + "3 [{'iso_3166_1': 'US', 'name': 'United States o... 81452156.0 127.0 \n", + "4 [{'iso_3166_1': 'US', 'name': 'United States o... 76578911.0 106.0 \n", + "\n", + " status title vote_average \n", + "0 Released Toy Story 7.7 \n", + "1 Released Jumanji 6.9 \n", + "2 Released Grumpier Old Men 6.5 \n", + "3 Released Waiting to Exhale 6.1 \n", + "4 Released Father of the Bride Part II 5.7 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "0304a62e", + "metadata": {}, + "source": [ + "Now that we've obtained the filtered dataframe, I'll do routine data pre-processing, such as:\n", + "- Checking for `NaN` data and filling where necessary\n", + "- Making sure that the data in columns is \"clean\", i.e. each quantitive column has the right type and there are no strings in said numerical columns.\n", + "\n", + "#### 1. Checking for NaN data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "987d221f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "adult 0\n", + "budget 0\n", + "genres 0\n", + "id 0\n", + "imdb_id 17\n", + "overview 954\n", + "popularity 5\n", + "production_companies 3\n", + "production_countries 3\n", + "revenue 6\n", + "runtime 263\n", + "status 87\n", + "title 6\n", + "vote_average 6\n", + "dtype: int64" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Checking for NaN\n", + "meta_df.isna().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "47e0f2f7", + "metadata": {}, + "source": [ + "For filling in the gaps, I'll be using the [OMDb API](https://www.omdbapi.com/). But first things first, I'll be focusing on the `status` column, as I'll be dropping all rows dataframe-wide that do not have a `status` of released.\n", + "- As this is a movie recommendation task, there is no point in recommending movies that have not been released yet." + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "0c8cc4fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Only keep movies that have not been released yet.\n", + "meta_df = meta_df[meta_df['status'] == 'Released']\n", + "meta_df.drop('status', axis=1, inplace=True) # After this, we no longer need this column" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8dc5e293", + "metadata": {}, + "source": [ + "I'll now recalculate the number of `Nan` values per column:" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "f0ea83cf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "adult 0\n", + "budget 0\n", + "genres 0\n", + "id 0\n", + "imdb_id 15\n", + "overview 920\n", + "popularity 0\n", + "production_companies 0\n", + "production_countries 0\n", + "revenue 0\n", + "runtime 251\n", + "status 0\n", + "title 0\n", + "vote_average 0\n", + "dtype: int64" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df.isna().sum() # No change in NaN values for other columns " + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8758b1b9", + "metadata": {}, + "source": [ + "Here, I'll fill in the missing information as best I can with the OMDb API. I've put an .env file in the same location as this notebook containing my OMDb API key. For making the API requests, I'm using the incredibly well-known [requests](https://pypi.org/project/requests/) package.\n", + "\n", + "I'll first load that, and then define a function for fetching movies based on titles or IMDB ids:" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "29d0f249", + "metadata": {}, + "outputs": [], + "source": [ + "api_key = environ.get('OMDB_KEY')\n", + "\n", + "# We can either pass in imdb_id or movie title\n", + "def make_omdb_req(identifier, is_imdb_id=True):\n", + " if is_imdb_id:\n", + " query = f\"i={identifier}\"\n", + " else:\n", + " identifier = identifier.replace(\" \", \"\\ \")\n", + " query = f\"t={identifier}\"\n", + " \n", + " url = f\"http://www.omdbapi.com/?apikey={api_key}&{query}&type=movie\"\n", + " res = requests.get(url)\n", + " try:\n", + " if res.status_code == 200:\n", + " return json.loads(res.content)\n", + " except:\n", + " pass\n", + " return {'Response': 'False'}" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d4998ee2", + "metadata": {}, + "source": [ + "We can then test this for the first element in `meta_df`:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a3a6c508", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Title': 'Toy Story',\n", + " 'Year': '1995',\n", + " 'Rated': 'G',\n", + " 'Released': '25 Nov 1995',\n", + " 'Runtime': '81 min',\n", + " 'Genre': 'Animation, Adventure, Comedy',\n", + " 'Director': 'John Lasseter',\n", + " 'Writer': 'John Lasseter, Pete Docter, Andrew Stanton',\n", + " 'Actors': 'Tom Hanks, Tim Allen, Don Rickles',\n", + " 'Plot': \"A cowboy doll is profoundly threatened and jealous when a new spaceman action figure supplants him as top toy in a boy's bedroom.\",\n", + " 'Language': 'English',\n", + " 'Country': 'United States, Japan',\n", + " 'Awards': 'Nominated for 3 Oscars. 29 wins & 23 nominations total',\n", + " 'Poster': 'https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg',\n", + " 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'},\n", + " {'Source': 'Rotten Tomatoes', 'Value': '100%'},\n", + " {'Source': 'Metacritic', 'Value': '96/100'}],\n", + " 'Metascore': '96',\n", + " 'imdbRating': '8.3',\n", + " 'imdbVotes': '1,018,595',\n", + " 'imdbID': 'tt0114709',\n", + " 'Type': 'movie',\n", + " 'DVD': '23 Mar 2010',\n", + " 'BoxOffice': '$223,225,679',\n", + " 'Production': 'N/A',\n", + " 'Website': 'N/A',\n", + " 'Response': 'True'}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_id = meta_df[\"imdb_id\"][0]\n", + "make_omdb_req(test_id)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "706d238f", + "metadata": {}, + "source": [ + "I'll now iterate over all rows with `NaN` values and fill in the gaps accordingly:\n", + "\n", + "> A thing to note here is that only the `imdb_id`, `overview` and `runtime` columns have missing data, so for each row, I only need to check these three columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d944b1ea", + "metadata": {}, + "outputs": [], + "source": [ + "invalid_movies = [] # For movies not found using the API" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "6b25b861", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Row with ID 30146 not found!\n", + "Row with ID 65256 not found!\n", + "Row with ID 342011 not found!\n", + "Row with ID 391438 not found!\n", + "Row with ID 416569 not found!\n", + "Row with ID 109861 not found!\n", + "Row with ID 362617 not found!\n", + "Row with ID 227964 not found!\n", + "Row with ID 342684 not found!\n", + "Row with ID 359413 not found!\n", + "Row with ID 77564 not found!\n", + "Row with ID 327909 not found!\n", + "Row with ID 449863 not found!\n", + "Row with ID 142478 not found!\n", + "Row with ID 41663 not found!\n", + "Row with ID 185180 not found!\n", + "Row with ID 428950 not found!\n", + "Row with ID 440508 not found!\n", + "Row with ID 152100 not found!\n", + "Row with ID 167330 not found!\n", + "Row with ID 220669 not found!\n", + "Row with ID 240992 not found!\n", + "Row with ID 38547 not found!\n", + "Row with ID 366759 not found!\n", + "Row with ID 148697 not found!\n", + "Row with ID 49833 not found!\n", + "Row with ID 452606 not found!\n", + "Row with ID 65010 not found!\n", + "Row with ID 101217 not found!\n", + "Row with ID 236053 not found!\n", + "Row with ID 123592 not found!\n", + "Row with ID 109671 not found!\n", + "Row with ID 327935 not found!\n", + "Row with ID 123601 not found!\n", + "Row with ID 123611 not found!\n", + "Row with ID 453596 not found!\n", + "Row with ID 142802 not found!\n", + "Row with ID 77534 not found!\n", + "Row with ID 143883 not found!\n", + "Row with ID 354133 not found!\n", + "Row with ID 191486 not found!\n", + "Row with ID 127803 not found!\n", + "Row with ID 271495 not found!\n", + "Row with ID 244575 not found!\n", + "Row with ID 246438 not found!\n", + "Row with ID 362844 not found!\n", + "Row with ID 36264 not found!\n", + "Row with ID 270908 not found!\n", + "Row with ID 14210 not found!\n", + "Row with ID 376934 not found!\n", + "Row with ID 213321 not found!\n", + "Row with ID 380438 not found!\n", + "Row with ID 41493 not found!\n", + "Row with ID 452922 not found!\n", + "Row with ID 93461 not found!\n", + "Row with ID 63838 not found!\n", + "Row with ID 197057 not found!\n", + "Row with ID 143005 not found!\n", + "Row with ID 336484 not found!\n", + "Row with ID 159810 not found!\n", + "Row with ID 51275 not found!\n", + "Row with ID 420481 not found!\n", + "Row with ID 69976 not found!\n", + "Row with ID 26792 not found!\n", + "Row with ID 37603 not found!\n", + "Row with ID 48209 not found!\n", + "Row with ID 57382 not found!\n", + "Row with ID 110131 not found!\n", + "Row with ID 41689 not found!\n", + "Row with ID 458808 not found!\n", + "Row with ID 400552 not found!\n", + "Row with ID 419601 not found!\n", + "Row with ID 14644 not found!\n", + "Row with ID 82495 not found!\n", + "Row with ID 64827 not found!\n", + "Row with ID 103301 not found!\n", + "Row with ID 301876 not found!\n", + "Row with ID 73545 not found!\n", + "Row with ID 448879 not found!\n", + "Row with ID 457307 not found!\n", + "Row with ID 396987 not found!\n", + "Row with ID 153561 not found!\n", + "Row with ID 366860 not found!\n", + "Row with ID 202865 not found!\n", + "Row with ID 9765 not found!\n", + "Row with ID 213683 not found!\n", + "Row with ID 57770 not found!\n", + "Row with ID 142320 not found!\n", + "Row with ID 430058 not found!\n", + "Row with ID 54309 not found!\n", + "Row with ID 445840 not found!\n", + "Row with ID 64043 not found!\n", + "Row with ID 73649 not found!\n", + "Row with ID 57996 not found!\n", + "Row with ID 63179 not found!\n", + "Row with ID 398295 not found!\n", + "Row with ID 353713 not found!\n", + "Row with ID 458335 not found!\n", + "Row with ID 298207 not found!\n", + "Row with ID 382995 not found!\n", + "Row with ID 439314 not found!\n", + "Row with ID 422005 not found!\n", + "Row with ID 26969 not found!\n", + "Row with ID 91673 not found!\n", + "Row with ID 68063 not found!\n", + "Row with ID 103344 not found!\n", + "Row with ID 275272 not found!\n", + "Row with ID 231216 not found!\n", + "Row with ID 79343 not found!\n", + "Row with ID 418757 not found!\n", + "Row with ID 369444 not found!\n", + "Row with ID 395767 not found!\n", + "Row with ID 199887 not found!\n", + "Row with ID 317389 not found!\n", + "Row with ID 468707 not found!\n", + "Row with ID 280422 not found!\n", + "Row with ID 449131 not found!\n" + ] + } + ], + "source": [ + "for idx, row in meta_df[meta_df.isnull().any(axis=1)].iterrows():\n", + "\n", + " # No use in re-trying movies that don't exist in OMDb's database\n", + " if row['id'] in invalid_movies:\n", + " continue\n", + "\n", + " # First fetch row data from API\n", + " api_data = make_omdb_req(row['title'], False)\n", + "\n", + " # Movie not found\n", + " if api_data['Response'] == 'False':\n", + " # Try and make request with imdb_id, if it exists\n", + " api_data = make_omdb_req(row['imdb_id'])\n", + "\n", + " # If movie still not found\n", + " if api_data['Response'] == 'False':\n", + " print(f\"Row with ID {row['id']} not found!\")\n", + " invalid_movies.append(row['id'])\n", + " continue\n", + " \n", + " # If API res was okay, start filling in data:\n", + " if pd.isnull(row['overview']):\n", + " row[\"overview\"] = api_data['Plot']\n", + " \n", + " if pd.isnull(row['imdb_id']):\n", + " row['imdb_id'] = api_data['imdbID']\n", + "\n", + " if pd.isnull(row['runtime']):\n", + " # API data needs to be parsed; response has a \"min\" at the end but runtime col is float\n", + " try:\n", + " row['runtime'] = float(api_data['Runtime'].replace('min', ''))\n", + " except:\n", + " pass\n", + "\n", + " meta_df[meta_df['id'] == row['id']] = row\n", + "\n", + " # To avoid sending too many requests to the API at once\n", + " sleep(1.0)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "d06e3248", + "metadata": {}, + "source": [ + "Now, I'll check to see if there are any more `NaN` values left:" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "1b245bbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "adult 0\n", + "budget 0\n", + "genres 0\n", + "id 0\n", + "imdb_id 6\n", + "overview 114\n", + "popularity 0\n", + "production_companies 0\n", + "production_countries 0\n", + "revenue 0\n", + "runtime 51\n", + "status 0\n", + "title 0\n", + "vote_average 0\n", + "dtype: int64" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df.isna().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "41edb363", + "metadata": {}, + "source": [ + "The remaining `NaN` values will be handled as such:\n", + "- `imdb_id`: `NaN` values will be replaced with `-1`, to indicate that this movie has no imdb id.\n", + "- `overview`: `NaN` values will be replaced with \"No description available\".\n", + "- `runtime`: `NaN` values will be replaced with _average_ runtime." + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "06d8691a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "adult 0\n", + "budget 0\n", + "genres 0\n", + "id 0\n", + "imdb_id 0\n", + "overview 0\n", + "popularity 0\n", + "production_companies 0\n", + "production_countries 0\n", + "revenue 0\n", + "runtime 0\n", + "status 0\n", + "title 0\n", + "vote_average 0\n", + "dtype: int64" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df['imdb_id'] = meta_df['imdb_id'].fillna(-1)\n", + "meta_df['overview'] = meta_df['overview'].fillna(\"No description available\")\n", + "meta_df['runtime'] = meta_df['runtime'].fillna(meta_df['runtime'].median())\n", + "\n", + "meta_df.isna().sum()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "a2bbae24", + "metadata": {}, + "source": [ + "#### 2. Making sure data is clean\n", + "\n", + "An issue that exists in this data is that while there are columns comprising solely of numerical data, these columns are sometimes dirty in the sense that there may be erronous string data in that specific column that prevents us from using that column properly. Here, I'll iterate over all of the numerical and boolean columns and make sure they've been casted to the correct type.\n", + "\n", + "> The columns I'll be checking are: `adult` (bool), `budget` (int), `popularity` (float), `revenue` (int), `runtime` (float), `vote_average` (float)" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "45c516bc", + "metadata": {}, + "outputs": [], + "source": [ + "meta_df['adult'] = meta_df['adult'].astype(bool)\n", + "meta_df['budget'] = meta_df['budget'].astype(int)\n", + "meta_df['popularity'] = meta_df['popularity'].astype(float)\n", + "meta_df['revenue'] = meta_df['revenue'].astype(int)\n", + "meta_df['runtime'] = meta_df['runtime'].astype(float)\n", + "meta_df['vote_average'] = meta_df['vote_average'].astype(float)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3b7593f7", + "metadata": {}, + "source": [ + "---\n", + "\n", + "### Additional Feature Engineering\n", + "\n", + "#### Genre column" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "819aa271", + "metadata": {}, + "source": [ + "Here, I'll reconstruct the `meta_df` dataframe such that for each possible genre a movie could have, there is a column. If that movie falls under that specific genre, the value of the respective column is `1`, otherwise the value of that specific column in that specific row is `0`\n", + "\n", + "Next is to make a separate column for each genre. For this, I'll first need a list of all genres.\n", + "\n", + "What's important to note here is that elements in the genre column are json strings, thus I'll be using the `json` module to properly parse the string." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "4b33d433", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']\n" + ] + } + ], + "source": [ + "\n", + "genre_col = meta_df['genres'].values.tolist() # Get list of every element in the genres column\n", + "\n", + "# Iterate over every element, generate list of *unique* genres.\n", + "genres = []\n", + "for item in genre_col:\n", + " item_gs = json.loads(item.replace('\\'', '\"')) # json.loads expects double quotes (\") for property names\n", + " for g in item_gs:\n", + " if g['name'] not in genres:\n", + " genres.append(g['name'])\n", + "\n", + "print(genres)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b5a8a1c2", + "metadata": {}, + "source": [ + "Obviously, some data cleaning is needed here. Thankfully, there aren't very many candidate \"genres\", so I'll filter them by hand;" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "9b43758f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idgenresAnimationComedyFamilyAdventureFantasyRomanceDramaAction...HorrorHistoryScience FictionMysteryWarForeignMusicDocumentaryWesternTV Movie
0862[{'id': 16, 'name': 'Animation'}, {'id': 35, '...11100000...0000000000
18844[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...00111000...0000000000
215602[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...01000100...0000000000
331357[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...01000110...0000000000
411862[{'id': 35, 'name': 'Comedy'}]01000000...0000000000
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " id genres Animation \n", + "0 862 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1 \\\n", + "1 8844 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 0 \n", + "2 15602 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 0 \n", + "3 31357 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 0 \n", + "4 11862 [{'id': 35, 'name': 'Comedy'}] 0 \n", + "\n", + " Comedy Family Adventure Fantasy Romance Drama Action ... Horror \n", + "0 1 1 0 0 0 0 0 ... 0 \\\n", + "1 0 1 1 1 0 0 0 ... 0 \n", + "2 1 0 0 0 1 0 0 ... 0 \n", + "3 1 0 0 0 1 1 0 ... 0 \n", + "4 1 0 0 0 0 0 0 ... 0 \n", + "\n", + " History Science Fiction Mystery War Foreign Music Documentary \n", + "0 0 0 0 0 0 0 0 \\\n", + "1 0 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 \n", + "\n", + " Western TV Movie \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres = ['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']\n", + "\n", + "# Only get a copy of relevant columns\n", + "genre_df = meta_df[['id', 'genres']].copy()\n", + "\n", + "# Make a column for every genre\n", + "for genre in genres:\n", + " genre_df[genre] = 0\n", + "\n", + "# Iterate over every row and set genres accordingly:\n", + "for idx, row in genre_df.iterrows():\n", + " g_json = json.loads(row['genres'].replace('\\'', '\"')) \n", + " for g in g_json:\n", + " if g['name'] in genres:\n", + " genre_df.at[idx, g['name']] = 1\n", + "\n", + "genre_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "2a84ca58", + "metadata": {}, + "source": [ + "Now not only can we safely merge `meta_df` and `genre_df` to contruct the new dataframe with the genre columns, but we can also safely drop the `genres` column as it is no longer needed:" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "21afdde2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adultbudgetidimdb_idoverviewpopularityproduction_companiesproduction_countriesrevenueruntime...HorrorHistoryScience FictionMysteryWarForeignMusicDocumentaryWesternTV Movie
0True30000000862tt0114709Led by Woody, Andy's toys live happily in his ...21.946943[{'name': 'Pixar Animation Studios', 'id': 3}][{'iso_3166_1': 'US', 'name': 'United States o...37355403381.0...0000000000
1True650000008844tt0113497When siblings Judy and Peter discover an encha...17.015539[{'name': 'TriStar Pictures', 'id': 559}, {'na...[{'iso_3166_1': 'US', 'name': 'United States o...262797249104.0...0000000000
2True015602tt0113228A family wedding reignites the ancient feud be...11.712900[{'name': 'Warner Bros.', 'id': 6194}, {'name'...[{'iso_3166_1': 'US', 'name': 'United States o...0101.0...0000000000
3True1600000031357tt0114885Cheated on, mistreated and stepped on, the wom...3.859495[{'name': 'Twentieth Century Fox Film Corporat...[{'iso_3166_1': 'US', 'name': 'United States o...81452156127.0...0000000000
4True011862tt0113041Just when George Banks has recovered from his ...8.387519[{'name': 'Sandollar Productions', 'id': 5842}...[{'iso_3166_1': 'US', 'name': 'United States o...76578911106.0...0000000000
\n", + "

5 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " adult budget id imdb_id \n", + "0 True 30000000 862 tt0114709 \\\n", + "1 True 65000000 8844 tt0113497 \n", + "2 True 0 15602 tt0113228 \n", + "3 True 16000000 31357 tt0114885 \n", + "4 True 0 11862 tt0113041 \n", + "\n", + " overview popularity \n", + "0 Led by Woody, Andy's toys live happily in his ... 21.946943 \\\n", + "1 When siblings Judy and Peter discover an encha... 17.015539 \n", + "2 A family wedding reignites the ancient feud be... 11.712900 \n", + "3 Cheated on, mistreated and stepped on, the wom... 3.859495 \n", + "4 Just when George Banks has recovered from his ... 8.387519 \n", + "\n", + " production_companies \n", + "0 [{'name': 'Pixar Animation Studios', 'id': 3}] \\\n", + "1 [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n", + "2 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n", + "3 [{'name': 'Twentieth Century Fox Film Corporat... \n", + "4 [{'name': 'Sandollar Productions', 'id': 5842}... \n", + "\n", + " production_countries revenue runtime ... \n", + "0 [{'iso_3166_1': 'US', 'name': 'United States o... 373554033 81.0 ... \\\n", + "1 [{'iso_3166_1': 'US', 'name': 'United States o... 262797249 104.0 ... \n", + "2 [{'iso_3166_1': 'US', 'name': 'United States o... 0 101.0 ... \n", + "3 [{'iso_3166_1': 'US', 'name': 'United States o... 81452156 127.0 ... \n", + "4 [{'iso_3166_1': 'US', 'name': 'United States o... 76578911 106.0 ... \n", + "\n", + " Horror History Science Fiction Mystery War Foreign Music Documentary \n", + "0 0 0 0 0 0 0 0 0 \\\n", + "1 0 0 0 0 0 0 0 0 \n", + "2 0 0 0 0 0 0 0 0 \n", + "3 0 0 0 0 0 0 0 0 \n", + "4 0 0 0 0 0 0 0 0 \n", + "\n", + " Western TV Movie \n", + "0 0 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df = pd.merge(meta_df, genre_df)\n", + "meta_df.drop('genres', axis=1, inplace=True)\n", + "meta_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f5f329bc", + "metadata": {}, + "source": [ + "#### Extracting useful information from revenue and budget\n", + "\n", + "As noted previously, the `revenue` and `budget` column represent the movie's total revenue upon release and the budget for the movie's production respectively. \n", + "\n", + "On their own, they might not be incredibly useful in the final recommender system, however, combining the two might yield useful information. For instance, I could add an extra column with the ratio between a movie's revenue vs. it's budget.\n", + "\n", + "- If this ratio is greater than 1, this indicated that a production studio has made a return on their investment. The higher this number, the better the movie did relative to it's production budget. If the revenue is less than the budget, pushing this ratio to be less than 1, that would be an indicator that the movie did not do very well in the box office.\n", + "\n", + "However, before I do that I need to check the values for `revenue` and budget to make sure no values for these columns are zero." + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "4e06d20a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
revenuebudget
count4.507400e+044.507400e+04
mean1.120213e+074.265201e+06
std6.407433e+071.749784e+07
min-2.147484e+090.000000e+00
25%0.000000e+000.000000e+00
50%0.000000e+000.000000e+00
75%0.000000e+000.000000e+00
max2.068224e+093.800000e+08
\n", + "
" + ], + "text/plain": [ + " revenue budget\n", + "count 4.507400e+04 4.507400e+04\n", + "mean 1.120213e+07 4.265201e+06\n", + "std 6.407433e+07 1.749784e+07\n", + "min -2.147484e+09 0.000000e+00\n", + "25% 0.000000e+00 0.000000e+00\n", + "50% 0.000000e+00 0.000000e+00\n", + "75% 0.000000e+00 0.000000e+00\n", + "max 2.068224e+09 3.800000e+08" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df[['revenue', 'budget']].describe()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "33ffa8e8", + "metadata": {}, + "source": [ + "This is bad, as there are movies with no budget, or no revenue (or both) recorded. A quick (albeit debatably \"dirty\") solution to this is to group by genre for _budget_ (as say, a fantasy movie is likely to need more of a budget than a drama or romance movie), and then by popularity for revenue.\n", + "\n", + "> However, we'll need to bin popularity values first, as the popularity column is continuous, and the possible values are too large to group the values properly. For this, I'll use [Pandas's qcut](https://pandas.pydata.org/docs/reference/api/pandas.qcut.html) to bin values based on sample quantiles.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "b8014aaf", + "metadata": {}, + "outputs": [], + "source": [ + "# Bin popularity\n", + "meta_df['pop_bin'] = pd.qcut(meta_df['popularity'], q=10, labels=[i for i in range(10)])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e3796f09", + "metadata": {}, + "source": [ + "Next, I'll bring back the genre column from before, and pick the first genre in the list in the original genre column. For this, I'll reload the old unprocessed `movies_metadata.csv`." + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "760b33d0", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Saqi\\AppData\\Local\\Temp\\ipykernel_34392\\1858458685.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " old_meta_df = pd.read_csv('./data/movies_metadata.csv')\n" + ] + } + ], + "source": [ + "old_meta_df = pd.read_csv('./data/movies_metadata.csv')\n", + "\n", + "meta_df['main_genre'] = \"Unknown\"\n", + "\n", + "\n", + "for idx, row in meta_df.iterrows():\n", + " org_row = old_meta_df[old_meta_df['id'] == str(row['id'])]\n", + " old_gen = str(org_row['genres'].values[0].replace(\"'\", '\"'))\n", + " try:\n", + " meta_df.at[idx, 'main_genre'] = json.loads(old_gen)[0]['name']\n", + " except:\n", + " pass\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9bab8729", + "metadata": {}, + "source": [ + "I'll quickly illustrate the difference in budget and revenue based on popularity using simple barplots;" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "id": "d5c4a46f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 233, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Difference in revenue vs. popularity bins\n", + "pop_df = pd.DataFrame({\"means\": meta_df.groupby(meta_df['pop_bin'])['revenue'].mean(), \"bins\": [i for i in range(10)]}) # Will be useful later\n", + "pop_df['normalized_means'] = pop_df['means']/pop_df['means'].sum()\n", + "sns.barplot(data=pop_df, x='bins', y='normalized_means')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "id": "c30759c4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", + " 17, 18, 19, 20]),\n", + " [Text(0, 0, 'Animation'),\n", + " Text(1, 0, 'Adventure'),\n", + " Text(2, 0, 'Romance'),\n", + " Text(3, 0, 'Comedy'),\n", + " Text(4, 0, 'Action'),\n", + " Text(5, 0, 'Family'),\n", + " Text(6, 0, 'History'),\n", + " Text(7, 0, 'Drama'),\n", + " Text(8, 0, 'Crime'),\n", + " Text(9, 0, 'Fantasy'),\n", + " Text(10, 0, 'Science Fiction'),\n", + " Text(11, 0, 'Unknown'),\n", + " Text(12, 0, 'Thriller'),\n", + " Text(13, 0, 'Music'),\n", + " Text(14, 0, 'Horror'),\n", + " Text(15, 0, 'Documentary'),\n", + " Text(16, 0, 'Mystery'),\n", + " Text(17, 0, 'Western'),\n", + " Text(18, 0, 'TV Movie'),\n", + " Text(19, 0, 'War'),\n", + " Text(20, 0, 'Foreign')])" + ] + }, + "execution_count": 235, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Difference in budget based on genre\n", + "genre_df = pd.DataFrame({'genre': meta_df['main_genre'].unique(), 'means': meta_df.groupby(meta_df['main_genre'])['budget'].mean()})\n", + "genre_df['normalized_means'] = genre_df['means']/genre_df['means'].sum()\n", + "sns.barplot(data=genre_df, x='genre', y='normalized_means')\n", + "plt.xticks(rotation=90)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "35504723", + "metadata": {}, + "source": [ + "Finally, I'll set the budget and revenue based on the aforementioned metrics:" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "id": "0f04a7b9", + "metadata": {}, + "outputs": [], + "source": [ + "for idx, row in meta_df.iterrows():\n", + " if abs(row['budget']) < 100.0:\n", + " median_budget = genre_df[genre_df['genre'] == row['main_genre']]['means']\n", + " meta_df.at[idx, 'budget'] = median_budget.values[0]\n", + "\n", + " if abs(row['revenue']) < 100.0:\n", + " median_revenue = pop_df[pop_df['bins'] == row['pop_bin']]['means']\n", + " meta_df.at[idx, 'revenue'] = median_revenue.values[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 258, + "id": "7bb19ce3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
revenuebudget
count4.507400e+044.507400e+04
mean1.543365e+078.652409e+06
std6.519887e+071.683259e+07
min-2.147484e+091.000000e+02
25%1.584208e+053.198537e+06
50%5.501059e+054.500000e+06
75%3.308691e+068.524772e+06
max2.068224e+093.800000e+08
\n", + "
" + ], + "text/plain": [ + " revenue budget\n", + "count 4.507400e+04 4.507400e+04\n", + "mean 1.543365e+07 8.652409e+06\n", + "std 6.519887e+07 1.683259e+07\n", + "min -2.147484e+09 1.000000e+02\n", + "25% 1.584208e+05 3.198537e+06\n", + "50% 5.501059e+05 4.500000e+06\n", + "75% 3.308691e+06 8.524772e+06\n", + "max 2.068224e+09 3.800000e+08" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df[['revenue', 'budget']].describe()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b130b8a6", + "metadata": {}, + "source": [ + "Now that these values have been fixed, we can finally add the engineered column." + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "id": "a1a96923", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 4.507400e+04\n", + "mean 5.942046e+01\n", + "std 5.133602e+03\n", + "min -9.061112e+00\n", + "25% 3.124410e-02\n", + "50% 1.680270e-01\n", + "75% 1.000000e+00\n", + "max 1.018619e+06\n", + "Name: rb_ratio, dtype: float64" + ] + }, + "execution_count": 262, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df['rb_ratio'] = meta_df['revenue'] / meta_df['budget'].astype(np.float32)\n", + "meta_df['rb_ratio'].describe()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "cb79615a", + "metadata": {}, + "source": [ + "This column will be useful later on as an extra source of information when calculating similarity." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "69a5fa99", + "metadata": {}, + "source": [ + "## Finalizing Item Dataframe\n", + "\n", + "Before I finish off, I'll take one last look at the columns inside `meta_df` to see if there's anything irrelevant that I might want to drop to save on space:" + ] + }, + { + "cell_type": "code", + "execution_count": 345, + "id": "ce1b995e", + "metadata": {}, + "outputs": [], + "source": [ + "meta_df.to_csv('meta_bu_6.csv',index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 344, + "id": "53417423", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['adult', 'budget', 'id', 'imdb_id', 'overview', 'popularity',\n", + " 'production_companies', 'production_countries', 'revenue', 'runtime',\n", + " 'title', 'vote_average', 'Animation', 'Comedy', 'Family', 'Adventure',\n", + " 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',\n", + " 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music',\n", + " 'Documentary', 'Western', 'TV Movie', 'rb_ratio', 'pop_bin',\n", + " 'main_genre'],\n", + " dtype='object')" + ] + }, + "execution_count": 344, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df.columns" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b6219f22", + "metadata": {}, + "source": [ + "Now that there are so many extra columns, it might be best to drop less informative columns such as `production_companies`. If needed, I'll re-add this column in future steps, much like how I had re-read the original `movies_metadata.csv` file to obtain all genre data in previous steps." + ] + }, + { + "cell_type": "code", + "execution_count": 346, + "id": "cc623a2c", + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_drop = ['production_companies', 'production_countries', 'revenue', 'budget', 'main_genre']\n", + "meta_df.drop(cols_to_drop, axis=1, inplace=True)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f77699da", + "metadata": {}, + "source": [ + "And the final item dataframe looks like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 348, + "id": "d4f5f74d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adultidimdb_idoverviewpopularityruntimetitlevote_averageAnimationComedy...Science FictionMysteryWarForeignMusicDocumentaryWesternTV Movierb_ratiopop_bin
0True862tt0114709Led by Woody, Andy's toys live happily in his ...21.94694381.0Toy Story7.711...0000000012.4518019
1True8844tt0113497When siblings Judy and Peter discover an encha...17.015539104.0Jumanji6.900...000000004.0430359
2True15602tt0113228A family wedding reignites the ancient feud be...11.712900101.0Grumpier Old Men6.501...000000009.6090789
\n", + "

3 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " adult id imdb_id overview \n", + "0 True 862 tt0114709 Led by Woody, Andy's toys live happily in his ... \\\n", + "1 True 8844 tt0113497 When siblings Judy and Peter discover an encha... \n", + "2 True 15602 tt0113228 A family wedding reignites the ancient feud be... \n", + "\n", + " popularity runtime title vote_average Animation Comedy \n", + "0 21.946943 81.0 Toy Story 7.7 1 1 \\\n", + "1 17.015539 104.0 Jumanji 6.9 0 0 \n", + "2 11.712900 101.0 Grumpier Old Men 6.5 0 1 \n", + "\n", + " ... Science Fiction Mystery War Foreign Music Documentary Western \n", + "0 ... 0 0 0 0 0 0 0 \\\n", + "1 ... 0 0 0 0 0 0 0 \n", + "2 ... 0 0 0 0 0 0 0 \n", + "\n", + " TV Movie rb_ratio pop_bin \n", + "0 0 12.451801 9 \n", + "1 0 4.043035 9 \n", + "2 0 9.609078 9 \n", + "\n", + "[3 rows x 30 columns]" + ] + }, + "execution_count": 348, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "meta_df.head(3)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b98e7683", + "metadata": {}, + "source": [ + "Here, I'll save `meta_df` in a csv file for later use in our recommendation algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 349, + "id": "ba5d21ea", + "metadata": {}, + "outputs": [], + "source": [ + "meta_df.to_csv('items.csv', index=False)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "376e9299", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Construct User Dataframe\n", + "\n", + "For this, I'll be using the full `ratings.csv` Dataframe. It's very big though.\n", + "\n", + "I'll construct a new dataframe here, with a column for each movie, and a row for each user. However, I won't be taking every movie into account; only movies that exist in the previously constructed item dataframe. Additionally, I won't be taking all the users into account either; only the first 500 users to avoid making the data too large." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1edbca48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdmovieIdratingtimestamp
011101.01425941529
111474.51425942435
218585.01425941523
3112215.01425941546
4112465.01425941556
\n", + "
" + ], + "text/plain": [ + " userId movieId rating timestamp\n", + "0 1 110 1.0 1425941529\n", + "1 1 147 4.5 1425942435\n", + "2 1 858 5.0 1425941523\n", + "3 1 1221 5.0 1425941546\n", + "4 1 1246 5.0 1425941556" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings_df = pd.read_csv('./data/ratings.csv')\n", + "ratings_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "ff07831a", + "metadata": {}, + "source": [ + "To construct the final users dataframe, I'll first get a list of unique movie ids from the `meta_df` I had created in the previous step. I could also load `items.csv`:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3ba8a2ab", + "metadata": {}, + "outputs": [], + "source": [ + "items_df = pd.read_csv('items.csv')\n", + "movie_ids = [str(x) for x in items_df['id'].unique().tolist()] # This will be our column names\n", + "col_names = [\"user_id\"] + movie_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2b61d065", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_id862884415602313571186294911860453259091...844193909592899232228483084043905011110967758227506461257
\n", + "

0 rows × 44986 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [user_id, 862, 8844, 15602, 31357, 11862, 949, 11860, 45325, 9091, 710, 9087, 12110, 21032, 10858, 1408, 524, 4584, 5, 9273, 11517, 8012, 1710, 9691, 12665, 451, 16420, 9263, 17015, 902, 37557, 9909, 63, 78802, 9598, 47018, 687, 139405, 33689, 9603, 34615, 31174, 11443, 35196, 9312, 577, 11861, 807, 10530, 8391, 629, 117164, 11448, 49133, 26441, 97406, 124057, 9089, 11010, 99040, 11359, 17182, 2054, 10607, 19760, 9536, 11525, 40628, 4482, 10634, 755, 11859, 28387, 48750, 20927, 36929, 9102, 124626, 27526, 9623, 46785, 400, 880, 146599, 188588, 8447, 10534, 17414, 13997, 2086, 61548, 9095, 12158, 9283, 9208, 40154, 406, 45549, 63076, 11062, ...]\n", + "Index: []\n", + "\n", + "[0 rows x 44986 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "user_df = pd.DataFrame(columns=col_names)\n", + "user_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "fd214104", + "metadata": {}, + "source": [ + "Next, I'll iterate over 500 users in `ratings_df`, and fill out `user_df`." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "60c41da7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User 0\n", + "User 1\n", + "User 2\n", + "User 3\n", + "User 4\n", + "User 5\n", + "User 6\n", + "User 7\n", + "User 8\n", + "User 9\n", + "User 10\n", + "User 11\n", + "User 12\n", + "User 13\n", + "User 14\n", + "User 15\n", + "User 16\n", + "User 17\n", + "User 18\n", + "User 19\n", + "User 20\n", + "User 21\n", + "User 22\n", + "User 23\n", + "User 24\n", + "User 25\n", + "User 26\n", + "User 27\n", + "User 28\n", + "User 29\n", + "User 30\n", + "User 31\n", + "User 32\n", + "User 33\n", + "User 34\n", + "User 35\n", + "User 36\n", + "User 37\n", + "User 38\n", + "User 39\n", + "User 40\n", + "User 41\n", + "User 42\n", + "User 43\n", + "User 44\n", + "User 44\n", + "User 45\n", + "User 46\n", + "User 47\n", + "User 48\n", + "User 49\n", + "User 50\n", + "User 51\n", + "User 52\n", + "User 53\n", + "User 54\n", + "User 55\n", + "User 56\n", + "User 57\n", + "User 58\n", + "User 59\n", + "User 60\n", + "User 61\n", + "User 62\n", + "User 63\n", + "User 64\n", + "User 65\n", + "User 66\n", + "User 67\n", + "User 68\n", + "User 69\n", + "User 70\n", + "User 71\n", + "User 72\n", + "User 73\n", + "User 74\n", + "User 75\n", + "User 76\n", + "User 77\n", + "User 78\n", + "User 79\n", + "User 80\n", + "User 81\n", + "User 82\n", + "User 83\n", + "User 84\n", + "User 85\n", + "User 86\n", + "User 87\n", + "User 88\n", + "User 89\n", + "User 90\n", + "User 91\n", + "User 92\n", + "User 93\n", + "User 94\n", + "User 95\n", + "User 96\n", + "User 97\n", + "User 98\n", + "User 99\n", + "User 100\n", + "User 101\n", + "User 102\n", + "User 103\n", + "User 104\n", + "User 105\n", + "User 106\n", + "User 107\n", + "User 108\n", + "User 109\n", + "User 110\n", + "User 111\n", + "User 112\n", + "User 113\n", + "User 114\n", + "User 115\n", + "User 116\n", + "User 117\n", + "User 118\n", + "User 119\n", + "User 120\n", + "User 121\n", + "User 122\n", + "User 123\n", + "User 124\n", + "User 125\n", + "User 126\n", + "User 127\n", + "User 128\n", + "User 129\n", + "User 130\n", + "User 131\n", + "User 132\n", + "User 133\n", + "User 134\n", + "User 135\n", + "User 136\n", + "User 137\n", + "User 138\n", + "User 139\n", + "User 140\n", + "User 141\n", + "User 142\n", + "User 143\n", + "User 144\n", + "User 145\n", + "User 146\n", + "User 147\n", + "User 148\n", + "User 149\n", + "User 150\n", + "User 151\n", + "User 152\n", + "User 153\n", + "User 154\n", + "User 155\n", + "User 156\n", + "User 157\n", + "User 158\n", + "User 159\n", + "User 160\n", + "User 161\n", + "User 162\n", + "User 163\n", + "User 164\n", + "User 165\n", + "User 166\n", + "User 167\n", + "User 168\n", + "User 169\n", + "User 170\n", + "User 171\n", + "User 172\n", + "User 173\n", + "User 174\n", + "User 175\n", + "User 176\n", + "User 177\n", + "User 178\n", + "User 179\n", + "User 180\n", + "User 181\n", + "User 182\n", + "User 183\n", + "User 184\n", + "User 185\n", + "User 186\n", + "User 187\n", + "User 188\n", + "User 189\n", + "User 190\n", + "User 191\n", + "User 192\n", + "User 193\n", + "User 194\n", + "User 195\n", + "User 196\n", + "User 197\n", + "User 198\n", + "User 199\n", + "User 200\n", + "User 201\n", + "User 202\n", + "User 203\n", + "User 204\n", + "User 205\n", + "User 206\n", + "User 207\n", + "User 208\n", + "User 209\n", + "User 210\n", + "User 211\n", + "User 212\n", + "User 213\n", + "User 214\n", + "User 215\n", + "User 216\n", + "User 217\n", + "User 218\n", + "User 219\n", + "User 220\n", + "User 221\n", + "User 222\n", + "User 223\n", + "User 224\n", + "User 225\n", + "User 226\n", + "User 227\n", + "User 228\n", + "User 229\n", + "User 230\n", + "User 231\n", + "User 232\n", + "User 233\n", + "User 234\n", + "User 235\n", + "User 236\n", + "User 237\n", + "User 238\n", + "User 239\n", + "User 240\n", + "User 241\n", + "User 242\n", + "User 243\n", + "User 244\n", + "User 245\n", + "User 246\n", + "User 247\n", + "User 248\n", + "User 249\n", + "User 250\n", + "User 251\n", + "User 252\n", + "User 253\n", + "User 254\n", + "User 255\n", + "User 256\n", + "User 257\n", + "User 258\n", + "User 259\n", + "User 260\n", + "User 261\n", + "User 262\n", + "User 263\n", + "User 264\n", + "User 265\n", + "User 266\n", + "User 267\n", + "User 268\n", + "User 269\n", + "User 270\n", + "User 271\n", + "User 272\n", + "User 273\n", + "User 274\n", + "User 275\n", + "User 276\n", + "User 277\n", + "User 278\n", + "User 279\n", + "User 280\n", + "User 281\n", + "User 282\n", + "User 283\n", + "User 284\n", + "User 285\n", + "User 286\n", + "User 287\n", + "User 288\n", + "User 289\n", + "User 290\n", + "User 291\n", + "User 292\n", + "User 293\n", + "User 294\n", + "User 295\n", + "User 296\n", + "User 297\n", + "User 298\n", + "User 299\n", + "User 300\n", + "User 301\n", + "User 302\n", + "User 303\n", + "User 304\n", + "User 305\n", + "User 306\n", + "User 307\n", + "User 308\n", + "User 309\n", + "User 310\n", + "User 311\n", + "User 312\n", + "User 313\n", + "User 314\n", + "User 315\n", + "User 316\n", + "User 317\n", + "User 318\n", + "User 319\n", + "User 320\n", + "User 321\n", + "User 322\n", + "User 323\n", + "User 324\n", + "User 325\n", + "User 325\n", + "User 326\n", + "User 327\n", + "User 328\n", + "User 329\n", + "User 330\n", + "User 331\n", + "User 332\n", + "User 333\n", + "User 334\n", + "User 335\n", + "User 336\n", + "User 337\n", + "User 338\n", + "User 339\n", + "User 340\n", + "User 341\n", + "User 342\n", + "User 343\n", + "User 344\n", + "User 345\n", + "User 346\n", + "User 347\n", + "User 348\n", + "User 348\n", + "User 349\n", + "User 350\n", + "User 351\n", + "User 352\n", + "User 353\n", + "User 354\n", + "User 355\n", + "User 356\n", + "User 357\n", + "User 358\n", + "User 359\n", + "User 360\n", + "User 361\n", + "User 362\n", + "User 363\n", + "User 364\n", + "User 365\n", + "User 366\n", + "User 367\n", + "User 368\n", + "User 369\n", + "User 370\n", + "User 371\n", + "User 372\n", + "User 373\n", + "User 374\n", + "User 375\n", + "User 376\n", + "User 377\n", + "User 377\n", + "User 378\n", + "User 379\n", + "User 380\n", + "User 381\n", + "User 382\n", + "User 383\n", + "User 384\n", + "User 385\n", + "User 386\n", + "User 387\n", + "User 388\n", + "User 389\n", + "User 389\n", + "User 390\n", + "User 391\n", + "User 392\n", + "User 393\n", + "User 393\n", + "User 394\n", + "User 395\n", + "User 396\n", + "User 397\n", + "User 398\n", + "User 399\n", + "User 400\n", + "User 401\n", + "User 402\n", + "User 403\n", + "User 404\n", + "User 405\n", + "User 406\n", + "User 407\n", + "User 408\n", + "User 409\n", + "User 410\n", + "User 411\n", + "User 412\n", + "User 413\n", + "User 414\n", + "User 415\n", + "User 416\n", + "User 417\n", + "User 418\n", + "User 419\n", + "User 420\n", + "User 421\n", + "User 422\n", + "User 423\n", + "User 424\n", + "User 425\n", + "User 426\n", + "User 427\n", + "User 428\n", + "User 429\n", + "User 430\n", + "User 431\n", + "User 432\n", + "User 433\n", + "User 434\n", + "User 435\n", + "User 436\n", + "User 437\n", + "User 438\n", + "User 439\n", + "User 440\n", + "User 441\n", + "User 442\n", + "User 443\n", + "User 444\n", + "User 445\n", + "User 446\n", + "User 447\n", + "User 448\n", + "User 449\n", + "User 450\n", + "User 451\n", + "User 452\n", + "User 453\n", + "User 454\n", + "User 455\n", + "User 456\n", + "User 457\n", + "User 458\n", + "User 459\n", + "User 460\n", + "User 461\n", + "User 462\n", + "User 463\n", + "User 464\n", + "User 465\n", + "User 466\n", + "User 466\n", + "User 467\n", + "User 468\n", + "User 469\n", + "User 470\n", + "User 471\n", + "User 472\n", + "User 473\n", + "User 474\n", + "User 475\n", + "User 476\n", + "User 476\n", + "User 477\n", + "User 478\n", + "User 479\n", + "User 480\n", + "User 481\n", + "User 482\n", + "User 483\n", + "User 484\n", + "User 485\n", + "User 486\n", + "User 487\n", + "User 488\n", + "User 489\n", + "User 490\n", + "User 491\n", + "User 492\n", + "User 493\n", + "User 494\n", + "User 495\n", + "User 496\n", + "User 497\n", + "User 498\n", + "User 499\n" + ] + } + ], + "source": [ + "# Takes a while to run\n", + "\n", + "# Get 500 unique users that have given a rating for at least one of the movies\n", + "uuids = ratings_df['userId'].unique().tolist()\n", + "count = 0\n", + "n_users = 500\n", + "\n", + "for i, uuid in enumerate(uuids):\n", + " print(f\"User {count}\")\n", + " row = np.zeros_like(col_names)\n", + " row[0] = uuid # Set user id\n", + " # Get list of user reviews so I don't keep doing a search on the entire ratings_df dataframe every time\n", + " ratings_filtered_df = ratings_df[ratings_df['userId'] == uuid]\n", + " for i, movie_id in enumerate(movie_ids):\n", + " movie_rating = ratings_filtered_df[ratings_filtered_df['movieId'] == int(movie_id)]\n", + " if not movie_rating.empty:\n", + " row[i] = movie_rating.rating.values[0]\n", + " made_rating = True\n", + " \n", + " # Only add a specific user rating if the user has made a prediction for at least one of the listed movies (i.e. check to see row is not all 0)\n", + " row[row == ''] = '0'\n", + " row = row.astype(float)\n", + " \n", + " if np.any(row[1:]):\n", + " user_df.loc[len(user_df.index)] = row\n", + " count += 1\n", + " \n", + " if count >= 500:\n", + " break" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f46f28fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([1, 0, 0, ..., 0, 0, 0])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row[row == ''] = '0'\n", + "row.astype(float)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c7a55120", + "metadata": {}, + "source": [ + "Now that I've constructed this dataframe, I'll save it as a csv file;" + ] + }, + { + "cell_type": "code", + "execution_count": 393, + "id": "14dd70a3", + "metadata": {}, + "outputs": [], + "source": [ + "user_df.to_csv('users.csv', index=None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "bff569a8", + "metadata": {}, + "source": [ + "And that concludes the data preprocessing step." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "95c48c10", + "metadata": {}, + "source": [ + "---\n", + "\n", + "# Making Smaller files\n", + "\n", + "However, the resulting `items.csv` and `users.csv` are quite large, so I'm splitting these csv files into 5 and 12 parts respectively, so I can upload them up onto hugging face." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "05811f9a", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from math import ceil\n", + "from glob import glob\n", + "\n", + "\n", + "# Read dataframes\n", + "\n", + "items_df = pd.read_csv('items.csv')\n", + "users_df = pd.read_csv('users.csv')" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "32855011", + "metadata": {}, + "source": [ + "## Splitting users_df up into 12 parts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "78eb378f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "42" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parts_users = 12\n", + "\n", + "n_rows = ceil(users_df.shape[0]/n_parts_users)\n", + "n_rows" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b7a07409", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving part 1/12\n", + "Saving part 2/12\n", + "Saving part 3/12\n", + "Saving part 4/12\n", + "Saving part 5/12\n", + "Saving part 6/12\n", + "Saving part 7/12\n", + "Saving part 8/12\n", + "Saving part 9/12\n", + "Saving part 10/12\n", + "Saving part 11/12\n", + "Saving part 12/12\n" + ] + } + ], + "source": [ + "for part in range(n_parts_users):\n", + " print(f\"Saving part {part+1}/{n_parts_users}\")\n", + " filename = \"users_{}.csv\".format(part)\n", + " start_idx = part * n_rows\n", + " end_idx = min((part+1) * n_rows, users_df.shape[0])\n", + " users_df.loc[start_idx:start_idx].to_csv(filename, index=None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e7653833", + "metadata": {}, + "source": [ + "## Splitting items_df up into 5 parts" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "101ade69", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9015" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parts_items = 5\n", + "\n", + "n_rows = ceil(items_df.shape[0]/n_parts_items)\n", + "n_rows" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "35266a30", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving part 1/5\n", + "Saving part 2/5\n", + "Saving part 3/5\n", + "Saving part 4/5\n", + "Saving part 5/5\n" + ] + } + ], + "source": [ + "for part in range(n_parts_items):\n", + " print(f\"Saving part {part+1}/{n_parts_items}\")\n", + " filename = \"items_{}.csv\".format(part)\n", + " start_idx = part * n_rows\n", + " end_idx = min((part+1) * n_rows, items_df.shape[0])\n", + " items_df.loc[start_idx:start_idx].to_csv(filename, index=None)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "3e3d3988", + "metadata": {}, + "source": [ + "## Reconstructing original DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9f3a5743", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_id862884415602313571186294911860453259091...844193909592899232228483084043905011110967758227506461257
01.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
043.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", + "

2 rows × 44986 columns

\n", + "
" + ], + "text/plain": [ + " user_id 862 8844 15602 31357 11862 949 11860 45325 9091 ... \n", + "0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \\\n", + "0 43.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... \n", + "\n", + " 84419 390959 289923 222848 30840 439050 111109 67758 227506 461257 \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", + "\n", + "[2 rows x 44986 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Reconstructor\n", + "\n", + "# users_df\n", + "users_df = pd.concat([pd.read_csv(f) for f in glob(\"users_*.csv\")])\n", + "users_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "ac68d277", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adultidimdb_idoverviewpopularityruntimetitlevote_averageAnimationComedy...Science FictionMysteryWarForeignMusicDocumentaryWesternTV Movierb_ratiopop_bin
0True862tt0114709Led by Woody, Andy's toys live happily in his ...21.94694381.0Toy Story7.711...0000000012.4518019
0True27678tt0106356A television movie based upon the book by Brya...1.685697107.0Barbarians at the Gate6.801...000000000.3026615
\n", + "

2 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " adult id imdb_id overview \n", + "0 True 862 tt0114709 Led by Woody, Andy's toys live happily in his ... \\\n", + "0 True 27678 tt0106356 A television movie based upon the book by Brya... \n", + "\n", + " popularity runtime title vote_average Animation \n", + "0 21.946943 81.0 Toy Story 7.7 1 \\\n", + "0 1.685697 107.0 Barbarians at the Gate 6.8 0 \n", + "\n", + " Comedy ... Science Fiction Mystery War Foreign Music Documentary \n", + "0 1 ... 0 0 0 0 0 0 \\\n", + "0 1 ... 0 0 0 0 0 0 \n", + "\n", + " Western TV Movie rb_ratio pop_bin \n", + "0 0 0 12.451801 9 \n", + "0 0 0 0.302661 5 \n", + "\n", + "[2 rows x 30 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# items_df\n", + "items_df = pd.concat([pd.read_csv(f) for f in glob(\"items_*.csv\")])\n", + "items_df.head(2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "vscode": { + "interpreter": { + "hash": "e084279891f6f4db1ee843a72e2e91611a252795aeda8ffc8cf83a1802c1e7e8" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}