diff --git "a/DF_Construction.ipynb" "b/DF_Construction.ipynb"
new file mode 100644--- /dev/null
+++ "b/DF_Construction.ipynb"
@@ -0,0 +1,3348 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "39139b70",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Import relevant libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import json\n",
+ "import matplotlib.pyplot as plt\n",
+ "%matplotlib inline\n",
+ "import seaborn as sns\n",
+ "from dotenv import load_dotenv\n",
+ "from os import environ\n",
+ "import requests\n",
+ "from time import sleep\n",
+ "import re\n",
+ "\n",
+ "load_dotenv() # Read local .env file"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "2fa58a90",
+ "metadata": {},
+ "source": [
+ "# Construct Item Dataframe\n",
+ "\n",
+ "The goal here is to construct a dataframe consisting of relevant information for each movie. For this, I'll be using only one of the original csv files:\n",
+ "\n",
+ "- **movies_metadata.csv**: The main Movies Metadata file. Contains information on 45,000 movies featured in the Full MovieLens dataset. Features include posters, backdrops, budget, revenue, release dates, languages, production countries and companies.\n",
+ "\n",
+ "> Data description based on [Kaggle](https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset)\n",
+ "\n",
+ "First, I'll read the csv file and list the different columns in the dataframe:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "03fa2e2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\Saqi\\AppData\\Local\\Temp\\ipykernel_34392\\3934408411.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " meta_df = pd.read_csv('./data/movies_metadata.csv')\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',\n",
+ " 'imdb_id', 'original_language', 'original_title', 'overview',\n",
+ " 'popularity', 'poster_path', 'production_companies',\n",
+ " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
+ " 'spoken_languages', 'status', 'tagline', 'title', 'video',\n",
+ " 'vote_average', 'vote_count'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df = pd.read_csv('./data/movies_metadata.csv')\n",
+ "meta_df.columns"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "cb5a13bf",
+ "metadata": {},
+ "source": [
+ "Relevant columsn include:\n",
+ "\n",
+ "- `adult`: Whether or not a movie has been rated adult. While not relevant in the core recommendation algorithm, it could be useful in the final result such that the user would be able to filter adult content if they so wished.\n",
+ "- `budget`: The movie's budget. Should a user have a preference for high-budget movies, this column could be a good indicator of that.\n",
+ "- `genres`: The movie genre.\n",
+ "- `popularity`: Movie popularity. Will be relevant for the user interface.\n",
+ "- `revenue`: The amount of money the movie had made. Will be more relevant later during feature engineering.\n",
+ "- `runtime`: How long the movie was. Could also be useful for filtering purposes.\n",
+ "- `status`: Whether or not the movie has been released or not.\n",
+ "- `vote_average`: The average vote that viewers had given this specific movie.\n",
+ "- `production_companies` and `production_countries`: Which company made the movie in what countries. This could be useful if a user prefers movies made by a certain company or from a certain country\n",
+ "\n",
+ "Other columns such as `id`, `imdb_id`, `title` and `overview` will be useful for descriptive purposes later on."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "id": "464a0be1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "relevant_columns = [\"adult\", \"budget\", \"genres\", \"id\", \"imdb_id\",\n",
+ " \"overview\", \"popularity\", \"revenue\", \"runtime\", \"status\", \"vote_average\", \"title\", \"overview\",\n",
+ " \"production_companies\", \"production_countries\"]\n",
+ "cols_to_drop = [col for col in meta_df.columns if col not in relevant_columns]\n",
+ "\n",
+ "meta_df.drop(cols_to_drop, axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "b0b309c9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
adult
\n",
+ "
budget
\n",
+ "
genres
\n",
+ "
id
\n",
+ "
imdb_id
\n",
+ "
overview
\n",
+ "
popularity
\n",
+ "
production_companies
\n",
+ "
production_countries
\n",
+ "
revenue
\n",
+ "
runtime
\n",
+ "
status
\n",
+ "
title
\n",
+ "
vote_average
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
False
\n",
+ "
30000000
\n",
+ "
[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
\n",
+ "
862
\n",
+ "
tt0114709
\n",
+ "
Led by Woody, Andy's toys live happily in his ...
\n",
+ "
21.946943
\n",
+ "
[{'name': 'Pixar Animation Studios', 'id': 3}]
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
373554033.0
\n",
+ "
81.0
\n",
+ "
Released
\n",
+ "
Toy Story
\n",
+ "
7.7
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
False
\n",
+ "
65000000
\n",
+ "
[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
\n",
+ "
8844
\n",
+ "
tt0113497
\n",
+ "
When siblings Judy and Peter discover an encha...
\n",
+ "
17.015539
\n",
+ "
[{'name': 'TriStar Pictures', 'id': 559}, {'na...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
262797249.0
\n",
+ "
104.0
\n",
+ "
Released
\n",
+ "
Jumanji
\n",
+ "
6.9
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
False
\n",
+ "
0
\n",
+ "
[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
\n",
+ "
15602
\n",
+ "
tt0113228
\n",
+ "
A family wedding reignites the ancient feud be...
\n",
+ "
11.7129
\n",
+ "
[{'name': 'Warner Bros.', 'id': 6194}, {'name'...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
0.0
\n",
+ "
101.0
\n",
+ "
Released
\n",
+ "
Grumpier Old Men
\n",
+ "
6.5
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
False
\n",
+ "
16000000
\n",
+ "
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
\n",
+ "
31357
\n",
+ "
tt0114885
\n",
+ "
Cheated on, mistreated and stepped on, the wom...
\n",
+ "
3.859495
\n",
+ "
[{'name': 'Twentieth Century Fox Film Corporat...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
81452156.0
\n",
+ "
127.0
\n",
+ "
Released
\n",
+ "
Waiting to Exhale
\n",
+ "
6.1
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
False
\n",
+ "
0
\n",
+ "
[{'id': 35, 'name': 'Comedy'}]
\n",
+ "
11862
\n",
+ "
tt0113041
\n",
+ "
Just when George Banks has recovered from his ...
\n",
+ "
8.387519
\n",
+ "
[{'name': 'Sandollar Productions', 'id': 5842}...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
76578911.0
\n",
+ "
106.0
\n",
+ "
Released
\n",
+ "
Father of the Bride Part II
\n",
+ "
5.7
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " adult budget genres id \n",
+ "0 False 30000000 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 862 \\\n",
+ "1 False 65000000 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 8844 \n",
+ "2 False 0 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 15602 \n",
+ "3 False 16000000 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 31357 \n",
+ "4 False 0 [{'id': 35, 'name': 'Comedy'}] 11862 \n",
+ "\n",
+ " imdb_id overview popularity \n",
+ "0 tt0114709 Led by Woody, Andy's toys live happily in his ... 21.946943 \\\n",
+ "1 tt0113497 When siblings Judy and Peter discover an encha... 17.015539 \n",
+ "2 tt0113228 A family wedding reignites the ancient feud be... 11.7129 \n",
+ "3 tt0114885 Cheated on, mistreated and stepped on, the wom... 3.859495 \n",
+ "4 tt0113041 Just when George Banks has recovered from his ... 8.387519 \n",
+ "\n",
+ " production_companies \n",
+ "0 [{'name': 'Pixar Animation Studios', 'id': 3}] \\\n",
+ "1 [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n",
+ "2 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n",
+ "3 [{'name': 'Twentieth Century Fox Film Corporat... \n",
+ "4 [{'name': 'Sandollar Productions', 'id': 5842}... \n",
+ "\n",
+ " production_countries revenue runtime \n",
+ "0 [{'iso_3166_1': 'US', 'name': 'United States o... 373554033.0 81.0 \\\n",
+ "1 [{'iso_3166_1': 'US', 'name': 'United States o... 262797249.0 104.0 \n",
+ "2 [{'iso_3166_1': 'US', 'name': 'United States o... 0.0 101.0 \n",
+ "3 [{'iso_3166_1': 'US', 'name': 'United States o... 81452156.0 127.0 \n",
+ "4 [{'iso_3166_1': 'US', 'name': 'United States o... 76578911.0 106.0 \n",
+ "\n",
+ " status title vote_average \n",
+ "0 Released Toy Story 7.7 \n",
+ "1 Released Jumanji 6.9 \n",
+ "2 Released Grumpier Old Men 6.5 \n",
+ "3 Released Waiting to Exhale 6.1 \n",
+ "4 Released Father of the Bride Part II 5.7 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "0304a62e",
+ "metadata": {},
+ "source": [
+ "Now that we've obtained the filtered dataframe, I'll do routine data pre-processing, such as:\n",
+ "- Checking for `NaN` data and filling where necessary\n",
+ "- Making sure that the data in columns is \"clean\", i.e. each quantitive column has the right type and there are no strings in said numerical columns.\n",
+ "\n",
+ "#### 1. Checking for NaN data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "987d221f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "adult 0\n",
+ "budget 0\n",
+ "genres 0\n",
+ "id 0\n",
+ "imdb_id 17\n",
+ "overview 954\n",
+ "popularity 5\n",
+ "production_companies 3\n",
+ "production_countries 3\n",
+ "revenue 6\n",
+ "runtime 263\n",
+ "status 87\n",
+ "title 6\n",
+ "vote_average 6\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Checking for NaN\n",
+ "meta_df.isna().sum()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "47e0f2f7",
+ "metadata": {},
+ "source": [
+ "For filling in the gaps, I'll be using the [OMDb API](https://www.omdbapi.com/). But first things first, I'll be focusing on the `status` column, as I'll be dropping all rows dataframe-wide that do not have a `status` of released.\n",
+ "- As this is a movie recommendation task, there is no point in recommending movies that have not been released yet."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 86,
+ "id": "0c8cc4fb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Only keep movies that have not been released yet.\n",
+ "meta_df = meta_df[meta_df['status'] == 'Released']\n",
+ "meta_df.drop('status', axis=1, inplace=True) # After this, we no longer need this column"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "8dc5e293",
+ "metadata": {},
+ "source": [
+ "I'll now recalculate the number of `Nan` values per column:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "f0ea83cf",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "adult 0\n",
+ "budget 0\n",
+ "genres 0\n",
+ "id 0\n",
+ "imdb_id 15\n",
+ "overview 920\n",
+ "popularity 0\n",
+ "production_companies 0\n",
+ "production_countries 0\n",
+ "revenue 0\n",
+ "runtime 251\n",
+ "status 0\n",
+ "title 0\n",
+ "vote_average 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df.isna().sum() # No change in NaN values for other columns "
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "8758b1b9",
+ "metadata": {},
+ "source": [
+ "Here, I'll fill in the missing information as best I can with the OMDb API. I've put an .env file in the same location as this notebook containing my OMDb API key. For making the API requests, I'm using the incredibly well-known [requests](https://pypi.org/project/requests/) package.\n",
+ "\n",
+ "I'll first load that, and then define a function for fetching movies based on titles or IMDB ids:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
+ "id": "29d0f249",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "api_key = environ.get('OMDB_KEY')\n",
+ "\n",
+ "# We can either pass in imdb_id or movie title\n",
+ "def make_omdb_req(identifier, is_imdb_id=True):\n",
+ " if is_imdb_id:\n",
+ " query = f\"i={identifier}\"\n",
+ " else:\n",
+ " identifier = identifier.replace(\" \", \"\\ \")\n",
+ " query = f\"t={identifier}\"\n",
+ " \n",
+ " url = f\"http://www.omdbapi.com/?apikey={api_key}&{query}&type=movie\"\n",
+ " res = requests.get(url)\n",
+ " try:\n",
+ " if res.status_code == 200:\n",
+ " return json.loads(res.content)\n",
+ " except:\n",
+ " pass\n",
+ " return {'Response': 'False'}"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "d4998ee2",
+ "metadata": {},
+ "source": [
+ "We can then test this for the first element in `meta_df`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "a3a6c508",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Title': 'Toy Story',\n",
+ " 'Year': '1995',\n",
+ " 'Rated': 'G',\n",
+ " 'Released': '25 Nov 1995',\n",
+ " 'Runtime': '81 min',\n",
+ " 'Genre': 'Animation, Adventure, Comedy',\n",
+ " 'Director': 'John Lasseter',\n",
+ " 'Writer': 'John Lasseter, Pete Docter, Andrew Stanton',\n",
+ " 'Actors': 'Tom Hanks, Tim Allen, Don Rickles',\n",
+ " 'Plot': \"A cowboy doll is profoundly threatened and jealous when a new spaceman action figure supplants him as top toy in a boy's bedroom.\",\n",
+ " 'Language': 'English',\n",
+ " 'Country': 'United States, Japan',\n",
+ " 'Awards': 'Nominated for 3 Oscars. 29 wins & 23 nominations total',\n",
+ " 'Poster': 'https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg',\n",
+ " 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'},\n",
+ " {'Source': 'Rotten Tomatoes', 'Value': '100%'},\n",
+ " {'Source': 'Metacritic', 'Value': '96/100'}],\n",
+ " 'Metascore': '96',\n",
+ " 'imdbRating': '8.3',\n",
+ " 'imdbVotes': '1,018,595',\n",
+ " 'imdbID': 'tt0114709',\n",
+ " 'Type': 'movie',\n",
+ " 'DVD': '23 Mar 2010',\n",
+ " 'BoxOffice': '$223,225,679',\n",
+ " 'Production': 'N/A',\n",
+ " 'Website': 'N/A',\n",
+ " 'Response': 'True'}"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test_id = meta_df[\"imdb_id\"][0]\n",
+ "make_omdb_req(test_id)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "706d238f",
+ "metadata": {},
+ "source": [
+ "I'll now iterate over all rows with `NaN` values and fill in the gaps accordingly:\n",
+ "\n",
+ "> A thing to note here is that only the `imdb_id`, `overview` and `runtime` columns have missing data, so for each row, I only need to check these three columns."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d944b1ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "invalid_movies = [] # For movies not found using the API"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "6b25b861",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Row with ID 30146 not found!\n",
+ "Row with ID 65256 not found!\n",
+ "Row with ID 342011 not found!\n",
+ "Row with ID 391438 not found!\n",
+ "Row with ID 416569 not found!\n",
+ "Row with ID 109861 not found!\n",
+ "Row with ID 362617 not found!\n",
+ "Row with ID 227964 not found!\n",
+ "Row with ID 342684 not found!\n",
+ "Row with ID 359413 not found!\n",
+ "Row with ID 77564 not found!\n",
+ "Row with ID 327909 not found!\n",
+ "Row with ID 449863 not found!\n",
+ "Row with ID 142478 not found!\n",
+ "Row with ID 41663 not found!\n",
+ "Row with ID 185180 not found!\n",
+ "Row with ID 428950 not found!\n",
+ "Row with ID 440508 not found!\n",
+ "Row with ID 152100 not found!\n",
+ "Row with ID 167330 not found!\n",
+ "Row with ID 220669 not found!\n",
+ "Row with ID 240992 not found!\n",
+ "Row with ID 38547 not found!\n",
+ "Row with ID 366759 not found!\n",
+ "Row with ID 148697 not found!\n",
+ "Row with ID 49833 not found!\n",
+ "Row with ID 452606 not found!\n",
+ "Row with ID 65010 not found!\n",
+ "Row with ID 101217 not found!\n",
+ "Row with ID 236053 not found!\n",
+ "Row with ID 123592 not found!\n",
+ "Row with ID 109671 not found!\n",
+ "Row with ID 327935 not found!\n",
+ "Row with ID 123601 not found!\n",
+ "Row with ID 123611 not found!\n",
+ "Row with ID 453596 not found!\n",
+ "Row with ID 142802 not found!\n",
+ "Row with ID 77534 not found!\n",
+ "Row with ID 143883 not found!\n",
+ "Row with ID 354133 not found!\n",
+ "Row with ID 191486 not found!\n",
+ "Row with ID 127803 not found!\n",
+ "Row with ID 271495 not found!\n",
+ "Row with ID 244575 not found!\n",
+ "Row with ID 246438 not found!\n",
+ "Row with ID 362844 not found!\n",
+ "Row with ID 36264 not found!\n",
+ "Row with ID 270908 not found!\n",
+ "Row with ID 14210 not found!\n",
+ "Row with ID 376934 not found!\n",
+ "Row with ID 213321 not found!\n",
+ "Row with ID 380438 not found!\n",
+ "Row with ID 41493 not found!\n",
+ "Row with ID 452922 not found!\n",
+ "Row with ID 93461 not found!\n",
+ "Row with ID 63838 not found!\n",
+ "Row with ID 197057 not found!\n",
+ "Row with ID 143005 not found!\n",
+ "Row with ID 336484 not found!\n",
+ "Row with ID 159810 not found!\n",
+ "Row with ID 51275 not found!\n",
+ "Row with ID 420481 not found!\n",
+ "Row with ID 69976 not found!\n",
+ "Row with ID 26792 not found!\n",
+ "Row with ID 37603 not found!\n",
+ "Row with ID 48209 not found!\n",
+ "Row with ID 57382 not found!\n",
+ "Row with ID 110131 not found!\n",
+ "Row with ID 41689 not found!\n",
+ "Row with ID 458808 not found!\n",
+ "Row with ID 400552 not found!\n",
+ "Row with ID 419601 not found!\n",
+ "Row with ID 14644 not found!\n",
+ "Row with ID 82495 not found!\n",
+ "Row with ID 64827 not found!\n",
+ "Row with ID 103301 not found!\n",
+ "Row with ID 301876 not found!\n",
+ "Row with ID 73545 not found!\n",
+ "Row with ID 448879 not found!\n",
+ "Row with ID 457307 not found!\n",
+ "Row with ID 396987 not found!\n",
+ "Row with ID 153561 not found!\n",
+ "Row with ID 366860 not found!\n",
+ "Row with ID 202865 not found!\n",
+ "Row with ID 9765 not found!\n",
+ "Row with ID 213683 not found!\n",
+ "Row with ID 57770 not found!\n",
+ "Row with ID 142320 not found!\n",
+ "Row with ID 430058 not found!\n",
+ "Row with ID 54309 not found!\n",
+ "Row with ID 445840 not found!\n",
+ "Row with ID 64043 not found!\n",
+ "Row with ID 73649 not found!\n",
+ "Row with ID 57996 not found!\n",
+ "Row with ID 63179 not found!\n",
+ "Row with ID 398295 not found!\n",
+ "Row with ID 353713 not found!\n",
+ "Row with ID 458335 not found!\n",
+ "Row with ID 298207 not found!\n",
+ "Row with ID 382995 not found!\n",
+ "Row with ID 439314 not found!\n",
+ "Row with ID 422005 not found!\n",
+ "Row with ID 26969 not found!\n",
+ "Row with ID 91673 not found!\n",
+ "Row with ID 68063 not found!\n",
+ "Row with ID 103344 not found!\n",
+ "Row with ID 275272 not found!\n",
+ "Row with ID 231216 not found!\n",
+ "Row with ID 79343 not found!\n",
+ "Row with ID 418757 not found!\n",
+ "Row with ID 369444 not found!\n",
+ "Row with ID 395767 not found!\n",
+ "Row with ID 199887 not found!\n",
+ "Row with ID 317389 not found!\n",
+ "Row with ID 468707 not found!\n",
+ "Row with ID 280422 not found!\n",
+ "Row with ID 449131 not found!\n"
+ ]
+ }
+ ],
+ "source": [
+ "for idx, row in meta_df[meta_df.isnull().any(axis=1)].iterrows():\n",
+ "\n",
+ " # No use in re-trying movies that don't exist in OMDb's database\n",
+ " if row['id'] in invalid_movies:\n",
+ " continue\n",
+ "\n",
+ " # First fetch row data from API\n",
+ " api_data = make_omdb_req(row['title'], False)\n",
+ "\n",
+ " # Movie not found\n",
+ " if api_data['Response'] == 'False':\n",
+ " # Try and make request with imdb_id, if it exists\n",
+ " api_data = make_omdb_req(row['imdb_id'])\n",
+ "\n",
+ " # If movie still not found\n",
+ " if api_data['Response'] == 'False':\n",
+ " print(f\"Row with ID {row['id']} not found!\")\n",
+ " invalid_movies.append(row['id'])\n",
+ " continue\n",
+ " \n",
+ " # If API res was okay, start filling in data:\n",
+ " if pd.isnull(row['overview']):\n",
+ " row[\"overview\"] = api_data['Plot']\n",
+ " \n",
+ " if pd.isnull(row['imdb_id']):\n",
+ " row['imdb_id'] = api_data['imdbID']\n",
+ "\n",
+ " if pd.isnull(row['runtime']):\n",
+ " # API data needs to be parsed; response has a \"min\" at the end but runtime col is float\n",
+ " try:\n",
+ " row['runtime'] = float(api_data['Runtime'].replace('min', ''))\n",
+ " except:\n",
+ " pass\n",
+ "\n",
+ " meta_df[meta_df['id'] == row['id']] = row\n",
+ "\n",
+ " # To avoid sending too many requests to the API at once\n",
+ " sleep(1.0)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "d06e3248",
+ "metadata": {},
+ "source": [
+ "Now, I'll check to see if there are any more `NaN` values left:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "1b245bbc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "adult 0\n",
+ "budget 0\n",
+ "genres 0\n",
+ "id 0\n",
+ "imdb_id 6\n",
+ "overview 114\n",
+ "popularity 0\n",
+ "production_companies 0\n",
+ "production_countries 0\n",
+ "revenue 0\n",
+ "runtime 51\n",
+ "status 0\n",
+ "title 0\n",
+ "vote_average 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df.isna().sum()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "41edb363",
+ "metadata": {},
+ "source": [
+ "The remaining `NaN` values will be handled as such:\n",
+ "- `imdb_id`: `NaN` values will be replaced with `-1`, to indicate that this movie has no imdb id.\n",
+ "- `overview`: `NaN` values will be replaced with \"No description available\".\n",
+ "- `runtime`: `NaN` values will be replaced with _average_ runtime."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "id": "06d8691a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "adult 0\n",
+ "budget 0\n",
+ "genres 0\n",
+ "id 0\n",
+ "imdb_id 0\n",
+ "overview 0\n",
+ "popularity 0\n",
+ "production_companies 0\n",
+ "production_countries 0\n",
+ "revenue 0\n",
+ "runtime 0\n",
+ "status 0\n",
+ "title 0\n",
+ "vote_average 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df['imdb_id'] = meta_df['imdb_id'].fillna(-1)\n",
+ "meta_df['overview'] = meta_df['overview'].fillna(\"No description available\")\n",
+ "meta_df['runtime'] = meta_df['runtime'].fillna(meta_df['runtime'].median())\n",
+ "\n",
+ "meta_df.isna().sum()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "a2bbae24",
+ "metadata": {},
+ "source": [
+ "#### 2. Making sure data is clean\n",
+ "\n",
+ "An issue that exists in this data is that while there are columns comprising solely of numerical data, these columns are sometimes dirty in the sense that there may be erronous string data in that specific column that prevents us from using that column properly. Here, I'll iterate over all of the numerical and boolean columns and make sure they've been casted to the correct type.\n",
+ "\n",
+ "> The columns I'll be checking are: `adult` (bool), `budget` (int), `popularity` (float), `revenue` (int), `runtime` (float), `vote_average` (float)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "id": "45c516bc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "meta_df['adult'] = meta_df['adult'].astype(bool)\n",
+ "meta_df['budget'] = meta_df['budget'].astype(int)\n",
+ "meta_df['popularity'] = meta_df['popularity'].astype(float)\n",
+ "meta_df['revenue'] = meta_df['revenue'].astype(int)\n",
+ "meta_df['runtime'] = meta_df['runtime'].astype(float)\n",
+ "meta_df['vote_average'] = meta_df['vote_average'].astype(float)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "3b7593f7",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "\n",
+ "### Additional Feature Engineering\n",
+ "\n",
+ "#### Genre column"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "819aa271",
+ "metadata": {},
+ "source": [
+ "Here, I'll reconstruct the `meta_df` dataframe such that for each possible genre a movie could have, there is a column. If that movie falls under that specific genre, the value of the respective column is `1`, otherwise the value of that specific column in that specific row is `0`\n",
+ "\n",
+ "Next is to make a separate column for each genre. For this, I'll first need a list of all genres.\n",
+ "\n",
+ "What's important to note here is that elements in the genre column are json strings, thus I'll be using the `json` module to properly parse the string."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "id": "4b33d433",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "genre_col = meta_df['genres'].values.tolist() # Get list of every element in the genres column\n",
+ "\n",
+ "# Iterate over every element, generate list of *unique* genres.\n",
+ "genres = []\n",
+ "for item in genre_col:\n",
+ " item_gs = json.loads(item.replace('\\'', '\"')) # json.loads expects double quotes (\") for property names\n",
+ " for g in item_gs:\n",
+ " if g['name'] not in genres:\n",
+ " genres.append(g['name'])\n",
+ "\n",
+ "print(genres)"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "b5a8a1c2",
+ "metadata": {},
+ "source": [
+ "Obviously, some data cleaning is needed here. Thankfully, there aren't very many candidate \"genres\", so I'll filter them by hand;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 97,
+ "id": "9b43758f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
id
\n",
+ "
genres
\n",
+ "
Animation
\n",
+ "
Comedy
\n",
+ "
Family
\n",
+ "
Adventure
\n",
+ "
Fantasy
\n",
+ "
Romance
\n",
+ "
Drama
\n",
+ "
Action
\n",
+ "
...
\n",
+ "
Horror
\n",
+ "
History
\n",
+ "
Science Fiction
\n",
+ "
Mystery
\n",
+ "
War
\n",
+ "
Foreign
\n",
+ "
Music
\n",
+ "
Documentary
\n",
+ "
Western
\n",
+ "
TV Movie
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
862
\n",
+ "
[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
\n",
+ "
1
\n",
+ "
1
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
8844
\n",
+ "
[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
1
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
15602
\n",
+ "
[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
31357
\n",
+ "
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
1
\n",
+ "
1
\n",
+ "
0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
11862
\n",
+ "
[{'id': 35, 'name': 'Comedy'}]
\n",
+ "
0
\n",
+ "
1
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id genres Animation \n",
+ "0 862 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... 1 \\\n",
+ "1 8844 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... 0 \n",
+ "2 15602 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... 0 \n",
+ "3 31357 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... 0 \n",
+ "4 11862 [{'id': 35, 'name': 'Comedy'}] 0 \n",
+ "\n",
+ " Comedy Family Adventure Fantasy Romance Drama Action ... Horror \n",
+ "0 1 1 0 0 0 0 0 ... 0 \\\n",
+ "1 0 1 1 1 0 0 0 ... 0 \n",
+ "2 1 0 0 0 1 0 0 ... 0 \n",
+ "3 1 0 0 0 1 1 0 ... 0 \n",
+ "4 1 0 0 0 0 0 0 ... 0 \n",
+ "\n",
+ " History Science Fiction Mystery War Foreign Music Documentary \n",
+ "0 0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 \n",
+ "\n",
+ " Western TV Movie \n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ "[5 rows x 22 columns]"
+ ]
+ },
+ "execution_count": 97,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "genres = ['Animation', 'Comedy', 'Family', 'Adventure', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'History', 'Science Fiction', 'Mystery', 'War', 'Foreign', 'Music', 'Documentary', 'Western', 'TV Movie']\n",
+ "\n",
+ "# Only get a copy of relevant columns\n",
+ "genre_df = meta_df[['id', 'genres']].copy()\n",
+ "\n",
+ "# Make a column for every genre\n",
+ "for genre in genres:\n",
+ " genre_df[genre] = 0\n",
+ "\n",
+ "# Iterate over every row and set genres accordingly:\n",
+ "for idx, row in genre_df.iterrows():\n",
+ " g_json = json.loads(row['genres'].replace('\\'', '\"')) \n",
+ " for g in g_json:\n",
+ " if g['name'] in genres:\n",
+ " genre_df.at[idx, g['name']] = 1\n",
+ "\n",
+ "genre_df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "2a84ca58",
+ "metadata": {},
+ "source": [
+ "Now not only can we safely merge `meta_df` and `genre_df` to contruct the new dataframe with the genre columns, but we can also safely drop the `genres` column as it is no longer needed:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 98,
+ "id": "21afdde2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
adult
\n",
+ "
budget
\n",
+ "
id
\n",
+ "
imdb_id
\n",
+ "
overview
\n",
+ "
popularity
\n",
+ "
production_companies
\n",
+ "
production_countries
\n",
+ "
revenue
\n",
+ "
runtime
\n",
+ "
...
\n",
+ "
Horror
\n",
+ "
History
\n",
+ "
Science Fiction
\n",
+ "
Mystery
\n",
+ "
War
\n",
+ "
Foreign
\n",
+ "
Music
\n",
+ "
Documentary
\n",
+ "
Western
\n",
+ "
TV Movie
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
True
\n",
+ "
30000000
\n",
+ "
862
\n",
+ "
tt0114709
\n",
+ "
Led by Woody, Andy's toys live happily in his ...
\n",
+ "
21.946943
\n",
+ "
[{'name': 'Pixar Animation Studios', 'id': 3}]
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
373554033
\n",
+ "
81.0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
True
\n",
+ "
65000000
\n",
+ "
8844
\n",
+ "
tt0113497
\n",
+ "
When siblings Judy and Peter discover an encha...
\n",
+ "
17.015539
\n",
+ "
[{'name': 'TriStar Pictures', 'id': 559}, {'na...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
262797249
\n",
+ "
104.0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
True
\n",
+ "
0
\n",
+ "
15602
\n",
+ "
tt0113228
\n",
+ "
A family wedding reignites the ancient feud be...
\n",
+ "
11.712900
\n",
+ "
[{'name': 'Warner Bros.', 'id': 6194}, {'name'...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
0
\n",
+ "
101.0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
True
\n",
+ "
16000000
\n",
+ "
31357
\n",
+ "
tt0114885
\n",
+ "
Cheated on, mistreated and stepped on, the wom...
\n",
+ "
3.859495
\n",
+ "
[{'name': 'Twentieth Century Fox Film Corporat...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
81452156
\n",
+ "
127.0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
True
\n",
+ "
0
\n",
+ "
11862
\n",
+ "
tt0113041
\n",
+ "
Just when George Banks has recovered from his ...
\n",
+ "
8.387519
\n",
+ "
[{'name': 'Sandollar Productions', 'id': 5842}...
\n",
+ "
[{'iso_3166_1': 'US', 'name': 'United States o...
\n",
+ "
76578911
\n",
+ "
106.0
\n",
+ "
...
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 32 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " adult budget id imdb_id \n",
+ "0 True 30000000 862 tt0114709 \\\n",
+ "1 True 65000000 8844 tt0113497 \n",
+ "2 True 0 15602 tt0113228 \n",
+ "3 True 16000000 31357 tt0114885 \n",
+ "4 True 0 11862 tt0113041 \n",
+ "\n",
+ " overview popularity \n",
+ "0 Led by Woody, Andy's toys live happily in his ... 21.946943 \\\n",
+ "1 When siblings Judy and Peter discover an encha... 17.015539 \n",
+ "2 A family wedding reignites the ancient feud be... 11.712900 \n",
+ "3 Cheated on, mistreated and stepped on, the wom... 3.859495 \n",
+ "4 Just when George Banks has recovered from his ... 8.387519 \n",
+ "\n",
+ " production_companies \n",
+ "0 [{'name': 'Pixar Animation Studios', 'id': 3}] \\\n",
+ "1 [{'name': 'TriStar Pictures', 'id': 559}, {'na... \n",
+ "2 [{'name': 'Warner Bros.', 'id': 6194}, {'name'... \n",
+ "3 [{'name': 'Twentieth Century Fox Film Corporat... \n",
+ "4 [{'name': 'Sandollar Productions', 'id': 5842}... \n",
+ "\n",
+ " production_countries revenue runtime ... \n",
+ "0 [{'iso_3166_1': 'US', 'name': 'United States o... 373554033 81.0 ... \\\n",
+ "1 [{'iso_3166_1': 'US', 'name': 'United States o... 262797249 104.0 ... \n",
+ "2 [{'iso_3166_1': 'US', 'name': 'United States o... 0 101.0 ... \n",
+ "3 [{'iso_3166_1': 'US', 'name': 'United States o... 81452156 127.0 ... \n",
+ "4 [{'iso_3166_1': 'US', 'name': 'United States o... 76578911 106.0 ... \n",
+ "\n",
+ " Horror History Science Fiction Mystery War Foreign Music Documentary \n",
+ "0 0 0 0 0 0 0 0 0 \\\n",
+ "1 0 0 0 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 0 0 0 \n",
+ "\n",
+ " Western TV Movie \n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "\n",
+ "[5 rows x 32 columns]"
+ ]
+ },
+ "execution_count": 98,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df = pd.merge(meta_df, genre_df)\n",
+ "meta_df.drop('genres', axis=1, inplace=True)\n",
+ "meta_df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "f5f329bc",
+ "metadata": {},
+ "source": [
+ "#### Extracting useful information from revenue and budget\n",
+ "\n",
+ "As noted previously, the `revenue` and `budget` column represent the movie's total revenue upon release and the budget for the movie's production respectively. \n",
+ "\n",
+ "On their own, they might not be incredibly useful in the final recommender system, however, combining the two might yield useful information. For instance, I could add an extra column with the ratio between a movie's revenue vs. it's budget.\n",
+ "\n",
+ "- If this ratio is greater than 1, this indicated that a production studio has made a return on their investment. The higher this number, the better the movie did relative to it's production budget. If the revenue is less than the budget, pushing this ratio to be less than 1, that would be an indicator that the movie did not do very well in the box office.\n",
+ "\n",
+ "However, before I do that I need to check the values for `revenue` and budget to make sure no values for these columns are zero."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 100,
+ "id": "4e06d20a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
revenue
\n",
+ "
budget
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
count
\n",
+ "
4.507400e+04
\n",
+ "
4.507400e+04
\n",
+ "
\n",
+ "
\n",
+ "
mean
\n",
+ "
1.120213e+07
\n",
+ "
4.265201e+06
\n",
+ "
\n",
+ "
\n",
+ "
std
\n",
+ "
6.407433e+07
\n",
+ "
1.749784e+07
\n",
+ "
\n",
+ "
\n",
+ "
min
\n",
+ "
-2.147484e+09
\n",
+ "
0.000000e+00
\n",
+ "
\n",
+ "
\n",
+ "
25%
\n",
+ "
0.000000e+00
\n",
+ "
0.000000e+00
\n",
+ "
\n",
+ "
\n",
+ "
50%
\n",
+ "
0.000000e+00
\n",
+ "
0.000000e+00
\n",
+ "
\n",
+ "
\n",
+ "
75%
\n",
+ "
0.000000e+00
\n",
+ "
0.000000e+00
\n",
+ "
\n",
+ "
\n",
+ "
max
\n",
+ "
2.068224e+09
\n",
+ "
3.800000e+08
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " revenue budget\n",
+ "count 4.507400e+04 4.507400e+04\n",
+ "mean 1.120213e+07 4.265201e+06\n",
+ "std 6.407433e+07 1.749784e+07\n",
+ "min -2.147484e+09 0.000000e+00\n",
+ "25% 0.000000e+00 0.000000e+00\n",
+ "50% 0.000000e+00 0.000000e+00\n",
+ "75% 0.000000e+00 0.000000e+00\n",
+ "max 2.068224e+09 3.800000e+08"
+ ]
+ },
+ "execution_count": 100,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "meta_df[['revenue', 'budget']].describe()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "33ffa8e8",
+ "metadata": {},
+ "source": [
+ "This is bad, as there are movies with no budget, or no revenue (or both) recorded. A quick (albeit debatably \"dirty\") solution to this is to group by genre for _budget_ (as say, a fantasy movie is likely to need more of a budget than a drama or romance movie), and then by popularity for revenue.\n",
+ "\n",
+ "> However, we'll need to bin popularity values first, as the popularity column is continuous, and the possible values are too large to group the values properly. For this, I'll use [Pandas's qcut](https://pandas.pydata.org/docs/reference/api/pandas.qcut.html) to bin values based on sample quantiles.\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 112,
+ "id": "b8014aaf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Bin popularity\n",
+ "meta_df['pop_bin'] = pd.qcut(meta_df['popularity'], q=10, labels=[i for i in range(10)])"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "e3796f09",
+ "metadata": {},
+ "source": [
+ "Next, I'll bring back the genre column from before, and pick the first genre in the list in the original genre column. For this, I'll reload the old unprocessed `movies_metadata.csv`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 196,
+ "id": "760b33d0",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\Saqi\\AppData\\Local\\Temp\\ipykernel_34392\\1858458685.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " old_meta_df = pd.read_csv('./data/movies_metadata.csv')\n"
+ ]
+ }
+ ],
+ "source": [
+ "old_meta_df = pd.read_csv('./data/movies_metadata.csv')\n",
+ "\n",
+ "meta_df['main_genre'] = \"Unknown\"\n",
+ "\n",
+ "\n",
+ "for idx, row in meta_df.iterrows():\n",
+ " org_row = old_meta_df[old_meta_df['id'] == str(row['id'])]\n",
+ " old_gen = str(org_row['genres'].values[0].replace(\"'\", '\"'))\n",
+ " try:\n",
+ " meta_df.at[idx, 'main_genre'] = json.loads(old_gen)[0]['name']\n",
+ " except:\n",
+ " pass\n"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "9bab8729",
+ "metadata": {},
+ "source": [
+ "I'll quickly illustrate the difference in budget and revenue based on popularity using simple barplots;"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 233,
+ "id": "d5c4a46f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 233,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGwCAYAAABVdURTAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAyu0lEQVR4nO3df1RUdcLH8c8w8sNfoIWAEsparj9SwSAI+2FbFJseXffZjFwNws196tFdddY2yZS0ktotok2SNKnW1qTMdi1L16Zf6yNFQbpSZqulkAXIsUAxoWbm+aPT7MOKNnMZvXB7v8655zjf+d7L5261fLz3O3NtHo/HIwAAAIsIMjsAAABAIFFuAACApVBuAACApVBuAACApVBuAACApVBuAACApVBuAACApXQzO8CZ5na79dlnn6l3796y2WxmxwEAAD7weDw6cuSIBgwYoKCgU1+b+cGVm88++0xxcXFmxwAAAAbU1NTonHPOOeWcH1y56d27t6Rv/8cJDw83OQ0AAPBFU1OT4uLivL/HT+UHV26+uxUVHh5OuQEAoIvxZUkJC4oBAIClUG4AAIClUG4AAIClUG4AAIClUG4AAIClUG4AAIClUG4AAIClUG4AAICldIpyU1RUpPj4eIWFhSk1NVXl5eWnnF9YWKihQ4eqe/fuiouL07x583T8+PEzlBYAAHRmppeb0tJSORwO5eXlqbKyUgkJCcrIyFB9fX2789euXasFCxYoLy9Pu3fv1urVq1VaWqrbb7/9DCcHAACdkenlpqCgQDNnzlROTo5GjBih4uJi9ejRQyUlJe3O3759uy6++GL98pe/VHx8vK6++mpNnTr1e6/2AACAHwZTy01ra6sqKiqUnp7uHQsKClJ6errKysra3Wfs2LGqqKjwlpmPP/5YL730ksaPH9/u/JaWFjU1NbXZAACAdZn64MyGhga5XC5FR0e3GY+OjtaHH37Y7j6//OUv1dDQoEsuuUQej0fffPONbr755pPelsrPz9eSJUsCnh0AAHROpt+W8tfrr7+uZcuW6ZFHHlFlZaU2bNigTZs26a677mp3fm5urhobG71bTU3NGU4MAADOJFOv3ERGRsput6uurq7NeF1dnWJiYtrdZ9GiRbrhhht00003SZJGjRql5uZm/frXv9bChQsVFNS2r4WGhio0NPT0nAAAAOh0TC03ISEhSkpKktPp1OTJkyVJbrdbTqdTs2fPbnefY8eOnVBg7Ha7JMnj8ZzWvAAAWEnC+i1mRzipnddmGN7X1HIjSQ6HQ9nZ2UpOTlZKSooKCwvV3NysnJwcSVJWVpZiY2OVn58vSZo4caIKCgo0ZswYpaamau/evVq0aJEmTpzoLTkAAOCHy/Ryk5mZqUOHDmnx4sWqra1VYmKiNm/e7F1kXF1d3eZKzR133CGbzaY77rhDBw8eVL9+/TRx4kTdc889Zp0CAADoRGyeH9i9nKamJkVERKixsVHh4eFmxwEAwDRd6baUP7+/u9ynpQAAAE6FcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACyFcgMAACylU5SboqIixcfHKywsTKmpqSovLz/p3Msvv1w2m+2EbcKECWcwMQAA6KxMLzelpaVyOBzKy8tTZWWlEhISlJGRofr6+nbnb9iwQZ9//rl3q6qqkt1u15QpU85wcgAA0BmZXm4KCgo0c+ZM5eTkaMSIESouLlaPHj1UUlLS7vyzzjpLMTEx3m3r1q3q0aMH5QYAAEgyudy0traqoqJC6enp3rGgoCClp6errKzMp2OsXr1a119/vXr27Nnu+y0tLWpqamqzAQAA6zK13DQ0NMjlcik6OrrNeHR0tGpra793//LyclVVVemmm2466Zz8/HxFRER4t7i4uA7nBgAAnZfpt6U6YvXq1Ro1apRSUlJOOic3N1eNjY3eraam5gwmBAAAZ1o3M394ZGSk7Ha76urq2ozX1dUpJibmlPs2Nzdr3bp1Wrp06SnnhYaGKjQ0tMNZAQBA12DqlZuQkBAlJSXJ6XR6x9xut5xOp9LS0k6577PPPquWlhZNnz79dMcEAABdiKlXbiTJ4XAoOztbycnJSklJUWFhoZqbm5WTkyNJysrKUmxsrPLz89vst3r1ak2ePFlnn322GbEBAEAnZXq5yczM1KFDh7R48WLV1tYqMTFRmzdv9i4yrq6uVlBQ2wtMe/bs0bZt2/T3v//djMgAAKATs3k8Ho/ZIc6kpqYmRUREqLGxUeHh4WbHAQDANAnrt5gd4aR2XpvR5rU/v7+79KelAAAA/hPlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWArlBgAAWEqnKDdFRUWKj49XWFiYUlNTVV5efsr5X375pWbNmqX+/fsrNDRUP/7xj/XSSy+dobQAAKAz62Z2gNLSUjkcDhUXFys1NVWFhYXKyMjQnj17FBUVdcL81tZWXXXVVYqKitL69esVGxurAwcOqE+fPmc+PAAA6HRMLzcFBQWaOXOmcnJyJEnFxcXatGmTSkpKtGDBghPml5SU6PDhw9q+fbuCg4MlSfHx8Sc9fktLi1paWryvm5qaAnsCAACgUzH1tlRra6sqKiqUnp7uHQsKClJ6errKysra3Wfjxo1KS0vTrFmzFB0drZEjR2rZsmVyuVztzs/Pz1dERIR3i4uLOy3nAgAAOgdTy01DQ4NcLpeio6PbjEdHR6u2trbdfT7++GOtX79eLpdLL730khYtWqQHHnhAd999d7vzc3Nz1djY6N1qamoCfh4AAKDzMP22lL/cbreioqK0cuVK2e12JSUl6eDBg/rjH/+ovLy8E+aHhoYqNDTUhKQAAMAMppabyMhI2e121dXVtRmvq6tTTExMu/v0799fwcHBstvt3rHhw4ertrZWra2tCgkJOa2ZAQBA52bqbamQkBAlJSXJ6XR6x9xut5xOp9LS0trd5+KLL9bevXvldru9Yx999JH69+9PsQEAAOZ/z43D4dCqVav05JNPavfu3brlllvU3Nzs/fRUVlaWcnNzvfNvueUWHT58WHPmzNFHH32kTZs2admyZZo1a5ZZpwAAADoR09fcZGZm6tChQ1q8eLFqa2uVmJiozZs3excZV1dXKyjo3x0sLi5OW7Zs0bx58zR69GjFxsZqzpw5uu2228w6BQAA0InYPB6Px+wQZ1JTU5MiIiLU2Nio8PBws+MAAGCahPVbzI5wUjuvzWjz2p/f36bflgIAAAgkyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALAUyg0AALCUgJQbl8ulHTt26IsvvgjE4QAAAAwzVG7mzp2r1atXS/q22IwbN04XXHCB4uLi9PrrrwcyHwAAgF8MlZv169crISFBkvTCCy/ok08+0Ycffqh58+Zp4cKFAQ0IAADgD0PlpqGhQTExMZKkl156SVOmTNGPf/xjzZgxQ7t27QpoQAAAAH8YKjfR0dH64IMP5HK5tHnzZl111VWSpGPHjslutwc0IAAAgD+6GdkpJydH1113nfr37y+bzab09HRJ0ttvv61hw4YFNCAAAIA/DJWbO++8UyNHjlRNTY2mTJmi0NBQSZLdbteCBQsCGhAAAMAfhsqNJF177bUnjGVnZ3coDAAAQEcZLjdOp1NOp1P19fVyu91t3ispKelwMAAAACMMlZslS5Zo6dKlSk5O9q67AQAA6AwMlZvi4mI98cQTuuGGGwKdBwAAoEMMfRS8tbVVY8eODXQWAACADjNUbm666SatXbs20FkAAAA6zNBtqePHj2vlypV65ZVXNHr0aAUHB7d5v6CgICDhAAAA/GWo3Pzzn/9UYmKiJKmqqqrNeywuBgAAZjJUbl577bVA5wAAAAgIQ2tuAAAAOivDX+L37rvv6plnnlF1dbVaW1vbvLdhw4YOBwMAADDC0JWbdevWaezYsdq9e7eef/55ff3113r//ff16quvKiIiItAZAQAAfGao3CxbtkwPPvigXnjhBYWEhOihhx7Shx9+qOuuu04DBw4MdEYAAACfGSo3+/bt04QJEyRJISEham5uls1m07x587Ry5cqABgQAAPCHoXLTt29fHTlyRJIUGxvr/Tj4l19+qWPHjvl9vKKiIsXHxyssLEypqakqLy8/6dwnnnhCNputzRYWFmbkNAAAgAUZKjeXXXaZtm7dKkmaMmWK5syZo5kzZ2rq1Km68sor/TpWaWmpHA6H8vLyVFlZqYSEBGVkZKi+vv6k+4SHh+vzzz/3bgcOHDByGgAAwIIMfVpq+fLlOn78uCRp4cKFCg4O1vbt2/WLX/xCd9xxh1/HKigo0MyZM5WTkyPp24dybtq0SSUlJVqwYEG7+9hsNsXExBiJDgAALM5QuTnrrLO8fw4KCjppCfk+ra2tqqioUG5ubpvjpaenq6ys7KT7HT16VIMGDZLb7dYFF1ygZcuW6fzzz293bktLi1paWryvm5qaDGUFAABdg+Ev8du3b5/uuOMOTZ061XsL6eWXX9b777/v8zEaGhrkcrkUHR3dZjw6Olq1tbXt7jN06FCVlJTob3/7m5566im53W6NHTtWn376abvz8/PzFRER4d3i4uJ8zgcAALoeQ+XmjTfe0KhRo/T2229rw4YNOnr0qCRp586dysvLC2jA/5SWlqasrCwlJiZq3Lhx2rBhg/r166dHH3203fm5ublqbGz0bjU1Nac1HwAAMJehcrNgwQLdfffd2rp1q0JCQrzjV1xxhd566y2fjxMZGSm73a66uro243V1dT6vqQkODtaYMWO0d+/edt8PDQ1VeHh4mw0AAFiXoXKza9cu/fznPz9hPCoqSg0NDT4fJyQkRElJSXI6nd4xt9stp9OptLQ0n47hcrm0a9cu9e/f3+efCwAArMtQuenTp48+//zzE8bfe+89xcbG+nUsh8OhVatW6cknn9Tu3bt1yy23qLm52fvpqaysrDYLjpcuXaq///3v+vjjj1VZWanp06frwIEDuummm4ycCgAAsBhDn5a6/vrrddttt+nZZ5+VzWaT2+3W//7v/2r+/PnKysry61iZmZk6dOiQFi9erNraWiUmJmrz5s3eRcbV1dUKCvp3B/viiy80c+ZM1dbWqm/fvkpKStL27ds1YsQII6cCAAAsxubxeDz+7tTa2qpZs2bpiSeekMvlUrdu3eRyufTLX/5STzzxhOx2++nIGhBNTU2KiIhQY2Mj628AAD9oCeu3mB3hpHZem9HmtT+/vw1duQkJCdGqVau0aNEiVVVV6ejRoxozZoyGDBli5HAAAAABY6jcfGfgwIE8BRwAAHQqhsqNx+PR+vXr9dprr6m+vl5ut7vN+xs2bAhIOAAAAH8ZKjdz587Vo48+qp/85CeKjo6WzWYLdC4AAABDDJWbNWvWaMOGDRo/fnyg8wAAAHSIoe+5iYiI0ODBgwOdBQAAoMMMlZs777xTS5Ys0VdffRXoPAAAAB1i6LbUddddp6efflpRUVGKj49XcHBwm/crKysDEg4AAMBfhspNdna2KioqNH36dBYUAwCATsVQudm0aZO2bNmiSy65JNB5AAAAOsTQmpu4uDgeXQAAADolQ+XmgQce0O9//3vt378/wHEAAAA6xtBtqenTp+vYsWM699xz1aNHjxMWFB8+fDgg4QAAAPxlqNwUFhYGOAYAAEBgGP60lC/uvfde3XzzzerTp4+RHwMAAOA3Q2tufLVs2TJuUQEAgDPqtJYbj8dzOg8PAABwgtNabgAAAM40yg0AALAUyg0AALAUyg0AALCU01puLr30UnXv3v10/ggAAIA2fP6em6amJp8P+t1zp1566SX/EwEAAHSAz+WmT58+stlsPs11uVyGAwEAAHSEz+Xmtdde8/55//79WrBggW688UalpaVJksrKyvTkk08qPz8/8CkBAAB85HO5GTdunPfPS5cuVUFBgaZOneodmzRpkkaNGqWVK1f6/HgGAACAQDO0oLisrEzJycknjCcnJ6u8vLzDoQAAAIwyVG7i4uK0atWqE8Yfe+wxxcXFdTgUAACAUYaeCv7ggw/qF7/4hV5++WWlpqZKksrLy/Wvf/1Lzz33XEADAgAA+MPQlZvx48fro48+0sSJE3X48GEdPnxYEydO1EcffaTx48cHOiMAAIDPDF25kb69NbVs2bJAZgEAAOgww99Q/I9//EPTp0/X2LFjdfDgQUnSmjVrtG3btoCFAwAA8JehcvPcc88pIyND3bt3V2VlpVpaWiRJjY2NXM0BAACmMlRu7r77bhUXF2vVqlUKDg72jl988cWqrKwMWDgAAAB/GSo3e/bs0WWXXXbCeEREhL788ku/j1dUVKT4+HiFhYUpNTXV5+/KWbdunWw2myZPnuz3zwQAANZkqNzExMRo7969J4xv27ZNgwcP9utYpaWlcjgcysvLU2VlpRISEpSRkaH6+vpT7rd//37Nnz9fl156qV8/DwAAWJuhcjNz5kzNmTNHb7/9tmw2mz777DP95S9/0fz583XLLbf4dayCggLNnDlTOTk5GjFihIqLi9WjRw+VlJScdB+Xy6Vp06ZpyZIl31umWlpa1NTU1GYDAADWZeij4AsWLJDb7daVV16pY8eO6bLLLlNoaKjmz5+v3/zmNz4fp7W1VRUVFcrNzfWOBQUFKT09XWVlZSfdb+nSpYqKitKvfvUr/eMf/zjlz8jPz9eSJUt8zgQAALo2Q1dubDabFi5cqMOHD6uqqkpvvfWWDh06pLvuusuv4zQ0NMjlcik6OrrNeHR0tGpra9vdZ9u2bVq9enW7j39oT25urhobG71bTU2NXxkBAEDXYqjc/PnPf9bu3bsVEhKiESNGKCUlRb169dLx48f15z//OdAZvY4cOaIbbrhBq1atUmRkpE/7hIaGKjw8vM0GAACsy1C5ufHGG5WSknLCc6QaGxuVk5Pj83EiIyNlt9tVV1fXZryurk4xMTEnzN+3b5/279+viRMnqlu3burWrZv+/Oc/a+PGjerWrZv27dtn5HQAAICFGP6G4iVLluiGG27QnXfeafiHh4SEKCkpSU6n0zvmdrvldDqVlpZ2wvxhw4Zp165d2rFjh3ebNGmSfvKTn2jHjh08kRwAABh/ttR3j174+c9/rqqqKq1Zs8bQcRwOh7Kzs5WcnKyUlBQVFhaqubnZewUoKytLsbGxys/PV1hYmEaOHNlm/z59+kjSCeMAAOCHyVC5sdlskqSLLrpIb7/9tiZNmqSxY8equLjY72NlZmbq0KFDWrx4sWpra5WYmKjNmzd7FxlXV1crKMjwBSYAAPADY/N4PB5/dwoKClJtba2ioqIkSceOHdO0adPkdDrV3Nwsl8sV8KCB0tTUpIiICDU2NrK4GADwg5awfovZEU5q57UZbV778/vb0CWRvLw89erVy/u6R48eev755zVv3rx2H8sAAABwphi6ctOVceUGAIBvWfXKjc9rbjZu3KhrrrlGwcHB2rhx40nn2Ww2TZw40dfDAgAABJTP5Wby5MnedTanegq3zWbr1GtuAACAtflcbtxud7t/BgAA6Ez4jDUAALAUn6/c/OlPf/L5oL/97W8NhQEAAOgon8vNgw8+6NM8m81GuQEAAKbxudx88sknpzMHAABAQLDmBgAAWIrhB2d++umn2rhxo6qrq9Xa2trmvYKCgg4HAwAAMMJQuXE6nZo0aZIGDx6sDz/8UCNHjtT+/fvl8Xh0wQUXBDojAACAzwzdlsrNzdX8+fO1a9cuhYWF6bnnnlNNTY3GjRunKVOmBDojAACAzwyVm927dysrK0uS1K1bN3311Vfq1auXli5dqvvuuy+gAQEAAPxhqNz07NnTu86mf//+2rdvn/e9hoaGwCQDAAAwwNCam4suukjbtm3T8OHDNX78eP3ud7/Trl27tGHDBl100UWBzggAAOAzQ+WmoKBAR48elSQtWbJER48eVWlpqYYMGcInpQAAgKkMlZvBgwd7/9yzZ08VFxcHLBAAAEBHGP6em+8cPXr0hKeEh4eHd/SwAAAAhhhaUPzJJ59owoQJ6tmzpyIiItS3b1/17dtXffr0Ud++fQOdEQAAwGeGrtxMnz5dHo9HJSUlio6Ols1mC3QuAAAAQwyVm507d6qiokJDhw4NdB4AAIAOMXRb6sILL1RNTU2gswAAAHSYoSs3jz32mG6++WYdPHhQI0eOVHBwcJv3R48eHZBwAAAA/jJUbg4dOqR9+/YpJyfHO2az2eTxeGSz2eRyuQIWEAAAwB+Gys2MGTM0ZswYPf300ywoBgAAnYqhcnPgwAFt3LhR5513XqDzAAAAdIihBcVXXHGFdu7cGegsAAAAHWboys3EiRM1b9487dq1S6NGjTphQfGkSZMCEg4AAMBfhsrNzTffLElaunTpCe+xoBgAAJjJULn5z2dJAQAAdBZ+r7n5+uuv1a1bN1VVVZ2OPAAAAB3id7kJDg7WwIEDufUEAAA6JUOfllq4cKFuv/12HT58OCAhioqKFB8fr7CwMKWmpqq8vPykczds2KDk5GT16dNHPXv2VGJiotasWROQHAAAoOsztOZm+fLl2rt3rwYMGKBBgwapZ8+ebd6vrKz0+VilpaVyOBwqLi5WamqqCgsLlZGRoT179igqKuqE+WeddZYWLlyoYcOGKSQkRC+++KJycnIUFRWljIwMI6cDAAAsxFC5mTx5csACFBQUaObMmd5HORQXF2vTpk0qKSnRggULTph/+eWXt3k9Z84cPfnkk9q2bRvlBgAAGCs3eXl5Afnhra2tqqioUG5urncsKChI6enpKisr+979PR6PXn31Ve3Zs0f33Xdfu3NaWlrU0tLifd3U1NTx4AAAoNMyVG6+U1FRod27d0uSzj//fI0ZM8av/RsaGuRyuRQdHd1mPDo6Wh9++OFJ92tsbFRsbKxaWlpkt9v1yCOP6Kqrrmp3bn5+vpYsWeJXLgAA0HUZKjf19fW6/vrr9frrr6tPnz6SpC+//FI/+clPtG7dOvXr1y+QGU/Qu3dv7dixQ0ePHpXT6ZTD4dDgwYNPuGUlSbm5uXI4HN7XTU1NiouLO635AACAeQx9Wuo3v/mNjhw5ovfff1+HDx/W4cOHVVVVpaamJv32t7/1+TiRkZGy2+2qq6trM15XV6eYmJiThw4K0nnnnafExET97ne/07XXXqv8/Px254aGhio8PLzNBgAArMtQudm8ebMeeeQRDR8+3Ds2YsQIFRUV6eWXX/b5OCEhIUpKSpLT6fSOud1uOZ1OpaWl+Xwct9vdZl0NAAD44TL8+IX/fFim9O0X/Pn7aAaHw6Hs7GwlJycrJSVFhYWFam5u9n56KisrS7Gxsd4rM/n5+UpOTta5556rlpYWvfTSS1qzZo1WrFhh5FQAAIDFGCo3V1xxhebMmaOnn35aAwYMkCQdPHhQ8+bN05VXXunXsTIzM3Xo0CEtXrxYtbW1SkxM1ObNm72LjKurqxUU9O8LTM3Nzfqf//kfffrpp+revbuGDRump556SpmZmUZOBQAAWIzN4/F4/N2ppqZGkyZN0vvvv+9dnFtdXa1Ro0Zp48aNOueccwIeNFCampoUERGhxsZG1t8AAH7QEtZvMTvCSe28tu131/nz+9vQlZu4uDhVVlbK6XR6Pwo+fPhwpaenGzkcAABAwBj+nptXX31Vr776qurr6+V2u/Xee+9p7dq1kqSSkpKABQQAAPCHoXKzZMkSLV26VMnJyerfv79sNlugcwEAABhiqNwUFxfriSee0A033BDoPAAAAB1i6HtuWltbNXbs2EBnAQAA6DBD5eamm27yrq8BAADoTAzdljp+/LhWrlypV155RaNHjz7hC/0KCgoCEg4AAMBfhsrNP//5TyUmJkqSqqqq2rzH4mIAAGAmQ+XmtddeC3QOAACAgDC05gYAAKCzotwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABLodwAAABL6RTlpqioSPHx8QoLC1NqaqrKy8tPOnfVqlW69NJL1bdvX/Xt21fp6emnnA8AAH5YTC83paWlcjgcysvLU2VlpRISEpSRkaH6+vp257/++uuaOnWqXnvtNZWVlSkuLk5XX321Dh48eIaTAwCAzsjm8Xg8ZgZITU3VhRdeqOXLl0uS3G634uLi9Jvf/EYLFiz43v1dLpf69u2r5cuXKysr63vnNzU1KSIiQo2NjQoPD+9wfgAAuqqE9VvMjnBSO6/NaPPan9/fpl65aW1tVUVFhdLT071jQUFBSk9PV1lZmU/HOHbsmL7++mudddZZ7b7f0tKipqamNhsAALAuU8tNQ0ODXC6XoqOj24xHR0ertrbWp2PcdtttGjBgQJuC9P/l5+crIiLCu8XFxXU4NwAA6LxMX3PTEffee6/WrVun559/XmFhYe3Oyc3NVWNjo3erqak5wykBAMCZ1M3MHx4ZGSm73a66uro243V1dYqJiTnlvvfff7/uvfdevfLKKxo9evRJ54WGhio0NDQgeQEAQOdn6pWbkJAQJSUlyel0esfcbrecTqfS0tJOut8f/vAH3XXXXdq8ebOSk5PPRFQAANBFmHrlRpIcDoeys7OVnJyslJQUFRYWqrm5WTk5OZKkrKwsxcbGKj8/X5J03333afHixVq7dq3i4+O9a3N69eqlXr16mXYeAACgczC93GRmZurQoUNavHixamtrlZiYqM2bN3sXGVdXVyso6N8XmFasWKHW1lZde+21bY6Tl5enO++880xGBwAAnZDp33NzpvE9NwAAfIvvuQEAAOgCKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSKDcAAMBSupkdAACAruaZZ1PMjnBK100pNzuCqbhyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALKVTlJuioiLFx8crLCxMqampKi8vP+nc999/X7/4xS8UHx8vm82mwsLCMxcUAAB0eqaXm9LSUjkcDuXl5amyslIJCQnKyMhQfX19u/OPHTumwYMH695771VMTMwZTgsAADo708tNQUGBZs6cqZycHI0YMULFxcXq0aOHSkpK2p1/4YUX6o9//KOuv/56hYaGfu/xW1pa1NTU1GYDAADWZWq5aW1tVUVFhdLT071jQUFBSk9PV1lZWUB+Rn5+viIiIrxbXFxcQI4LAAA6J1PLTUNDg1wul6Kjo9uMR0dHq7a2NiA/Izc3V42Njd6tpqYmIMcFAACdUzezA5xuoaGhPt2+AgAA1mDqlZvIyEjZ7XbV1dW1Ga+rq2OxMAAAMMTUchMSEqKkpCQ5nU7vmNvtltPpVFpamonJAABAV2X6bSmHw6Hs7GwlJycrJSVFhYWFam5uVk5OjiQpKytLsbGxys/Pl/TtIuQPPvjA++eDBw9qx44d6tWrl8477zzTzgMAAHQOppebzMxMHTp0SIsXL1Ztba0SExO1efNm7yLj6upqBQX9+wLTZ599pjFjxnhf33///br//vs1btw4vf7662c6PgAA6GRMLzeSNHv2bM2ePbvd9/6zsMTHx8vj8ZyBVAAAoCsy/Uv8AAAAAolyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALIVyAwAALKWb2QEAAD8sd955p9kRTqozZ4PvuHIDAAAshXIDAAAshXIDAAAshXIDAAAshXIDAAAshXIDAAAshXIDAAAshe+5AYAuYvc9r5od4ZSGL7zC7AiApE5SboqKivTHP/5RtbW1SkhI0MMPP6yUlJSTzn/22We1aNEi7d+/X0OGDNF9992n8ePHn8HEALqSe6Zfa3aEU1r41HqzIwCWYvptqdLSUjkcDuXl5amyslIJCQnKyMhQfX19u/O3b9+uqVOn6le/+pXee+89TZ48WZMnT1ZVVdUZTg4AADoj06/cFBQUaObMmcrJyZEkFRcXa9OmTSopKdGCBQtOmP/QQw/ppz/9qW699VZJ0l133aWtW7dq+fLlKi4uPqPZgVN547JxZkc4pXFvvuHTvOW/e+E0JzFu9gMTzY4AoBMytdy0traqoqJCubm53rGgoCClp6errKys3X3KysrkcDjajGVkZOivf/1ru/NbWlrU0tLifd3Y2ChJampq6mB6nE5XFV9ldoST2nrzVp/mNX/zzWlO0jG+/jfwVcux05zEOF/P4fjXX5/mJB3j63kcPd58mpN0jK/n8f//P7mz8fUcjh1zneYkHePrebiOdd5/p/7zHL577fF4vn9nj4kOHjzokeTZvn17m/Fbb73Vk5KS0u4+wcHBnrVr17YZKyoq8kRFRbU7Py8vzyOJjY2NjY2NzQJbTU3N9/YL029LnW65ubltrvS43W4dPnxYZ599tmw222n5mU1NTYqLi1NNTY3Cw8NPy884E6xwHlY4B4nz6EyscA6SNc7DCucgcR6+8ng8OnLkiAYMGPC9c00tN5GRkbLb7aqrq2szXldXp5iYmHb3iYmJ8Wt+aGioQkND24z16dPHeGg/hIeHd+l/Ub9jhfOwwjlInEdnYoVzkKxxHlY4B4nz8EVERIRP80z9tFRISIiSkpLkdDq9Y263W06nU2lpae3uk5aW1ma+JG3duvWk8wEAwA+L6belHA6HsrOzlZycrJSUFBUWFqq5udn76amsrCzFxsYqPz9fkjRnzhyNGzdODzzwgCZMmKB169bp3Xff1cqVK808DQAA0EmYXm4yMzN16NAhLV68WLW1tUpMTNTmzZsVHR0tSaqurlZQ0L8vMI0dO1Zr167VHXfcodtvv11DhgzRX//6V40cOdKsUzhBaGio8vLyTrgd1tVY4TyscA4S59GZWOEcJGuchxXOQeI8Tgebx+PLZ6oAAAC6BtO/oRgAACCQKDcAAMBSKDcAAMBSKDcAAMBSKDenQVFRkeLj4xUWFqbU1FSVl5ebHckvb775piZOnKgBAwbIZrOd9LldnVl+fr4uvPBC9e7dW1FRUZo8ebL27Nljdiy/rVixQqNHj/Z+KVZaWppefvlls2N1yL333iubzaa5c+eaHcUvd955p2w2W5tt2LBhZsfy28GDBzV9+nSdffbZ6t69u0aNGqV3333X7Fh+iY+PP+Gfhc1m06xZs8yO5heXy6VFixbpRz/6kbp3765zzz1Xd911l2/PTupEjhw5orlz52rQoEHq3r27xo4dq3feecfUTJSbACstLZXD4VBeXp4qKyuVkJCgjIwM1dfXmx3NZ83NzUpISFBRUZHZUQx74403NGvWLL311lvaunWrvv76a1199dVqbu68D4lrzznnnKN7771XFRUVevfdd3XFFVfoZz/7md5//32zoxnyzjvv6NFHH9Xo0aPNjmLI+eefr88//9y7bdu2zexIfvniiy908cUXKzg4WC+//LI++OADPfDAA+rbt6/Z0fzyzjvvtPnnsHXrtw+znTJlisnJ/HPfffdpxYoVWr58uXbv3q377rtPf/jDH/Twww+bHc0vN910k7Zu3ao1a9Zo165duvrqq5Wenq6DBw+aF8qH51vCDykpKZ5Zs2Z5X7tcLs+AAQM8+fn5JqYyTpLn+eefNztGh9XX13sked544w2zo3RY3759PY899pjZMfx25MgRz5AhQzxbt271jBs3zjNnzhyzI/klLy/Pk5CQYHaMDrnttts8l1xyidkxAm7OnDmec8891+N2u82O4pcJEyZ4ZsyY0Wbsv/7rvzzTpk0zKZH/jh075rHb7Z4XX3yxzfgFF1zgWbhwoUmpPB6u3ARQa2urKioqlJ6e7h0LCgpSenq6ysrKTEyGxsZGSdJZZ51lchLjXC6X1q1bp+bm5i75uJFZs2ZpwoQJbf776Gr+9a9/acCAARo8eLCmTZum6upqsyP5ZePGjUpOTtaUKVMUFRWlMWPGaNWqVWbH6pDW1lY99dRTmjFjxml7GPLpMnbsWDmdTn300UeSpJ07d2rbtm265pprTE7mu2+++UYul0thYWFtxrt3727qlU3Tv6HYShoaGuRyubzfrvyd6Ohoffjhhyalgtvt1ty5c3XxxRd3qm+y9tWuXbuUlpam48ePq1evXnr++ec1YsQIs2P5Zd26daqsrDT9PnxHpKam6oknntDQoUP1+eefa8mSJbr00ktVVVWl3r17mx3PJx9//LFWrFghh8Oh22+/Xe+8845++9vfKiQkRNnZ2WbHM+Svf/2rvvzyS914441mR/HbggUL1NTUpGHDhslut8vlcumee+7RtGnTzI7ms969eystLU133XWXhg8frujoaD399NMqKyvTeeedZ1ouyg0sb9asWaqqqupy6yO+M3ToUO3YsUONjY1av369srOz9cYbb3SZglNTU6M5c+Zo69atJ/ztriv5/3+bHj16tFJTUzVo0CA988wz+tWvfmViMt+53W4lJydr2bJlkqQxY8aoqqpKxcXFXbbcrF69Wtdcc40GDBhgdhS/PfPMM/rLX/6itWvX6vzzz9eOHTs0d+5cDRgwoEv981izZo1mzJih2NhY2e12XXDBBZo6daoqKipMy0S5CaDIyEjZ7XbV1dW1Ga+rq1NMTIxJqX7YZs+erRdffFFvvvmmzjnnHLPjGBISEuL9G1BSUpLeeecdPfTQQ3r00UdNTuabiooK1dfX64ILLvCOuVwuvfnmm1q+fLlaWlpkt9tNTGhMnz599OMf/1h79+41O4rP+vfvf0IpHj58uJ577jmTEnXMgQMH9Morr2jDhg1mRzHk1ltv1YIFC3T99ddLkkaNGqUDBw4oPz+/S5Wbc889V2+88Yaam5vV1NSk/v37KzMzU4MHDzYtE2tuAigkJERJSUlyOp3eMbfbLafT2SXXSHRlHo9Hs2fP1vPPP69XX31VP/rRj8yOFDBut1stLS1mx/DZlVdeqV27dmnHjh3eLTk5WdOmTdOOHTu6ZLGRpKNHj2rfvn3q37+/2VF8dvHFF5/wlQgfffSRBg0aZFKijnn88ccVFRWlCRMmmB3FkGPHjrV5MLQk2e12ud1ukxJ1TM+ePdW/f3998cUX2rJli372s5+ZloUrNwHmcDiUnZ2t5ORkpaSkqLCwUM3NzcrJyTE7ms+OHj3a5m+jn3zyiXbs2KGzzjpLAwcONDGZ72bNmqW1a9fqb3/7m3r37q3a2lpJUkREhLp3725yOt/l5ubqmmuu0cCBA3XkyBGtXbtWr7/+urZs2WJ2NJ/17t37hLVOPXv21Nlnn92l1kDNnz9fEydO1KBBg/TZZ58pLy9PdrtdU6dONTuaz+bNm6exY8dq2bJluu6661ReXq6VK1dq5cqVZkfzm9vt1uOPP67s7Gx169Y1f5VNnDhR99xzjwYOHKjzzz9f7733ngoKCjRjxgyzo/lly5Yt8ng8Gjp0qPbu3atbb71Vw4YNM/f3nmmf07Kwhx9+2DNw4EBPSEiIJyUlxfPWW2+ZHckvr732mkfSCVt2drbZ0XzWXn5Jnscff9zsaH6ZMWOGZ9CgQZ6QkBBPv379PFdeeaXn73//u9mxOqwrfhQ8MzPT079/f09ISIgnNjbWk5mZ6dm7d6/Zsfz2wgsveEaOHOkJDQ31DBs2zLNy5UqzIxmyZcsWjyTPnj17zI5iWFNTk2fOnDmegQMHesLCwjyDBw/2LFy40NPS0mJ2NL+UlpZ6Bg8e7AkJCfHExMR4Zs2a5fnyyy9NzWTzeLrYVyECAACcAmtuAACApVBuAACApVBuAACApVBuAACApVBuAACApVBuAACApVBuAACApVBuAACApVBuAHRal19+uebOnXvS9+Pj41VYWHjG8gDoGrrmAzkAQNI777yjnj17mh0DQCdDuQHQZfXr18/sCAA6IW5LAejUvvnmG82ePVsRERGKjIzUokWL9N0j8f7ztpTNZtNjjz2mn//85+rRo4eGDBmijRs3et//4osvNG3aNPXr10/du3fXkCFD9Pjjj5/pUwJwmlFuAHRqTz75pLp166by8nI99NBDKigo0GOPPXbS+UuWLNF1112nf/7znxo/frymTZumw4cPS5IWLVqkDz74QC+//LJ2796tFStWKDIy8kydCoAzhNtSADq1uLg4Pfjgg7LZbBo6dKh27dqlBx98UDNnzmx3/o033qipU6dKkpYtW6Y//elPKi8v109/+lNVV1drzJgxSk5OlvTtlR8A1sOVGwCd2kUXXSSbzeZ9nZaWpn/9619yuVztzh89erT3zz179lR4eLjq6+slSbfccovWrVunxMRE/f73v9f27dtPb3gApqDcALCU4ODgNq9tNpvcbrck6ZprrtGBAwc0b948ffbZZ7ryyis1f/58M2ICOI0oNwA6tbfffrvN67feektDhgyR3W43dLx+/fopOztbTz31lAoLC7Vy5cpAxATQibDmBkCnVl1dLYfDof/+7/9WZWWlHn74YT3wwAOGjrV48WIlJSXp/PPPV0tLi1588UUNHz48wIkBmI1yA6BTy8rK0ldffaWUlBTZ7XbNmTNHv/71rw0dKyQkRLm5udq/f7+6d++uSy+9VOvWrQtwYgBms3m++8IIAAAAC2DNDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsBTKDQAAsJT/AylzLaehRfT+AAAAAElFTkSuQmCC",
+ "text/plain": [
+ "