{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Import Data and Exploration "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from pandas_profiling import ProfileReport\n",
"import os\n",
"import sys\n",
"import plotly.express as px"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/joao_victor/netflix-recommendation-app\n"
]
}
],
"source": [
"# Setting the working path\n",
"\n",
"os.chdir(\"../\") # remove the last directory\n",
"path = os.getcwd()\n",
"print(path)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" person_id \n",
" id \n",
" name \n",
" character \n",
" role \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 3748 \n",
" tm84618 \n",
" Robert De Niro \n",
" Travis Bickle \n",
" ACTOR \n",
" \n",
" \n",
" 1 \n",
" 14658 \n",
" tm84618 \n",
" Jodie Foster \n",
" Iris Steensma \n",
" ACTOR \n",
" \n",
" \n",
" 2 \n",
" 7064 \n",
" tm84618 \n",
" Albert Brooks \n",
" Tom \n",
" ACTOR \n",
" \n",
" \n",
" 3 \n",
" 3739 \n",
" tm84618 \n",
" Harvey Keitel \n",
" Matthew 'Sport' Higgins \n",
" ACTOR \n",
" \n",
" \n",
" 4 \n",
" 48933 \n",
" tm84618 \n",
" Cybill Shepherd \n",
" Betsy \n",
" ACTOR \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" person_id id name character role\n",
"0 3748 tm84618 Robert De Niro Travis Bickle ACTOR\n",
"1 14658 tm84618 Jodie Foster Iris Steensma ACTOR\n",
"2 7064 tm84618 Albert Brooks Tom ACTOR\n",
"3 3739 tm84618 Harvey Keitel Matthew 'Sport' Higgins ACTOR\n",
"4 48933 tm84618 Cybill Shepherd Betsy ACTOR"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reading credits table\n",
"\n",
"df_credits = pd.read_csv(path + \"/data/input/credits.csv\")\n",
"df_credits.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 77213 entries, 0 to 77212\n",
"Data columns (total 5 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 person_id 77213 non-null int64 \n",
" 1 id 77213 non-null object\n",
" 2 name 77213 non-null object\n",
" 3 character 67586 non-null object\n",
" 4 role 77213 non-null object\n",
"dtypes: int64(1), object(4)\n",
"memory usage: 2.9+ MB\n"
]
}
],
"source": [
"df_credits.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" 0.600 \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" NaN \n",
" tt0075314 \n",
" 8.3 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.2 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"\n",
" age_certification runtime genres production_countries \\\n",
"0 TV-MA 48 ['documentation'] ['US'] \n",
"1 R 113 ['crime', 'drama'] ['US'] \n",
"\n",
" seasons imdb_id imdb_score imdb_votes tmdb_popularity tmdb_score \n",
"0 1.0 NaN NaN NaN 0.600 NaN \n",
"1 NaN tt0075314 8.3 795222.0 27.612 8.2 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reading titles table\n",
"\n",
"df_titles = pd.read_csv(path + \"/data/input/titles.csv\")\n",
"df_titles.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5806 entries, 0 to 5805\n",
"Data columns (total 15 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 id 5806 non-null object \n",
" 1 title 5805 non-null object \n",
" 2 type 5806 non-null object \n",
" 3 description 5788 non-null object \n",
" 4 release_year 5806 non-null int64 \n",
" 5 age_certification 3196 non-null object \n",
" 6 runtime 5806 non-null int64 \n",
" 7 genres 5806 non-null object \n",
" 8 production_countries 5806 non-null object \n",
" 9 seasons 2047 non-null float64\n",
" 10 imdb_id 5362 non-null object \n",
" 11 imdb_score 5283 non-null float64\n",
" 12 imdb_votes 5267 non-null float64\n",
" 13 tmdb_popularity 5712 non-null float64\n",
" 14 tmdb_score 5488 non-null float64\n",
"dtypes: float64(5), int64(2), object(8)\n",
"memory usage: 680.5+ KB\n"
]
}
],
"source": [
"df_titles.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# # Aggregate name column by movie id separeting by comma\n",
"\n",
"# df_credits_agg = df_credits.groupby('id')['name'].agg(','.join).to_frame().reset_index()\n",
"# df_credits_agg"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Validate the aggregation\n",
"\n",
"# df_credits.loc[df_credits['id'] == 'ts9794']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Merge actors name column from df_titles in df_credits to have a concatenate column with all actors by movie\n",
"\n",
"# df_merged_titles = pd.merge(df_titles, df_credits_agg, left_on = \"id\", right_on = \"id\")\n",
"# df_merged_titles"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Nulls Handling"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"seasons 3759\n",
"age_certification 2610\n",
"imdb_votes 539\n",
"imdb_score 523\n",
"imdb_id 444\n",
"tmdb_score 318\n",
"tmdb_popularity 94\n",
"description 18\n",
"title 1\n",
"id 0\n",
"type 0\n",
"release_year 0\n",
"runtime 0\n",
"genres 0\n",
"production_countries 0\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Checking for 'null' values\n",
"\n",
"df_titles.isnull().sum().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan])"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Checkin if all rows with 'type' == 'MOVIE' have just null values in seasons column\n",
"\n",
"df_titles.loc[df_titles[\"type\"] == \"MOVIE\"][\"seasons\"].unique()\n",
"\n",
"# As seen all null values in season column are from movies, so we'll replace null values for zeros"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" 0.600 \n",
" NaN \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0075314 \n",
" 8.3 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.2 \n",
" \n",
" \n",
" 2 \n",
" tm127384 \n",
" Monty Python and the Holy Grail \n",
" MOVIE \n",
" King Arthur, accompanied by his squire, recrui... \n",
" 1975 \n",
" PG \n",
" 91 \n",
" ['comedy', 'fantasy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0071853 \n",
" 8.2 \n",
" 530877.0 \n",
" 18.216 \n",
" 7.8 \n",
" \n",
" \n",
" 3 \n",
" tm70993 \n",
" Life of Brian \n",
" MOVIE \n",
" Brian Cohen is an average young Jewish man, bu... \n",
" 1979 \n",
" R \n",
" 94 \n",
" ['comedy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0079470 \n",
" 8.0 \n",
" 392419.0 \n",
" 17.505 \n",
" 7.8 \n",
" \n",
" \n",
" 4 \n",
" tm190788 \n",
" The Exorcist \n",
" MOVIE \n",
" 12-year-old Regan MacNeil begins to adapt an e... \n",
" 1973 \n",
" R \n",
" 133 \n",
" ['horror'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0070047 \n",
" 8.1 \n",
" 391942.0 \n",
" 95.337 \n",
" 7.7 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 5801 \n",
" tm1014599 \n",
" Fine Wine \n",
" MOVIE \n",
" A beautiful love story that can happen between... \n",
" 2021 \n",
" no_certification \n",
" 100 \n",
" ['romance', 'drama'] \n",
" ['NG'] \n",
" 0.0 \n",
" tt13857480 \n",
" 6.9 \n",
" 39.0 \n",
" 0.966 \n",
" NaN \n",
" \n",
" \n",
" 5802 \n",
" tm1108171 \n",
" Edis Starlight \n",
" MOVIE \n",
" Rising star Edis's career journey with ups and... \n",
" 2021 \n",
" no_certification \n",
" 74 \n",
" ['music', 'documentation'] \n",
" [] \n",
" 0.0 \n",
" NaN \n",
" NaN \n",
" NaN \n",
" 1.036 \n",
" 8.5 \n",
" \n",
" \n",
" 5803 \n",
" tm1045018 \n",
" Clash \n",
" MOVIE \n",
" A man from Nigeria returns to his family in Ca... \n",
" 2021 \n",
" no_certification \n",
" 88 \n",
" ['family', 'drama'] \n",
" ['NG', 'CA'] \n",
" 0.0 \n",
" tt14620732 \n",
" 6.5 \n",
" 32.0 \n",
" 0.709 \n",
" NaN \n",
" \n",
" \n",
" 5804 \n",
" tm1098060 \n",
" Shadow Parties \n",
" MOVIE \n",
" A family faces destruction in a long-running c... \n",
" 2021 \n",
" no_certification \n",
" 116 \n",
" ['action', 'thriller'] \n",
" [] \n",
" 0.0 \n",
" tt10168094 \n",
" 6.2 \n",
" 9.0 \n",
" 2.186 \n",
" NaN \n",
" \n",
" \n",
" 5805 \n",
" ts271048 \n",
" Mighty Little Bheem: Kite Festival \n",
" SHOW \n",
" With winter behind them, Bheem and his townspe... \n",
" 2021 \n",
" no_certification \n",
" 0 \n",
" ['family', 'comedy', 'animation'] \n",
" [] \n",
" 1.0 \n",
" tt13711094 \n",
" 8.8 \n",
" 16.0 \n",
" 0.979 \n",
" 10.0 \n",
" \n",
" \n",
"
\n",
"
5806 rows × 15 columns
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"2 tm127384 Monty Python and the Holy Grail MOVIE \n",
"3 tm70993 Life of Brian MOVIE \n",
"4 tm190788 The Exorcist MOVIE \n",
"... ... ... ... \n",
"5801 tm1014599 Fine Wine MOVIE \n",
"5802 tm1108171 Edis Starlight MOVIE \n",
"5803 tm1045018 Clash MOVIE \n",
"5804 tm1098060 Shadow Parties MOVIE \n",
"5805 ts271048 Mighty Little Bheem: Kite Festival SHOW \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"2 King Arthur, accompanied by his squire, recrui... 1975 \n",
"3 Brian Cohen is an average young Jewish man, bu... 1979 \n",
"4 12-year-old Regan MacNeil begins to adapt an e... 1973 \n",
"... ... ... \n",
"5801 A beautiful love story that can happen between... 2021 \n",
"5802 Rising star Edis's career journey with ups and... 2021 \n",
"5803 A man from Nigeria returns to his family in Ca... 2021 \n",
"5804 A family faces destruction in a long-running c... 2021 \n",
"5805 With winter behind them, Bheem and his townspe... 2021 \n",
"\n",
" age_certification runtime genres \\\n",
"0 TV-MA 48 ['documentation'] \n",
"1 R 113 ['crime', 'drama'] \n",
"2 PG 91 ['comedy', 'fantasy'] \n",
"3 R 94 ['comedy'] \n",
"4 R 133 ['horror'] \n",
"... ... ... ... \n",
"5801 no_certification 100 ['romance', 'drama'] \n",
"5802 no_certification 74 ['music', 'documentation'] \n",
"5803 no_certification 88 ['family', 'drama'] \n",
"5804 no_certification 116 ['action', 'thriller'] \n",
"5805 no_certification 0 ['family', 'comedy', 'animation'] \n",
"\n",
" production_countries seasons imdb_id imdb_score imdb_votes \\\n",
"0 ['US'] 1.0 NaN NaN NaN \n",
"1 ['US'] 0.0 tt0075314 8.3 795222.0 \n",
"2 ['GB'] 0.0 tt0071853 8.2 530877.0 \n",
"3 ['GB'] 0.0 tt0079470 8.0 392419.0 \n",
"4 ['US'] 0.0 tt0070047 8.1 391942.0 \n",
"... ... ... ... ... ... \n",
"5801 ['NG'] 0.0 tt13857480 6.9 39.0 \n",
"5802 [] 0.0 NaN NaN NaN \n",
"5803 ['NG', 'CA'] 0.0 tt14620732 6.5 32.0 \n",
"5804 [] 0.0 tt10168094 6.2 9.0 \n",
"5805 [] 1.0 tt13711094 8.8 16.0 \n",
"\n",
" tmdb_popularity tmdb_score \n",
"0 0.600 NaN \n",
"1 27.612 8.2 \n",
"2 18.216 7.8 \n",
"3 17.505 7.8 \n",
"4 95.337 7.7 \n",
"... ... ... \n",
"5801 0.966 NaN \n",
"5802 1.036 8.5 \n",
"5803 0.709 NaN \n",
"5804 2.186 NaN \n",
"5805 0.979 10.0 \n",
"\n",
"[5806 rows x 15 columns]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define function to replace null values\n",
"\n",
"\n",
"def replace_nulls(df: pd.DataFrame, columns: list, value: float) -> pd.DataFrame:\n",
" \"\"\"\n",
" Substitute null values for specified value in a dataframe column(s) inplace.\n",
"\n",
" Args:\n",
" df (pd.DataFrame): Pandas Dataframe\n",
" columns (list): Column(s) for transformation\n",
" value (float): Value to replace\n",
"\n",
" Returns:\n",
" pd.DataFrame: Pandas Dataframe\n",
" \"\"\"\n",
" for i in columns:\n",
" df[i].fillna(value, inplace=True)\n",
" return df\n",
"\n",
"\n",
"replace_nulls(df_titles, [\"seasons\"], 0)\n",
"replace_nulls(df_titles, [\"age_certification\"], \"no_certification\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tmdb_score 318\n",
"imdb_score 523\n",
"dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 'tmdb_score' and 'imdb_score' columns have differents rows with null values\n",
"\n",
"df_titles[[\"tmdb_score\", \"imdb_score\"]].isnull().sum()\n",
"\n",
"# So we can substitute these null values for the column mean"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
" warnings.warn(msg, FutureWarning)\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Check for normal distribution of the imdb_score column\n",
"\n",
"sns.distplot(df_titles[\"imdb_score\"])\n",
"plt.xlabel(\"Imdb Score\")\n",
"plt.ylabel(\"Frequencia\")\n",
"\n",
"plt.axvline(x=df_titles[\"imdb_score\"].mean(), color=\"red\", label=\"Média\") # média\n",
"plt.axvline(\n",
" x=df_titles[\"imdb_score\"].median(), color=\"blue\", label=\"Mediana\"\n",
") # mediana\n",
"\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/seaborn/distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).\n",
" warnings.warn(msg, FutureWarning)\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Check for normal distribution of the tmdb_score column\n",
"\n",
"sns.distplot(df_titles[\"tmdb_score\"])\n",
"plt.xlabel(\"Tmdb Score\")\n",
"plt.ylabel(\"Frequencia\")\n",
"\n",
"plt.axvline(x=df_titles[\"tmdb_score\"].mean(), color=\"red\", label=\"Média\") # média\n",
"plt.axvline(\n",
" x=df_titles[\"tmdb_score\"].median(), color=\"blue\", label=\"Mediana\"\n",
") # mediana\n",
"\n",
"plt.legend()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 5283.000000\n",
"mean 6.533447\n",
"std 1.160932\n",
"min 1.500000\n",
"25% 5.800000\n",
"50% 6.600000\n",
"75% 7.400000\n",
"max 9.600000\n",
"Name: imdb_score, dtype: float64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We can algo use describe function to see the mean and median values\n",
"\n",
"df_titles[\"imdb_score\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# These columns are on the same number scale (same min and max values), thus we can create a column for universal score, using tmdb score whenever there is no imdb score\n",
"\n",
"# df_titles[['tmdb_score', 'imdb_score']].describe()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" column_name \n",
" percent_missing \n",
" \n",
" \n",
" \n",
" \n",
" id \n",
" id \n",
" 0.000000 \n",
" \n",
" \n",
" title \n",
" title \n",
" 0.017224 \n",
" \n",
" \n",
" type \n",
" type \n",
" 0.000000 \n",
" \n",
" \n",
" description \n",
" description \n",
" 0.310024 \n",
" \n",
" \n",
" release_year \n",
" release_year \n",
" 0.000000 \n",
" \n",
" \n",
" age_certification \n",
" age_certification \n",
" 0.000000 \n",
" \n",
" \n",
" runtime \n",
" runtime \n",
" 0.000000 \n",
" \n",
" \n",
" genres \n",
" genres \n",
" 0.000000 \n",
" \n",
" \n",
" production_countries \n",
" production_countries \n",
" 0.000000 \n",
" \n",
" \n",
" seasons \n",
" seasons \n",
" 0.000000 \n",
" \n",
" \n",
" imdb_id \n",
" imdb_id \n",
" 7.647261 \n",
" \n",
" \n",
" imdb_score \n",
" imdb_score \n",
" 9.007923 \n",
" \n",
" \n",
" imdb_votes \n",
" imdb_votes \n",
" 9.283500 \n",
" \n",
" \n",
" tmdb_popularity \n",
" tmdb_popularity \n",
" 1.619015 \n",
" \n",
" \n",
" tmdb_score \n",
" tmdb_score \n",
" 5.477093 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" column_name percent_missing\n",
"id id 0.000000\n",
"title title 0.017224\n",
"type type 0.000000\n",
"description description 0.310024\n",
"release_year release_year 0.000000\n",
"age_certification age_certification 0.000000\n",
"runtime runtime 0.000000\n",
"genres genres 0.000000\n",
"production_countries production_countries 0.000000\n",
"seasons seasons 0.000000\n",
"imdb_id imdb_id 7.647261\n",
"imdb_score imdb_score 9.007923\n",
"imdb_votes imdb_votes 9.283500\n",
"tmdb_popularity tmdb_popularity 1.619015\n",
"tmdb_score tmdb_score 5.477093"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check the % of null values in the dataset.\n",
"\n",
"df_missing = df_titles[[\"tmdb_score\", \"imdb_score\"]]\n",
"percent_missing = df_titles.isnull().sum() * 100 / len(df_titles)\n",
"missing_value_df = pd.DataFrame(\n",
" {\"column_name\": df_titles.columns, \"percent_missing\": percent_missing}\n",
")\n",
"missing_value_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use \"substitute with the mean\" strategy "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 0.600 \n",
" 6.818039 \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0075314 \n",
" 8.300000 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.200000 \n",
" \n",
" \n",
" 2 \n",
" tm127384 \n",
" Monty Python and the Holy Grail \n",
" MOVIE \n",
" King Arthur, accompanied by his squire, recrui... \n",
" 1975 \n",
" PG \n",
" 91 \n",
" ['comedy', 'fantasy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0071853 \n",
" 8.200000 \n",
" 530877.0 \n",
" 18.216 \n",
" 7.800000 \n",
" \n",
" \n",
" 3 \n",
" tm70993 \n",
" Life of Brian \n",
" MOVIE \n",
" Brian Cohen is an average young Jewish man, bu... \n",
" 1979 \n",
" R \n",
" 94 \n",
" ['comedy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0079470 \n",
" 8.000000 \n",
" 392419.0 \n",
" 17.505 \n",
" 7.800000 \n",
" \n",
" \n",
" 4 \n",
" tm190788 \n",
" The Exorcist \n",
" MOVIE \n",
" 12-year-old Regan MacNeil begins to adapt an e... \n",
" 1973 \n",
" R \n",
" 133 \n",
" ['horror'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0070047 \n",
" 8.100000 \n",
" 391942.0 \n",
" 95.337 \n",
" 7.700000 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 5801 \n",
" tm1014599 \n",
" Fine Wine \n",
" MOVIE \n",
" A beautiful love story that can happen between... \n",
" 2021 \n",
" no_certification \n",
" 100 \n",
" ['romance', 'drama'] \n",
" ['NG'] \n",
" 0.0 \n",
" tt13857480 \n",
" 6.900000 \n",
" 39.0 \n",
" 0.966 \n",
" 6.818039 \n",
" \n",
" \n",
" 5802 \n",
" tm1108171 \n",
" Edis Starlight \n",
" MOVIE \n",
" Rising star Edis's career journey with ups and... \n",
" 2021 \n",
" no_certification \n",
" 74 \n",
" ['music', 'documentation'] \n",
" [] \n",
" 0.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 1.036 \n",
" 8.500000 \n",
" \n",
" \n",
" 5803 \n",
" tm1045018 \n",
" Clash \n",
" MOVIE \n",
" A man from Nigeria returns to his family in Ca... \n",
" 2021 \n",
" no_certification \n",
" 88 \n",
" ['family', 'drama'] \n",
" ['NG', 'CA'] \n",
" 0.0 \n",
" tt14620732 \n",
" 6.500000 \n",
" 32.0 \n",
" 0.709 \n",
" 6.818039 \n",
" \n",
" \n",
" 5804 \n",
" tm1098060 \n",
" Shadow Parties \n",
" MOVIE \n",
" A family faces destruction in a long-running c... \n",
" 2021 \n",
" no_certification \n",
" 116 \n",
" ['action', 'thriller'] \n",
" [] \n",
" 0.0 \n",
" tt10168094 \n",
" 6.200000 \n",
" 9.0 \n",
" 2.186 \n",
" 6.818039 \n",
" \n",
" \n",
" 5805 \n",
" ts271048 \n",
" Mighty Little Bheem: Kite Festival \n",
" SHOW \n",
" With winter behind them, Bheem and his townspe... \n",
" 2021 \n",
" no_certification \n",
" 0 \n",
" ['family', 'comedy', 'animation'] \n",
" [] \n",
" 1.0 \n",
" tt13711094 \n",
" 8.800000 \n",
" 16.0 \n",
" 0.979 \n",
" 10.000000 \n",
" \n",
" \n",
"
\n",
"
5806 rows × 15 columns
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"2 tm127384 Monty Python and the Holy Grail MOVIE \n",
"3 tm70993 Life of Brian MOVIE \n",
"4 tm190788 The Exorcist MOVIE \n",
"... ... ... ... \n",
"5801 tm1014599 Fine Wine MOVIE \n",
"5802 tm1108171 Edis Starlight MOVIE \n",
"5803 tm1045018 Clash MOVIE \n",
"5804 tm1098060 Shadow Parties MOVIE \n",
"5805 ts271048 Mighty Little Bheem: Kite Festival SHOW \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"2 King Arthur, accompanied by his squire, recrui... 1975 \n",
"3 Brian Cohen is an average young Jewish man, bu... 1979 \n",
"4 12-year-old Regan MacNeil begins to adapt an e... 1973 \n",
"... ... ... \n",
"5801 A beautiful love story that can happen between... 2021 \n",
"5802 Rising star Edis's career journey with ups and... 2021 \n",
"5803 A man from Nigeria returns to his family in Ca... 2021 \n",
"5804 A family faces destruction in a long-running c... 2021 \n",
"5805 With winter behind them, Bheem and his townspe... 2021 \n",
"\n",
" age_certification runtime genres \\\n",
"0 TV-MA 48 ['documentation'] \n",
"1 R 113 ['crime', 'drama'] \n",
"2 PG 91 ['comedy', 'fantasy'] \n",
"3 R 94 ['comedy'] \n",
"4 R 133 ['horror'] \n",
"... ... ... ... \n",
"5801 no_certification 100 ['romance', 'drama'] \n",
"5802 no_certification 74 ['music', 'documentation'] \n",
"5803 no_certification 88 ['family', 'drama'] \n",
"5804 no_certification 116 ['action', 'thriller'] \n",
"5805 no_certification 0 ['family', 'comedy', 'animation'] \n",
"\n",
" production_countries seasons imdb_id imdb_score imdb_votes \\\n",
"0 ['US'] 1.0 NaN 6.533447 NaN \n",
"1 ['US'] 0.0 tt0075314 8.300000 795222.0 \n",
"2 ['GB'] 0.0 tt0071853 8.200000 530877.0 \n",
"3 ['GB'] 0.0 tt0079470 8.000000 392419.0 \n",
"4 ['US'] 0.0 tt0070047 8.100000 391942.0 \n",
"... ... ... ... ... ... \n",
"5801 ['NG'] 0.0 tt13857480 6.900000 39.0 \n",
"5802 [] 0.0 NaN 6.533447 NaN \n",
"5803 ['NG', 'CA'] 0.0 tt14620732 6.500000 32.0 \n",
"5804 [] 0.0 tt10168094 6.200000 9.0 \n",
"5805 [] 1.0 tt13711094 8.800000 16.0 \n",
"\n",
" tmdb_popularity tmdb_score \n",
"0 0.600 6.818039 \n",
"1 27.612 8.200000 \n",
"2 18.216 7.800000 \n",
"3 17.505 7.800000 \n",
"4 95.337 7.700000 \n",
"... ... ... \n",
"5801 0.966 6.818039 \n",
"5802 1.036 8.500000 \n",
"5803 0.709 6.818039 \n",
"5804 2.186 6.818039 \n",
"5805 0.979 10.000000 \n",
"\n",
"[5806 rows x 15 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Define function to replace null values for column mean\n",
"\n",
"\n",
"def null_for_mean(df: pd.DataFrame, columns: list) -> pd.DataFrame:\n",
" \"\"\"\n",
" Substitute null values for the specified column mean.\n",
"\n",
" Args:\n",
" df (pd.DataFrame): Pandas Dataframe\n",
" columns (list): column(s) for transformation\n",
"\n",
" Returns:\n",
" pd.DataFrame: Pandas Dataframe\n",
" \"\"\"\n",
" for i in columns:\n",
" df[i].fillna(df[i].mean(), inplace=True)\n",
" return df\n",
"\n",
"\n",
"null_for_mean(\n",
" df_titles, [\"imdb_score\", \"tmdb_score\"]\n",
") # replace null for mean in 'imdb_score', 'tmdb_score' columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tmdb_score 0\n",
"imdb_score 0\n",
"dtype: int64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Validate change of null replace to mean\n",
"\n",
"df_titles[[\"tmdb_score\", \"imdb_score\"]].isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"imdb_votes 539\n",
"imdb_id 444\n",
"tmdb_popularity 94\n",
"description 18\n",
"title 1\n",
"id 0\n",
"type 0\n",
"release_year 0\n",
"age_certification 0\n",
"runtime 0\n",
"genres 0\n",
"production_countries 0\n",
"seasons 0\n",
"imdb_score 0\n",
"tmdb_score 0\n",
"dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Checkin columns with null values still\n",
"\n",
"df_titles.isnull().sum().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_8096/1949421916.py:3: FutureWarning: In a future version of pandas all arguments of DataFrame.dropna will be keyword-only.\n",
" df_titles.dropna(0, subset=[\"description\", \"title\", \"tmdb_popularity\"], inplace=True)\n"
]
}
],
"source": [
"# Remove rows with null values in description, title and tmdb_popularity columns\n",
"\n",
"df_titles.dropna(0, subset=[\"description\", \"title\", \"tmdb_popularity\"], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"imdb_votes 516\n",
"imdb_id 431\n",
"id 0\n",
"title 0\n",
"type 0\n",
"description 0\n",
"release_year 0\n",
"age_certification 0\n",
"runtime 0\n",
"genres 0\n",
"production_countries 0\n",
"seasons 0\n",
"imdb_score 0\n",
"tmdb_popularity 0\n",
"tmdb_score 0\n",
"dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titles.isnull().sum().sort_values(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"# Extracting the first value from genres array\n",
"\n",
"genres = {}\n",
"\n",
"\n",
"def transform_genres(row):\n",
" parsed = (str(row)[1:-1]).split(\",\")\n",
"\n",
" for i in range(len(parsed)):\n",
" parsed[i] = parsed[i].strip()[1:-1]\n",
"\n",
" for i in parsed:\n",
" if i not in genres.keys():\n",
" genres[i] = 0\n",
" continue\n",
" genres[i] += 1\n",
"\n",
" return parsed[0] if parsed[0] != \"\" else \"none\"\n",
"\n",
"\n",
"df_titles[\"genres_transformed\"] = df_titles[\"genres\"].map(transform_genres)\n",
"df_titles[\"production_countries_transformed\"] = df_titles[\"production_countries\"].map(\n",
" transform_genres\n",
")\n",
"\n",
"# another way\n",
"# import ast\n",
"# df_titles['new_col'] = df_titles['genres'].apply(ast.literal_eval).str[0]\n",
"# df_titles.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Exploratory Analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we will create a recommendation app after the exploration, so we have to analyze and look for the most promissing faetures"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" genres_transformed \n",
" production_countries_transformed \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 0.600 \n",
" 6.818039 \n",
" documentation \n",
" US \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0075314 \n",
" 8.300000 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.200000 \n",
" crime \n",
" US \n",
" \n",
" \n",
" 2 \n",
" tm127384 \n",
" Monty Python and the Holy Grail \n",
" MOVIE \n",
" King Arthur, accompanied by his squire, recrui... \n",
" 1975 \n",
" PG \n",
" 91 \n",
" ['comedy', 'fantasy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0071853 \n",
" 8.200000 \n",
" 530877.0 \n",
" 18.216 \n",
" 7.800000 \n",
" comedy \n",
" GB \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"2 tm127384 Monty Python and the Holy Grail MOVIE \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"2 King Arthur, accompanied by his squire, recrui... 1975 \n",
"\n",
" age_certification runtime genres production_countries \\\n",
"0 TV-MA 48 ['documentation'] ['US'] \n",
"1 R 113 ['crime', 'drama'] ['US'] \n",
"2 PG 91 ['comedy', 'fantasy'] ['GB'] \n",
"\n",
" seasons imdb_id imdb_score imdb_votes tmdb_popularity tmdb_score \\\n",
"0 1.0 NaN 6.533447 NaN 0.600 6.818039 \n",
"1 0.0 tt0075314 8.300000 795222.0 27.612 8.200000 \n",
"2 0.0 tt0071853 8.200000 530877.0 18.216 7.800000 \n",
"\n",
" genres_transformed production_countries_transformed \n",
"0 documentation US \n",
"1 crime US \n",
"2 comedy GB "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titles.head(3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# Check correlation\n",
"\n",
"sns.heatmap(df_titles.corr(), cmap=\"Blues\", annot=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Analyzing Age Certification\n",
"\n",
"- **G:** General audiences – All ages admitted.\n",
"- **PG:** Parental Guidance Suggested.\n",
"- **PG-13:** Parents Strongly Cautioned.\n",
"- **TV-MA:** Adults Only.\n",
"- **TV-G:** Suitable for All Ages.\n",
"- **TV-Y:** Appropriate for All Children.\n",
"- **TV-Y7:** Designed for Children Age 7 and Above.\n",
"- **TV-14** Parental Guidance Suggested for Children Under 14 Year of Age.\n",
"- **TV-PG:** Parental Guidance Suggested.\n",
"- **NC-17** Adults Only.\n",
"- **R** Restricted. "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_8096/3813717038.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.\n",
" df_titles.groupby([\"age_certification\"])[\"imdb_score\", \"tmdb_score\"]\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"data = (\n",
" df_titles.groupby([\"age_certification\"])[\"imdb_score\", \"tmdb_score\"]\n",
" .mean()\n",
" .reset_index()\n",
")\n",
"plt.figure(figsize=(9, 6))\n",
"\n",
"sns.lineplot(data=data, x=\"age_certification\", y=\"tmdb_score\")\n",
"sns.lineplot(data=data, x=\"age_certification\", y=\"imdb_score\")\n",
"\n",
"# plt.grid()\n",
"plt.legend(labels=[\"Imdb Score\", \"Tmdb Score\"])\n",
"plt.xticks(rotation=45)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"fig, ax1 = plt.subplots(figsize=(10, 5))\n",
"\n",
"data_score = df_titles.groupby([\"age_certification\"])[\"tmdb_score\"].mean().reset_index()\n",
"data_popularity = (\n",
" df_titles.groupby([\"age_certification\"])[\"tmdb_popularity\"].mean().reset_index()\n",
")\n",
"\n",
"ax1 = sns.lineplot(\n",
" data=data_score,\n",
" x=\"age_certification\",\n",
" y=\"tmdb_score\",\n",
" color=\"red\",\n",
" ax=ax1,\n",
" label=\"Tmdb Score\",\n",
" marker=\"o\",\n",
")\n",
"\n",
"ax2 = ax1.twinx()\n",
"ax2 = sns.lineplot(\n",
" data=data_popularity,\n",
" x=\"age_certification\",\n",
" y=\"tmdb_popularity\",\n",
" ax=ax2,\n",
" label=\"Tmdb Popularity\",\n",
" marker=\"o\",\n",
")\n",
"\n",
"# Show two lines legends\n",
"\n",
"lines_1, labels_1 = ax1.get_legend_handles_labels()\n",
"lines_2, labels_2 = ax2.get_legend_handles_labels()\n",
"\n",
"lines = lines_1 + lines_2\n",
"labels = labels_1 + labels_2\n",
"\n",
"ax1.legend(lines, labels, loc=0)\n",
"ax2.get_legend().remove()\n",
"\n",
"plt.xticks(rotation=90)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['US', 'IN', 'GB', 'JP', 'KR']"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_top_country = (\n",
" df_titles.groupby([\"production_countries_transformed\"])[\"id\"]\n",
" .count()\n",
" .reset_index()\n",
" .sort_values(by=[\"id\"], ascending=False)\n",
" .head(5)\n",
")\n",
"top_countries = list(data_top_country[\"production_countries_transformed\"])\n",
"top_countries"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" index \n",
" production_countries_transformed \n",
" genres_transformed \n",
" id \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 493 \n",
" US \n",
" comedy \n",
" 569 \n",
" \n",
" \n",
" 1 \n",
" 496 \n",
" US \n",
" drama \n",
" 319 \n",
" \n",
" \n",
" 2 \n",
" 495 \n",
" US \n",
" documentation \n",
" 316 \n",
" \n",
" \n",
" 3 \n",
" 239 \n",
" IN \n",
" drama \n",
" 239 \n",
" \n",
" \n",
" 4 \n",
" 506 \n",
" US \n",
" thriller \n",
" 138 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 78 \n",
" 283 \n",
" JP \n",
" none \n",
" 1 \n",
" \n",
" \n",
" 79 \n",
" 279 \n",
" JP \n",
" family \n",
" 1 \n",
" \n",
" \n",
" 80 \n",
" 249 \n",
" IN \n",
" sport \n",
" 1 \n",
" \n",
" \n",
" 81 \n",
" 245 \n",
" IN \n",
" none \n",
" 1 \n",
" \n",
" \n",
" 82 \n",
" 242 \n",
" IN \n",
" history \n",
" 1 \n",
" \n",
" \n",
"
\n",
"
83 rows × 4 columns
\n",
"
"
],
"text/plain": [
" index production_countries_transformed genres_transformed id\n",
"0 493 US comedy 569\n",
"1 496 US drama 319\n",
"2 495 US documentation 316\n",
"3 239 IN drama 239\n",
"4 506 US thriller 138\n",
".. ... ... ... ...\n",
"78 283 JP none 1\n",
"79 279 JP family 1\n",
"80 249 IN sport 1\n",
"81 245 IN none 1\n",
"82 242 IN history 1\n",
"\n",
"[83 rows x 4 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_country = (\n",
" df_titles.groupby([\"production_countries_transformed\", \"genres_transformed\"])[\"id\"]\n",
" .count()\n",
" .reset_index()\n",
" .sort_values(by=[\"id\"], ascending=False)\n",
")\n",
"data_top_country_df = data_country[\n",
" data_country[\"production_countries_transformed\"].isin(top_countries)\n",
"].reset_index()\n",
"data_top_country_df"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n",
"/home/joao_victor/anaconda3/envs/netflix-app/lib/python3.10/site-packages/plotly/express/_core.py:271: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
" trace_data = trace_data.append(trace_data.iloc[0])\n"
]
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hovertemplate": "genres_transformed=comedy id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "comedy",
"line": {
"color": "#636efa",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "comedy",
"r": [
569,
105,
67,
35,
19,
569
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"IN",
"GB",
"KR",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=drama id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "drama",
"line": {
"color": "#EF553B",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "drama",
"r": [
319,
239,
80,
59,
53,
319
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"IN",
"KR",
"GB",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=documentation id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "documentation",
"line": {
"color": "#00cc96",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "documentation",
"r": [
316,
50,
9,
6,
5,
316
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"GB",
"IN",
"JP",
"KR",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=thriller id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "thriller",
"line": {
"color": "#ab63fa",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "thriller",
"r": [
138,
96,
20,
16,
4,
138
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"IN",
"GB",
"KR",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=scifi id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "scifi",
"line": {
"color": "#FFA15A",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "scifi",
"r": [
103,
68,
17,
11,
3,
103
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"JP",
"KR",
"GB",
"IN",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=reality id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "reality",
"line": {
"color": "#19d3f3",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "reality",
"r": [
94,
12,
10,
5,
3,
94
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"GB",
"JP",
"KR",
"IN",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=action id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "action",
"line": {
"color": "#FF6692",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "action",
"r": [
92,
49,
26,
20,
11,
92
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"JP",
"IN",
"KR",
"GB",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=animation id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "animation",
"line": {
"color": "#B6E880",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "animation",
"r": [
87,
27,
13,
9,
5,
87
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"JP",
"GB",
"IN",
"KR",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=crime id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "crime",
"line": {
"color": "#FF97FF",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "crime",
"r": [
87,
27,
23,
12,
4,
87
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"GB",
"IN",
"KR",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=romance id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "romance",
"line": {
"color": "#FECB52",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "romance",
"r": [
65,
57,
9,
4,
3,
65
],
"showlegend": true,
"subplot": "polar",
"theta": [
"IN",
"US",
"GB",
"JP",
"KR",
"IN"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=fantasy id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "fantasy",
"line": {
"color": "#636efa",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "fantasy",
"r": [
54,
23,
12,
5,
3,
54
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"JP",
"IN",
"GB",
"KR",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=horror id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "horror",
"line": {
"color": "#EF553B",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "horror",
"r": [
51,
10,
5,
2,
2,
51
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"IN",
"GB",
"KR",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=family id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "family",
"line": {
"color": "#00cc96",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "family",
"r": [
32,
3,
3,
2,
1,
32
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"KR",
"GB",
"IN",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=music id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "music",
"line": {
"color": "#ab63fa",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "music",
"r": [
26,
3,
2,
2,
26
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"GB",
"IN",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=western id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "western",
"line": {
"color": "#FFA15A",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "western",
"r": [
19,
1,
19
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"JP",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=none id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "none",
"line": {
"color": "#19d3f3",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "none",
"r": [
17,
3,
2,
1,
1,
17
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"KR",
"GB",
"JP",
"IN",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=war id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "war",
"line": {
"color": "#FF6692",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "war",
"r": [
16,
6,
3,
2,
16
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"GB",
"KR",
"IN",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=history id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "history",
"line": {
"color": "#B6E880",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "history",
"r": [
2,
1,
2
],
"showlegend": true,
"subplot": "polar",
"theta": [
"US",
"IN",
"US"
],
"type": "scatterpolar"
},
{
"hovertemplate": "genres_transformed=sport id=%{r} production_countries_transformed=%{theta} ",
"legendgroup": "sport",
"line": {
"color": "#FF97FF",
"dash": "solid"
},
"marker": {
"symbol": "circle"
},
"mode": "lines",
"name": "sport",
"r": [
1,
1
],
"showlegend": true,
"subplot": "polar",
"theta": [
"IN",
"IN"
],
"type": "scatterpolar"
}
],
"layout": {
"legend": {
"title": {
"text": "genres_transformed"
},
"tracegroupgap": 0
},
"margin": {
"t": 60
},
"polar": {
"angularaxis": {
"direction": "clockwise",
"rotation": 90
},
"domain": {
"x": [
0,
1
],
"y": [
0,
1
]
}
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = px.line_polar(\n",
" data_top_country_df,\n",
" r=\"id\",\n",
" theta=\"production_countries_transformed\",\n",
" line_close=True,\n",
" color=\"genres_transformed\",\n",
")\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"data_runtime = df_titles[[\"id\", \"runtime\", \"genres_transformed\"]]\n",
"plt.figure(figsize=(16, 6))\n",
"plt.suptitle(\n",
" \"Data Distribution Across Genres\", fontsize=18, weight=600, color=\"#333d29\"\n",
")\n",
"ax = sns.stripplot(\n",
" x=data_runtime[\"genres_transformed\"], y=data_runtime[\"runtime\"], jitter=0.05, size=5\n",
")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" genres_transformed \n",
" production_countries_transformed \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 0.600 \n",
" 6.818039 \n",
" documentation \n",
" US \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0075314 \n",
" 8.300000 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.200000 \n",
" crime \n",
" US \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"\n",
" age_certification runtime genres production_countries \\\n",
"0 TV-MA 48 ['documentation'] ['US'] \n",
"1 R 113 ['crime', 'drama'] ['US'] \n",
"\n",
" seasons imdb_id imdb_score imdb_votes tmdb_popularity tmdb_score \\\n",
"0 1.0 NaN 6.533447 NaN 0.600 6.818039 \n",
"1 0.0 tt0075314 8.300000 795222.0 27.612 8.200000 \n",
"\n",
" genres_transformed production_countries_transformed \n",
"0 documentation US \n",
"1 crime US "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titles.head(2)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" id \n",
" title \n",
" type \n",
" description \n",
" release_year \n",
" age_certification \n",
" runtime \n",
" genres \n",
" production_countries \n",
" seasons \n",
" imdb_id \n",
" imdb_score \n",
" imdb_votes \n",
" tmdb_popularity \n",
" tmdb_score \n",
" genres_transformed \n",
" production_countries_transformed \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" ts300399 \n",
" Five Came Back: The Reference Films \n",
" SHOW \n",
" This collection includes 12 World War II-era p... \n",
" 1945 \n",
" TV-MA \n",
" 48 \n",
" ['documentation'] \n",
" ['US'] \n",
" 1.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 0.600 \n",
" 6.818039 \n",
" documentation \n",
" US \n",
" \n",
" \n",
" 1 \n",
" tm84618 \n",
" Taxi Driver \n",
" MOVIE \n",
" A mentally unstable Vietnam War veteran works ... \n",
" 1976 \n",
" R \n",
" 113 \n",
" ['crime', 'drama'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0075314 \n",
" 8.300000 \n",
" 795222.0 \n",
" 27.612 \n",
" 8.200000 \n",
" crime \n",
" US \n",
" \n",
" \n",
" 2 \n",
" tm127384 \n",
" Monty Python and the Holy Grail \n",
" MOVIE \n",
" King Arthur, accompanied by his squire, recrui... \n",
" 1975 \n",
" PG \n",
" 91 \n",
" ['comedy', 'fantasy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0071853 \n",
" 8.200000 \n",
" 530877.0 \n",
" 18.216 \n",
" 7.800000 \n",
" comedy \n",
" GB \n",
" \n",
" \n",
" 3 \n",
" tm70993 \n",
" Life of Brian \n",
" MOVIE \n",
" Brian Cohen is an average young Jewish man, bu... \n",
" 1979 \n",
" R \n",
" 94 \n",
" ['comedy'] \n",
" ['GB'] \n",
" 0.0 \n",
" tt0079470 \n",
" 8.000000 \n",
" 392419.0 \n",
" 17.505 \n",
" 7.800000 \n",
" comedy \n",
" GB \n",
" \n",
" \n",
" 4 \n",
" tm190788 \n",
" The Exorcist \n",
" MOVIE \n",
" 12-year-old Regan MacNeil begins to adapt an e... \n",
" 1973 \n",
" R \n",
" 133 \n",
" ['horror'] \n",
" ['US'] \n",
" 0.0 \n",
" tt0070047 \n",
" 8.100000 \n",
" 391942.0 \n",
" 95.337 \n",
" 7.700000 \n",
" horror \n",
" US \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 5801 \n",
" tm1014599 \n",
" Fine Wine \n",
" MOVIE \n",
" A beautiful love story that can happen between... \n",
" 2021 \n",
" no_certification \n",
" 100 \n",
" ['romance', 'drama'] \n",
" ['NG'] \n",
" 0.0 \n",
" tt13857480 \n",
" 6.900000 \n",
" 39.0 \n",
" 0.966 \n",
" 6.818039 \n",
" romance \n",
" NG \n",
" \n",
" \n",
" 5802 \n",
" tm1108171 \n",
" Edis Starlight \n",
" MOVIE \n",
" Rising star Edis's career journey with ups and... \n",
" 2021 \n",
" no_certification \n",
" 74 \n",
" ['music', 'documentation'] \n",
" [] \n",
" 0.0 \n",
" NaN \n",
" 6.533447 \n",
" NaN \n",
" 1.036 \n",
" 8.500000 \n",
" music \n",
" none \n",
" \n",
" \n",
" 5803 \n",
" tm1045018 \n",
" Clash \n",
" MOVIE \n",
" A man from Nigeria returns to his family in Ca... \n",
" 2021 \n",
" no_certification \n",
" 88 \n",
" ['family', 'drama'] \n",
" ['NG', 'CA'] \n",
" 0.0 \n",
" tt14620732 \n",
" 6.500000 \n",
" 32.0 \n",
" 0.709 \n",
" 6.818039 \n",
" family \n",
" NG \n",
" \n",
" \n",
" 5804 \n",
" tm1098060 \n",
" Shadow Parties \n",
" MOVIE \n",
" A family faces destruction in a long-running c... \n",
" 2021 \n",
" no_certification \n",
" 116 \n",
" ['action', 'thriller'] \n",
" [] \n",
" 0.0 \n",
" tt10168094 \n",
" 6.200000 \n",
" 9.0 \n",
" 2.186 \n",
" 6.818039 \n",
" action \n",
" none \n",
" \n",
" \n",
" 5805 \n",
" ts271048 \n",
" Mighty Little Bheem: Kite Festival \n",
" SHOW \n",
" With winter behind them, Bheem and his townspe... \n",
" 2021 \n",
" no_certification \n",
" 0 \n",
" ['family', 'comedy', 'animation'] \n",
" [] \n",
" 1.0 \n",
" tt13711094 \n",
" 8.800000 \n",
" 16.0 \n",
" 0.979 \n",
" 10.000000 \n",
" family \n",
" none \n",
" \n",
" \n",
"
\n",
"
5699 rows × 17 columns
\n",
"
"
],
"text/plain": [
" id title type \\\n",
"0 ts300399 Five Came Back: The Reference Films SHOW \n",
"1 tm84618 Taxi Driver MOVIE \n",
"2 tm127384 Monty Python and the Holy Grail MOVIE \n",
"3 tm70993 Life of Brian MOVIE \n",
"4 tm190788 The Exorcist MOVIE \n",
"... ... ... ... \n",
"5801 tm1014599 Fine Wine MOVIE \n",
"5802 tm1108171 Edis Starlight MOVIE \n",
"5803 tm1045018 Clash MOVIE \n",
"5804 tm1098060 Shadow Parties MOVIE \n",
"5805 ts271048 Mighty Little Bheem: Kite Festival SHOW \n",
"\n",
" description release_year \\\n",
"0 This collection includes 12 World War II-era p... 1945 \n",
"1 A mentally unstable Vietnam War veteran works ... 1976 \n",
"2 King Arthur, accompanied by his squire, recrui... 1975 \n",
"3 Brian Cohen is an average young Jewish man, bu... 1979 \n",
"4 12-year-old Regan MacNeil begins to adapt an e... 1973 \n",
"... ... ... \n",
"5801 A beautiful love story that can happen between... 2021 \n",
"5802 Rising star Edis's career journey with ups and... 2021 \n",
"5803 A man from Nigeria returns to his family in Ca... 2021 \n",
"5804 A family faces destruction in a long-running c... 2021 \n",
"5805 With winter behind them, Bheem and his townspe... 2021 \n",
"\n",
" age_certification runtime genres \\\n",
"0 TV-MA 48 ['documentation'] \n",
"1 R 113 ['crime', 'drama'] \n",
"2 PG 91 ['comedy', 'fantasy'] \n",
"3 R 94 ['comedy'] \n",
"4 R 133 ['horror'] \n",
"... ... ... ... \n",
"5801 no_certification 100 ['romance', 'drama'] \n",
"5802 no_certification 74 ['music', 'documentation'] \n",
"5803 no_certification 88 ['family', 'drama'] \n",
"5804 no_certification 116 ['action', 'thriller'] \n",
"5805 no_certification 0 ['family', 'comedy', 'animation'] \n",
"\n",
" production_countries seasons imdb_id imdb_score imdb_votes \\\n",
"0 ['US'] 1.0 NaN 6.533447 NaN \n",
"1 ['US'] 0.0 tt0075314 8.300000 795222.0 \n",
"2 ['GB'] 0.0 tt0071853 8.200000 530877.0 \n",
"3 ['GB'] 0.0 tt0079470 8.000000 392419.0 \n",
"4 ['US'] 0.0 tt0070047 8.100000 391942.0 \n",
"... ... ... ... ... ... \n",
"5801 ['NG'] 0.0 tt13857480 6.900000 39.0 \n",
"5802 [] 0.0 NaN 6.533447 NaN \n",
"5803 ['NG', 'CA'] 0.0 tt14620732 6.500000 32.0 \n",
"5804 [] 0.0 tt10168094 6.200000 9.0 \n",
"5805 [] 1.0 tt13711094 8.800000 16.0 \n",
"\n",
" tmdb_popularity tmdb_score genres_transformed \\\n",
"0 0.600 6.818039 documentation \n",
"1 27.612 8.200000 crime \n",
"2 18.216 7.800000 comedy \n",
"3 17.505 7.800000 comedy \n",
"4 95.337 7.700000 horror \n",
"... ... ... ... \n",
"5801 0.966 6.818039 romance \n",
"5802 1.036 8.500000 music \n",
"5803 0.709 6.818039 family \n",
"5804 2.186 6.818039 action \n",
"5805 0.979 10.000000 family \n",
"\n",
" production_countries_transformed \n",
"0 US \n",
"1 US \n",
"2 GB \n",
"3 GB \n",
"4 US \n",
"... ... \n",
"5801 NG \n",
"5802 none \n",
"5803 NG \n",
"5804 none \n",
"5805 none \n",
"\n",
"[5699 rows x 17 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_titles"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/home/joao_victor/netflix-recommendation-app/data/output/df_titles.csv\n"
]
}
],
"source": [
"out_path = os.getcwd() + \"/data/output/df_titles.csv\"\n",
"print(out_path)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"df_titles.to_csv(out_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.10.4 ('netflix-app')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "43130da1fbae14895eb338e4f0e60d3310ef5c08adc6957cf17fde952d5329db"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}