{
"cells": [
{
"cell_type": "code",
"execution_count": 533,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 534,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" genres | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title genres\n",
"movieId \n",
"1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy"
]
},
"execution_count": 534,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies = pd.read_csv('../data/reduced/movies_m10.csv', index_col='movieId')\n",
"movies.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 535,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tmdbId | \n",
" imdbId | \n",
" popularity | \n",
" budget | \n",
" revenue | \n",
" original_title | \n",
" cast | \n",
" homepage | \n",
" director | \n",
" tagline | \n",
" ... | \n",
" overview | \n",
" runtime | \n",
" genres | \n",
" production_companies | \n",
" release_date | \n",
" vote_count | \n",
" vote_average | \n",
" release_year | \n",
" budget_adj | \n",
" revenue_adj | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 135397 | \n",
" tt0369610 | \n",
" 32.985763 | \n",
" 150000000 | \n",
" 1513528810 | \n",
" Jurassic World | \n",
" Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | \n",
" http://www.jurassicworld.com/ | \n",
" Colin Trevorrow | \n",
" The park is open. | \n",
" ... | \n",
" Twenty-two years after the events of Jurassic ... | \n",
" 124 | \n",
" Action|Adventure|Science Fiction|Thriller | \n",
" Universal Studios|Amblin Entertainment|Legenda... | \n",
" 6/9/2015 | \n",
" 5562 | \n",
" 6.5 | \n",
" 2015 | \n",
" 137999939.3 | \n",
" 1.392446e+09 | \n",
"
\n",
" \n",
"
\n",
"
1 rows × 21 columns
\n",
"
"
],
"text/plain": [
" tmdbId imdbId popularity budget revenue original_title \\\n",
"0 135397 tt0369610 32.985763 150000000 1513528810 Jurassic World \n",
"\n",
" cast \\\n",
"0 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n",
"\n",
" homepage director tagline ... \\\n",
"0 http://www.jurassicworld.com/ Colin Trevorrow The park is open. ... \n",
"\n",
" overview runtime \\\n",
"0 Twenty-two years after the events of Jurassic ... 124 \n",
"\n",
" genres \\\n",
"0 Action|Adventure|Science Fiction|Thriller \n",
"\n",
" production_companies release_date vote_count \\\n",
"0 Universal Studios|Amblin Entertainment|Legenda... 6/9/2015 5562 \n",
"\n",
" vote_average release_year budget_adj revenue_adj \n",
"0 6.5 2015 137999939.3 1.392446e+09 \n",
"\n",
"[1 rows x 21 columns]"
]
},
"execution_count": 535,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data = pd.read_csv('../data/auxiliar/tmdb_movies_data.csv')\n",
"movies_data.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 536,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tmdbId | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 135397 | \n",
" tt0369610 | \n",
" Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | \n",
" Colin Trevorrow | \n",
" monster|dna|tyrannosaurus rex|velociraptor|island | \n",
" Twenty-two years after the events of Jurassic ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tmdbId imdbId cast \\\n",
"0 135397 tt0369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n",
"\n",
" director keywords \\\n",
"0 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n",
"\n",
" overview \n",
"0 Twenty-two years after the events of Jurassic ... "
]
},
"execution_count": 536,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data = movies_data[['tmdbId', 'imdbId', 'cast', 'director', 'keywords', 'overview']]\n",
"movies_data.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 537,
"metadata": {},
"outputs": [],
"source": [
"def remove_prefix(series):\n",
" s = str(series).removeprefix('tt')\n",
" s = str(s).removeprefix('0')\n",
" return s"
]
},
{
"cell_type": "code",
"execution_count": 538,
"metadata": {},
"outputs": [],
"source": [
"imdb_ids = movies_data['imdbId']\n",
"\n",
"imdb_ids = imdb_ids.apply(remove_prefix)\n",
"\n",
"movies_data['imdbId'] = imdb_ids"
]
},
{
"cell_type": "code",
"execution_count": 539,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
"
\n",
" \n",
" tmdbId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 135397 | \n",
" 369610 | \n",
" Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | \n",
" Colin Trevorrow | \n",
" monster|dna|tyrannosaurus rex|velociraptor|island | \n",
" Twenty-two years after the events of Jurassic ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId cast \\\n",
"tmdbId \n",
"135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n",
"\n",
" director keywords \\\n",
"tmdbId \n",
"135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n",
"\n",
" overview \n",
"tmdbId \n",
"135397 Twenty-two years after the events of Jurassic ... "
]
},
"execution_count": 539,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data = movies_data.set_index('tmdbId')\n",
"movies_data.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 540,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" tmdbId | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 114709 | \n",
" 862.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId tmdbId\n",
"movieId \n",
"1 114709 862.0"
]
},
"execution_count": 540,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"links = pd.read_csv('../data/reduced/links_m10.csv', index_col='movieId')\n",
"links.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 541,
"metadata": {},
"outputs": [],
"source": [
"links['tmdbId'] = links['tmdbId'].astype('int64')"
]
},
{
"cell_type": "code",
"execution_count": 542,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 2269 entries, 1 to 187593\n",
"Data columns (total 2 columns):\n",
" # Column Non-Null Count Dtype\n",
"--- ------ -------------- -----\n",
" 0 imdbId 2269 non-null int64\n",
" 1 tmdbId 2269 non-null int64\n",
"dtypes: int64(2)\n",
"memory usage: 53.2 KB\n"
]
}
],
"source": [
"links.info()"
]
},
{
"cell_type": "code",
"execution_count": 543,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" genres | \n",
" imdbId | \n",
" tmdbId | \n",
"
\n",
" \n",
" movieId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
" 114709 | \n",
" 862 | \n",
"
\n",
" \n",
" 2 | \n",
" Jumanji (1995) | \n",
" Adventure|Children|Fantasy | \n",
" 113497 | \n",
" 8844 | \n",
"
\n",
" \n",
" 3 | \n",
" Grumpier Old Men (1995) | \n",
" Comedy|Romance | \n",
" 113228 | \n",
" 15602 | \n",
"
\n",
" \n",
" 5 | \n",
" Father of the Bride Part II (1995) | \n",
" Comedy | \n",
" 113041 | \n",
" 11862 | \n",
"
\n",
" \n",
" 6 | \n",
" Heat (1995) | \n",
" Action|Crime|Thriller | \n",
" 113277 | \n",
" 949 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title \\\n",
"movieId \n",
"1 Toy Story (1995) \n",
"2 Jumanji (1995) \n",
"3 Grumpier Old Men (1995) \n",
"5 Father of the Bride Part II (1995) \n",
"6 Heat (1995) \n",
"\n",
" genres imdbId tmdbId \n",
"movieId \n",
"1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 \n",
"2 Adventure|Children|Fantasy 113497 8844 \n",
"3 Comedy|Romance 113228 15602 \n",
"5 Comedy 113041 11862 \n",
"6 Action|Crime|Thriller 113277 949 "
]
},
"execution_count": 543,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = pd.concat([movies, links], axis=1, join='inner')\n",
"result.head()"
]
},
{
"cell_type": "code",
"execution_count": 544,
"metadata": {},
"outputs": [],
"source": [
"mask = movies_data['imdbId'] == 'nan'\n",
"\n",
"movies_data = movies_data[~mask]\n",
"movies_data['imdbId'].dropna(inplace=True)\n",
"movies_data.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 545,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 10855\n",
"unique 10855\n",
"top 369610\n",
"freq 1\n",
"Name: imdbId, dtype: object"
]
},
"execution_count": 545,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data['imdbId'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 546,
"metadata": {},
"outputs": [],
"source": [
"movies_data['imdbId'] = movies_data['imdbId'].astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 547,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
"
\n",
" \n",
" tmdbId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 862 | \n",
" 114709 | \n",
" Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal... | \n",
" John Lasseter | \n",
" jealousy|toy|boy|friendship|friends | \n",
" Woody the cowboy is young Andy’s favorite to... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId cast \\\n",
"tmdbId \n",
"862 114709 Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal... \n",
"\n",
" director keywords \\\n",
"tmdbId \n",
"862 John Lasseter jealousy|toy|boy|friendship|friends \n",
"\n",
" overview \n",
"tmdbId \n",
"862 Woody the cowboy is young Andy’s favorite to... "
]
},
"execution_count": 547,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data[movies_data.index == 862]"
]
},
{
"cell_type": "code",
"execution_count": 549,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" title | \n",
" genres | \n",
" movieId | \n",
"
\n",
" \n",
" tmdbId | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 862 | \n",
" Toy Story (1995) | \n",
" Adventure|Animation|Children|Comedy|Fantasy | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" title genres movieId\n",
"tmdbId \n",
"862 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1"
]
},
"execution_count": 549,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"result = result[['title', 'genres', 'tmdbId']]\n",
"result['movieId'] = result.index\n",
"result.set_index('tmdbId', inplace=True)\n",
"result.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 550,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
"
\n",
" \n",
" tmdbId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 135397 | \n",
" 369610 | \n",
" Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | \n",
" Colin Trevorrow | \n",
" monster|dna|tyrannosaurus rex|velociraptor|island | \n",
" Twenty-two years after the events of Jurassic ... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId cast \\\n",
"tmdbId \n",
"135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n",
"\n",
" director keywords \\\n",
"tmdbId \n",
"135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n",
"\n",
" overview \n",
"tmdbId \n",
"135397 Twenty-two years after the events of Jurassic ... "
]
},
"execution_count": 550,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_data.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 551,
"metadata": {},
"outputs": [],
"source": [
"movies_full_data = pd.concat([movies_data, result], axis=1, join='inner')"
]
},
{
"cell_type": "code",
"execution_count": 552,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" cast | \n",
" director | \n",
" keywords | \n",
" overview | \n",
" title | \n",
" genres | \n",
" movieId | \n",
"
\n",
" \n",
" tmdbId | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" 135397 | \n",
" 369610 | \n",
" Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... | \n",
" Colin Trevorrow | \n",
" monster|dna|tyrannosaurus rex|velociraptor|island | \n",
" Twenty-two years after the events of Jurassic ... | \n",
" Jurassic World (2015) | \n",
" Action|Adventure|Drama|Sci-Fi|Thriller | \n",
" 117529 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId cast \\\n",
"tmdbId \n",
"135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n",
"\n",
" director keywords \\\n",
"tmdbId \n",
"135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n",
"\n",
" overview \\\n",
"tmdbId \n",
"135397 Twenty-two years after the events of Jurassic ... \n",
"\n",
" title genres movieId \n",
"tmdbId \n",
"135397 Jurassic World (2015) Action|Adventure|Drama|Sci-Fi|Thriller 117529 "
]
},
"execution_count": 552,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies_full_data.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 554,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Index: 2026 entries, 135397 to 1714\n",
"Data columns (total 8 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 imdbId 2026 non-null int32 \n",
" 1 cast 2026 non-null object\n",
" 2 director 2026 non-null object\n",
" 3 keywords 2026 non-null object\n",
" 4 overview 2026 non-null object\n",
" 5 title 2026 non-null object\n",
" 6 genres 2026 non-null object\n",
" 7 movieId 2026 non-null int64 \n",
"dtypes: int32(1), int64(1), object(6)\n",
"memory usage: 134.5+ KB\n"
]
}
],
"source": [
"movies_full_data.dropna(inplace=True)\n",
"movies_full_data.info()"
]
},
{
"cell_type": "code",
"execution_count": 555,
"metadata": {},
"outputs": [],
"source": [
"movies_full_data.to_csv('../data/reduced/movies_m10_rich.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}