{ "cells": [ { "cell_type": "code", "execution_count": 533, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 534, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
movieId
1Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy
\n", "
" ], "text/plain": [ " title genres\n", "movieId \n", "1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy" ] }, "execution_count": 534, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.read_csv('../data/reduced/movies_m10.csv', index_col='movieId')\n", "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 535, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tmdbIdimdbIdpopularitybudgetrevenueoriginal_titlecasthomepagedirectortagline...overviewruntimegenresproduction_companiesrelease_datevote_countvote_averagerelease_yearbudget_adjrevenue_adj
0135397tt036961032.9857631500000001513528810Jurassic WorldChris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...http://www.jurassicworld.com/Colin TrevorrowThe park is open....Twenty-two years after the events of Jurassic ...124Action|Adventure|Science Fiction|ThrillerUniversal Studios|Amblin Entertainment|Legenda...6/9/201555626.52015137999939.31.392446e+09
\n", "

1 rows × 21 columns

\n", "
" ], "text/plain": [ " tmdbId imdbId popularity budget revenue original_title \\\n", "0 135397 tt0369610 32.985763 150000000 1513528810 Jurassic World \n", "\n", " cast \\\n", "0 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "\n", " homepage director tagline ... \\\n", "0 http://www.jurassicworld.com/ Colin Trevorrow The park is open. ... \n", "\n", " overview runtime \\\n", "0 Twenty-two years after the events of Jurassic ... 124 \n", "\n", " genres \\\n", "0 Action|Adventure|Science Fiction|Thriller \n", "\n", " production_companies release_date vote_count \\\n", "0 Universal Studios|Amblin Entertainment|Legenda... 6/9/2015 5562 \n", "\n", " vote_average release_year budget_adj revenue_adj \n", "0 6.5 2015 137999939.3 1.392446e+09 \n", "\n", "[1 rows x 21 columns]" ] }, "execution_count": 535, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data = pd.read_csv('../data/auxiliar/tmdb_movies_data.csv')\n", "movies_data.head(1)" ] }, { "cell_type": "code", "execution_count": 536, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tmdbIdimdbIdcastdirectorkeywordsoverview
0135397tt0369610Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...Colin Trevorrowmonster|dna|tyrannosaurus rex|velociraptor|islandTwenty-two years after the events of Jurassic ...
\n", "
" ], "text/plain": [ " tmdbId imdbId cast \\\n", "0 135397 tt0369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "\n", " director keywords \\\n", "0 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n", "\n", " overview \n", "0 Twenty-two years after the events of Jurassic ... " ] }, "execution_count": 536, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data = movies_data[['tmdbId', 'imdbId', 'cast', 'director', 'keywords', 'overview']]\n", "movies_data.head(1)" ] }, { "cell_type": "code", "execution_count": 537, "metadata": {}, "outputs": [], "source": [ "def remove_prefix(series):\n", " s = str(series).removeprefix('tt')\n", " s = str(s).removeprefix('0')\n", " return s" ] }, { "cell_type": "code", "execution_count": 538, "metadata": {}, "outputs": [], "source": [ "imdb_ids = movies_data['imdbId']\n", "\n", "imdb_ids = imdb_ids.apply(remove_prefix)\n", "\n", "movies_data['imdbId'] = imdb_ids" ] }, { "cell_type": "code", "execution_count": 539, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdcastdirectorkeywordsoverview
tmdbId
135397369610Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...Colin Trevorrowmonster|dna|tyrannosaurus rex|velociraptor|islandTwenty-two years after the events of Jurassic ...
\n", "
" ], "text/plain": [ " imdbId cast \\\n", "tmdbId \n", "135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "\n", " director keywords \\\n", "tmdbId \n", "135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n", "\n", " overview \n", "tmdbId \n", "135397 Twenty-two years after the events of Jurassic ... " ] }, "execution_count": 539, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data = movies_data.set_index('tmdbId')\n", "movies_data.head(1)" ] }, { "cell_type": "code", "execution_count": 540, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdtmdbId
movieId
1114709862.0
\n", "
" ], "text/plain": [ " imdbId tmdbId\n", "movieId \n", "1 114709 862.0" ] }, "execution_count": 540, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links = pd.read_csv('../data/reduced/links_m10.csv', index_col='movieId')\n", "links.head(1)" ] }, { "cell_type": "code", "execution_count": 541, "metadata": {}, "outputs": [], "source": [ "links['tmdbId'] = links['tmdbId'].astype('int64')" ] }, { "cell_type": "code", "execution_count": 542, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 2269 entries, 1 to 187593\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype\n", "--- ------ -------------- -----\n", " 0 imdbId 2269 non-null int64\n", " 1 tmdbId 2269 non-null int64\n", "dtypes: int64(2)\n", "memory usage: 53.2 KB\n" ] } ], "source": [ "links.info()" ] }, { "cell_type": "code", "execution_count": 543, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenresimdbIdtmdbId
movieId
1Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy114709862
2Jumanji (1995)Adventure|Children|Fantasy1134978844
3Grumpier Old Men (1995)Comedy|Romance11322815602
5Father of the Bride Part II (1995)Comedy11304111862
6Heat (1995)Action|Crime|Thriller113277949
\n", "
" ], "text/plain": [ " title \\\n", "movieId \n", "1 Toy Story (1995) \n", "2 Jumanji (1995) \n", "3 Grumpier Old Men (1995) \n", "5 Father of the Bride Part II (1995) \n", "6 Heat (1995) \n", "\n", " genres imdbId tmdbId \n", "movieId \n", "1 Adventure|Animation|Children|Comedy|Fantasy 114709 862 \n", "2 Adventure|Children|Fantasy 113497 8844 \n", "3 Comedy|Romance 113228 15602 \n", "5 Comedy 113041 11862 \n", "6 Action|Crime|Thriller 113277 949 " ] }, "execution_count": 543, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = pd.concat([movies, links], axis=1, join='inner')\n", "result.head()" ] }, { "cell_type": "code", "execution_count": 544, "metadata": {}, "outputs": [], "source": [ "mask = movies_data['imdbId'] == 'nan'\n", "\n", "movies_data = movies_data[~mask]\n", "movies_data['imdbId'].dropna(inplace=True)\n", "movies_data.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 545, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 10855\n", "unique 10855\n", "top 369610\n", "freq 1\n", "Name: imdbId, dtype: object" ] }, "execution_count": 545, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data['imdbId'].describe()" ] }, { "cell_type": "code", "execution_count": 546, "metadata": {}, "outputs": [], "source": [ "movies_data['imdbId'] = movies_data['imdbId'].astype(int)" ] }, { "cell_type": "code", "execution_count": 547, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdcastdirectorkeywordsoverview
tmdbId
862114709Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...John Lasseterjealousy|toy|boy|friendship|friendsWoody the cowboy is young Andy’s favorite to...
\n", "
" ], "text/plain": [ " imdbId cast \\\n", "tmdbId \n", "862 114709 Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal... \n", "\n", " director keywords \\\n", "tmdbId \n", "862 John Lasseter jealousy|toy|boy|friendship|friends \n", "\n", " overview \n", "tmdbId \n", "862 Woody the cowboy is young Andy’s favorite to... " ] }, "execution_count": 547, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data[movies_data.index == 862]" ] }, { "cell_type": "code", "execution_count": 549, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenresmovieId
tmdbId
862Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy1
\n", "
" ], "text/plain": [ " title genres movieId\n", "tmdbId \n", "862 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 1" ] }, "execution_count": 549, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = result[['title', 'genres', 'tmdbId']]\n", "result['movieId'] = result.index\n", "result.set_index('tmdbId', inplace=True)\n", "result.head(1)" ] }, { "cell_type": "code", "execution_count": 550, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdcastdirectorkeywordsoverview
tmdbId
135397369610Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...Colin Trevorrowmonster|dna|tyrannosaurus rex|velociraptor|islandTwenty-two years after the events of Jurassic ...
\n", "
" ], "text/plain": [ " imdbId cast \\\n", "tmdbId \n", "135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "\n", " director keywords \\\n", "tmdbId \n", "135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n", "\n", " overview \n", "tmdbId \n", "135397 Twenty-two years after the events of Jurassic ... " ] }, "execution_count": 550, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data.head(1)" ] }, { "cell_type": "code", "execution_count": 551, "metadata": {}, "outputs": [], "source": [ "movies_full_data = pd.concat([movies_data, result], axis=1, join='inner')" ] }, { "cell_type": "code", "execution_count": 552, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdcastdirectorkeywordsoverviewtitlegenresmovieId
tmdbId
135397369610Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...Colin Trevorrowmonster|dna|tyrannosaurus rex|velociraptor|islandTwenty-two years after the events of Jurassic ...Jurassic World (2015)Action|Adventure|Drama|Sci-Fi|Thriller117529
\n", "
" ], "text/plain": [ " imdbId cast \\\n", "tmdbId \n", "135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "\n", " director keywords \\\n", "tmdbId \n", "135397 Colin Trevorrow monster|dna|tyrannosaurus rex|velociraptor|island \n", "\n", " overview \\\n", "tmdbId \n", "135397 Twenty-two years after the events of Jurassic ... \n", "\n", " title genres movieId \n", "tmdbId \n", "135397 Jurassic World (2015) Action|Adventure|Drama|Sci-Fi|Thriller 117529 " ] }, "execution_count": 552, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_full_data.head(1)" ] }, { "cell_type": "code", "execution_count": 554, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 2026 entries, 135397 to 1714\n", "Data columns (total 8 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 imdbId 2026 non-null int32 \n", " 1 cast 2026 non-null object\n", " 2 director 2026 non-null object\n", " 3 keywords 2026 non-null object\n", " 4 overview 2026 non-null object\n", " 5 title 2026 non-null object\n", " 6 genres 2026 non-null object\n", " 7 movieId 2026 non-null int64 \n", "dtypes: int32(1), int64(1), object(6)\n", "memory usage: 134.5+ KB\n" ] } ], "source": [ "movies_full_data.dropna(inplace=True)\n", "movies_full_data.info()" ] }, { "cell_type": "code", "execution_count": 555, "metadata": {}, "outputs": [], "source": [ "movies_full_data.to_csv('../data/reduced/movies_m10_rich.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }