{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "#! pip install -r ../requirements.txt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "movies_path = '../data/standard/movies_m10_rich'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "links = pd.read_csv('../data/standard/links.csv', index_col='movieId') # ids do filme y nas plataformas imdb e tmdb, util para enriquecer os dados\n", "movies = pd.read_csv(movies_path+'.csv', index_col='movieId') # dados do filme y\n", "ratings = pd.read_csv('../data/reduced/ratings_m10.csv', index_col='movieId') # avaliacao do usuario x pro filme y\n", "tags = pd.read_csv('../data/standard/tags.csv', index_col='movieId') # n parece mto relevante" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## separação do ano de lançamento" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "movies[['title', 'year']] = movies['title'].str.extract('(.+)+\\((\\d+)', expand=True)\n", "\n", "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tmdbIdimdbIdcastdirectorkeywordsoverviewtitlegenresyear
movieId
117529135397369610Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...Colin Trevorrowmonster|dna|tyrannosaurus rex|velociraptor|islandTwenty-two years after the events of Jurassic ...Jurassic WorldAction|Adventure|Drama|Sci-Fi|Thriller2015
122882763411392190Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...George Millerfuture|chase|post-apocalyptic|dystopia|australiaAn apocalyptic story set in the furthest reach...Mad Max: Fury RoadAction|Adventure|Sci-Fi|Thriller2015
1228861406072488496Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...J.J. Abramsandroid|spaceship|jedi|space opera|3dThirty years after defeating the Galactic Empi...Star Wars: Episode VII - The Force AwakensAction|Adventure|Fantasy|Sci-Fi|IMAX2015
1393852819571663202Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn...Alejandro González Iñárritufather-son relationship|rape|based on novel|mo...In the 1820s, a frontiersman, Hugh Glass, sets...The RevenantAdventure|Drama2015
1341302862173659388Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ...Ridley Scottbased on novel|mars|nasa|isolation|botanistDuring a manned mission to Mars, Astronaut Mar...The MartianAdventure|Drama|Sci-Fi2015
\n", "
" ], "text/plain": [ " tmdbId imdbId cast \\\n", "movieId \n", "117529 135397 369610 Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi... \n", "122882 76341 1392190 Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic... \n", "122886 140607 2488496 Harrison Ford|Mark Hamill|Carrie Fisher|Adam D... \n", "139385 281957 1663202 Leonardo DiCaprio|Tom Hardy|Will Poulter|Domhn... \n", "134130 286217 3659388 Matt Damon|Jessica Chastain|Kristen Wiig|Jeff ... \n", "\n", " director \\\n", "movieId \n", "117529 Colin Trevorrow \n", "122882 George Miller \n", "122886 J.J. Abrams \n", "139385 Alejandro González Iñárritu \n", "134130 Ridley Scott \n", "\n", " keywords \\\n", "movieId \n", "117529 monster|dna|tyrannosaurus rex|velociraptor|island \n", "122882 future|chase|post-apocalyptic|dystopia|australia \n", "122886 android|spaceship|jedi|space opera|3d \n", "139385 father-son relationship|rape|based on novel|mo... \n", "134130 based on novel|mars|nasa|isolation|botanist \n", "\n", " overview \\\n", "movieId \n", "117529 Twenty-two years after the events of Jurassic ... \n", "122882 An apocalyptic story set in the furthest reach... \n", "122886 Thirty years after defeating the Galactic Empi... \n", "139385 In the 1820s, a frontiersman, Hugh Glass, sets... \n", "134130 During a manned mission to Mars, Astronaut Mar... \n", "\n", " title \\\n", "movieId \n", "117529 Jurassic World \n", "122882 Mad Max: Fury Road \n", "122886 Star Wars: Episode VII - The Force Awakens \n", "139385 The Revenant \n", "134130 The Martian \n", "\n", " genres year \n", "movieId \n", "117529 Action|Adventure|Drama|Sci-Fi|Thriller 2015 \n", "122882 Action|Adventure|Sci-Fi|Thriller 2015 \n", "122886 Action|Adventure|Fantasy|Sci-Fi|IMAX 2015 \n", "139385 Adventure|Drama 2015 \n", "134130 Adventure|Drama|Sci-Fi 2015 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "genres = movies['genres']\n", "cast = movies['cast']\n", "keywords = movies['keywords']\n", "\n", "def split_into_list(series):\n", " return str(series).split('|') \n", "\n", "genres = genres.apply(split_into_list)\n", "cast = cast.apply(split_into_list)\n", "keywords = keywords.apply(split_into_list)\n", "\n", "\n", "movies['genres'] = genres\n", "movies['cast'] = cast\n", "movies['keywords'] = keywords" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tmdbIdimdbIdcastdirectorkeywordsoverviewtitlegenresyear
movieId
117529135397369610[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...Colin Trevorrow[monster, dna, tyrannosaurus rex, velociraptor...Twenty-two years after the events of Jurassic ...Jurassic World[Action, Adventure, Drama, Sci-Fi, Thriller]2015
122882763411392190[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...George Miller[future, chase, post-apocalyptic, dystopia, au...An apocalyptic story set in the furthest reach...Mad Max: Fury Road[Action, Adventure, Sci-Fi, Thriller]2015
1228861406072488496[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...J.J. Abrams[android, spaceship, jedi, space opera, 3d]Thirty years after defeating the Galactic Empi...Star Wars: Episode VII - The Force Awakens[Action, Adventure, Fantasy, Sci-Fi, IMAX]2015
1393852819571663202[Leonardo DiCaprio, Tom Hardy, Will Poulter, D...Alejandro González Iñárritu[father-son relationship, rape, based on novel...In the 1820s, a frontiersman, Hugh Glass, sets...The Revenant[Adventure, Drama]2015
1341302862173659388[Matt Damon, Jessica Chastain, Kristen Wiig, J...Ridley Scott[based on novel, mars, nasa, isolation, botanist]During a manned mission to Mars, Astronaut Mar...The Martian[Adventure, Drama, Sci-Fi]2015
\n", "
" ], "text/plain": [ " tmdbId imdbId cast \\\n", "movieId \n", "117529 135397 369610 [Chris Pratt, Bryce Dallas Howard, Irrfan Khan... \n", "122882 76341 1392190 [Tom Hardy, Charlize Theron, Hugh Keays-Byrne,... \n", "122886 140607 2488496 [Harrison Ford, Mark Hamill, Carrie Fisher, Ad... \n", "139385 281957 1663202 [Leonardo DiCaprio, Tom Hardy, Will Poulter, D... \n", "134130 286217 3659388 [Matt Damon, Jessica Chastain, Kristen Wiig, J... \n", "\n", " director \\\n", "movieId \n", "117529 Colin Trevorrow \n", "122882 George Miller \n", "122886 J.J. Abrams \n", "139385 Alejandro González Iñárritu \n", "134130 Ridley Scott \n", "\n", " keywords \\\n", "movieId \n", "117529 [monster, dna, tyrannosaurus rex, velociraptor... \n", "122882 [future, chase, post-apocalyptic, dystopia, au... \n", "122886 [android, spaceship, jedi, space opera, 3d] \n", "139385 [father-son relationship, rape, based on novel... \n", "134130 [based on novel, mars, nasa, isolation, botanist] \n", "\n", " overview \\\n", "movieId \n", "117529 Twenty-two years after the events of Jurassic ... \n", "122882 An apocalyptic story set in the furthest reach... \n", "122886 Thirty years after defeating the Galactic Empi... \n", "139385 In the 1820s, a frontiersman, Hugh Glass, sets... \n", "134130 During a manned mission to Mars, Astronaut Mar... \n", "\n", " title \\\n", "movieId \n", "117529 Jurassic World \n", "122882 Mad Max: Fury Road \n", "122886 Star Wars: Episode VII - The Force Awakens \n", "139385 The Revenant \n", "134130 The Martian \n", "\n", " genres year \n", "movieId \n", "117529 [Action, Adventure, Drama, Sci-Fi, Thriller] 2015 \n", "122882 [Action, Adventure, Sci-Fi, Thriller] 2015 \n", "122886 [Action, Adventure, Fantasy, Sci-Fi, IMAX] 2015 \n", "139385 [Adventure, Drama] 2015 \n", "134130 [Adventure, Drama, Sci-Fi] 2015 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies = movies.sort_index()\n", "movies.index = [i+1 for i in range(len(movies.index.values))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies['movieId'] = movies.index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tmdbIdimdbIdcastdirectorkeywordsoverviewtitlegenresyearmovieId
1862114709[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...John Lasseter[jealousy, toy, boy, friendship, friends]Woody the cowboy is young Andy’s favorite to...Toy Story[Adventure, Animation, Children, Comedy, Fantasy]19951
28844113497[Robin Williams, Jonathan Hyde, Kirsten Dunst,...Joe Johnston[board game, disappearance, based on a childre...When siblings Judy and Peter discover an encha...Jumanji[Adventure, Children, Fantasy]19952
315602113228[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...Howard Deutch[fishing, best friend, duringcreditsstinger, o...A family wedding reignites the ancient feud be...Grumpier Old Men[Comedy, Romance]19953
411862113041[Steve Martin, Diane Keaton, Martin Short, Kim...Charles Shyer[baby, midlife crisis, confidence, aging, daug...Just when George Banks has recovered from his ...Father of the Bride Part II[Comedy]19954
5949113277[Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...Michael Mann[gambling, suicide attempt, burglar, booties, ...Obsessive master thief, Neil McCauley leads a ...Heat[Action, Crime, Thriller]19955
.................................
20222819571663202[Leonardo DiCaprio, Tom Hardy, Will Poulter, D...Alejandro González Iñárritu[father-son relationship, rape, based on novel...In the 1820s, a frontiersman, Hugh Glass, sets...The Revenant[Adventure, Drama]20152022
20232734813397884[Emily Blunt, Benicio del Toro, Josh Brolin, V...Denis Villeneuve[mexico, cia, smoking, texas, fbi]A young female FBI agent joins a secret CIA op...Sicario[Crime, Drama, Mystery]20152023
20242572112361509[Robert De Niro, Anne Hathaway, Rene Russo, An...Nancy Meyers[intern, woman director]70-year-old widower Ben Whittaker has discover...The Intern[Comedy]20152024
20253143651895587[Mark Ruffalo, Michael Keaton, Rachel McAdams,...Tom McCarthy[child abuse, journalism, judge, florida, boston]The true story of how The Boston Globe uncover...Spotlight[Thriller]20152025
20263188461596363[Christian Bale, Steve Carell, Ryan Gosling, B...Adam McKay[bank, fraud, biography, wall street, finances]The men who made millions from a global econom...Big Short, The[Drama]20152026
\n", "

2026 rows × 10 columns

\n", "
" ], "text/plain": [ " tmdbId imdbId cast \\\n", "1 862 114709 [Tom Hanks, Tim Allen, Don Rickles, Jim Varney... \n", "2 8844 113497 [Robin Williams, Jonathan Hyde, Kirsten Dunst,... \n", "3 15602 113228 [Walter Matthau, Jack Lemmon, Ann-Margret, Sop... \n", "4 11862 113041 [Steve Martin, Diane Keaton, Martin Short, Kim... \n", "5 949 113277 [Al Pacino, Robert De Niro, Val Kilmer, Jon Vo... \n", "... ... ... ... \n", "2022 281957 1663202 [Leonardo DiCaprio, Tom Hardy, Will Poulter, D... \n", "2023 273481 3397884 [Emily Blunt, Benicio del Toro, Josh Brolin, V... \n", "2024 257211 2361509 [Robert De Niro, Anne Hathaway, Rene Russo, An... \n", "2025 314365 1895587 [Mark Ruffalo, Michael Keaton, Rachel McAdams,... \n", "2026 318846 1596363 [Christian Bale, Steve Carell, Ryan Gosling, B... \n", "\n", " director \\\n", "1 John Lasseter \n", "2 Joe Johnston \n", "3 Howard Deutch \n", "4 Charles Shyer \n", "5 Michael Mann \n", "... ... \n", "2022 Alejandro González Iñárritu \n", "2023 Denis Villeneuve \n", "2024 Nancy Meyers \n", "2025 Tom McCarthy \n", "2026 Adam McKay \n", "\n", " keywords \\\n", "1 [jealousy, toy, boy, friendship, friends] \n", "2 [board game, disappearance, based on a childre... \n", "3 [fishing, best friend, duringcreditsstinger, o... \n", "4 [baby, midlife crisis, confidence, aging, daug... \n", "5 [gambling, suicide attempt, burglar, booties, ... \n", "... ... \n", "2022 [father-son relationship, rape, based on novel... \n", "2023 [mexico, cia, smoking, texas, fbi] \n", "2024 [intern, woman director] \n", "2025 [child abuse, journalism, judge, florida, boston] \n", "2026 [bank, fraud, biography, wall street, finances] \n", "\n", " overview \\\n", "1 Woody the cowboy is young Andy’s favorite to... \n", "2 When siblings Judy and Peter discover an encha... \n", "3 A family wedding reignites the ancient feud be... \n", "4 Just when George Banks has recovered from his ... \n", "5 Obsessive master thief, Neil McCauley leads a ... \n", "... ... \n", "2022 In the 1820s, a frontiersman, Hugh Glass, sets... \n", "2023 A young female FBI agent joins a secret CIA op... \n", "2024 70-year-old widower Ben Whittaker has discover... \n", "2025 The true story of how The Boston Globe uncover... \n", "2026 The men who made millions from a global econom... \n", "\n", " title \\\n", "1 Toy Story \n", "2 Jumanji \n", "3 Grumpier Old Men \n", "4 Father of the Bride Part II \n", "5 Heat \n", "... ... \n", "2022 The Revenant \n", "2023 Sicario \n", "2024 The Intern \n", "2025 Spotlight \n", "2026 Big Short, The \n", "\n", " genres year movieId \n", "1 [Adventure, Animation, Children, Comedy, Fantasy] 1995 1 \n", "2 [Adventure, Children, Fantasy] 1995 2 \n", "3 [Comedy, Romance] 1995 3 \n", "4 [Comedy] 1995 4 \n", "5 [Action, Crime, Thriller] 1995 5 \n", "... ... ... ... \n", "2022 [Adventure, Drama] 2015 2022 \n", "2023 [Crime, Drama, Mystery] 2015 2023 \n", "2024 [Comedy] 2015 2024 \n", "2025 [Thriller] 2015 2025 \n", "2026 [Drama] 2015 2026 \n", "\n", "[2026 rows x 10 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "movies.to_csv(movies_path+'_pre.csv')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }