{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Movie recommendation" ] }, { "cell_type": "code", "execution_count": 252, "metadata": {}, "outputs": [], "source": [ "import os \n", "import pickle\n", "\n", "path_data = r\"data/movies\"\n", "\n", "with open(os.path.join(path_data,'movies_dict.pkl'), 'rb') as file:\n", " movies_data = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": 253, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "movies = pd.DataFrame(movies_data)\n", "movies.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 254, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def has_capital(string):\n", " for index, char in enumerate(string):\n", " if char.isupper() and index != 0:\n", " return True\n", " return False\n", "\n", "def clean_tags(text):\n", " pattern1 = re.compile(r'[?!]')\n", " pattern2 = re.compile(r'\\.(?!\\s|$)')\n", " pattern3 = re.compile(r'\\.[a-zA-Z]\\.')\n", " \n", " text_clean = re.sub(pattern1, '. ', text)\n", " text_clean = re.sub(pattern2, \"\", text_clean)\n", " text_clean = re.sub(pattern3, \"\", text_clean)\n", " text_clean = text_clean.replace(\"RobertDowneyJr.\",\"\").replace(\"SamuelL.\",\"\").replace(\"ScienceFiction\", \"Sciencefiction\")\n", "\n", " tags_words = \" \".join([t for t in text_clean.split(\" \") if has_capital(t)==False])\n", " tags_words = [t for t in tags_words.split(\". \")[-1:][0].strip().split(\" \")[:8] if t!=\"\"]\n", " tags_words = [t for t in tags_words if t[0].isupper()==True]\n", " #tags_words_clean = [t for t in tags_words_clean if has_capital(t)==False]\n", " return \" \".join(sorted(tags_words)).replace(\"Sciencefiction\",\"Science Fiction\")" ] }, { "cell_type": "code", "execution_count": 255, "metadata": {}, "outputs": [], "source": [ "movies[\"tags\"] = movies[\"tags\"].apply(lambda x: x.replace(\"…\",\".\").replace(\"—\",\"\").replace(\" \",\" \"))\n", "movies[\"description\"] = movies[\"tags\"].apply(lambda x: \".\".join(x.split(\".\")[:-1] + [\"\"]))\n", "movies[\"tags_clean\"] = movies[\"tags\"].apply(clean_tags).apply(lambda x: x.replace(\"Science Fiction\",\"Sciencefiction\"))" ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitletagsdescriptiontags_clean
019995AvatarIn the 22nd century, a paraplegic Marine is di...In the 22nd century, a paraplegic Marine is di...Action Adventure Fantasy Sciencefiction
1285Pirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...Captain Barbossa, long believed to be dead, ha...Action Adventure Fantasy
2206647SpectreA cryptic message from Bond’s past sends him o...A cryptic message from Bond’s past sends him o...M While
349026The Dark Knight RisesFollowing the death of District Attorney Harve...Following the death of District Attorney Harve...Action Crime Drama Thriller
449529John CarterJohn Carter is a war-weary, former military ca...John Carter is a war-weary, former military ca...Action Adventure Sciencefiction
..................
48049367El MariachiEl Mariachi just wants to play his guitar and ...El Mariachi just wants to play his guitar and ...Action Crime Thriller
480572766NewlywedsA newlywed couple's honeymoon is upended by th...A newlywed couple's honeymoon is upended by th...Comedy Romance
4806231617Signed, Sealed, Delivered\"Signed, Sealed, Delivered\" introduces a dedic...\"Signed, Sealed, Delivered\" introduces a dedic...Comedy Drama Romance
4807126186Shanghai CallingWhen ambitious New York attorney Sam is sent t...When ambitious New York attorney Sam is sent t...Anonymous Written
480825975My Date with DrewEver since the second grade when he first saw ...Ever since the second grade when he first saw ...Documentary
\n", "

4806 rows × 5 columns

\n", "
" ], "text/plain": [ " movie_id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "... ... ... \n", "4804 9367 El Mariachi \n", "4805 72766 Newlyweds \n", "4806 231617 Signed, Sealed, Delivered \n", "4807 126186 Shanghai Calling \n", "4808 25975 My Date with Drew \n", "\n", " tags \\\n", "0 In the 22nd century, a paraplegic Marine is di... \n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "... ... \n", "4804 El Mariachi just wants to play his guitar and ... \n", "4805 A newlywed couple's honeymoon is upended by th... \n", "4806 \"Signed, Sealed, Delivered\" introduces a dedic... \n", "4807 When ambitious New York attorney Sam is sent t... \n", "4808 Ever since the second grade when he first saw ... \n", "\n", " description \\\n", "0 In the 22nd century, a paraplegic Marine is di... \n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "... ... \n", "4804 El Mariachi just wants to play his guitar and ... \n", "4805 A newlywed couple's honeymoon is upended by th... \n", "4806 \"Signed, Sealed, Delivered\" introduces a dedic... \n", "4807 When ambitious New York attorney Sam is sent t... \n", "4808 Ever since the second grade when he first saw ... \n", "\n", " tags_clean \n", "0 Action Adventure Fantasy Sciencefiction \n", "1 Action Adventure Fantasy \n", "2 M While \n", "3 Action Crime Drama Thriller \n", "4 Action Adventure Sciencefiction \n", "... ... \n", "4804 Action Crime Thriller \n", "4805 Comedy Romance \n", "4806 Comedy Drama Romance \n", "4807 Anonymous Written \n", "4808 Documentary \n", "\n", "[4806 rows x 5 columns]" ] }, "execution_count": 256, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 257, "metadata": {}, "outputs": [], "source": [ "from collections import Counter\n", "import numpy as np\n", "\n", "count_genre = pd.Series([t_ for t in movies[\"tags_clean\"].to_list() for t_ in t.split(\" \")]).value_counts().to_frame()\n", "list_genres = list(count_genre.loc[count_genre[\"count\"]>75].index)" ] }, { "cell_type": "code", "execution_count": 258, "metadata": {}, "outputs": [], "source": [ "# index of movies with wrong tags\n", "list_index = []\n", "for index, t in enumerate(movies[\"tags_clean\"].to_list()):\n", " for elem in t.split():\n", " if elem not in list_genres:\n", " list_index.append(index)\n", " break" ] }, { "cell_type": "code", "execution_count": 259, "metadata": {}, "outputs": [], "source": [ "dict_tags = dict()\n", "for index, description in zip(list_index, movies.iloc[list_index][\"tags\"].to_list()):\n", " list_tags = [] \n", " for genre in list_genres:\n", " if genre in description: \n", " list_tags.append(genre)\n", " dict_tags[index] = \" \".join(list_tags)\n", " " ] }, { "cell_type": "code", "execution_count": 260, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\LaurèneDAVID\\AppData\\Local\\Temp\\ipykernel_9060\\521199459.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies[\"tags_clean\"].iloc[list_index] = list(dict_tags.values())\n" ] } ], "source": [ "movies[\"tags_clean\"].iloc[list_index] = list(dict_tags.values())" ] }, { "cell_type": "code", "execution_count": 261, "metadata": {}, "outputs": [], "source": [ "movies.drop(columns=\"tags\",inplace=True)\n", "movies.rename({\"tags_clean\":\"genre\"},axis=1,inplace=True)" ] }, { "cell_type": "code", "execution_count": 262, "metadata": {}, "outputs": [], "source": [ "movies[\"genre\"] = movies[\"genre\"].apply(lambda x:x.replace(\" \",\", \").replace(\"Sciencefiction\", \"Science Fiction\").replace(\"–\",\" \"))" ] }, { "cell_type": "code", "execution_count": 263, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitledescriptiongenre
799861Avengers: Age of UltronWhen Tony Stark tries to jumpstart a dormant p...Action, Adventure, Science Fiction
\n", "
" ], "text/plain": [ " movie_id title \\\n", "7 99861 Avengers: Age of Ultron \n", "\n", " description \\\n", "7 When Tony Stark tries to jumpstart a dormant p... \n", "\n", " genre \n", "7 Action, Adventure, Science Fiction " ] }, "execution_count": 263, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.loc[movies[\"title\"]==\"Avengers: Age of Ultron\"]" ] }, { "cell_type": "code", "execution_count": 264, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'When Tony Stark tries to jumpstart a dormant peacekeeping program, things go awry and Earth’s Mightiest Heroes are put to the ultimate test as the fate of the planet hangs in the balance. As the villainous Ultron emerges, it is up to The Avengers to stop him from enacting his terrible plans, and soon uneasy alliances and unexpected action pave the way for an epic and unique global adventure. Action Adventure ScienceFiction marvelcomic sequel superhero basedoncomicbook vision superheroteam duringcreditsstinger marvelcinematicuniverse 3d RobertDowneyJr.'" ] }, "execution_count": 264, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies[\"description\"].to_list()[7]" ] }, { "cell_type": "code", "execution_count": 265, "metadata": {}, "outputs": [], "source": [ "def clean_description_v2(text):\n", " new_text = text.split(\". \")[-1]\n", " for genre in list_genres:\n", " if genre in new_text:\n", " return \". \".join(text.split(\". \")[:-1] + [\"\"]).strip()\n", " return text" ] }, { "cell_type": "code", "execution_count": 266, "metadata": {}, "outputs": [], "source": [ "movies[\"description\"] = movies[\"description\"].apply(clean_description_v2)" ] }, { "cell_type": "code", "execution_count": 267, "metadata": {}, "outputs": [], "source": [ "movies.to_pickle(\"data/movies/movies_dict2.pkl\")" ] }, { "cell_type": "code", "execution_count": 268, "metadata": {}, "outputs": [], "source": [ "vote_info = pickle.load(open(os.path.join(path_data,\"vote_info.pkl\"),\"rb\"))\n", "vote = pd.DataFrame(vote_info)" ] }, { "cell_type": "code", "execution_count": 271, "metadata": {}, "outputs": [], "source": [ "movies.rename({\"movie_id\":\"id\"}, axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 272, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitledescriptiongenrevote_averagevote_count
019995AvatarIn the 22nd century, a paraplegic Marine is di...Action, Adventure, Fantasy, Science Fiction7.211800
1285Pirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...Action, Adventure, Fantasy6.94500
2206647SpectreA cryptic message from Bond’s past sends him o...Action, Adventure, Crime6.34466
349026The Dark Knight RisesFollowing the death of District Attorney Harve...Action, Crime, Drama, Thriller7.69106
449529John CarterJohn Carter is a war-weary, former military ca...Action, Adventure, Science Fiction6.12124
.....................
48019367El MariachiEl Mariachi just wants to play his guitar and ...Action, Crime, Thriller6.6238
480272766NewlywedsA newlywed couple's honeymoon is upended by th...Comedy, Romance5.95
4803231617Signed, Sealed, Delivered\"Signed, Sealed, Delivered\" introduces a dedic...Comedy, Drama, Romance7.06
4804126186Shanghai CallingWhen ambitious New York attorney Sam is sent t...5.77
480525975My Date with DrewEver since the second grade when he first saw ...Documentary6.316
\n", "

4806 rows × 6 columns

\n", "
" ], "text/plain": [ " id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "... ... ... \n", "4801 9367 El Mariachi \n", "4802 72766 Newlyweds \n", "4803 231617 Signed, Sealed, Delivered \n", "4804 126186 Shanghai Calling \n", "4805 25975 My Date with Drew \n", "\n", " description \\\n", "0 In the 22nd century, a paraplegic Marine is di... \n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "... ... \n", "4801 El Mariachi just wants to play his guitar and ... \n", "4802 A newlywed couple's honeymoon is upended by th... \n", "4803 \"Signed, Sealed, Delivered\" introduces a dedic... \n", "4804 When ambitious New York attorney Sam is sent t... \n", "4805 Ever since the second grade when he first saw ... \n", "\n", " genre vote_average vote_count \n", "0 Action, Adventure, Fantasy, Science Fiction 7.2 11800 \n", "1 Action, Adventure, Fantasy 6.9 4500 \n", "2 Action, Adventure, Crime 6.3 4466 \n", "3 Action, Crime, Drama, Thriller 7.6 9106 \n", "4 Action, Adventure, Science Fiction 6.1 2124 \n", "... ... ... ... \n", "4801 Action, Crime, Thriller 6.6 238 \n", "4802 Comedy, Romance 5.9 5 \n", "4803 Comedy, Drama, Romance 7.0 6 \n", "4804 5.7 7 \n", "4805 Documentary 6.3 16 \n", "\n", "[4806 rows x 6 columns]" ] }, "execution_count": 272, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.merge(vote, on=\"id\", how=\"left\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.0" } }, "nbformat": 4, "nbformat_minor": 2 }