{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Movie recommendation" ] }, { "cell_type": "code", "execution_count": 252, "metadata": {}, "outputs": [], "source": [ "import os \n", "import pickle\n", "\n", "path_data = r\"data/movies\"\n", "\n", "with open(os.path.join(path_data,'movies_dict.pkl'), 'rb') as file:\n", " movies_data = pickle.load(file)" ] }, { "cell_type": "code", "execution_count": 253, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "movies = pd.DataFrame(movies_data)\n", "movies.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 254, "metadata": {}, "outputs": [], "source": [ "import re\n", "\n", "def has_capital(string):\n", " for index, char in enumerate(string):\n", " if char.isupper() and index != 0:\n", " return True\n", " return False\n", "\n", "def clean_tags(text):\n", " pattern1 = re.compile(r'[?!]')\n", " pattern2 = re.compile(r'\\.(?!\\s|$)')\n", " pattern3 = re.compile(r'\\.[a-zA-Z]\\.')\n", " \n", " text_clean = re.sub(pattern1, '. ', text)\n", " text_clean = re.sub(pattern2, \"\", text_clean)\n", " text_clean = re.sub(pattern3, \"\", text_clean)\n", " text_clean = text_clean.replace(\"RobertDowneyJr.\",\"\").replace(\"SamuelL.\",\"\").replace(\"ScienceFiction\", \"Sciencefiction\")\n", "\n", " tags_words = \" \".join([t for t in text_clean.split(\" \") if has_capital(t)==False])\n", " tags_words = [t for t in tags_words.split(\". \")[-1:][0].strip().split(\" \")[:8] if t!=\"\"]\n", " tags_words = [t for t in tags_words if t[0].isupper()==True]\n", " #tags_words_clean = [t for t in tags_words_clean if has_capital(t)==False]\n", " return \" \".join(sorted(tags_words)).replace(\"Sciencefiction\",\"Science Fiction\")" ] }, { "cell_type": "code", "execution_count": 255, "metadata": {}, "outputs": [], "source": [ "movies[\"tags\"] = movies[\"tags\"].apply(lambda x: x.replace(\"…\",\".\").replace(\"—\",\"\").replace(\" \",\" \"))\n", "movies[\"description\"] = movies[\"tags\"].apply(lambda x: \".\".join(x.split(\".\")[:-1] + [\"\"]))\n", "movies[\"tags_clean\"] = movies[\"tags\"].apply(clean_tags).apply(lambda x: x.replace(\"Science Fiction\",\"Sciencefiction\"))" ] }, { "cell_type": "code", "execution_count": 256, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | movie_id | \n", "title | \n", "tags | \n", "description | \n", "tags_clean | \n", "
---|---|---|---|---|---|
0 | \n", "19995 | \n", "Avatar | \n", "In the 22nd century, a paraplegic Marine is di... | \n", "In the 22nd century, a paraplegic Marine is di... | \n", "Action Adventure Fantasy Sciencefiction | \n", "
1 | \n", "285 | \n", "Pirates of the Caribbean: At World's End | \n", "Captain Barbossa, long believed to be dead, ha... | \n", "Captain Barbossa, long believed to be dead, ha... | \n", "Action Adventure Fantasy | \n", "
2 | \n", "206647 | \n", "Spectre | \n", "A cryptic message from Bond’s past sends him o... | \n", "A cryptic message from Bond’s past sends him o... | \n", "M While | \n", "
3 | \n", "49026 | \n", "The Dark Knight Rises | \n", "Following the death of District Attorney Harve... | \n", "Following the death of District Attorney Harve... | \n", "Action Crime Drama Thriller | \n", "
4 | \n", "49529 | \n", "John Carter | \n", "John Carter is a war-weary, former military ca... | \n", "John Carter is a war-weary, former military ca... | \n", "Action Adventure Sciencefiction | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4804 | \n", "9367 | \n", "El Mariachi | \n", "El Mariachi just wants to play his guitar and ... | \n", "El Mariachi just wants to play his guitar and ... | \n", "Action Crime Thriller | \n", "
4805 | \n", "72766 | \n", "Newlyweds | \n", "A newlywed couple's honeymoon is upended by th... | \n", "A newlywed couple's honeymoon is upended by th... | \n", "Comedy Romance | \n", "
4806 | \n", "231617 | \n", "Signed, Sealed, Delivered | \n", "\"Signed, Sealed, Delivered\" introduces a dedic... | \n", "\"Signed, Sealed, Delivered\" introduces a dedic... | \n", "Comedy Drama Romance | \n", "
4807 | \n", "126186 | \n", "Shanghai Calling | \n", "When ambitious New York attorney Sam is sent t... | \n", "When ambitious New York attorney Sam is sent t... | \n", "Anonymous Written | \n", "
4808 | \n", "25975 | \n", "My Date with Drew | \n", "Ever since the second grade when he first saw ... | \n", "Ever since the second grade when he first saw ... | \n", "Documentary | \n", "
4806 rows × 5 columns
\n", "\n", " | movie_id | \n", "title | \n", "description | \n", "genre | \n", "
---|---|---|---|---|
7 | \n", "99861 | \n", "Avengers: Age of Ultron | \n", "When Tony Stark tries to jumpstart a dormant p... | \n", "Action, Adventure, Science Fiction | \n", "
\n", " | id | \n", "title | \n", "description | \n", "genre | \n", "vote_average | \n", "vote_count | \n", "
---|---|---|---|---|---|---|
0 | \n", "19995 | \n", "Avatar | \n", "In the 22nd century, a paraplegic Marine is di... | \n", "Action, Adventure, Fantasy, Science Fiction | \n", "7.2 | \n", "11800 | \n", "
1 | \n", "285 | \n", "Pirates of the Caribbean: At World's End | \n", "Captain Barbossa, long believed to be dead, ha... | \n", "Action, Adventure, Fantasy | \n", "6.9 | \n", "4500 | \n", "
2 | \n", "206647 | \n", "Spectre | \n", "A cryptic message from Bond’s past sends him o... | \n", "Action, Adventure, Crime | \n", "6.3 | \n", "4466 | \n", "
3 | \n", "49026 | \n", "The Dark Knight Rises | \n", "Following the death of District Attorney Harve... | \n", "Action, Crime, Drama, Thriller | \n", "7.6 | \n", "9106 | \n", "
4 | \n", "49529 | \n", "John Carter | \n", "John Carter is a war-weary, former military ca... | \n", "Action, Adventure, Science Fiction | \n", "6.1 | \n", "2124 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4801 | \n", "9367 | \n", "El Mariachi | \n", "El Mariachi just wants to play his guitar and ... | \n", "Action, Crime, Thriller | \n", "6.6 | \n", "238 | \n", "
4802 | \n", "72766 | \n", "Newlyweds | \n", "A newlywed couple's honeymoon is upended by th... | \n", "Comedy, Romance | \n", "5.9 | \n", "5 | \n", "
4803 | \n", "231617 | \n", "Signed, Sealed, Delivered | \n", "\"Signed, Sealed, Delivered\" introduces a dedic... | \n", "Comedy, Drama, Romance | \n", "7.0 | \n", "6 | \n", "
4804 | \n", "126186 | \n", "Shanghai Calling | \n", "When ambitious New York attorney Sam is sent t... | \n", "\n", " | 5.7 | \n", "7 | \n", "
4805 | \n", "25975 | \n", "My Date with Drew | \n", "Ever since the second grade when he first saw ... | \n", "Documentary | \n", "6.3 | \n", "16 | \n", "
4806 rows × 6 columns
\n", "