{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import ast\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from nltk.stem.porter import PorterStemmer\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "movies = pd.read_csv('tmdb_5000_movies.csv')\n", "credits = pd.read_csv('tmdb_5000_credits.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companiesproduction_countriesrelease_daterevenueruntimespoken_languagesstatustaglinetitlevote_averagevote_count
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...2009-12-102787965087162.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.211800
\n", "
" ], "text/plain": [ " budget genres \n", "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n", "\n", " homepage id \n", "0 http://www.avatarmovie.com/ 19995 \\\n", "\n", " keywords original_language \n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \\\n", "\n", " original_title overview \n", "0 Avatar In the 22nd century, a paraplegic Marine is di... \\\n", "\n", " popularity production_companies \n", "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... \\\n", "\n", " production_countries release_date revenue \n", "0 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2009-12-10 2787965087 \\\n", "\n", " runtime spoken_languages status \n", "0 162.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \\\n", "\n", " tagline title vote_average vote_count \n", "0 Enter the World of Pandora. Avatar 7.2 11800 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlecastcrew
019995Avatar[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
\n", "
" ], "text/plain": [ " movie_id title cast \n", "0 19995 Avatar [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head(1)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "movies = movies.merge(credits,on='title')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companies...runtimespoken_languagesstatustaglinetitlevote_averagevote_countmovie_idcastcrew
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289......162.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.21180019995[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
\n", "

1 rows × 23 columns

\n", "
" ], "text/plain": [ " budget genres \n", "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n", "\n", " homepage id \n", "0 http://www.avatarmovie.com/ 19995 \\\n", "\n", " keywords original_language \n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \\\n", "\n", " original_title overview \n", "0 Avatar In the 22nd century, a paraplegic Marine is di... \\\n", "\n", " popularity production_companies ... runtime \n", "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... 162.0 \\\n", "\n", " spoken_languages status \n", "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \\\n", "\n", " tagline title vote_average vote_count movie_id \n", "0 Enter the World of Pandora. Avatar 7.2 11800 19995 \\\n", "\n", " cast \n", "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n", "\n", "[1 rows x 23 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',\n", " 'original_title', 'overview', 'popularity', 'production_companies',\n", " 'production_countries', 'release_date', 'revenue', 'runtime',\n", " 'spoken_languages', 'status', 'tagline', 'title', 'vote_average',\n", " 'vote_count', 'movie_id', 'cast', 'crew'],\n", " dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.columns" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrew
019995AvatarIn the 22nd century, a paraplegic Marine is di...[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
1285Pirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...
2206647SpectreA cryptic message from Bond’s past sends him o...[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...
349026The Dark Knight RisesFollowing the death of District Attorney Harve...[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...
449529John CarterJohn Carter is a war-weary, former military ca...[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...
\n", "
" ], "text/plain": [ " movie_id title \n", "0 19995 Avatar \\\n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " overview \n", "0 In the 22nd century, a paraplegic Marine is di... \\\n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "\n", " genres \n", "0 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n", "1 [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"... \n", "2 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "3 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam... \n", "4 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "\n", " keywords \n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... \\\n", "1 [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na... \n", "2 [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name... \n", "3 [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,... \n", "4 [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":... \n", "\n", " cast \n", "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n", "1 [{\"cast_id\": 4, \"character\": \"Captain Jack Spa... \n", "2 [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr... \n", "3 [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba... \n", "4 [{\"cast_id\": 5, \"character\": \"John Carter\", \"c... \n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n", "1 [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de... \n", "2 [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de... \n", "3 [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de... \n", "4 [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "title 0\n", "overview 3\n", "genres 0\n", "keywords 0\n", "cast 0\n", "crew 0\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "movies.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {\"id\": 878, \"name\": \"Science Fiction\"}]'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.iloc[0].genres" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def convert(obj):\n", " l = []\n", " for i in ast.literal_eval(obj):\n", " l.append(i['name']) \n", " return l" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "movies['genres'] = movies['genres'].apply(convert)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrew
019995AvatarIn the 22nd century, a paraplegic Marine is di...[Action, Adventure, Fantasy, Science Fiction][{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
\n", "
" ], "text/plain": [ " movie_id title overview \n", "0 19995 Avatar In the 22nd century, a paraplegic Marine is di... \\\n", "\n", " genres \n", "0 [Action, Adventure, Fantasy, Science Fiction] \\\n", "\n", " keywords \n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... \\\n", "\n", " cast \n", "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "movies['keywords'] = movies['keywords'].apply(convert)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def convert3(obj):\n", " l = []\n", " c = 0\n", " for i in ast.literal_eval(obj):\n", " if c!=3:\n", " l.append(i['name']) \n", " c+=1\n", " else:\n", " break\n", " return l" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "movies['cast'] = movies['cast'].apply(convert3)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def fetch(obj):\n", " l = []\n", " for i in ast.literal_eval(obj):\n", " if i['job']=='Director':\n", " l.append(i['name']) \n", " break\n", " return l" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "movies['crew'] = movies['crew'].apply(fetch)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrew
019995AvatarIn the 22nd century, a paraplegic Marine is di...[Action, Adventure, Fantasy, Science Fiction][culture clash, future, space war, space colon...[Sam Worthington, Zoe Saldana, Sigourney Weaver][James Cameron]
1285Pirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...[Adventure, Fantasy, Action][ocean, drug abuse, exotic island, east india ...[Johnny Depp, Orlando Bloom, Keira Knightley][Gore Verbinski]
2206647SpectreA cryptic message from Bond’s past sends him o...[Action, Adventure, Crime][spy, based on novel, secret agent, sequel, mi...[Daniel Craig, Christoph Waltz, Léa Seydoux][Sam Mendes]
349026The Dark Knight RisesFollowing the death of District Attorney Harve...[Action, Crime, Drama, Thriller][dc comics, crime fighter, terrorist, secret i...[Christian Bale, Michael Caine, Gary Oldman][Christopher Nolan]
449529John CarterJohn Carter is a war-weary, former military ca...[Action, Adventure, Science Fiction][based on novel, mars, medallion, space travel...[Taylor Kitsch, Lynn Collins, Samantha Morton][Andrew Stanton]
\n", "
" ], "text/plain": [ " movie_id title \n", "0 19995 Avatar \\\n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " overview \n", "0 In the 22nd century, a paraplegic Marine is di... \\\n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "\n", " genres \n", "0 [Action, Adventure, Fantasy, Science Fiction] \\\n", "1 [Adventure, Fantasy, Action] \n", "2 [Action, Adventure, Crime] \n", "3 [Action, Crime, Drama, Thriller] \n", "4 [Action, Adventure, Science Fiction] \n", "\n", " keywords \n", "0 [culture clash, future, space war, space colon... \\\n", "1 [ocean, drug abuse, exotic island, east india ... \n", "2 [spy, based on novel, secret agent, sequel, mi... \n", "3 [dc comics, crime fighter, terrorist, secret i... \n", "4 [based on novel, mars, medallion, space travel... \n", "\n", " cast crew \n", "0 [Sam Worthington, Zoe Saldana, Sigourney Weaver] [James Cameron] \n", "1 [Johnny Depp, Orlando Bloom, Keira Knightley] [Gore Verbinski] \n", "2 [Daniel Craig, Christoph Waltz, Léa Seydoux] [Sam Mendes] \n", "3 [Christian Bale, Michael Caine, Gary Oldman] [Christopher Nolan] \n", "4 [Taylor Kitsch, Lynn Collins, Samantha Morton] [Andrew Stanton] " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "movies['overview'] = movies['overview'].apply(lambda x:x.split())" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrew
019995Avatar[In, the, 22nd, century,, a, paraplegic, Marin...[Action, Adventure, Fantasy, Science Fiction][culture clash, future, space war, space colon...[Sam Worthington, Zoe Saldana, Sigourney Weaver][James Cameron]
1285Pirates of the Caribbean: At World's End[Captain, Barbossa,, long, believed, to, be, d...[Adventure, Fantasy, Action][ocean, drug abuse, exotic island, east india ...[Johnny Depp, Orlando Bloom, Keira Knightley][Gore Verbinski]
2206647Spectre[A, cryptic, message, from, Bond’s, past, send...[Action, Adventure, Crime][spy, based on novel, secret agent, sequel, mi...[Daniel Craig, Christoph Waltz, Léa Seydoux][Sam Mendes]
349026The Dark Knight Rises[Following, the, death, of, District, Attorney...[Action, Crime, Drama, Thriller][dc comics, crime fighter, terrorist, secret i...[Christian Bale, Michael Caine, Gary Oldman][Christopher Nolan]
449529John Carter[John, Carter, is, a, war-weary,, former, mili...[Action, Adventure, Science Fiction][based on novel, mars, medallion, space travel...[Taylor Kitsch, Lynn Collins, Samantha Morton][Andrew Stanton]
\n", "
" ], "text/plain": [ " movie_id title \n", "0 19995 Avatar \\\n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " overview \n", "0 [In, the, 22nd, century,, a, paraplegic, Marin... \\\n", "1 [Captain, Barbossa,, long, believed, to, be, d... \n", "2 [A, cryptic, message, from, Bond’s, past, send... \n", "3 [Following, the, death, of, District, Attorney... \n", "4 [John, Carter, is, a, war-weary,, former, mili... \n", "\n", " genres \n", "0 [Action, Adventure, Fantasy, Science Fiction] \\\n", "1 [Adventure, Fantasy, Action] \n", "2 [Action, Adventure, Crime] \n", "3 [Action, Crime, Drama, Thriller] \n", "4 [Action, Adventure, Science Fiction] \n", "\n", " keywords \n", "0 [culture clash, future, space war, space colon... \\\n", "1 [ocean, drug abuse, exotic island, east india ... \n", "2 [spy, based on novel, secret agent, sequel, mi... \n", "3 [dc comics, crime fighter, terrorist, secret i... \n", "4 [based on novel, mars, medallion, space travel... \n", "\n", " cast crew \n", "0 [Sam Worthington, Zoe Saldana, Sigourney Weaver] [James Cameron] \n", "1 [Johnny Depp, Orlando Bloom, Keira Knightley] [Gore Verbinski] \n", "2 [Daniel Craig, Christoph Waltz, Léa Seydoux] [Sam Mendes] \n", "3 [Christian Bale, Michael Caine, Gary Oldman] [Christopher Nolan] \n", "4 [Taylor Kitsch, Lynn Collins, Samantha Morton] [Andrew Stanton] " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "movies['genres'] = movies['genres'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n", "movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n", "movies['cast'] = movies['cast'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n", "movies['crew'] = movies['crew'].apply(lambda x:[i.replace(\" \",\"\") for i in x])" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrew
019995Avatar[In, the, 22nd, century,, a, paraplegic, Marin...[Action, Adventure, Fantasy, ScienceFiction][cultureclash, future, spacewar, spacecolony, ...[SamWorthington, ZoeSaldana, SigourneyWeaver][JamesCameron]
\n", "
" ], "text/plain": [ " movie_id title overview \n", "0 19995 Avatar [In, the, 22nd, century,, a, paraplegic, Marin... \\\n", "\n", " genres \n", "0 [Action, Adventure, Fantasy, ScienceFiction] \\\n", "\n", " keywords \n", "0 [cultureclash, future, spacewar, spacecolony, ... \\\n", "\n", " cast crew \n", "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head(1)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "movies['tags'] = movies['overview']+movies['keywords']+movies['cast']+movies['crew']" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitleoverviewgenreskeywordscastcrewtags
019995Avatar[In, the, 22nd, century,, a, paraplegic, Marin...[Action, Adventure, Fantasy, ScienceFiction][cultureclash, future, spacewar, spacecolony, ...[SamWorthington, ZoeSaldana, SigourneyWeaver][JamesCameron][In, the, 22nd, century,, a, paraplegic, Marin...
1285Pirates of the Caribbean: At World's End[Captain, Barbossa,, long, believed, to, be, d...[Adventure, Fantasy, Action][ocean, drugabuse, exoticisland, eastindiatrad...[JohnnyDepp, OrlandoBloom, KeiraKnightley][GoreVerbinski][Captain, Barbossa,, long, believed, to, be, d...
2206647Spectre[A, cryptic, message, from, Bond’s, past, send...[Action, Adventure, Crime][spy, basedonnovel, secretagent, sequel, mi6, ...[DanielCraig, ChristophWaltz, LéaSeydoux][SamMendes][A, cryptic, message, from, Bond’s, past, send...
349026The Dark Knight Rises[Following, the, death, of, District, Attorney...[Action, Crime, Drama, Thriller][dccomics, crimefighter, terrorist, secretiden...[ChristianBale, MichaelCaine, GaryOldman][ChristopherNolan][Following, the, death, of, District, Attorney...
449529John Carter[John, Carter, is, a, war-weary,, former, mili...[Action, Adventure, ScienceFiction][basedonnovel, mars, medallion, spacetravel, p...[TaylorKitsch, LynnCollins, SamanthaMorton][AndrewStanton][John, Carter, is, a, war-weary,, former, mili...
\n", "
" ], "text/plain": [ " movie_id title \n", "0 19995 Avatar \\\n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " overview \n", "0 [In, the, 22nd, century,, a, paraplegic, Marin... \\\n", "1 [Captain, Barbossa,, long, believed, to, be, d... \n", "2 [A, cryptic, message, from, Bond’s, past, send... \n", "3 [Following, the, death, of, District, Attorney... \n", "4 [John, Carter, is, a, war-weary,, former, mili... \n", "\n", " genres \n", "0 [Action, Adventure, Fantasy, ScienceFiction] \\\n", "1 [Adventure, Fantasy, Action] \n", "2 [Action, Adventure, Crime] \n", "3 [Action, Crime, Drama, Thriller] \n", "4 [Action, Adventure, ScienceFiction] \n", "\n", " keywords \n", "0 [cultureclash, future, spacewar, spacecolony, ... \\\n", "1 [ocean, drugabuse, exoticisland, eastindiatrad... \n", "2 [spy, basedonnovel, secretagent, sequel, mi6, ... \n", "3 [dccomics, crimefighter, terrorist, secretiden... \n", "4 [basedonnovel, mars, medallion, spacetravel, p... \n", "\n", " cast crew \n", "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] \\\n", "1 [JohnnyDepp, OrlandoBloom, KeiraKnightley] [GoreVerbinski] \n", "2 [DanielCraig, ChristophWaltz, LéaSeydoux] [SamMendes] \n", "3 [ChristianBale, MichaelCaine, GaryOldman] [ChristopherNolan] \n", "4 [TaylorKitsch, LynnCollins, SamanthaMorton] [AndrewStanton] \n", "\n", " tags \n", "0 [In, the, 22nd, century,, a, paraplegic, Marin... \n", "1 [Captain, Barbossa,, long, believed, to, be, d... \n", "2 [A, cryptic, message, from, Bond’s, past, send... \n", "3 [Following, the, death, of, District, Attorney... \n", "4 [John, Carter, is, a, war-weary,, former, mili... " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "new_df = movies[['movie_id','title','tags']]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3089450492.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " new_df['tags'] = new_df['tags'].apply(lambda x:\" \".join(x))\n" ] } ], "source": [ "new_df['tags'] = new_df['tags'].apply(lambda x:\" \".join(x))" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitletags
019995AvatarIn the 22nd century, a paraplegic Marine is di...
1285Pirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...
2206647SpectreA cryptic message from Bond’s past sends him o...
349026The Dark Knight RisesFollowing the death of District Attorney Harve...
449529John CarterJohn Carter is a war-weary, former military ca...
\n", "
" ], "text/plain": [ " movie_id title \n", "0 19995 Avatar \\\n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " tags \n", "0 In the 22nd century, a paraplegic Marine is di... \n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3214958533.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())\n" ] } ], "source": [ "new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "cv = CountVectorizer(max_features=5000,stop_words='english')" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "vectors = cv.fit_transform(new_df['tags']).toarray()" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " ...,\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectors" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", " warnings.warn(msg, category=FutureWarning)\n" ] }, { "data": { "text/plain": [ "5000" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(cv.get_feature_names())" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "ps = PorterStemmer()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "def stem(text):\n", " y = []\n", "\n", " for i in text.split():\n", " y.append(ps.stem(i))\n", "\n", " return \" \".join(y)\n", " " ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'danc'" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ps.stem('danc')" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3213734980.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " new_df['tags'] = new_df['tags'].apply(stem)\n" ] } ], "source": [ "new_df['tags'] = new_df['tags'].apply(stem)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", " warnings.warn(msg, category=FutureWarning)\n" ] }, { "data": { "text/plain": [ "['000',\n", " '007',\n", " '10',\n", " '100',\n", " '11',\n", " '12',\n", " '13',\n", " '14',\n", " '15',\n", " '16',\n", " '17',\n", " '18',\n", " '18th',\n", " '19',\n", " '1930s',\n", " '1940s',\n", " '1944',\n", " '1950',\n", " '1950s',\n", " '1960s',\n", " '1970s',\n", " '1971',\n", " '1974',\n", " '1976',\n", " '1980',\n", " '1980s',\n", " '1985',\n", " '1990s',\n", " '19th',\n", " '19thcentury',\n", " '20',\n", " '200',\n", " '2009',\n", " '20th',\n", " '21st',\n", " '23',\n", " '24',\n", " '25',\n", " '30',\n", " '300',\n", " '3d',\n", " '40',\n", " '50',\n", " '500',\n", " '60',\n", " '60s',\n", " '70',\n", " '70s',\n", " 'aaron',\n", " 'aaroneckhart',\n", " 'abandoned',\n", " 'abducted',\n", " 'abigailbreslin',\n", " 'abilities',\n", " 'ability',\n", " 'able',\n", " 'aboard',\n", " 'abuse',\n", " 'abusive',\n", " 'academic',\n", " 'academy',\n", " 'accept',\n", " 'accepted',\n", " 'accepts',\n", " 'access',\n", " 'accident',\n", " 'accidental',\n", " 'accidentally',\n", " 'accompanied',\n", " 'accomplish',\n", " 'account',\n", " 'accountant',\n", " 'accused',\n", " 'ace',\n", " 'achieve',\n", " 'act',\n", " 'acting',\n", " 'action',\n", " 'actionhero',\n", " 'actions',\n", " 'activist',\n", " 'activities',\n", " 'activity',\n", " 'actor',\n", " 'actors',\n", " 'actress',\n", " 'acts',\n", " 'actual',\n", " 'actually',\n", " 'adam',\n", " 'adams',\n", " 'adamsandler',\n", " 'adamshankman',\n", " 'adaptation',\n", " 'adapted',\n", " 'addict',\n", " 'addicted',\n", " 'addiction',\n", " 'adolescence',\n", " 'adopt',\n", " 'adopted',\n", " 'adoption',\n", " 'adopts',\n", " 'adrienbrody',\n", " 'adult',\n", " 'adultery',\n", " 'adulthood',\n", " 'adults',\n", " 'advantage',\n", " 'adventure',\n", " 'adventures',\n", " 'advertising',\n", " 'advice',\n", " 'affair',\n", " 'affairs',\n", " 'affection',\n", " 'affections',\n", " 'afghanistan',\n", " 'africa',\n", " 'african',\n", " 'africanamerican',\n", " 'aftercreditsstinger',\n", " 'afterlife',\n", " 'aftermath',\n", " 'age',\n", " 'aged',\n", " 'agedifference',\n", " 'agency',\n", " 'agenda',\n", " 'agent',\n", " 'agents',\n", " 'aggressive',\n", " 'aging',\n", " 'ago',\n", " 'agree',\n", " 'agrees',\n", " 'ahead',\n", " 'aid',\n", " 'aided',\n", " 'aids',\n", " 'ailing',\n", " 'air',\n", " 'airplane',\n", " 'airplanecrash',\n", " 'airport',\n", " 'aka',\n", " 'al',\n", " 'alabama',\n", " 'alan',\n", " 'alaska',\n", " 'albert',\n", " 'alcohol',\n", " 'alcoholic',\n", " 'alcoholism',\n", " 'alecbaldwin',\n", " 'alex',\n", " 'alexkendrick',\n", " 'alfredhitchcock',\n", " 'ali',\n", " 'alice',\n", " 'alien',\n", " 'alieninvasion',\n", " 'alienlife',\n", " 'aliens',\n", " 'alike',\n", " 'alive',\n", " 'allen',\n", " 'alliance',\n", " 'allied',\n", " 'allies',\n", " 'allow',\n", " 'allowing',\n", " 'allows',\n", " 'ally',\n", " 'alongside',\n", " 'alpacino',\n", " 'alter',\n", " 'alternate',\n", " 'alternative',\n", " 'alzheimer',\n", " 'amanda',\n", " 'amandapeet',\n", " 'amandaseyfried',\n", " 'amateur',\n", " 'amazing',\n", " 'ambassador',\n", " 'ambition',\n", " 'ambitious',\n", " 'ambulance',\n", " 'ambush',\n", " 'america',\n", " 'american',\n", " 'americanabroad',\n", " 'americanfootball',\n", " 'americans',\n", " 'amid',\n", " 'amidst',\n", " 'amnesia',\n", " 'amp',\n", " 'amsterdam',\n", " 'amusement',\n", " 'amusementpark',\n", " 'amy',\n", " 'amyadams',\n", " 'amysmart',\n", " 'analyst',\n", " 'anarchiccomedy',\n", " 'ancient',\n", " 'ancientrome',\n", " 'ancientworld',\n", " 'anderson',\n", " 'andiemacdowell',\n", " 'andrew',\n", " 'android',\n", " 'andy',\n", " 'andygarcía',\n", " 'angel',\n", " 'angelabassett',\n", " 'angeles',\n", " 'angelinajolie',\n", " 'angels',\n", " 'anger',\n", " 'anglee',\n", " 'angry',\n", " 'animal',\n", " 'animalattack',\n", " 'animalhorror',\n", " 'animals',\n", " 'animated',\n", " 'animation',\n", " 'anna',\n", " 'annafaris',\n", " 'anne',\n", " 'annehathaway',\n", " 'annemoss',\n", " 'annettebening',\n", " 'annie',\n", " 'anniversary',\n", " 'annual',\n", " 'answer',\n", " 'answers',\n", " 'ant',\n", " 'anthology',\n", " 'anthony',\n", " 'anthonyanderson',\n", " 'anthonyhopkins',\n", " 'anthropomorphism',\n", " 'anti',\n", " 'antics',\n", " 'antihero',\n", " 'antoinefuqua',\n", " 'antoniobanderas',\n", " 'antonyelchin',\n", " 'apart',\n", " 'apartheid',\n", " 'apartment',\n", " 'ape',\n", " 'apes',\n", " 'apocalypse',\n", " 'apocalyptic',\n", " 'apparent',\n", " 'apparently',\n", " 'appear',\n", " 'appears',\n", " 'apple',\n", " 'appointed',\n", " 'apprentice',\n", " 'approach',\n", " 'approaches',\n", " 'approaching',\n", " 'april',\n", " 'aquarium',\n", " 'arab',\n", " 'arch',\n", " 'archaeologist',\n", " 'archeology',\n", " 'architect',\n", " 'arctic',\n", " 'area',\n", " 'aren',\n", " 'arena',\n", " 'argument',\n", " 'arise',\n", " 'aristocrat',\n", " 'armed',\n", " 'arms',\n", " 'army',\n", " 'arnold',\n", " 'arnoldschwarzenegger',\n", " 'arrangedmarriage',\n", " 'arrangement',\n", " 'arrest',\n", " 'arrested',\n", " 'arrival',\n", " 'arrive',\n", " 'arrives',\n", " 'arriving',\n", " 'arrogant',\n", " 'art',\n", " 'arthur',\n", " 'artificialintelligence',\n", " 'artist',\n", " 'artistic',\n", " 'artists',\n", " 'arts',\n", " 'ashley',\n", " 'ashleyjudd',\n", " 'ashtonkutcher',\n", " 'asia',\n", " 'aside',\n", " 'ask',\n", " 'asked',\n", " 'asking',\n", " 'asks',\n", " 'aspirations',\n", " 'aspiring',\n", " 'assassin',\n", " 'assassinate',\n", " 'assassination',\n", " 'assassins',\n", " 'assault',\n", " 'assigned',\n", " 'assignment',\n", " 'assistant',\n", " 'assumes',\n", " 'asteroid',\n", " 'astronaut',\n", " 'astronauts',\n", " 'asylum',\n", " 'athlete',\n", " 'atomicbomb',\n", " 'attack',\n", " 'attacked',\n", " 'attacks',\n", " 'attempt',\n", " 'attempting',\n", " 'attempts',\n", " 'attempttoescape',\n", " 'attending',\n", " 'attends',\n", " 'attention',\n", " 'attic',\n", " 'attitude',\n", " 'attorney',\n", " 'attracted',\n", " 'attraction',\n", " 'attractive',\n", " 'audience',\n", " 'audiences',\n", " 'audition',\n", " 'august',\n", " 'aunt',\n", " 'austin',\n", " 'australia',\n", " 'australian',\n", " 'author',\n", " 'authorities',\n", " 'authority',\n", " 'autism',\n", " 'auto',\n", " 'avenge',\n", " 'average',\n", " 'avoid',\n", " 'awaits',\n", " 'awakens',\n", " 'award',\n", " 'away',\n", " 'awry',\n", " 'ax',\n", " 'babe',\n", " 'baby',\n", " 'bachelor',\n", " 'backdrop',\n", " 'background',\n", " 'backgrounds',\n", " 'bad',\n", " 'bag',\n", " 'bahamas',\n", " 'bail',\n", " 'balance',\n", " 'ball',\n", " 'ballet',\n", " 'baltimore',\n", " 'band',\n", " 'bandits',\n", " 'bangkok',\n", " 'banished',\n", " 'bank',\n", " 'banker',\n", " 'bankrobber',\n", " 'bankrobbery',\n", " 'bar',\n", " 'barely',\n", " 'bargained',\n", " 'barn',\n", " 'barney',\n", " 'barry',\n", " 'barrylevinson',\n", " 'bars',\n", " 'base',\n", " 'baseball',\n", " 'based',\n", " 'basedoncomicbook',\n", " 'basedongraphicnovel',\n", " 'basedonnovel',\n", " 'basedonplay',\n", " 'basedonstagemusical',\n", " 'basedontrueevents',\n", " 'basedontruestory',\n", " 'basedontvseries',\n", " 'basedonvideogame',\n", " 'basedonyoungadultnovel',\n", " 'basement',\n", " 'basketball',\n", " 'batman',\n", " 'battle',\n", " 'battlefield',\n", " 'battles',\n", " 'battling',\n", " 'bay',\n", " 'beach',\n", " 'bear',\n", " 'bears',\n", " 'beast',\n", " 'beasts',\n", " 'beat',\n", " 'beating',\n", " 'beautiful',\n", " 'beautifulwoman',\n", " 'beauty',\n", " 'becky',\n", " 'becominganadult',\n", " 'bed',\n", " 'bedroom',\n", " 'bee',\n", " 'beer',\n", " 'befriends',\n", " 'began',\n", " 'begin',\n", " 'beginning',\n", " 'begins',\n", " 'behavior',\n", " 'beings',\n", " 'belief',\n", " 'beliefs',\n", " 'believe',\n", " 'believed',\n", " 'believes',\n", " 'believing',\n", " 'bell',\n", " 'belong',\n", " 'belongs',\n", " 'beloved',\n", " 'ben',\n", " 'benaffleck',\n", " 'beneath',\n", " 'benfoster',\n", " 'beniciodeltoro',\n", " 'benjamin',\n", " 'benjaminbratt',\n", " 'benkingsley',\n", " 'bennett',\n", " 'benstiller',\n", " 'bent',\n", " 'berlin',\n", " 'best',\n", " 'bestfriend',\n", " 'bet',\n", " 'beth',\n", " 'betrayal',\n", " 'betrayed',\n", " 'bettemidler',\n", " 'better',\n", " 'betty',\n", " 'beverly',\n", " 'bible',\n", " 'bid',\n", " 'big',\n", " 'bigger',\n", " 'biggest',\n", " 'biker',\n", " 'bikini',\n", " 'billhader',\n", " 'billionaire',\n", " 'billmurray',\n", " 'billnighy',\n", " 'billpaxton',\n", " 'billpullman',\n", " 'billy',\n", " 'billybobthornton',\n", " 'billycrudup',\n", " 'billycrystal',\n", " 'biography',\n", " 'bird',\n", " 'birth',\n", " 'birthday',\n", " 'bisexual',\n", " 'bishop',\n", " 'bit',\n", " 'bite',\n", " 'bitter',\n", " 'bizarre',\n", " 'black',\n", " 'blackmagic',\n", " 'blackmail',\n", " 'blackpeople',\n", " 'blacksmith',\n", " 'blade',\n", " 'blame',\n", " 'blind',\n", " 'bliss',\n", " 'block',\n", " 'blonde',\n", " 'blood',\n", " 'bloodsplatter',\n", " 'bloodthirsty',\n", " 'bloody',\n", " 'blow',\n", " 'blue',\n", " 'board',\n", " 'boarding',\n", " 'boardingschool',\n", " 'boat',\n", " 'bob',\n", " 'bobby',\n", " 'bobbyfarrelly',\n", " 'bobhoskins',\n", " 'bodies',\n", " 'body',\n", " 'bodyguard',\n", " 'bold',\n", " 'bollywood',\n", " 'bomb',\n", " 'bombing',\n", " 'bond',\n", " 'bonds',\n", " 'bone',\n", " 'book',\n", " 'books',\n", " 'border',\n", " 'bored',\n", " 'boredom',\n", " 'boring',\n", " 'born',\n", " 'boss',\n", " 'boston',\n", " 'botched',\n", " 'bound',\n", " 'boundaries',\n", " 'bounty',\n", " 'bountyhunter',\n", " 'bourne',\n", " 'box',\n", " 'boxer',\n", " 'boxing',\n", " 'boy',\n", " 'boyfriend',\n", " 'boys',\n", " 'bradleycooper',\n", " 'bradpitt',\n", " 'brain',\n", " 'brand',\n", " 'brave',\n", " 'bravery',\n", " 'brazil',\n", " 'brazilian',\n", " 'break',\n", " 'breakdown',\n", " 'breaking',\n", " 'breaks',\n", " 'brendanfraser',\n", " 'brendangleeson',\n", " 'brent',\n", " 'brettratner',\n", " 'brian',\n", " 'briandepalma',\n", " 'bride',\n", " 'bridge',\n", " 'brief',\n", " 'brien',\n", " 'bright',\n", " 'brilliant',\n", " 'bring',\n", " 'bringing',\n", " 'brings',\n", " 'brink',\n", " 'britain',\n", " 'british',\n", " 'britishsecretservice',\n", " 'brittanymurphy',\n", " 'broadway',\n", " 'broke',\n", " 'broken',\n", " 'broker',\n", " 'brooklyn',\n", " 'brooks',\n", " 'brothel',\n", " 'brother',\n", " 'brotherbrotherrelationship',\n", " 'brothers',\n", " 'brothersisterrelationship',\n", " 'brought',\n", " 'brown',\n", " 'bruce',\n", " 'brucewillis',\n", " 'brutal',\n", " 'brutality',\n", " 'brutally',\n", " 'bryansinger',\n", " 'buck',\n", " 'buddies',\n", " 'buddy',\n", " 'buddycomedy',\n", " 'budget',\n", " 'build',\n", " 'building',\n", " 'built',\n", " 'bully',\n", " 'bullying',\n", " 'bumbling',\n", " 'bunny',\n", " 'burglar',\n", " 'buried',\n", " 'burned',\n", " 'bus',\n", " 'bush',\n", " 'business',\n", " 'businessman',\n", " 'bust',\n", " 'busy',\n", " 'butler',\n", " 'buy',\n", " 'cabin',\n", " 'caesar',\n", " 'cage',\n", " 'cairo',\n", " 'cal',\n", " 'california',\n", " 'called',\n", " 'calls',\n", " 'calvin',\n", " 'camcorder',\n", " 'came',\n", " 'camera',\n", " 'cameraman',\n", " 'cameras',\n", " 'camerondiaz',\n", " 'camp',\n", " 'campaign',\n", " 'campbell',\n", " 'camping',\n", " 'campus',\n", " 'canada',\n", " 'canadian',\n", " 'cancer',\n", " 'candidate',\n", " 'candy',\n", " 'cannibal',\n", " 'capable',\n", " 'capital',\n", " 'capitalism',\n", " 'capt',\n", " 'captain',\n", " 'captive',\n", " 'capture',\n", " 'captured',\n", " 'captures',\n", " 'car',\n", " 'caraccident',\n", " 'carchase',\n", " 'carcrash',\n", " 'card',\n", " 'care',\n", " 'career',\n", " 'carefree',\n", " 'caretaker',\n", " 'caribbean',\n", " 'carjourney',\n", " 'carl',\n", " 'carlagugino',\n", " 'carmen',\n", " 'carol',\n", " 'carolina',\n", " 'carrace',\n", " 'carrie',\n", " 'carry',\n", " 'carrying',\n", " 'cars',\n", " 'cartel',\n", " 'carter',\n", " 'cartoon',\n", " 'caryelwes',\n", " 'case',\n", " 'caseyaffleck',\n", " 'cash',\n", " 'casino',\n", " 'cast',\n", " 'castle',\n", " 'cat',\n", " 'cataclysm',\n", " 'catastrophe',\n", " 'catch',\n", " 'catches',\n", " 'cateblanchett',\n", " 'catherinedeneuve',\n", " 'catherinekeener',\n", " 'catherinezeta',\n", " 'catholic',\n", " 'catholicism',\n", " 'cattle',\n", " 'caught',\n", " 'cause',\n", " 'caused',\n", " 'causes',\n", " 'causing',\n", " 'cavalry',\n", " 'cave',\n", " 'cavemen',\n", " 'celebrate',\n", " 'celebrated',\n", " 'celebration',\n", " 'celebrity',\n", " 'cell',\n", " 'cellphone',\n", " 'cemetery',\n", " 'center',\n", " 'centered',\n", " 'centers',\n", " 'central',\n", " 'centuries',\n", " 'century',\n", " 'ceo',\n", " 'ceremony',\n", " 'certain',\n", " 'chad',\n", " 'chain',\n", " 'chainsaw',\n", " 'challenge',\n", " 'challenged',\n", " 'challenges',\n", " 'champion',\n", " 'championship',\n", " 'chance',\n", " 'change',\n", " 'changed',\n", " 'changes',\n", " 'changing',\n", " 'channingtatum',\n", " 'chaos',\n", " 'chaotic',\n", " 'chapter',\n", " 'character',\n", " 'characters',\n", " 'charge',\n", " 'charged',\n", " 'charismatic',\n", " 'charles',\n", " 'charlie',\n", " 'charliesheen',\n", " 'charlizetheron',\n", " 'charlotte',\n", " 'charm',\n", " 'charming',\n", " 'chase',\n", " 'chased',\n", " 'chauffeur',\n", " 'cheating',\n", " 'cheerleader',\n", " 'chef',\n", " 'chemical',\n", " 'cher',\n", " 'chicago',\n", " 'chicken',\n", " 'chief',\n", " 'child',\n", " 'childabuse',\n", " 'childhero',\n", " 'childhood',\n", " 'childprodigy',\n", " 'children',\n", " 'chilling',\n", " 'china',\n", " 'chinese',\n", " 'chip',\n", " 'chiwetelejiofor',\n", " 'chloe',\n", " 'chloëgracemoretz',\n", " 'chloësevigny',\n", " 'chocolate',\n", " 'choice',\n", " 'choices',\n", " 'choose',\n", " 'chosen',\n", " 'chris',\n", " 'chriscolumbus',\n", " 'chriscooper',\n", " 'chrisevans',\n", " 'chrishemsworth',\n", " 'chrisklein',\n", " 'chrispine',\n", " 'chrisrock',\n", " 'christ',\n", " 'christian',\n", " 'christianbale',\n", " 'christianity',\n", " 'christianslater',\n", " 'christinaapplegate',\n", " 'christinaricci',\n", " 'christine',\n", " 'christmas',\n", " 'christmasparty',\n", " 'christmastree',\n", " 'christopher',\n", " 'christopherlloyd',\n", " 'christophernolan',\n", " 'christopherplummer',\n", " 'christopherwalken',\n", " 'christophwaltz',\n", " 'chrisweitz',\n", " 'chronicle',\n", " 'chronicles',\n", " 'chuck',\n", " 'church',\n", " 'cia',\n", " 'ciaránhinds',\n", " 'cigarettesmoking',\n", " 'cillianmurphy',\n", " 'cindy',\n", " 'cinema',\n", " 'circle',\n", " 'circuit',\n", " 'circumstances',\n", " 'circus',\n", " 'cities',\n", " 'citizens',\n", " 'city',\n", " 'civil',\n", " 'civilization',\n", " 'civilwar',\n", " 'claim',\n", " 'claims',\n", " 'claire',\n", " 'clairedanes',\n", " 'clan',\n", " 'clark',\n", " 'clash',\n", " 'class',\n", " 'classes',\n", " 'classic',\n", " 'classmate',\n", " 'classmates',\n", " 'classroom',\n", " 'claudevandamme',\n", " 'clay',\n", " 'clean',\n", " 'clear',\n", " 'clerk',\n", " 'client',\n", " 'clients',\n", " 'climate',\n", " 'climbing',\n", " 'clinteastwood',\n", " 'clique',\n", " 'cliveowen',\n", " 'clock',\n", " 'clone',\n", " 'cloning',\n", " 'close',\n", " 'closed',\n", " 'closer',\n", " 'club',\n", " 'clubs',\n", " 'clue',\n", " 'clueless',\n", " 'clues',\n", " 'clutches',\n", " 'coach',\n", " 'coast',\n", " 'cocaine',\n", " 'cocky',\n", " 'code',\n", " 'cody',\n", " 'coffin',\n", " 'cohen',\n", " 'col',\n", " 'cold',\n", " 'coldwar',\n", " 'cole',\n", " 'colin',\n", " 'colinfarrell',\n", " 'colinfirth',\n", " 'collapse',\n", " 'colleague',\n", " 'colleagues',\n", " 'collect',\n", " 'collection',\n", " 'collector',\n", " 'college',\n", " 'collide',\n", " 'collision',\n", " 'colonel',\n", " 'colony',\n", " 'color',\n", " 'colorado',\n", " 'colorful',\n", " 'coma',\n", " 'combat',\n", " 'combined',\n", " 'come',\n", " 'comeback',\n", " 'comedian',\n", " 'comedic',\n", " 'comedy',\n", " 'comes',\n", " 'comet',\n", " 'comfort',\n", " 'comic',\n", " 'comics',\n", " 'coming',\n", " 'comingofage',\n", " 'comingout',\n", " 'command',\n", " 'commander',\n", " 'commercial',\n", " 'commit',\n", " 'commitment',\n", " 'committed',\n", " 'common',\n", " 'communication',\n", " 'community',\n", " 'companion',\n", " 'company',\n", " 'compete',\n", " 'competing',\n", " 'competition',\n", " 'complete',\n", " 'completely',\n", " 'complex',\n", " 'complicated',\n", " 'complications',\n", " 'composer',\n", " 'computer',\n", " 'computervirus',\n", " 'conan',\n", " 'concert',\n", " 'conclusion',\n", " 'condition',\n", " 'confession',\n", " 'confidence',\n", " 'confident',\n", " 'conflict',\n", " 'confront',\n", " 'confronted',\n", " 'confused',\n", " 'congress',\n", " 'conman',\n", " 'connected',\n", " 'connection',\n", " 'connell',\n", " 'connor',\n", " 'conquer',\n", " 'conscience',\n", " 'consequences',\n", " 'conservative',\n", " 'considered',\n", " 'conspiracy',\n", " 'constant',\n", " 'constantly',\n", " 'construction',\n", " 'contact',\n", " 'contain',\n", " 'contemporary',\n", " 'contend',\n", " 'contest',\n", " 'continue',\n", " 'continues',\n", " 'continuing',\n", " 'contract',\n", " 'control',\n", " 'controlled',\n", " 'controlling',\n", " 'controversial',\n", " 'convention',\n", " 'converge',\n", " 'convict',\n", " 'convicted',\n", " 'convince',\n", " 'convinced',\n", " 'convinces',\n", " 'cook',\n", " 'cooking',\n", " 'cool',\n", " 'cooper',\n", " 'cop',\n", " 'cope',\n", " 'cops',\n", " 'core',\n", " 'corner',\n", " 'corners',\n", " 'corporate',\n", " ...]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cv.get_feature_names()" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0. , 0. , ..., 0. , 0.02752409,\n", " 0. ],\n", " [0. , 1. , 0. , ..., 0.02865341, 0. ,\n", " 0. ],\n", " [0. , 0. , 1. , ..., 0.02865341, 0. ,\n", " 0. ],\n", " ...,\n", " [0. , 0.02865341, 0.02865341, ..., 1. , 0.048795 ,\n", " 0.05006262],\n", " [0.02752409, 0. , 0. , ..., 0.048795 , 1. ,\n", " 0.05129892],\n", " [0. , 0. , 0. , ..., 0.05006262, 0.05129892,\n", " 1. ]])" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cosine_similarity(vectors)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "similarity = cosine_similarity(vectors)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "def recommend(movie):\n", " movie_index = new_df[new_df['title']== movie].index[0]\n", " distances = similarity[movie_index]\n", " movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]\n", "\n", " for i in movies_list:\n", " print(new_df.iloc[i[0]].title)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The Dark Knight\n", "The Dark Knight Rises\n", "Batman\n", "Batman v Superman: Dawn of Justice\n", "Batman\n" ] } ], "source": [ "recommend('Batman Begins')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "fb4569285eef3a3450cb62085a5b1e0da4bce0af555edc33dcf29baf3acc1368" } } }, "nbformat": 4, "nbformat_minor": 2 }