Saaquib commited on
Commit
3424037
·
1 Parent(s): ce4462b

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
  *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  *.pkl filter=lfs diff=lfs merge=lfs -text
2
+ tmdb_5000_credits.csv filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pickle
3
+ import pandas as pd
4
+
5
+
6
+ def recommend(movie):
7
+ movie_index = movies[movies['title']== movie].index[0]
8
+ distances = similarity[movie_index]
9
+ movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]
10
+
11
+ recommended_movies = []
12
+ for i in movies_list:
13
+ movie_id = i[0]
14
+ recommended_movies.append(movies.iloc[i[0]].title)
15
+
16
+ return recommended_movies
17
+
18
+ movie_dict = pickle.load(open('movie_dict.pkl','rb'))
19
+ movies = pd.DataFrame(movie_dict)
20
+ similarity = pickle.load(open('similarity.pkl','rb'))
21
+
22
+ st.title('Movie Recommender System')
23
+
24
+ option = st.selectbox('Which Movie recommendation would you like to have?', movies['title'].values)
25
+
26
+ if st.button('Recommend'):
27
+ recommemdations = recommend(option)
28
+ for i in recommemdations:
29
+ st.write(i)
movie-recommender.ipynb ADDED
@@ -0,0 +1,2685 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import numpy as np\n",
10
+ "import pandas as pd\n",
11
+ "import ast\n",
12
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
13
+ "from nltk.stem.porter import PorterStemmer\n",
14
+ "from sklearn.metrics.pairwise import cosine_similarity"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "movies = pd.read_csv('tmdb_5000_movies.csv')\n",
24
+ "credits = pd.read_csv('tmdb_5000_credits.csv')"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 3,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "data": {
34
+ "text/html": [
35
+ "<div>\n",
36
+ "<style scoped>\n",
37
+ " .dataframe tbody tr th:only-of-type {\n",
38
+ " vertical-align: middle;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe tbody tr th {\n",
42
+ " vertical-align: top;\n",
43
+ " }\n",
44
+ "\n",
45
+ " .dataframe thead th {\n",
46
+ " text-align: right;\n",
47
+ " }\n",
48
+ "</style>\n",
49
+ "<table border=\"1\" class=\"dataframe\">\n",
50
+ " <thead>\n",
51
+ " <tr style=\"text-align: right;\">\n",
52
+ " <th></th>\n",
53
+ " <th>budget</th>\n",
54
+ " <th>genres</th>\n",
55
+ " <th>homepage</th>\n",
56
+ " <th>id</th>\n",
57
+ " <th>keywords</th>\n",
58
+ " <th>original_language</th>\n",
59
+ " <th>original_title</th>\n",
60
+ " <th>overview</th>\n",
61
+ " <th>popularity</th>\n",
62
+ " <th>production_companies</th>\n",
63
+ " <th>production_countries</th>\n",
64
+ " <th>release_date</th>\n",
65
+ " <th>revenue</th>\n",
66
+ " <th>runtime</th>\n",
67
+ " <th>spoken_languages</th>\n",
68
+ " <th>status</th>\n",
69
+ " <th>tagline</th>\n",
70
+ " <th>title</th>\n",
71
+ " <th>vote_average</th>\n",
72
+ " <th>vote_count</th>\n",
73
+ " </tr>\n",
74
+ " </thead>\n",
75
+ " <tbody>\n",
76
+ " <tr>\n",
77
+ " <th>0</th>\n",
78
+ " <td>237000000</td>\n",
79
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
80
+ " <td>http://www.avatarmovie.com/</td>\n",
81
+ " <td>19995</td>\n",
82
+ " <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
83
+ " <td>en</td>\n",
84
+ " <td>Avatar</td>\n",
85
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
86
+ " <td>150.437577</td>\n",
87
+ " <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
88
+ " <td>[{\"iso_3166_1\": \"US\", \"name\": \"United States o...</td>\n",
89
+ " <td>2009-12-10</td>\n",
90
+ " <td>2787965087</td>\n",
91
+ " <td>162.0</td>\n",
92
+ " <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
93
+ " <td>Released</td>\n",
94
+ " <td>Enter the World of Pandora.</td>\n",
95
+ " <td>Avatar</td>\n",
96
+ " <td>7.2</td>\n",
97
+ " <td>11800</td>\n",
98
+ " </tr>\n",
99
+ " </tbody>\n",
100
+ "</table>\n",
101
+ "</div>"
102
+ ],
103
+ "text/plain": [
104
+ " budget genres \n",
105
+ "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n",
106
+ "\n",
107
+ " homepage id \n",
108
+ "0 http://www.avatarmovie.com/ 19995 \\\n",
109
+ "\n",
110
+ " keywords original_language \n",
111
+ "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \\\n",
112
+ "\n",
113
+ " original_title overview \n",
114
+ "0 Avatar In the 22nd century, a paraplegic Marine is di... \\\n",
115
+ "\n",
116
+ " popularity production_companies \n",
117
+ "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... \\\n",
118
+ "\n",
119
+ " production_countries release_date revenue \n",
120
+ "0 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2009-12-10 2787965087 \\\n",
121
+ "\n",
122
+ " runtime spoken_languages status \n",
123
+ "0 162.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \\\n",
124
+ "\n",
125
+ " tagline title vote_average vote_count \n",
126
+ "0 Enter the World of Pandora. Avatar 7.2 11800 "
127
+ ]
128
+ },
129
+ "execution_count": 3,
130
+ "metadata": {},
131
+ "output_type": "execute_result"
132
+ }
133
+ ],
134
+ "source": [
135
+ "movies.head(1)"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": 4,
141
+ "metadata": {},
142
+ "outputs": [
143
+ {
144
+ "data": {
145
+ "text/html": [
146
+ "<div>\n",
147
+ "<style scoped>\n",
148
+ " .dataframe tbody tr th:only-of-type {\n",
149
+ " vertical-align: middle;\n",
150
+ " }\n",
151
+ "\n",
152
+ " .dataframe tbody tr th {\n",
153
+ " vertical-align: top;\n",
154
+ " }\n",
155
+ "\n",
156
+ " .dataframe thead th {\n",
157
+ " text-align: right;\n",
158
+ " }\n",
159
+ "</style>\n",
160
+ "<table border=\"1\" class=\"dataframe\">\n",
161
+ " <thead>\n",
162
+ " <tr style=\"text-align: right;\">\n",
163
+ " <th></th>\n",
164
+ " <th>movie_id</th>\n",
165
+ " <th>title</th>\n",
166
+ " <th>cast</th>\n",
167
+ " <th>crew</th>\n",
168
+ " </tr>\n",
169
+ " </thead>\n",
170
+ " <tbody>\n",
171
+ " <tr>\n",
172
+ " <th>0</th>\n",
173
+ " <td>19995</td>\n",
174
+ " <td>Avatar</td>\n",
175
+ " <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
176
+ " <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
177
+ " </tr>\n",
178
+ " </tbody>\n",
179
+ "</table>\n",
180
+ "</div>"
181
+ ],
182
+ "text/plain": [
183
+ " movie_id title cast \n",
184
+ "0 19995 Avatar [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n",
185
+ "\n",
186
+ " crew \n",
187
+ "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... "
188
+ ]
189
+ },
190
+ "execution_count": 4,
191
+ "metadata": {},
192
+ "output_type": "execute_result"
193
+ }
194
+ ],
195
+ "source": [
196
+ "credits.head(1)"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": 5,
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "movies = movies.merge(credits,on='title')"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": 6,
211
+ "metadata": {},
212
+ "outputs": [
213
+ {
214
+ "data": {
215
+ "text/html": [
216
+ "<div>\n",
217
+ "<style scoped>\n",
218
+ " .dataframe tbody tr th:only-of-type {\n",
219
+ " vertical-align: middle;\n",
220
+ " }\n",
221
+ "\n",
222
+ " .dataframe tbody tr th {\n",
223
+ " vertical-align: top;\n",
224
+ " }\n",
225
+ "\n",
226
+ " .dataframe thead th {\n",
227
+ " text-align: right;\n",
228
+ " }\n",
229
+ "</style>\n",
230
+ "<table border=\"1\" class=\"dataframe\">\n",
231
+ " <thead>\n",
232
+ " <tr style=\"text-align: right;\">\n",
233
+ " <th></th>\n",
234
+ " <th>budget</th>\n",
235
+ " <th>genres</th>\n",
236
+ " <th>homepage</th>\n",
237
+ " <th>id</th>\n",
238
+ " <th>keywords</th>\n",
239
+ " <th>original_language</th>\n",
240
+ " <th>original_title</th>\n",
241
+ " <th>overview</th>\n",
242
+ " <th>popularity</th>\n",
243
+ " <th>production_companies</th>\n",
244
+ " <th>...</th>\n",
245
+ " <th>runtime</th>\n",
246
+ " <th>spoken_languages</th>\n",
247
+ " <th>status</th>\n",
248
+ " <th>tagline</th>\n",
249
+ " <th>title</th>\n",
250
+ " <th>vote_average</th>\n",
251
+ " <th>vote_count</th>\n",
252
+ " <th>movie_id</th>\n",
253
+ " <th>cast</th>\n",
254
+ " <th>crew</th>\n",
255
+ " </tr>\n",
256
+ " </thead>\n",
257
+ " <tbody>\n",
258
+ " <tr>\n",
259
+ " <th>0</th>\n",
260
+ " <td>237000000</td>\n",
261
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
262
+ " <td>http://www.avatarmovie.com/</td>\n",
263
+ " <td>19995</td>\n",
264
+ " <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
265
+ " <td>en</td>\n",
266
+ " <td>Avatar</td>\n",
267
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
268
+ " <td>150.437577</td>\n",
269
+ " <td>[{\"name\": \"Ingenious Film Partners\", \"id\": 289...</td>\n",
270
+ " <td>...</td>\n",
271
+ " <td>162.0</td>\n",
272
+ " <td>[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...</td>\n",
273
+ " <td>Released</td>\n",
274
+ " <td>Enter the World of Pandora.</td>\n",
275
+ " <td>Avatar</td>\n",
276
+ " <td>7.2</td>\n",
277
+ " <td>11800</td>\n",
278
+ " <td>19995</td>\n",
279
+ " <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
280
+ " <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
281
+ " </tr>\n",
282
+ " </tbody>\n",
283
+ "</table>\n",
284
+ "<p>1 rows × 23 columns</p>\n",
285
+ "</div>"
286
+ ],
287
+ "text/plain": [
288
+ " budget genres \n",
289
+ "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n",
290
+ "\n",
291
+ " homepage id \n",
292
+ "0 http://www.avatarmovie.com/ 19995 \\\n",
293
+ "\n",
294
+ " keywords original_language \n",
295
+ "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \\\n",
296
+ "\n",
297
+ " original_title overview \n",
298
+ "0 Avatar In the 22nd century, a paraplegic Marine is di... \\\n",
299
+ "\n",
300
+ " popularity production_companies ... runtime \n",
301
+ "0 150.437577 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... ... 162.0 \\\n",
302
+ "\n",
303
+ " spoken_languages status \n",
304
+ "0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \\\n",
305
+ "\n",
306
+ " tagline title vote_average vote_count movie_id \n",
307
+ "0 Enter the World of Pandora. Avatar 7.2 11800 19995 \\\n",
308
+ "\n",
309
+ " cast \n",
310
+ "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n",
311
+ "\n",
312
+ " crew \n",
313
+ "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n",
314
+ "\n",
315
+ "[1 rows x 23 columns]"
316
+ ]
317
+ },
318
+ "execution_count": 6,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "movies.head(1)"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 7,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ "Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',\n",
336
+ " 'original_title', 'overview', 'popularity', 'production_companies',\n",
337
+ " 'production_countries', 'release_date', 'revenue', 'runtime',\n",
338
+ " 'spoken_languages', 'status', 'tagline', 'title', 'vote_average',\n",
339
+ " 'vote_count', 'movie_id', 'cast', 'crew'],\n",
340
+ " dtype='object')"
341
+ ]
342
+ },
343
+ "execution_count": 7,
344
+ "metadata": {},
345
+ "output_type": "execute_result"
346
+ }
347
+ ],
348
+ "source": [
349
+ "movies.columns"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": 8,
355
+ "metadata": {},
356
+ "outputs": [],
357
+ "source": [
358
+ "movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 9,
364
+ "metadata": {},
365
+ "outputs": [
366
+ {
367
+ "data": {
368
+ "text/html": [
369
+ "<div>\n",
370
+ "<style scoped>\n",
371
+ " .dataframe tbody tr th:only-of-type {\n",
372
+ " vertical-align: middle;\n",
373
+ " }\n",
374
+ "\n",
375
+ " .dataframe tbody tr th {\n",
376
+ " vertical-align: top;\n",
377
+ " }\n",
378
+ "\n",
379
+ " .dataframe thead th {\n",
380
+ " text-align: right;\n",
381
+ " }\n",
382
+ "</style>\n",
383
+ "<table border=\"1\" class=\"dataframe\">\n",
384
+ " <thead>\n",
385
+ " <tr style=\"text-align: right;\">\n",
386
+ " <th></th>\n",
387
+ " <th>movie_id</th>\n",
388
+ " <th>title</th>\n",
389
+ " <th>overview</th>\n",
390
+ " <th>genres</th>\n",
391
+ " <th>keywords</th>\n",
392
+ " <th>cast</th>\n",
393
+ " <th>crew</th>\n",
394
+ " </tr>\n",
395
+ " </thead>\n",
396
+ " <tbody>\n",
397
+ " <tr>\n",
398
+ " <th>0</th>\n",
399
+ " <td>19995</td>\n",
400
+ " <td>Avatar</td>\n",
401
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
402
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
403
+ " <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
404
+ " <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
405
+ " <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
406
+ " </tr>\n",
407
+ " <tr>\n",
408
+ " <th>1</th>\n",
409
+ " <td>285</td>\n",
410
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
411
+ " <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
412
+ " <td>[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...</td>\n",
413
+ " <td>[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...</td>\n",
414
+ " <td>[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...</td>\n",
415
+ " <td>[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...</td>\n",
416
+ " </tr>\n",
417
+ " <tr>\n",
418
+ " <th>2</th>\n",
419
+ " <td>206647</td>\n",
420
+ " <td>Spectre</td>\n",
421
+ " <td>A cryptic message from Bond’s past sends him o...</td>\n",
422
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
423
+ " <td>[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...</td>\n",
424
+ " <td>[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...</td>\n",
425
+ " <td>[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...</td>\n",
426
+ " </tr>\n",
427
+ " <tr>\n",
428
+ " <th>3</th>\n",
429
+ " <td>49026</td>\n",
430
+ " <td>The Dark Knight Rises</td>\n",
431
+ " <td>Following the death of District Attorney Harve...</td>\n",
432
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...</td>\n",
433
+ " <td>[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...</td>\n",
434
+ " <td>[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...</td>\n",
435
+ " <td>[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...</td>\n",
436
+ " </tr>\n",
437
+ " <tr>\n",
438
+ " <th>4</th>\n",
439
+ " <td>49529</td>\n",
440
+ " <td>John Carter</td>\n",
441
+ " <td>John Carter is a war-weary, former military ca...</td>\n",
442
+ " <td>[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...</td>\n",
443
+ " <td>[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...</td>\n",
444
+ " <td>[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...</td>\n",
445
+ " <td>[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...</td>\n",
446
+ " </tr>\n",
447
+ " </tbody>\n",
448
+ "</table>\n",
449
+ "</div>"
450
+ ],
451
+ "text/plain": [
452
+ " movie_id title \n",
453
+ "0 19995 Avatar \\\n",
454
+ "1 285 Pirates of the Caribbean: At World's End \n",
455
+ "2 206647 Spectre \n",
456
+ "3 49026 The Dark Knight Rises \n",
457
+ "4 49529 John Carter \n",
458
+ "\n",
459
+ " overview \n",
460
+ "0 In the 22nd century, a paraplegic Marine is di... \\\n",
461
+ "1 Captain Barbossa, long believed to be dead, ha... \n",
462
+ "2 A cryptic message from Bond’s past sends him o... \n",
463
+ "3 Following the death of District Attorney Harve... \n",
464
+ "4 John Carter is a war-weary, former military ca... \n",
465
+ "\n",
466
+ " genres \n",
467
+ "0 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \\\n",
468
+ "1 [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"... \n",
469
+ "2 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n",
470
+ "3 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam... \n",
471
+ "4 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n",
472
+ "\n",
473
+ " keywords \n",
474
+ "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... \\\n",
475
+ "1 [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na... \n",
476
+ "2 [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name... \n",
477
+ "3 [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,... \n",
478
+ "4 [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":... \n",
479
+ "\n",
480
+ " cast \n",
481
+ "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n",
482
+ "1 [{\"cast_id\": 4, \"character\": \"Captain Jack Spa... \n",
483
+ "2 [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr... \n",
484
+ "3 [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba... \n",
485
+ "4 [{\"cast_id\": 5, \"character\": \"John Carter\", \"c... \n",
486
+ "\n",
487
+ " crew \n",
488
+ "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n",
489
+ "1 [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de... \n",
490
+ "2 [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de... \n",
491
+ "3 [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de... \n",
492
+ "4 [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de... "
493
+ ]
494
+ },
495
+ "execution_count": 9,
496
+ "metadata": {},
497
+ "output_type": "execute_result"
498
+ }
499
+ ],
500
+ "source": [
501
+ "movies.head()"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": 10,
507
+ "metadata": {},
508
+ "outputs": [
509
+ {
510
+ "data": {
511
+ "text/plain": [
512
+ "movie_id 0\n",
513
+ "title 0\n",
514
+ "overview 3\n",
515
+ "genres 0\n",
516
+ "keywords 0\n",
517
+ "cast 0\n",
518
+ "crew 0\n",
519
+ "dtype: int64"
520
+ ]
521
+ },
522
+ "execution_count": 10,
523
+ "metadata": {},
524
+ "output_type": "execute_result"
525
+ }
526
+ ],
527
+ "source": [
528
+ "movies.isnull().sum()"
529
+ ]
530
+ },
531
+ {
532
+ "cell_type": "code",
533
+ "execution_count": 11,
534
+ "metadata": {},
535
+ "outputs": [],
536
+ "source": [
537
+ "movies.dropna(inplace=True)"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": 12,
543
+ "metadata": {},
544
+ "outputs": [
545
+ {
546
+ "data": {
547
+ "text/plain": [
548
+ "0"
549
+ ]
550
+ },
551
+ "execution_count": 12,
552
+ "metadata": {},
553
+ "output_type": "execute_result"
554
+ }
555
+ ],
556
+ "source": [
557
+ "movies.duplicated().sum()"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 13,
563
+ "metadata": {},
564
+ "outputs": [
565
+ {
566
+ "data": {
567
+ "text/plain": [
568
+ "'[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"name\": \"Fantasy\"}, {\"id\": 878, \"name\": \"Science Fiction\"}]'"
569
+ ]
570
+ },
571
+ "execution_count": 13,
572
+ "metadata": {},
573
+ "output_type": "execute_result"
574
+ }
575
+ ],
576
+ "source": [
577
+ "movies.iloc[0].genres"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "code",
582
+ "execution_count": 14,
583
+ "metadata": {},
584
+ "outputs": [],
585
+ "source": [
586
+ "def convert(obj):\n",
587
+ " l = []\n",
588
+ " for i in ast.literal_eval(obj):\n",
589
+ " l.append(i['name']) \n",
590
+ " return l"
591
+ ]
592
+ },
593
+ {
594
+ "cell_type": "code",
595
+ "execution_count": 15,
596
+ "metadata": {},
597
+ "outputs": [],
598
+ "source": [
599
+ "movies['genres'] = movies['genres'].apply(convert)"
600
+ ]
601
+ },
602
+ {
603
+ "cell_type": "code",
604
+ "execution_count": 16,
605
+ "metadata": {},
606
+ "outputs": [
607
+ {
608
+ "data": {
609
+ "text/html": [
610
+ "<div>\n",
611
+ "<style scoped>\n",
612
+ " .dataframe tbody tr th:only-of-type {\n",
613
+ " vertical-align: middle;\n",
614
+ " }\n",
615
+ "\n",
616
+ " .dataframe tbody tr th {\n",
617
+ " vertical-align: top;\n",
618
+ " }\n",
619
+ "\n",
620
+ " .dataframe thead th {\n",
621
+ " text-align: right;\n",
622
+ " }\n",
623
+ "</style>\n",
624
+ "<table border=\"1\" class=\"dataframe\">\n",
625
+ " <thead>\n",
626
+ " <tr style=\"text-align: right;\">\n",
627
+ " <th></th>\n",
628
+ " <th>movie_id</th>\n",
629
+ " <th>title</th>\n",
630
+ " <th>overview</th>\n",
631
+ " <th>genres</th>\n",
632
+ " <th>keywords</th>\n",
633
+ " <th>cast</th>\n",
634
+ " <th>crew</th>\n",
635
+ " </tr>\n",
636
+ " </thead>\n",
637
+ " <tbody>\n",
638
+ " <tr>\n",
639
+ " <th>0</th>\n",
640
+ " <td>19995</td>\n",
641
+ " <td>Avatar</td>\n",
642
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
643
+ " <td>[Action, Adventure, Fantasy, Science Fiction]</td>\n",
644
+ " <td>[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...</td>\n",
645
+ " <td>[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...</td>\n",
646
+ " <td>[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...</td>\n",
647
+ " </tr>\n",
648
+ " </tbody>\n",
649
+ "</table>\n",
650
+ "</div>"
651
+ ],
652
+ "text/plain": [
653
+ " movie_id title overview \n",
654
+ "0 19995 Avatar In the 22nd century, a paraplegic Marine is di... \\\n",
655
+ "\n",
656
+ " genres \n",
657
+ "0 [Action, Adventure, Fantasy, Science Fiction] \\\n",
658
+ "\n",
659
+ " keywords \n",
660
+ "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... \\\n",
661
+ "\n",
662
+ " cast \n",
663
+ "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \\\n",
664
+ "\n",
665
+ " crew \n",
666
+ "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... "
667
+ ]
668
+ },
669
+ "execution_count": 16,
670
+ "metadata": {},
671
+ "output_type": "execute_result"
672
+ }
673
+ ],
674
+ "source": [
675
+ "movies.head(1)"
676
+ ]
677
+ },
678
+ {
679
+ "cell_type": "code",
680
+ "execution_count": 17,
681
+ "metadata": {},
682
+ "outputs": [],
683
+ "source": [
684
+ "movies['keywords'] = movies['keywords'].apply(convert)"
685
+ ]
686
+ },
687
+ {
688
+ "cell_type": "code",
689
+ "execution_count": 18,
690
+ "metadata": {},
691
+ "outputs": [],
692
+ "source": [
693
+ "def convert3(obj):\n",
694
+ " l = []\n",
695
+ " c = 0\n",
696
+ " for i in ast.literal_eval(obj):\n",
697
+ " if c!=3:\n",
698
+ " l.append(i['name']) \n",
699
+ " c+=1\n",
700
+ " else:\n",
701
+ " break\n",
702
+ " return l"
703
+ ]
704
+ },
705
+ {
706
+ "cell_type": "code",
707
+ "execution_count": 19,
708
+ "metadata": {},
709
+ "outputs": [],
710
+ "source": [
711
+ "movies['cast'] = movies['cast'].apply(convert3)"
712
+ ]
713
+ },
714
+ {
715
+ "cell_type": "code",
716
+ "execution_count": 20,
717
+ "metadata": {},
718
+ "outputs": [],
719
+ "source": [
720
+ "def fetch(obj):\n",
721
+ " l = []\n",
722
+ " for i in ast.literal_eval(obj):\n",
723
+ " if i['job']=='Director':\n",
724
+ " l.append(i['name']) \n",
725
+ " break\n",
726
+ " return l"
727
+ ]
728
+ },
729
+ {
730
+ "cell_type": "code",
731
+ "execution_count": 21,
732
+ "metadata": {},
733
+ "outputs": [],
734
+ "source": [
735
+ "movies['crew'] = movies['crew'].apply(fetch)"
736
+ ]
737
+ },
738
+ {
739
+ "cell_type": "code",
740
+ "execution_count": 22,
741
+ "metadata": {},
742
+ "outputs": [
743
+ {
744
+ "data": {
745
+ "text/html": [
746
+ "<div>\n",
747
+ "<style scoped>\n",
748
+ " .dataframe tbody tr th:only-of-type {\n",
749
+ " vertical-align: middle;\n",
750
+ " }\n",
751
+ "\n",
752
+ " .dataframe tbody tr th {\n",
753
+ " vertical-align: top;\n",
754
+ " }\n",
755
+ "\n",
756
+ " .dataframe thead th {\n",
757
+ " text-align: right;\n",
758
+ " }\n",
759
+ "</style>\n",
760
+ "<table border=\"1\" class=\"dataframe\">\n",
761
+ " <thead>\n",
762
+ " <tr style=\"text-align: right;\">\n",
763
+ " <th></th>\n",
764
+ " <th>movie_id</th>\n",
765
+ " <th>title</th>\n",
766
+ " <th>overview</th>\n",
767
+ " <th>genres</th>\n",
768
+ " <th>keywords</th>\n",
769
+ " <th>cast</th>\n",
770
+ " <th>crew</th>\n",
771
+ " </tr>\n",
772
+ " </thead>\n",
773
+ " <tbody>\n",
774
+ " <tr>\n",
775
+ " <th>0</th>\n",
776
+ " <td>19995</td>\n",
777
+ " <td>Avatar</td>\n",
778
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
779
+ " <td>[Action, Adventure, Fantasy, Science Fiction]</td>\n",
780
+ " <td>[culture clash, future, space war, space colon...</td>\n",
781
+ " <td>[Sam Worthington, Zoe Saldana, Sigourney Weaver]</td>\n",
782
+ " <td>[James Cameron]</td>\n",
783
+ " </tr>\n",
784
+ " <tr>\n",
785
+ " <th>1</th>\n",
786
+ " <td>285</td>\n",
787
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
788
+ " <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
789
+ " <td>[Adventure, Fantasy, Action]</td>\n",
790
+ " <td>[ocean, drug abuse, exotic island, east india ...</td>\n",
791
+ " <td>[Johnny Depp, Orlando Bloom, Keira Knightley]</td>\n",
792
+ " <td>[Gore Verbinski]</td>\n",
793
+ " </tr>\n",
794
+ " <tr>\n",
795
+ " <th>2</th>\n",
796
+ " <td>206647</td>\n",
797
+ " <td>Spectre</td>\n",
798
+ " <td>A cryptic message from Bond’s past sends him o...</td>\n",
799
+ " <td>[Action, Adventure, Crime]</td>\n",
800
+ " <td>[spy, based on novel, secret agent, sequel, mi...</td>\n",
801
+ " <td>[Daniel Craig, Christoph Waltz, Léa Seydoux]</td>\n",
802
+ " <td>[Sam Mendes]</td>\n",
803
+ " </tr>\n",
804
+ " <tr>\n",
805
+ " <th>3</th>\n",
806
+ " <td>49026</td>\n",
807
+ " <td>The Dark Knight Rises</td>\n",
808
+ " <td>Following the death of District Attorney Harve...</td>\n",
809
+ " <td>[Action, Crime, Drama, Thriller]</td>\n",
810
+ " <td>[dc comics, crime fighter, terrorist, secret i...</td>\n",
811
+ " <td>[Christian Bale, Michael Caine, Gary Oldman]</td>\n",
812
+ " <td>[Christopher Nolan]</td>\n",
813
+ " </tr>\n",
814
+ " <tr>\n",
815
+ " <th>4</th>\n",
816
+ " <td>49529</td>\n",
817
+ " <td>John Carter</td>\n",
818
+ " <td>John Carter is a war-weary, former military ca...</td>\n",
819
+ " <td>[Action, Adventure, Science Fiction]</td>\n",
820
+ " <td>[based on novel, mars, medallion, space travel...</td>\n",
821
+ " <td>[Taylor Kitsch, Lynn Collins, Samantha Morton]</td>\n",
822
+ " <td>[Andrew Stanton]</td>\n",
823
+ " </tr>\n",
824
+ " </tbody>\n",
825
+ "</table>\n",
826
+ "</div>"
827
+ ],
828
+ "text/plain": [
829
+ " movie_id title \n",
830
+ "0 19995 Avatar \\\n",
831
+ "1 285 Pirates of the Caribbean: At World's End \n",
832
+ "2 206647 Spectre \n",
833
+ "3 49026 The Dark Knight Rises \n",
834
+ "4 49529 John Carter \n",
835
+ "\n",
836
+ " overview \n",
837
+ "0 In the 22nd century, a paraplegic Marine is di... \\\n",
838
+ "1 Captain Barbossa, long believed to be dead, ha... \n",
839
+ "2 A cryptic message from Bond’s past sends him o... \n",
840
+ "3 Following the death of District Attorney Harve... \n",
841
+ "4 John Carter is a war-weary, former military ca... \n",
842
+ "\n",
843
+ " genres \n",
844
+ "0 [Action, Adventure, Fantasy, Science Fiction] \\\n",
845
+ "1 [Adventure, Fantasy, Action] \n",
846
+ "2 [Action, Adventure, Crime] \n",
847
+ "3 [Action, Crime, Drama, Thriller] \n",
848
+ "4 [Action, Adventure, Science Fiction] \n",
849
+ "\n",
850
+ " keywords \n",
851
+ "0 [culture clash, future, space war, space colon... \\\n",
852
+ "1 [ocean, drug abuse, exotic island, east india ... \n",
853
+ "2 [spy, based on novel, secret agent, sequel, mi... \n",
854
+ "3 [dc comics, crime fighter, terrorist, secret i... \n",
855
+ "4 [based on novel, mars, medallion, space travel... \n",
856
+ "\n",
857
+ " cast crew \n",
858
+ "0 [Sam Worthington, Zoe Saldana, Sigourney Weaver] [James Cameron] \n",
859
+ "1 [Johnny Depp, Orlando Bloom, Keira Knightley] [Gore Verbinski] \n",
860
+ "2 [Daniel Craig, Christoph Waltz, Léa Seydoux] [Sam Mendes] \n",
861
+ "3 [Christian Bale, Michael Caine, Gary Oldman] [Christopher Nolan] \n",
862
+ "4 [Taylor Kitsch, Lynn Collins, Samantha Morton] [Andrew Stanton] "
863
+ ]
864
+ },
865
+ "execution_count": 22,
866
+ "metadata": {},
867
+ "output_type": "execute_result"
868
+ }
869
+ ],
870
+ "source": [
871
+ "movies.head()"
872
+ ]
873
+ },
874
+ {
875
+ "cell_type": "code",
876
+ "execution_count": 23,
877
+ "metadata": {},
878
+ "outputs": [],
879
+ "source": [
880
+ "movies['overview'] = movies['overview'].apply(lambda x:x.split())"
881
+ ]
882
+ },
883
+ {
884
+ "cell_type": "code",
885
+ "execution_count": 24,
886
+ "metadata": {},
887
+ "outputs": [
888
+ {
889
+ "data": {
890
+ "text/html": [
891
+ "<div>\n",
892
+ "<style scoped>\n",
893
+ " .dataframe tbody tr th:only-of-type {\n",
894
+ " vertical-align: middle;\n",
895
+ " }\n",
896
+ "\n",
897
+ " .dataframe tbody tr th {\n",
898
+ " vertical-align: top;\n",
899
+ " }\n",
900
+ "\n",
901
+ " .dataframe thead th {\n",
902
+ " text-align: right;\n",
903
+ " }\n",
904
+ "</style>\n",
905
+ "<table border=\"1\" class=\"dataframe\">\n",
906
+ " <thead>\n",
907
+ " <tr style=\"text-align: right;\">\n",
908
+ " <th></th>\n",
909
+ " <th>movie_id</th>\n",
910
+ " <th>title</th>\n",
911
+ " <th>overview</th>\n",
912
+ " <th>genres</th>\n",
913
+ " <th>keywords</th>\n",
914
+ " <th>cast</th>\n",
915
+ " <th>crew</th>\n",
916
+ " </tr>\n",
917
+ " </thead>\n",
918
+ " <tbody>\n",
919
+ " <tr>\n",
920
+ " <th>0</th>\n",
921
+ " <td>19995</td>\n",
922
+ " <td>Avatar</td>\n",
923
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
924
+ " <td>[Action, Adventure, Fantasy, Science Fiction]</td>\n",
925
+ " <td>[culture clash, future, space war, space colon...</td>\n",
926
+ " <td>[Sam Worthington, Zoe Saldana, Sigourney Weaver]</td>\n",
927
+ " <td>[James Cameron]</td>\n",
928
+ " </tr>\n",
929
+ " <tr>\n",
930
+ " <th>1</th>\n",
931
+ " <td>285</td>\n",
932
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
933
+ " <td>[Captain, Barbossa,, long, believed, to, be, d...</td>\n",
934
+ " <td>[Adventure, Fantasy, Action]</td>\n",
935
+ " <td>[ocean, drug abuse, exotic island, east india ...</td>\n",
936
+ " <td>[Johnny Depp, Orlando Bloom, Keira Knightley]</td>\n",
937
+ " <td>[Gore Verbinski]</td>\n",
938
+ " </tr>\n",
939
+ " <tr>\n",
940
+ " <th>2</th>\n",
941
+ " <td>206647</td>\n",
942
+ " <td>Spectre</td>\n",
943
+ " <td>[A, cryptic, message, from, Bond’s, past, send...</td>\n",
944
+ " <td>[Action, Adventure, Crime]</td>\n",
945
+ " <td>[spy, based on novel, secret agent, sequel, mi...</td>\n",
946
+ " <td>[Daniel Craig, Christoph Waltz, Léa Seydoux]</td>\n",
947
+ " <td>[Sam Mendes]</td>\n",
948
+ " </tr>\n",
949
+ " <tr>\n",
950
+ " <th>3</th>\n",
951
+ " <td>49026</td>\n",
952
+ " <td>The Dark Knight Rises</td>\n",
953
+ " <td>[Following, the, death, of, District, Attorney...</td>\n",
954
+ " <td>[Action, Crime, Drama, Thriller]</td>\n",
955
+ " <td>[dc comics, crime fighter, terrorist, secret i...</td>\n",
956
+ " <td>[Christian Bale, Michael Caine, Gary Oldman]</td>\n",
957
+ " <td>[Christopher Nolan]</td>\n",
958
+ " </tr>\n",
959
+ " <tr>\n",
960
+ " <th>4</th>\n",
961
+ " <td>49529</td>\n",
962
+ " <td>John Carter</td>\n",
963
+ " <td>[John, Carter, is, a, war-weary,, former, mili...</td>\n",
964
+ " <td>[Action, Adventure, Science Fiction]</td>\n",
965
+ " <td>[based on novel, mars, medallion, space travel...</td>\n",
966
+ " <td>[Taylor Kitsch, Lynn Collins, Samantha Morton]</td>\n",
967
+ " <td>[Andrew Stanton]</td>\n",
968
+ " </tr>\n",
969
+ " </tbody>\n",
970
+ "</table>\n",
971
+ "</div>"
972
+ ],
973
+ "text/plain": [
974
+ " movie_id title \n",
975
+ "0 19995 Avatar \\\n",
976
+ "1 285 Pirates of the Caribbean: At World's End \n",
977
+ "2 206647 Spectre \n",
978
+ "3 49026 The Dark Knight Rises \n",
979
+ "4 49529 John Carter \n",
980
+ "\n",
981
+ " overview \n",
982
+ "0 [In, the, 22nd, century,, a, paraplegic, Marin... \\\n",
983
+ "1 [Captain, Barbossa,, long, believed, to, be, d... \n",
984
+ "2 [A, cryptic, message, from, Bond’s, past, send... \n",
985
+ "3 [Following, the, death, of, District, Attorney... \n",
986
+ "4 [John, Carter, is, a, war-weary,, former, mili... \n",
987
+ "\n",
988
+ " genres \n",
989
+ "0 [Action, Adventure, Fantasy, Science Fiction] \\\n",
990
+ "1 [Adventure, Fantasy, Action] \n",
991
+ "2 [Action, Adventure, Crime] \n",
992
+ "3 [Action, Crime, Drama, Thriller] \n",
993
+ "4 [Action, Adventure, Science Fiction] \n",
994
+ "\n",
995
+ " keywords \n",
996
+ "0 [culture clash, future, space war, space colon... \\\n",
997
+ "1 [ocean, drug abuse, exotic island, east india ... \n",
998
+ "2 [spy, based on novel, secret agent, sequel, mi... \n",
999
+ "3 [dc comics, crime fighter, terrorist, secret i... \n",
1000
+ "4 [based on novel, mars, medallion, space travel... \n",
1001
+ "\n",
1002
+ " cast crew \n",
1003
+ "0 [Sam Worthington, Zoe Saldana, Sigourney Weaver] [James Cameron] \n",
1004
+ "1 [Johnny Depp, Orlando Bloom, Keira Knightley] [Gore Verbinski] \n",
1005
+ "2 [Daniel Craig, Christoph Waltz, Léa Seydoux] [Sam Mendes] \n",
1006
+ "3 [Christian Bale, Michael Caine, Gary Oldman] [Christopher Nolan] \n",
1007
+ "4 [Taylor Kitsch, Lynn Collins, Samantha Morton] [Andrew Stanton] "
1008
+ ]
1009
+ },
1010
+ "execution_count": 24,
1011
+ "metadata": {},
1012
+ "output_type": "execute_result"
1013
+ }
1014
+ ],
1015
+ "source": [
1016
+ "movies.head()"
1017
+ ]
1018
+ },
1019
+ {
1020
+ "cell_type": "code",
1021
+ "execution_count": 25,
1022
+ "metadata": {},
1023
+ "outputs": [],
1024
+ "source": [
1025
+ "movies['genres'] = movies['genres'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n",
1026
+ "movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n",
1027
+ "movies['cast'] = movies['cast'].apply(lambda x:[i.replace(\" \",\"\") for i in x])\n",
1028
+ "movies['crew'] = movies['crew'].apply(lambda x:[i.replace(\" \",\"\") for i in x])"
1029
+ ]
1030
+ },
1031
+ {
1032
+ "cell_type": "code",
1033
+ "execution_count": 26,
1034
+ "metadata": {},
1035
+ "outputs": [
1036
+ {
1037
+ "data": {
1038
+ "text/html": [
1039
+ "<div>\n",
1040
+ "<style scoped>\n",
1041
+ " .dataframe tbody tr th:only-of-type {\n",
1042
+ " vertical-align: middle;\n",
1043
+ " }\n",
1044
+ "\n",
1045
+ " .dataframe tbody tr th {\n",
1046
+ " vertical-align: top;\n",
1047
+ " }\n",
1048
+ "\n",
1049
+ " .dataframe thead th {\n",
1050
+ " text-align: right;\n",
1051
+ " }\n",
1052
+ "</style>\n",
1053
+ "<table border=\"1\" class=\"dataframe\">\n",
1054
+ " <thead>\n",
1055
+ " <tr style=\"text-align: right;\">\n",
1056
+ " <th></th>\n",
1057
+ " <th>movie_id</th>\n",
1058
+ " <th>title</th>\n",
1059
+ " <th>overview</th>\n",
1060
+ " <th>genres</th>\n",
1061
+ " <th>keywords</th>\n",
1062
+ " <th>cast</th>\n",
1063
+ " <th>crew</th>\n",
1064
+ " </tr>\n",
1065
+ " </thead>\n",
1066
+ " <tbody>\n",
1067
+ " <tr>\n",
1068
+ " <th>0</th>\n",
1069
+ " <td>19995</td>\n",
1070
+ " <td>Avatar</td>\n",
1071
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
1072
+ " <td>[Action, Adventure, Fantasy, ScienceFiction]</td>\n",
1073
+ " <td>[cultureclash, future, spacewar, spacecolony, ...</td>\n",
1074
+ " <td>[SamWorthington, ZoeSaldana, SigourneyWeaver]</td>\n",
1075
+ " <td>[JamesCameron]</td>\n",
1076
+ " </tr>\n",
1077
+ " </tbody>\n",
1078
+ "</table>\n",
1079
+ "</div>"
1080
+ ],
1081
+ "text/plain": [
1082
+ " movie_id title overview \n",
1083
+ "0 19995 Avatar [In, the, 22nd, century,, a, paraplegic, Marin... \\\n",
1084
+ "\n",
1085
+ " genres \n",
1086
+ "0 [Action, Adventure, Fantasy, ScienceFiction] \\\n",
1087
+ "\n",
1088
+ " keywords \n",
1089
+ "0 [cultureclash, future, spacewar, spacecolony, ... \\\n",
1090
+ "\n",
1091
+ " cast crew \n",
1092
+ "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] "
1093
+ ]
1094
+ },
1095
+ "execution_count": 26,
1096
+ "metadata": {},
1097
+ "output_type": "execute_result"
1098
+ }
1099
+ ],
1100
+ "source": [
1101
+ "movies.head(1)"
1102
+ ]
1103
+ },
1104
+ {
1105
+ "cell_type": "code",
1106
+ "execution_count": 27,
1107
+ "metadata": {},
1108
+ "outputs": [],
1109
+ "source": [
1110
+ "movies['tags'] = movies['overview']+movies['keywords']+movies['cast']+movies['crew']"
1111
+ ]
1112
+ },
1113
+ {
1114
+ "cell_type": "code",
1115
+ "execution_count": 28,
1116
+ "metadata": {},
1117
+ "outputs": [
1118
+ {
1119
+ "data": {
1120
+ "text/html": [
1121
+ "<div>\n",
1122
+ "<style scoped>\n",
1123
+ " .dataframe tbody tr th:only-of-type {\n",
1124
+ " vertical-align: middle;\n",
1125
+ " }\n",
1126
+ "\n",
1127
+ " .dataframe tbody tr th {\n",
1128
+ " vertical-align: top;\n",
1129
+ " }\n",
1130
+ "\n",
1131
+ " .dataframe thead th {\n",
1132
+ " text-align: right;\n",
1133
+ " }\n",
1134
+ "</style>\n",
1135
+ "<table border=\"1\" class=\"dataframe\">\n",
1136
+ " <thead>\n",
1137
+ " <tr style=\"text-align: right;\">\n",
1138
+ " <th></th>\n",
1139
+ " <th>movie_id</th>\n",
1140
+ " <th>title</th>\n",
1141
+ " <th>overview</th>\n",
1142
+ " <th>genres</th>\n",
1143
+ " <th>keywords</th>\n",
1144
+ " <th>cast</th>\n",
1145
+ " <th>crew</th>\n",
1146
+ " <th>tags</th>\n",
1147
+ " </tr>\n",
1148
+ " </thead>\n",
1149
+ " <tbody>\n",
1150
+ " <tr>\n",
1151
+ " <th>0</th>\n",
1152
+ " <td>19995</td>\n",
1153
+ " <td>Avatar</td>\n",
1154
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
1155
+ " <td>[Action, Adventure, Fantasy, ScienceFiction]</td>\n",
1156
+ " <td>[cultureclash, future, spacewar, spacecolony, ...</td>\n",
1157
+ " <td>[SamWorthington, ZoeSaldana, SigourneyWeaver]</td>\n",
1158
+ " <td>[JamesCameron]</td>\n",
1159
+ " <td>[In, the, 22nd, century,, a, paraplegic, Marin...</td>\n",
1160
+ " </tr>\n",
1161
+ " <tr>\n",
1162
+ " <th>1</th>\n",
1163
+ " <td>285</td>\n",
1164
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
1165
+ " <td>[Captain, Barbossa,, long, believed, to, be, d...</td>\n",
1166
+ " <td>[Adventure, Fantasy, Action]</td>\n",
1167
+ " <td>[ocean, drugabuse, exoticisland, eastindiatrad...</td>\n",
1168
+ " <td>[JohnnyDepp, OrlandoBloom, KeiraKnightley]</td>\n",
1169
+ " <td>[GoreVerbinski]</td>\n",
1170
+ " <td>[Captain, Barbossa,, long, believed, to, be, d...</td>\n",
1171
+ " </tr>\n",
1172
+ " <tr>\n",
1173
+ " <th>2</th>\n",
1174
+ " <td>206647</td>\n",
1175
+ " <td>Spectre</td>\n",
1176
+ " <td>[A, cryptic, message, from, Bond’s, past, send...</td>\n",
1177
+ " <td>[Action, Adventure, Crime]</td>\n",
1178
+ " <td>[spy, basedonnovel, secretagent, sequel, mi6, ...</td>\n",
1179
+ " <td>[DanielCraig, ChristophWaltz, LéaSeydoux]</td>\n",
1180
+ " <td>[SamMendes]</td>\n",
1181
+ " <td>[A, cryptic, message, from, Bond’s, past, send...</td>\n",
1182
+ " </tr>\n",
1183
+ " <tr>\n",
1184
+ " <th>3</th>\n",
1185
+ " <td>49026</td>\n",
1186
+ " <td>The Dark Knight Rises</td>\n",
1187
+ " <td>[Following, the, death, of, District, Attorney...</td>\n",
1188
+ " <td>[Action, Crime, Drama, Thriller]</td>\n",
1189
+ " <td>[dccomics, crimefighter, terrorist, secretiden...</td>\n",
1190
+ " <td>[ChristianBale, MichaelCaine, GaryOldman]</td>\n",
1191
+ " <td>[ChristopherNolan]</td>\n",
1192
+ " <td>[Following, the, death, of, District, Attorney...</td>\n",
1193
+ " </tr>\n",
1194
+ " <tr>\n",
1195
+ " <th>4</th>\n",
1196
+ " <td>49529</td>\n",
1197
+ " <td>John Carter</td>\n",
1198
+ " <td>[John, Carter, is, a, war-weary,, former, mili...</td>\n",
1199
+ " <td>[Action, Adventure, ScienceFiction]</td>\n",
1200
+ " <td>[basedonnovel, mars, medallion, spacetravel, p...</td>\n",
1201
+ " <td>[TaylorKitsch, LynnCollins, SamanthaMorton]</td>\n",
1202
+ " <td>[AndrewStanton]</td>\n",
1203
+ " <td>[John, Carter, is, a, war-weary,, former, mili...</td>\n",
1204
+ " </tr>\n",
1205
+ " </tbody>\n",
1206
+ "</table>\n",
1207
+ "</div>"
1208
+ ],
1209
+ "text/plain": [
1210
+ " movie_id title \n",
1211
+ "0 19995 Avatar \\\n",
1212
+ "1 285 Pirates of the Caribbean: At World's End \n",
1213
+ "2 206647 Spectre \n",
1214
+ "3 49026 The Dark Knight Rises \n",
1215
+ "4 49529 John Carter \n",
1216
+ "\n",
1217
+ " overview \n",
1218
+ "0 [In, the, 22nd, century,, a, paraplegic, Marin... \\\n",
1219
+ "1 [Captain, Barbossa,, long, believed, to, be, d... \n",
1220
+ "2 [A, cryptic, message, from, Bond’s, past, send... \n",
1221
+ "3 [Following, the, death, of, District, Attorney... \n",
1222
+ "4 [John, Carter, is, a, war-weary,, former, mili... \n",
1223
+ "\n",
1224
+ " genres \n",
1225
+ "0 [Action, Adventure, Fantasy, ScienceFiction] \\\n",
1226
+ "1 [Adventure, Fantasy, Action] \n",
1227
+ "2 [Action, Adventure, Crime] \n",
1228
+ "3 [Action, Crime, Drama, Thriller] \n",
1229
+ "4 [Action, Adventure, ScienceFiction] \n",
1230
+ "\n",
1231
+ " keywords \n",
1232
+ "0 [cultureclash, future, spacewar, spacecolony, ... \\\n",
1233
+ "1 [ocean, drugabuse, exoticisland, eastindiatrad... \n",
1234
+ "2 [spy, basedonnovel, secretagent, sequel, mi6, ... \n",
1235
+ "3 [dccomics, crimefighter, terrorist, secretiden... \n",
1236
+ "4 [basedonnovel, mars, medallion, spacetravel, p... \n",
1237
+ "\n",
1238
+ " cast crew \n",
1239
+ "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] \\\n",
1240
+ "1 [JohnnyDepp, OrlandoBloom, KeiraKnightley] [GoreVerbinski] \n",
1241
+ "2 [DanielCraig, ChristophWaltz, LéaSeydoux] [SamMendes] \n",
1242
+ "3 [ChristianBale, MichaelCaine, GaryOldman] [ChristopherNolan] \n",
1243
+ "4 [TaylorKitsch, LynnCollins, SamanthaMorton] [AndrewStanton] \n",
1244
+ "\n",
1245
+ " tags \n",
1246
+ "0 [In, the, 22nd, century,, a, paraplegic, Marin... \n",
1247
+ "1 [Captain, Barbossa,, long, believed, to, be, d... \n",
1248
+ "2 [A, cryptic, message, from, Bond’s, past, send... \n",
1249
+ "3 [Following, the, death, of, District, Attorney... \n",
1250
+ "4 [John, Carter, is, a, war-weary,, former, mili... "
1251
+ ]
1252
+ },
1253
+ "execution_count": 28,
1254
+ "metadata": {},
1255
+ "output_type": "execute_result"
1256
+ }
1257
+ ],
1258
+ "source": [
1259
+ "movies.head()"
1260
+ ]
1261
+ },
1262
+ {
1263
+ "cell_type": "code",
1264
+ "execution_count": 29,
1265
+ "metadata": {},
1266
+ "outputs": [],
1267
+ "source": [
1268
+ "new_df = movies[['movie_id','title','tags']]"
1269
+ ]
1270
+ },
1271
+ {
1272
+ "cell_type": "code",
1273
+ "execution_count": 30,
1274
+ "metadata": {},
1275
+ "outputs": [
1276
+ {
1277
+ "name": "stderr",
1278
+ "output_type": "stream",
1279
+ "text": [
1280
+ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3089450492.py:1: SettingWithCopyWarning: \n",
1281
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1282
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1283
+ "\n",
1284
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1285
+ " new_df['tags'] = new_df['tags'].apply(lambda x:\" \".join(x))\n"
1286
+ ]
1287
+ }
1288
+ ],
1289
+ "source": [
1290
+ "new_df['tags'] = new_df['tags'].apply(lambda x:\" \".join(x))"
1291
+ ]
1292
+ },
1293
+ {
1294
+ "cell_type": "code",
1295
+ "execution_count": 31,
1296
+ "metadata": {},
1297
+ "outputs": [
1298
+ {
1299
+ "data": {
1300
+ "text/html": [
1301
+ "<div>\n",
1302
+ "<style scoped>\n",
1303
+ " .dataframe tbody tr th:only-of-type {\n",
1304
+ " vertical-align: middle;\n",
1305
+ " }\n",
1306
+ "\n",
1307
+ " .dataframe tbody tr th {\n",
1308
+ " vertical-align: top;\n",
1309
+ " }\n",
1310
+ "\n",
1311
+ " .dataframe thead th {\n",
1312
+ " text-align: right;\n",
1313
+ " }\n",
1314
+ "</style>\n",
1315
+ "<table border=\"1\" class=\"dataframe\">\n",
1316
+ " <thead>\n",
1317
+ " <tr style=\"text-align: right;\">\n",
1318
+ " <th></th>\n",
1319
+ " <th>movie_id</th>\n",
1320
+ " <th>title</th>\n",
1321
+ " <th>tags</th>\n",
1322
+ " </tr>\n",
1323
+ " </thead>\n",
1324
+ " <tbody>\n",
1325
+ " <tr>\n",
1326
+ " <th>0</th>\n",
1327
+ " <td>19995</td>\n",
1328
+ " <td>Avatar</td>\n",
1329
+ " <td>In the 22nd century, a paraplegic Marine is di...</td>\n",
1330
+ " </tr>\n",
1331
+ " <tr>\n",
1332
+ " <th>1</th>\n",
1333
+ " <td>285</td>\n",
1334
+ " <td>Pirates of the Caribbean: At World's End</td>\n",
1335
+ " <td>Captain Barbossa, long believed to be dead, ha...</td>\n",
1336
+ " </tr>\n",
1337
+ " <tr>\n",
1338
+ " <th>2</th>\n",
1339
+ " <td>206647</td>\n",
1340
+ " <td>Spectre</td>\n",
1341
+ " <td>A cryptic message from Bond’s past sends him o...</td>\n",
1342
+ " </tr>\n",
1343
+ " <tr>\n",
1344
+ " <th>3</th>\n",
1345
+ " <td>49026</td>\n",
1346
+ " <td>The Dark Knight Rises</td>\n",
1347
+ " <td>Following the death of District Attorney Harve...</td>\n",
1348
+ " </tr>\n",
1349
+ " <tr>\n",
1350
+ " <th>4</th>\n",
1351
+ " <td>49529</td>\n",
1352
+ " <td>John Carter</td>\n",
1353
+ " <td>John Carter is a war-weary, former military ca...</td>\n",
1354
+ " </tr>\n",
1355
+ " </tbody>\n",
1356
+ "</table>\n",
1357
+ "</div>"
1358
+ ],
1359
+ "text/plain": [
1360
+ " movie_id title \n",
1361
+ "0 19995 Avatar \\\n",
1362
+ "1 285 Pirates of the Caribbean: At World's End \n",
1363
+ "2 206647 Spectre \n",
1364
+ "3 49026 The Dark Knight Rises \n",
1365
+ "4 49529 John Carter \n",
1366
+ "\n",
1367
+ " tags \n",
1368
+ "0 In the 22nd century, a paraplegic Marine is di... \n",
1369
+ "1 Captain Barbossa, long believed to be dead, ha... \n",
1370
+ "2 A cryptic message from Bond’s past sends him o... \n",
1371
+ "3 Following the death of District Attorney Harve... \n",
1372
+ "4 John Carter is a war-weary, former military ca... "
1373
+ ]
1374
+ },
1375
+ "execution_count": 31,
1376
+ "metadata": {},
1377
+ "output_type": "execute_result"
1378
+ }
1379
+ ],
1380
+ "source": [
1381
+ "new_df.head()"
1382
+ ]
1383
+ },
1384
+ {
1385
+ "cell_type": "code",
1386
+ "execution_count": 32,
1387
+ "metadata": {},
1388
+ "outputs": [
1389
+ {
1390
+ "name": "stderr",
1391
+ "output_type": "stream",
1392
+ "text": [
1393
+ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3214958533.py:1: SettingWithCopyWarning: \n",
1394
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1395
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1396
+ "\n",
1397
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1398
+ " new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())\n"
1399
+ ]
1400
+ }
1401
+ ],
1402
+ "source": [
1403
+ "new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())"
1404
+ ]
1405
+ },
1406
+ {
1407
+ "cell_type": "code",
1408
+ "execution_count": 33,
1409
+ "metadata": {},
1410
+ "outputs": [],
1411
+ "source": [
1412
+ "cv = CountVectorizer(max_features=5000,stop_words='english')"
1413
+ ]
1414
+ },
1415
+ {
1416
+ "cell_type": "code",
1417
+ "execution_count": 34,
1418
+ "metadata": {},
1419
+ "outputs": [],
1420
+ "source": [
1421
+ "vectors = cv.fit_transform(new_df['tags']).toarray()"
1422
+ ]
1423
+ },
1424
+ {
1425
+ "cell_type": "code",
1426
+ "execution_count": 35,
1427
+ "metadata": {},
1428
+ "outputs": [
1429
+ {
1430
+ "data": {
1431
+ "text/plain": [
1432
+ "array([[0, 0, 0, ..., 0, 0, 0],\n",
1433
+ " [0, 0, 0, ..., 0, 0, 0],\n",
1434
+ " [0, 0, 0, ..., 0, 0, 0],\n",
1435
+ " ...,\n",
1436
+ " [0, 0, 0, ..., 0, 0, 0],\n",
1437
+ " [0, 0, 0, ..., 0, 0, 0],\n",
1438
+ " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)"
1439
+ ]
1440
+ },
1441
+ "execution_count": 35,
1442
+ "metadata": {},
1443
+ "output_type": "execute_result"
1444
+ }
1445
+ ],
1446
+ "source": [
1447
+ "vectors"
1448
+ ]
1449
+ },
1450
+ {
1451
+ "cell_type": "code",
1452
+ "execution_count": 36,
1453
+ "metadata": {},
1454
+ "outputs": [
1455
+ {
1456
+ "name": "stderr",
1457
+ "output_type": "stream",
1458
+ "text": [
1459
+ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
1460
+ " warnings.warn(msg, category=FutureWarning)\n"
1461
+ ]
1462
+ },
1463
+ {
1464
+ "data": {
1465
+ "text/plain": [
1466
+ "5000"
1467
+ ]
1468
+ },
1469
+ "execution_count": 36,
1470
+ "metadata": {},
1471
+ "output_type": "execute_result"
1472
+ }
1473
+ ],
1474
+ "source": [
1475
+ "len(cv.get_feature_names())"
1476
+ ]
1477
+ },
1478
+ {
1479
+ "cell_type": "code",
1480
+ "execution_count": 37,
1481
+ "metadata": {},
1482
+ "outputs": [],
1483
+ "source": [
1484
+ "ps = PorterStemmer()"
1485
+ ]
1486
+ },
1487
+ {
1488
+ "cell_type": "code",
1489
+ "execution_count": 38,
1490
+ "metadata": {},
1491
+ "outputs": [],
1492
+ "source": [
1493
+ "def stem(text):\n",
1494
+ " y = []\n",
1495
+ "\n",
1496
+ " for i in text.split():\n",
1497
+ " y.append(ps.stem(i))\n",
1498
+ "\n",
1499
+ " return \" \".join(y)\n",
1500
+ " "
1501
+ ]
1502
+ },
1503
+ {
1504
+ "cell_type": "code",
1505
+ "execution_count": 39,
1506
+ "metadata": {},
1507
+ "outputs": [
1508
+ {
1509
+ "data": {
1510
+ "text/plain": [
1511
+ "'danc'"
1512
+ ]
1513
+ },
1514
+ "execution_count": 39,
1515
+ "metadata": {},
1516
+ "output_type": "execute_result"
1517
+ }
1518
+ ],
1519
+ "source": [
1520
+ "ps.stem('danc')"
1521
+ ]
1522
+ },
1523
+ {
1524
+ "cell_type": "code",
1525
+ "execution_count": 40,
1526
+ "metadata": {},
1527
+ "outputs": [
1528
+ {
1529
+ "name": "stderr",
1530
+ "output_type": "stream",
1531
+ "text": [
1532
+ "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_11184\\3213734980.py:1: SettingWithCopyWarning: \n",
1533
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1534
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
1535
+ "\n",
1536
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
1537
+ " new_df['tags'] = new_df['tags'].apply(stem)\n"
1538
+ ]
1539
+ }
1540
+ ],
1541
+ "source": [
1542
+ "new_df['tags'] = new_df['tags'].apply(stem)"
1543
+ ]
1544
+ },
1545
+ {
1546
+ "cell_type": "code",
1547
+ "execution_count": 41,
1548
+ "metadata": {},
1549
+ "outputs": [
1550
+ {
1551
+ "name": "stderr",
1552
+ "output_type": "stream",
1553
+ "text": [
1554
+ "c:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
1555
+ " warnings.warn(msg, category=FutureWarning)\n"
1556
+ ]
1557
+ },
1558
+ {
1559
+ "data": {
1560
+ "text/plain": [
1561
+ "['000',\n",
1562
+ " '007',\n",
1563
+ " '10',\n",
1564
+ " '100',\n",
1565
+ " '11',\n",
1566
+ " '12',\n",
1567
+ " '13',\n",
1568
+ " '14',\n",
1569
+ " '15',\n",
1570
+ " '16',\n",
1571
+ " '17',\n",
1572
+ " '18',\n",
1573
+ " '18th',\n",
1574
+ " '19',\n",
1575
+ " '1930s',\n",
1576
+ " '1940s',\n",
1577
+ " '1944',\n",
1578
+ " '1950',\n",
1579
+ " '1950s',\n",
1580
+ " '1960s',\n",
1581
+ " '1970s',\n",
1582
+ " '1971',\n",
1583
+ " '1974',\n",
1584
+ " '1976',\n",
1585
+ " '1980',\n",
1586
+ " '1980s',\n",
1587
+ " '1985',\n",
1588
+ " '1990s',\n",
1589
+ " '19th',\n",
1590
+ " '19thcentury',\n",
1591
+ " '20',\n",
1592
+ " '200',\n",
1593
+ " '2009',\n",
1594
+ " '20th',\n",
1595
+ " '21st',\n",
1596
+ " '23',\n",
1597
+ " '24',\n",
1598
+ " '25',\n",
1599
+ " '30',\n",
1600
+ " '300',\n",
1601
+ " '3d',\n",
1602
+ " '40',\n",
1603
+ " '50',\n",
1604
+ " '500',\n",
1605
+ " '60',\n",
1606
+ " '60s',\n",
1607
+ " '70',\n",
1608
+ " '70s',\n",
1609
+ " 'aaron',\n",
1610
+ " 'aaroneckhart',\n",
1611
+ " 'abandoned',\n",
1612
+ " 'abducted',\n",
1613
+ " 'abigailbreslin',\n",
1614
+ " 'abilities',\n",
1615
+ " 'ability',\n",
1616
+ " 'able',\n",
1617
+ " 'aboard',\n",
1618
+ " 'abuse',\n",
1619
+ " 'abusive',\n",
1620
+ " 'academic',\n",
1621
+ " 'academy',\n",
1622
+ " 'accept',\n",
1623
+ " 'accepted',\n",
1624
+ " 'accepts',\n",
1625
+ " 'access',\n",
1626
+ " 'accident',\n",
1627
+ " 'accidental',\n",
1628
+ " 'accidentally',\n",
1629
+ " 'accompanied',\n",
1630
+ " 'accomplish',\n",
1631
+ " 'account',\n",
1632
+ " 'accountant',\n",
1633
+ " 'accused',\n",
1634
+ " 'ace',\n",
1635
+ " 'achieve',\n",
1636
+ " 'act',\n",
1637
+ " 'acting',\n",
1638
+ " 'action',\n",
1639
+ " 'actionhero',\n",
1640
+ " 'actions',\n",
1641
+ " 'activist',\n",
1642
+ " 'activities',\n",
1643
+ " 'activity',\n",
1644
+ " 'actor',\n",
1645
+ " 'actors',\n",
1646
+ " 'actress',\n",
1647
+ " 'acts',\n",
1648
+ " 'actual',\n",
1649
+ " 'actually',\n",
1650
+ " 'adam',\n",
1651
+ " 'adams',\n",
1652
+ " 'adamsandler',\n",
1653
+ " 'adamshankman',\n",
1654
+ " 'adaptation',\n",
1655
+ " 'adapted',\n",
1656
+ " 'addict',\n",
1657
+ " 'addicted',\n",
1658
+ " 'addiction',\n",
1659
+ " 'adolescence',\n",
1660
+ " 'adopt',\n",
1661
+ " 'adopted',\n",
1662
+ " 'adoption',\n",
1663
+ " 'adopts',\n",
1664
+ " 'adrienbrody',\n",
1665
+ " 'adult',\n",
1666
+ " 'adultery',\n",
1667
+ " 'adulthood',\n",
1668
+ " 'adults',\n",
1669
+ " 'advantage',\n",
1670
+ " 'adventure',\n",
1671
+ " 'adventures',\n",
1672
+ " 'advertising',\n",
1673
+ " 'advice',\n",
1674
+ " 'affair',\n",
1675
+ " 'affairs',\n",
1676
+ " 'affection',\n",
1677
+ " 'affections',\n",
1678
+ " 'afghanistan',\n",
1679
+ " 'africa',\n",
1680
+ " 'african',\n",
1681
+ " 'africanamerican',\n",
1682
+ " 'aftercreditsstinger',\n",
1683
+ " 'afterlife',\n",
1684
+ " 'aftermath',\n",
1685
+ " 'age',\n",
1686
+ " 'aged',\n",
1687
+ " 'agedifference',\n",
1688
+ " 'agency',\n",
1689
+ " 'agenda',\n",
1690
+ " 'agent',\n",
1691
+ " 'agents',\n",
1692
+ " 'aggressive',\n",
1693
+ " 'aging',\n",
1694
+ " 'ago',\n",
1695
+ " 'agree',\n",
1696
+ " 'agrees',\n",
1697
+ " 'ahead',\n",
1698
+ " 'aid',\n",
1699
+ " 'aided',\n",
1700
+ " 'aids',\n",
1701
+ " 'ailing',\n",
1702
+ " 'air',\n",
1703
+ " 'airplane',\n",
1704
+ " 'airplanecrash',\n",
1705
+ " 'airport',\n",
1706
+ " 'aka',\n",
1707
+ " 'al',\n",
1708
+ " 'alabama',\n",
1709
+ " 'alan',\n",
1710
+ " 'alaska',\n",
1711
+ " 'albert',\n",
1712
+ " 'alcohol',\n",
1713
+ " 'alcoholic',\n",
1714
+ " 'alcoholism',\n",
1715
+ " 'alecbaldwin',\n",
1716
+ " 'alex',\n",
1717
+ " 'alexkendrick',\n",
1718
+ " 'alfredhitchcock',\n",
1719
+ " 'ali',\n",
1720
+ " 'alice',\n",
1721
+ " 'alien',\n",
1722
+ " 'alieninvasion',\n",
1723
+ " 'alienlife',\n",
1724
+ " 'aliens',\n",
1725
+ " 'alike',\n",
1726
+ " 'alive',\n",
1727
+ " 'allen',\n",
1728
+ " 'alliance',\n",
1729
+ " 'allied',\n",
1730
+ " 'allies',\n",
1731
+ " 'allow',\n",
1732
+ " 'allowing',\n",
1733
+ " 'allows',\n",
1734
+ " 'ally',\n",
1735
+ " 'alongside',\n",
1736
+ " 'alpacino',\n",
1737
+ " 'alter',\n",
1738
+ " 'alternate',\n",
1739
+ " 'alternative',\n",
1740
+ " 'alzheimer',\n",
1741
+ " 'amanda',\n",
1742
+ " 'amandapeet',\n",
1743
+ " 'amandaseyfried',\n",
1744
+ " 'amateur',\n",
1745
+ " 'amazing',\n",
1746
+ " 'ambassador',\n",
1747
+ " 'ambition',\n",
1748
+ " 'ambitious',\n",
1749
+ " 'ambulance',\n",
1750
+ " 'ambush',\n",
1751
+ " 'america',\n",
1752
+ " 'american',\n",
1753
+ " 'americanabroad',\n",
1754
+ " 'americanfootball',\n",
1755
+ " 'americans',\n",
1756
+ " 'amid',\n",
1757
+ " 'amidst',\n",
1758
+ " 'amnesia',\n",
1759
+ " 'amp',\n",
1760
+ " 'amsterdam',\n",
1761
+ " 'amusement',\n",
1762
+ " 'amusementpark',\n",
1763
+ " 'amy',\n",
1764
+ " 'amyadams',\n",
1765
+ " 'amysmart',\n",
1766
+ " 'analyst',\n",
1767
+ " 'anarchiccomedy',\n",
1768
+ " 'ancient',\n",
1769
+ " 'ancientrome',\n",
1770
+ " 'ancientworld',\n",
1771
+ " 'anderson',\n",
1772
+ " 'andiemacdowell',\n",
1773
+ " 'andrew',\n",
1774
+ " 'android',\n",
1775
+ " 'andy',\n",
1776
+ " 'andygarcía',\n",
1777
+ " 'angel',\n",
1778
+ " 'angelabassett',\n",
1779
+ " 'angeles',\n",
1780
+ " 'angelinajolie',\n",
1781
+ " 'angels',\n",
1782
+ " 'anger',\n",
1783
+ " 'anglee',\n",
1784
+ " 'angry',\n",
1785
+ " 'animal',\n",
1786
+ " 'animalattack',\n",
1787
+ " 'animalhorror',\n",
1788
+ " 'animals',\n",
1789
+ " 'animated',\n",
1790
+ " 'animation',\n",
1791
+ " 'anna',\n",
1792
+ " 'annafaris',\n",
1793
+ " 'anne',\n",
1794
+ " 'annehathaway',\n",
1795
+ " 'annemoss',\n",
1796
+ " 'annettebening',\n",
1797
+ " 'annie',\n",
1798
+ " 'anniversary',\n",
1799
+ " 'annual',\n",
1800
+ " 'answer',\n",
1801
+ " 'answers',\n",
1802
+ " 'ant',\n",
1803
+ " 'anthology',\n",
1804
+ " 'anthony',\n",
1805
+ " 'anthonyanderson',\n",
1806
+ " 'anthonyhopkins',\n",
1807
+ " 'anthropomorphism',\n",
1808
+ " 'anti',\n",
1809
+ " 'antics',\n",
1810
+ " 'antihero',\n",
1811
+ " 'antoinefuqua',\n",
1812
+ " 'antoniobanderas',\n",
1813
+ " 'antonyelchin',\n",
1814
+ " 'apart',\n",
1815
+ " 'apartheid',\n",
1816
+ " 'apartment',\n",
1817
+ " 'ape',\n",
1818
+ " 'apes',\n",
1819
+ " 'apocalypse',\n",
1820
+ " 'apocalyptic',\n",
1821
+ " 'apparent',\n",
1822
+ " 'apparently',\n",
1823
+ " 'appear',\n",
1824
+ " 'appears',\n",
1825
+ " 'apple',\n",
1826
+ " 'appointed',\n",
1827
+ " 'apprentice',\n",
1828
+ " 'approach',\n",
1829
+ " 'approaches',\n",
1830
+ " 'approaching',\n",
1831
+ " 'april',\n",
1832
+ " 'aquarium',\n",
1833
+ " 'arab',\n",
1834
+ " 'arch',\n",
1835
+ " 'archaeologist',\n",
1836
+ " 'archeology',\n",
1837
+ " 'architect',\n",
1838
+ " 'arctic',\n",
1839
+ " 'area',\n",
1840
+ " 'aren',\n",
1841
+ " 'arena',\n",
1842
+ " 'argument',\n",
1843
+ " 'arise',\n",
1844
+ " 'aristocrat',\n",
1845
+ " 'armed',\n",
1846
+ " 'arms',\n",
1847
+ " 'army',\n",
1848
+ " 'arnold',\n",
1849
+ " 'arnoldschwarzenegger',\n",
1850
+ " 'arrangedmarriage',\n",
1851
+ " 'arrangement',\n",
1852
+ " 'arrest',\n",
1853
+ " 'arrested',\n",
1854
+ " 'arrival',\n",
1855
+ " 'arrive',\n",
1856
+ " 'arrives',\n",
1857
+ " 'arriving',\n",
1858
+ " 'arrogant',\n",
1859
+ " 'art',\n",
1860
+ " 'arthur',\n",
1861
+ " 'artificialintelligence',\n",
1862
+ " 'artist',\n",
1863
+ " 'artistic',\n",
1864
+ " 'artists',\n",
1865
+ " 'arts',\n",
1866
+ " 'ashley',\n",
1867
+ " 'ashleyjudd',\n",
1868
+ " 'ashtonkutcher',\n",
1869
+ " 'asia',\n",
1870
+ " 'aside',\n",
1871
+ " 'ask',\n",
1872
+ " 'asked',\n",
1873
+ " 'asking',\n",
1874
+ " 'asks',\n",
1875
+ " 'aspirations',\n",
1876
+ " 'aspiring',\n",
1877
+ " 'assassin',\n",
1878
+ " 'assassinate',\n",
1879
+ " 'assassination',\n",
1880
+ " 'assassins',\n",
1881
+ " 'assault',\n",
1882
+ " 'assigned',\n",
1883
+ " 'assignment',\n",
1884
+ " 'assistant',\n",
1885
+ " 'assumes',\n",
1886
+ " 'asteroid',\n",
1887
+ " 'astronaut',\n",
1888
+ " 'astronauts',\n",
1889
+ " 'asylum',\n",
1890
+ " 'athlete',\n",
1891
+ " 'atomicbomb',\n",
1892
+ " 'attack',\n",
1893
+ " 'attacked',\n",
1894
+ " 'attacks',\n",
1895
+ " 'attempt',\n",
1896
+ " 'attempting',\n",
1897
+ " 'attempts',\n",
1898
+ " 'attempttoescape',\n",
1899
+ " 'attending',\n",
1900
+ " 'attends',\n",
1901
+ " 'attention',\n",
1902
+ " 'attic',\n",
1903
+ " 'attitude',\n",
1904
+ " 'attorney',\n",
1905
+ " 'attracted',\n",
1906
+ " 'attraction',\n",
1907
+ " 'attractive',\n",
1908
+ " 'audience',\n",
1909
+ " 'audiences',\n",
1910
+ " 'audition',\n",
1911
+ " 'august',\n",
1912
+ " 'aunt',\n",
1913
+ " 'austin',\n",
1914
+ " 'australia',\n",
1915
+ " 'australian',\n",
1916
+ " 'author',\n",
1917
+ " 'authorities',\n",
1918
+ " 'authority',\n",
1919
+ " 'autism',\n",
1920
+ " 'auto',\n",
1921
+ " 'avenge',\n",
1922
+ " 'average',\n",
1923
+ " 'avoid',\n",
1924
+ " 'awaits',\n",
1925
+ " 'awakens',\n",
1926
+ " 'award',\n",
1927
+ " 'away',\n",
1928
+ " 'awry',\n",
1929
+ " 'ax',\n",
1930
+ " 'babe',\n",
1931
+ " 'baby',\n",
1932
+ " 'bachelor',\n",
1933
+ " 'backdrop',\n",
1934
+ " 'background',\n",
1935
+ " 'backgrounds',\n",
1936
+ " 'bad',\n",
1937
+ " 'bag',\n",
1938
+ " 'bahamas',\n",
1939
+ " 'bail',\n",
1940
+ " 'balance',\n",
1941
+ " 'ball',\n",
1942
+ " 'ballet',\n",
1943
+ " 'baltimore',\n",
1944
+ " 'band',\n",
1945
+ " 'bandits',\n",
1946
+ " 'bangkok',\n",
1947
+ " 'banished',\n",
1948
+ " 'bank',\n",
1949
+ " 'banker',\n",
1950
+ " 'bankrobber',\n",
1951
+ " 'bankrobbery',\n",
1952
+ " 'bar',\n",
1953
+ " 'barely',\n",
1954
+ " 'bargained',\n",
1955
+ " 'barn',\n",
1956
+ " 'barney',\n",
1957
+ " 'barry',\n",
1958
+ " 'barrylevinson',\n",
1959
+ " 'bars',\n",
1960
+ " 'base',\n",
1961
+ " 'baseball',\n",
1962
+ " 'based',\n",
1963
+ " 'basedoncomicbook',\n",
1964
+ " 'basedongraphicnovel',\n",
1965
+ " 'basedonnovel',\n",
1966
+ " 'basedonplay',\n",
1967
+ " 'basedonstagemusical',\n",
1968
+ " 'basedontrueevents',\n",
1969
+ " 'basedontruestory',\n",
1970
+ " 'basedontvseries',\n",
1971
+ " 'basedonvideogame',\n",
1972
+ " 'basedonyoungadultnovel',\n",
1973
+ " 'basement',\n",
1974
+ " 'basketball',\n",
1975
+ " 'batman',\n",
1976
+ " 'battle',\n",
1977
+ " 'battlefield',\n",
1978
+ " 'battles',\n",
1979
+ " 'battling',\n",
1980
+ " 'bay',\n",
1981
+ " 'beach',\n",
1982
+ " 'bear',\n",
1983
+ " 'bears',\n",
1984
+ " 'beast',\n",
1985
+ " 'beasts',\n",
1986
+ " 'beat',\n",
1987
+ " 'beating',\n",
1988
+ " 'beautiful',\n",
1989
+ " 'beautifulwoman',\n",
1990
+ " 'beauty',\n",
1991
+ " 'becky',\n",
1992
+ " 'becominganadult',\n",
1993
+ " 'bed',\n",
1994
+ " 'bedroom',\n",
1995
+ " 'bee',\n",
1996
+ " 'beer',\n",
1997
+ " 'befriends',\n",
1998
+ " 'began',\n",
1999
+ " 'begin',\n",
2000
+ " 'beginning',\n",
2001
+ " 'begins',\n",
2002
+ " 'behavior',\n",
2003
+ " 'beings',\n",
2004
+ " 'belief',\n",
2005
+ " 'beliefs',\n",
2006
+ " 'believe',\n",
2007
+ " 'believed',\n",
2008
+ " 'believes',\n",
2009
+ " 'believing',\n",
2010
+ " 'bell',\n",
2011
+ " 'belong',\n",
2012
+ " 'belongs',\n",
2013
+ " 'beloved',\n",
2014
+ " 'ben',\n",
2015
+ " 'benaffleck',\n",
2016
+ " 'beneath',\n",
2017
+ " 'benfoster',\n",
2018
+ " 'beniciodeltoro',\n",
2019
+ " 'benjamin',\n",
2020
+ " 'benjaminbratt',\n",
2021
+ " 'benkingsley',\n",
2022
+ " 'bennett',\n",
2023
+ " 'benstiller',\n",
2024
+ " 'bent',\n",
2025
+ " 'berlin',\n",
2026
+ " 'best',\n",
2027
+ " 'bestfriend',\n",
2028
+ " 'bet',\n",
2029
+ " 'beth',\n",
2030
+ " 'betrayal',\n",
2031
+ " 'betrayed',\n",
2032
+ " 'bettemidler',\n",
2033
+ " 'better',\n",
2034
+ " 'betty',\n",
2035
+ " 'beverly',\n",
2036
+ " 'bible',\n",
2037
+ " 'bid',\n",
2038
+ " 'big',\n",
2039
+ " 'bigger',\n",
2040
+ " 'biggest',\n",
2041
+ " 'biker',\n",
2042
+ " 'bikini',\n",
2043
+ " 'billhader',\n",
2044
+ " 'billionaire',\n",
2045
+ " 'billmurray',\n",
2046
+ " 'billnighy',\n",
2047
+ " 'billpaxton',\n",
2048
+ " 'billpullman',\n",
2049
+ " 'billy',\n",
2050
+ " 'billybobthornton',\n",
2051
+ " 'billycrudup',\n",
2052
+ " 'billycrystal',\n",
2053
+ " 'biography',\n",
2054
+ " 'bird',\n",
2055
+ " 'birth',\n",
2056
+ " 'birthday',\n",
2057
+ " 'bisexual',\n",
2058
+ " 'bishop',\n",
2059
+ " 'bit',\n",
2060
+ " 'bite',\n",
2061
+ " 'bitter',\n",
2062
+ " 'bizarre',\n",
2063
+ " 'black',\n",
2064
+ " 'blackmagic',\n",
2065
+ " 'blackmail',\n",
2066
+ " 'blackpeople',\n",
2067
+ " 'blacksmith',\n",
2068
+ " 'blade',\n",
2069
+ " 'blame',\n",
2070
+ " 'blind',\n",
2071
+ " 'bliss',\n",
2072
+ " 'block',\n",
2073
+ " 'blonde',\n",
2074
+ " 'blood',\n",
2075
+ " 'bloodsplatter',\n",
2076
+ " 'bloodthirsty',\n",
2077
+ " 'bloody',\n",
2078
+ " 'blow',\n",
2079
+ " 'blue',\n",
2080
+ " 'board',\n",
2081
+ " 'boarding',\n",
2082
+ " 'boardingschool',\n",
2083
+ " 'boat',\n",
2084
+ " 'bob',\n",
2085
+ " 'bobby',\n",
2086
+ " 'bobbyfarrelly',\n",
2087
+ " 'bobhoskins',\n",
2088
+ " 'bodies',\n",
2089
+ " 'body',\n",
2090
+ " 'bodyguard',\n",
2091
+ " 'bold',\n",
2092
+ " 'bollywood',\n",
2093
+ " 'bomb',\n",
2094
+ " 'bombing',\n",
2095
+ " 'bond',\n",
2096
+ " 'bonds',\n",
2097
+ " 'bone',\n",
2098
+ " 'book',\n",
2099
+ " 'books',\n",
2100
+ " 'border',\n",
2101
+ " 'bored',\n",
2102
+ " 'boredom',\n",
2103
+ " 'boring',\n",
2104
+ " 'born',\n",
2105
+ " 'boss',\n",
2106
+ " 'boston',\n",
2107
+ " 'botched',\n",
2108
+ " 'bound',\n",
2109
+ " 'boundaries',\n",
2110
+ " 'bounty',\n",
2111
+ " 'bountyhunter',\n",
2112
+ " 'bourne',\n",
2113
+ " 'box',\n",
2114
+ " 'boxer',\n",
2115
+ " 'boxing',\n",
2116
+ " 'boy',\n",
2117
+ " 'boyfriend',\n",
2118
+ " 'boys',\n",
2119
+ " 'bradleycooper',\n",
2120
+ " 'bradpitt',\n",
2121
+ " 'brain',\n",
2122
+ " 'brand',\n",
2123
+ " 'brave',\n",
2124
+ " 'bravery',\n",
2125
+ " 'brazil',\n",
2126
+ " 'brazilian',\n",
2127
+ " 'break',\n",
2128
+ " 'breakdown',\n",
2129
+ " 'breaking',\n",
2130
+ " 'breaks',\n",
2131
+ " 'brendanfraser',\n",
2132
+ " 'brendangleeson',\n",
2133
+ " 'brent',\n",
2134
+ " 'brettratner',\n",
2135
+ " 'brian',\n",
2136
+ " 'briandepalma',\n",
2137
+ " 'bride',\n",
2138
+ " 'bridge',\n",
2139
+ " 'brief',\n",
2140
+ " 'brien',\n",
2141
+ " 'bright',\n",
2142
+ " 'brilliant',\n",
2143
+ " 'bring',\n",
2144
+ " 'bringing',\n",
2145
+ " 'brings',\n",
2146
+ " 'brink',\n",
2147
+ " 'britain',\n",
2148
+ " 'british',\n",
2149
+ " 'britishsecretservice',\n",
2150
+ " 'brittanymurphy',\n",
2151
+ " 'broadway',\n",
2152
+ " 'broke',\n",
2153
+ " 'broken',\n",
2154
+ " 'broker',\n",
2155
+ " 'brooklyn',\n",
2156
+ " 'brooks',\n",
2157
+ " 'brothel',\n",
2158
+ " 'brother',\n",
2159
+ " 'brotherbrotherrelationship',\n",
2160
+ " 'brothers',\n",
2161
+ " 'brothersisterrelationship',\n",
2162
+ " 'brought',\n",
2163
+ " 'brown',\n",
2164
+ " 'bruce',\n",
2165
+ " 'brucewillis',\n",
2166
+ " 'brutal',\n",
2167
+ " 'brutality',\n",
2168
+ " 'brutally',\n",
2169
+ " 'bryansinger',\n",
2170
+ " 'buck',\n",
2171
+ " 'buddies',\n",
2172
+ " 'buddy',\n",
2173
+ " 'buddycomedy',\n",
2174
+ " 'budget',\n",
2175
+ " 'build',\n",
2176
+ " 'building',\n",
2177
+ " 'built',\n",
2178
+ " 'bully',\n",
2179
+ " 'bullying',\n",
2180
+ " 'bumbling',\n",
2181
+ " 'bunny',\n",
2182
+ " 'burglar',\n",
2183
+ " 'buried',\n",
2184
+ " 'burned',\n",
2185
+ " 'bus',\n",
2186
+ " 'bush',\n",
2187
+ " 'business',\n",
2188
+ " 'businessman',\n",
2189
+ " 'bust',\n",
2190
+ " 'busy',\n",
2191
+ " 'butler',\n",
2192
+ " 'buy',\n",
2193
+ " 'cabin',\n",
2194
+ " 'caesar',\n",
2195
+ " 'cage',\n",
2196
+ " 'cairo',\n",
2197
+ " 'cal',\n",
2198
+ " 'california',\n",
2199
+ " 'called',\n",
2200
+ " 'calls',\n",
2201
+ " 'calvin',\n",
2202
+ " 'camcorder',\n",
2203
+ " 'came',\n",
2204
+ " 'camera',\n",
2205
+ " 'cameraman',\n",
2206
+ " 'cameras',\n",
2207
+ " 'camerondiaz',\n",
2208
+ " 'camp',\n",
2209
+ " 'campaign',\n",
2210
+ " 'campbell',\n",
2211
+ " 'camping',\n",
2212
+ " 'campus',\n",
2213
+ " 'canada',\n",
2214
+ " 'canadian',\n",
2215
+ " 'cancer',\n",
2216
+ " 'candidate',\n",
2217
+ " 'candy',\n",
2218
+ " 'cannibal',\n",
2219
+ " 'capable',\n",
2220
+ " 'capital',\n",
2221
+ " 'capitalism',\n",
2222
+ " 'capt',\n",
2223
+ " 'captain',\n",
2224
+ " 'captive',\n",
2225
+ " 'capture',\n",
2226
+ " 'captured',\n",
2227
+ " 'captures',\n",
2228
+ " 'car',\n",
2229
+ " 'caraccident',\n",
2230
+ " 'carchase',\n",
2231
+ " 'carcrash',\n",
2232
+ " 'card',\n",
2233
+ " 'care',\n",
2234
+ " 'career',\n",
2235
+ " 'carefree',\n",
2236
+ " 'caretaker',\n",
2237
+ " 'caribbean',\n",
2238
+ " 'carjourney',\n",
2239
+ " 'carl',\n",
2240
+ " 'carlagugino',\n",
2241
+ " 'carmen',\n",
2242
+ " 'carol',\n",
2243
+ " 'carolina',\n",
2244
+ " 'carrace',\n",
2245
+ " 'carrie',\n",
2246
+ " 'carry',\n",
2247
+ " 'carrying',\n",
2248
+ " 'cars',\n",
2249
+ " 'cartel',\n",
2250
+ " 'carter',\n",
2251
+ " 'cartoon',\n",
2252
+ " 'caryelwes',\n",
2253
+ " 'case',\n",
2254
+ " 'caseyaffleck',\n",
2255
+ " 'cash',\n",
2256
+ " 'casino',\n",
2257
+ " 'cast',\n",
2258
+ " 'castle',\n",
2259
+ " 'cat',\n",
2260
+ " 'cataclysm',\n",
2261
+ " 'catastrophe',\n",
2262
+ " 'catch',\n",
2263
+ " 'catches',\n",
2264
+ " 'cateblanchett',\n",
2265
+ " 'catherinedeneuve',\n",
2266
+ " 'catherinekeener',\n",
2267
+ " 'catherinezeta',\n",
2268
+ " 'catholic',\n",
2269
+ " 'catholicism',\n",
2270
+ " 'cattle',\n",
2271
+ " 'caught',\n",
2272
+ " 'cause',\n",
2273
+ " 'caused',\n",
2274
+ " 'causes',\n",
2275
+ " 'causing',\n",
2276
+ " 'cavalry',\n",
2277
+ " 'cave',\n",
2278
+ " 'cavemen',\n",
2279
+ " 'celebrate',\n",
2280
+ " 'celebrated',\n",
2281
+ " 'celebration',\n",
2282
+ " 'celebrity',\n",
2283
+ " 'cell',\n",
2284
+ " 'cellphone',\n",
2285
+ " 'cemetery',\n",
2286
+ " 'center',\n",
2287
+ " 'centered',\n",
2288
+ " 'centers',\n",
2289
+ " 'central',\n",
2290
+ " 'centuries',\n",
2291
+ " 'century',\n",
2292
+ " 'ceo',\n",
2293
+ " 'ceremony',\n",
2294
+ " 'certain',\n",
2295
+ " 'chad',\n",
2296
+ " 'chain',\n",
2297
+ " 'chainsaw',\n",
2298
+ " 'challenge',\n",
2299
+ " 'challenged',\n",
2300
+ " 'challenges',\n",
2301
+ " 'champion',\n",
2302
+ " 'championship',\n",
2303
+ " 'chance',\n",
2304
+ " 'change',\n",
2305
+ " 'changed',\n",
2306
+ " 'changes',\n",
2307
+ " 'changing',\n",
2308
+ " 'channingtatum',\n",
2309
+ " 'chaos',\n",
2310
+ " 'chaotic',\n",
2311
+ " 'chapter',\n",
2312
+ " 'character',\n",
2313
+ " 'characters',\n",
2314
+ " 'charge',\n",
2315
+ " 'charged',\n",
2316
+ " 'charismatic',\n",
2317
+ " 'charles',\n",
2318
+ " 'charlie',\n",
2319
+ " 'charliesheen',\n",
2320
+ " 'charlizetheron',\n",
2321
+ " 'charlotte',\n",
2322
+ " 'charm',\n",
2323
+ " 'charming',\n",
2324
+ " 'chase',\n",
2325
+ " 'chased',\n",
2326
+ " 'chauffeur',\n",
2327
+ " 'cheating',\n",
2328
+ " 'cheerleader',\n",
2329
+ " 'chef',\n",
2330
+ " 'chemical',\n",
2331
+ " 'cher',\n",
2332
+ " 'chicago',\n",
2333
+ " 'chicken',\n",
2334
+ " 'chief',\n",
2335
+ " 'child',\n",
2336
+ " 'childabuse',\n",
2337
+ " 'childhero',\n",
2338
+ " 'childhood',\n",
2339
+ " 'childprodigy',\n",
2340
+ " 'children',\n",
2341
+ " 'chilling',\n",
2342
+ " 'china',\n",
2343
+ " 'chinese',\n",
2344
+ " 'chip',\n",
2345
+ " 'chiwetelejiofor',\n",
2346
+ " 'chloe',\n",
2347
+ " 'chloëgracemoretz',\n",
2348
+ " 'chloësevigny',\n",
2349
+ " 'chocolate',\n",
2350
+ " 'choice',\n",
2351
+ " 'choices',\n",
2352
+ " 'choose',\n",
2353
+ " 'chosen',\n",
2354
+ " 'chris',\n",
2355
+ " 'chriscolumbus',\n",
2356
+ " 'chriscooper',\n",
2357
+ " 'chrisevans',\n",
2358
+ " 'chrishemsworth',\n",
2359
+ " 'chrisklein',\n",
2360
+ " 'chrispine',\n",
2361
+ " 'chrisrock',\n",
2362
+ " 'christ',\n",
2363
+ " 'christian',\n",
2364
+ " 'christianbale',\n",
2365
+ " 'christianity',\n",
2366
+ " 'christianslater',\n",
2367
+ " 'christinaapplegate',\n",
2368
+ " 'christinaricci',\n",
2369
+ " 'christine',\n",
2370
+ " 'christmas',\n",
2371
+ " 'christmasparty',\n",
2372
+ " 'christmastree',\n",
2373
+ " 'christopher',\n",
2374
+ " 'christopherlloyd',\n",
2375
+ " 'christophernolan',\n",
2376
+ " 'christopherplummer',\n",
2377
+ " 'christopherwalken',\n",
2378
+ " 'christophwaltz',\n",
2379
+ " 'chrisweitz',\n",
2380
+ " 'chronicle',\n",
2381
+ " 'chronicles',\n",
2382
+ " 'chuck',\n",
2383
+ " 'church',\n",
2384
+ " 'cia',\n",
2385
+ " 'ciaránhinds',\n",
2386
+ " 'cigarettesmoking',\n",
2387
+ " 'cillianmurphy',\n",
2388
+ " 'cindy',\n",
2389
+ " 'cinema',\n",
2390
+ " 'circle',\n",
2391
+ " 'circuit',\n",
2392
+ " 'circumstances',\n",
2393
+ " 'circus',\n",
2394
+ " 'cities',\n",
2395
+ " 'citizens',\n",
2396
+ " 'city',\n",
2397
+ " 'civil',\n",
2398
+ " 'civilization',\n",
2399
+ " 'civilwar',\n",
2400
+ " 'claim',\n",
2401
+ " 'claims',\n",
2402
+ " 'claire',\n",
2403
+ " 'clairedanes',\n",
2404
+ " 'clan',\n",
2405
+ " 'clark',\n",
2406
+ " 'clash',\n",
2407
+ " 'class',\n",
2408
+ " 'classes',\n",
2409
+ " 'classic',\n",
2410
+ " 'classmate',\n",
2411
+ " 'classmates',\n",
2412
+ " 'classroom',\n",
2413
+ " 'claudevandamme',\n",
2414
+ " 'clay',\n",
2415
+ " 'clean',\n",
2416
+ " 'clear',\n",
2417
+ " 'clerk',\n",
2418
+ " 'client',\n",
2419
+ " 'clients',\n",
2420
+ " 'climate',\n",
2421
+ " 'climbing',\n",
2422
+ " 'clinteastwood',\n",
2423
+ " 'clique',\n",
2424
+ " 'cliveowen',\n",
2425
+ " 'clock',\n",
2426
+ " 'clone',\n",
2427
+ " 'cloning',\n",
2428
+ " 'close',\n",
2429
+ " 'closed',\n",
2430
+ " 'closer',\n",
2431
+ " 'club',\n",
2432
+ " 'clubs',\n",
2433
+ " 'clue',\n",
2434
+ " 'clueless',\n",
2435
+ " 'clues',\n",
2436
+ " 'clutches',\n",
2437
+ " 'coach',\n",
2438
+ " 'coast',\n",
2439
+ " 'cocaine',\n",
2440
+ " 'cocky',\n",
2441
+ " 'code',\n",
2442
+ " 'cody',\n",
2443
+ " 'coffin',\n",
2444
+ " 'cohen',\n",
2445
+ " 'col',\n",
2446
+ " 'cold',\n",
2447
+ " 'coldwar',\n",
2448
+ " 'cole',\n",
2449
+ " 'colin',\n",
2450
+ " 'colinfarrell',\n",
2451
+ " 'colinfirth',\n",
2452
+ " 'collapse',\n",
2453
+ " 'colleague',\n",
2454
+ " 'colleagues',\n",
2455
+ " 'collect',\n",
2456
+ " 'collection',\n",
2457
+ " 'collector',\n",
2458
+ " 'college',\n",
2459
+ " 'collide',\n",
2460
+ " 'collision',\n",
2461
+ " 'colonel',\n",
2462
+ " 'colony',\n",
2463
+ " 'color',\n",
2464
+ " 'colorado',\n",
2465
+ " 'colorful',\n",
2466
+ " 'coma',\n",
2467
+ " 'combat',\n",
2468
+ " 'combined',\n",
2469
+ " 'come',\n",
2470
+ " 'comeback',\n",
2471
+ " 'comedian',\n",
2472
+ " 'comedic',\n",
2473
+ " 'comedy',\n",
2474
+ " 'comes',\n",
2475
+ " 'comet',\n",
2476
+ " 'comfort',\n",
2477
+ " 'comic',\n",
2478
+ " 'comics',\n",
2479
+ " 'coming',\n",
2480
+ " 'comingofage',\n",
2481
+ " 'comingout',\n",
2482
+ " 'command',\n",
2483
+ " 'commander',\n",
2484
+ " 'commercial',\n",
2485
+ " 'commit',\n",
2486
+ " 'commitment',\n",
2487
+ " 'committed',\n",
2488
+ " 'common',\n",
2489
+ " 'communication',\n",
2490
+ " 'community',\n",
2491
+ " 'companion',\n",
2492
+ " 'company',\n",
2493
+ " 'compete',\n",
2494
+ " 'competing',\n",
2495
+ " 'competition',\n",
2496
+ " 'complete',\n",
2497
+ " 'completely',\n",
2498
+ " 'complex',\n",
2499
+ " 'complicated',\n",
2500
+ " 'complications',\n",
2501
+ " 'composer',\n",
2502
+ " 'computer',\n",
2503
+ " 'computervirus',\n",
2504
+ " 'conan',\n",
2505
+ " 'concert',\n",
2506
+ " 'conclusion',\n",
2507
+ " 'condition',\n",
2508
+ " 'confession',\n",
2509
+ " 'confidence',\n",
2510
+ " 'confident',\n",
2511
+ " 'conflict',\n",
2512
+ " 'confront',\n",
2513
+ " 'confronted',\n",
2514
+ " 'confused',\n",
2515
+ " 'congress',\n",
2516
+ " 'conman',\n",
2517
+ " 'connected',\n",
2518
+ " 'connection',\n",
2519
+ " 'connell',\n",
2520
+ " 'connor',\n",
2521
+ " 'conquer',\n",
2522
+ " 'conscience',\n",
2523
+ " 'consequences',\n",
2524
+ " 'conservative',\n",
2525
+ " 'considered',\n",
2526
+ " 'conspiracy',\n",
2527
+ " 'constant',\n",
2528
+ " 'constantly',\n",
2529
+ " 'construction',\n",
2530
+ " 'contact',\n",
2531
+ " 'contain',\n",
2532
+ " 'contemporary',\n",
2533
+ " 'contend',\n",
2534
+ " 'contest',\n",
2535
+ " 'continue',\n",
2536
+ " 'continues',\n",
2537
+ " 'continuing',\n",
2538
+ " 'contract',\n",
2539
+ " 'control',\n",
2540
+ " 'controlled',\n",
2541
+ " 'controlling',\n",
2542
+ " 'controversial',\n",
2543
+ " 'convention',\n",
2544
+ " 'converge',\n",
2545
+ " 'convict',\n",
2546
+ " 'convicted',\n",
2547
+ " 'convince',\n",
2548
+ " 'convinced',\n",
2549
+ " 'convinces',\n",
2550
+ " 'cook',\n",
2551
+ " 'cooking',\n",
2552
+ " 'cool',\n",
2553
+ " 'cooper',\n",
2554
+ " 'cop',\n",
2555
+ " 'cope',\n",
2556
+ " 'cops',\n",
2557
+ " 'core',\n",
2558
+ " 'corner',\n",
2559
+ " 'corners',\n",
2560
+ " 'corporate',\n",
2561
+ " ...]"
2562
+ ]
2563
+ },
2564
+ "execution_count": 41,
2565
+ "metadata": {},
2566
+ "output_type": "execute_result"
2567
+ }
2568
+ ],
2569
+ "source": [
2570
+ "cv.get_feature_names()"
2571
+ ]
2572
+ },
2573
+ {
2574
+ "cell_type": "code",
2575
+ "execution_count": 42,
2576
+ "metadata": {},
2577
+ "outputs": [
2578
+ {
2579
+ "data": {
2580
+ "text/plain": [
2581
+ "array([[1. , 0. , 0. , ..., 0. , 0.02752409,\n",
2582
+ " 0. ],\n",
2583
+ " [0. , 1. , 0. , ..., 0.02865341, 0. ,\n",
2584
+ " 0. ],\n",
2585
+ " [0. , 0. , 1. , ..., 0.02865341, 0. ,\n",
2586
+ " 0. ],\n",
2587
+ " ...,\n",
2588
+ " [0. , 0.02865341, 0.02865341, ..., 1. , 0.048795 ,\n",
2589
+ " 0.05006262],\n",
2590
+ " [0.02752409, 0. , 0. , ..., 0.048795 , 1. ,\n",
2591
+ " 0.05129892],\n",
2592
+ " [0. , 0. , 0. , ..., 0.05006262, 0.05129892,\n",
2593
+ " 1. ]])"
2594
+ ]
2595
+ },
2596
+ "execution_count": 42,
2597
+ "metadata": {},
2598
+ "output_type": "execute_result"
2599
+ }
2600
+ ],
2601
+ "source": [
2602
+ "cosine_similarity(vectors)"
2603
+ ]
2604
+ },
2605
+ {
2606
+ "cell_type": "code",
2607
+ "execution_count": 43,
2608
+ "metadata": {},
2609
+ "outputs": [],
2610
+ "source": [
2611
+ "similarity = cosine_similarity(vectors)"
2612
+ ]
2613
+ },
2614
+ {
2615
+ "cell_type": "code",
2616
+ "execution_count": 44,
2617
+ "metadata": {},
2618
+ "outputs": [],
2619
+ "source": [
2620
+ "def recommend(movie):\n",
2621
+ " movie_index = new_df[new_df['title']== movie].index[0]\n",
2622
+ " distances = similarity[movie_index]\n",
2623
+ " movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]\n",
2624
+ "\n",
2625
+ " for i in movies_list:\n",
2626
+ " print(new_df.iloc[i[0]].title)"
2627
+ ]
2628
+ },
2629
+ {
2630
+ "cell_type": "code",
2631
+ "execution_count": 45,
2632
+ "metadata": {},
2633
+ "outputs": [
2634
+ {
2635
+ "name": "stdout",
2636
+ "output_type": "stream",
2637
+ "text": [
2638
+ "The Dark Knight\n",
2639
+ "The Dark Knight Rises\n",
2640
+ "Batman\n",
2641
+ "Batman v Superman: Dawn of Justice\n",
2642
+ "Batman\n"
2643
+ ]
2644
+ }
2645
+ ],
2646
+ "source": [
2647
+ "recommend('Batman Begins')"
2648
+ ]
2649
+ },
2650
+ {
2651
+ "cell_type": "code",
2652
+ "execution_count": null,
2653
+ "metadata": {},
2654
+ "outputs": [],
2655
+ "source": []
2656
+ }
2657
+ ],
2658
+ "metadata": {
2659
+ "kernelspec": {
2660
+ "display_name": "Python 3",
2661
+ "language": "python",
2662
+ "name": "python3"
2663
+ },
2664
+ "language_info": {
2665
+ "codemirror_mode": {
2666
+ "name": "ipython",
2667
+ "version": 3
2668
+ },
2669
+ "file_extension": ".py",
2670
+ "mimetype": "text/x-python",
2671
+ "name": "python",
2672
+ "nbconvert_exporter": "python",
2673
+ "pygments_lexer": "ipython3",
2674
+ "version": "3.10.4"
2675
+ },
2676
+ "orig_nbformat": 4,
2677
+ "vscode": {
2678
+ "interpreter": {
2679
+ "hash": "fb4569285eef3a3450cb62085a5b1e0da4bce0af555edc33dcf29baf3acc1368"
2680
+ }
2681
+ }
2682
+ },
2683
+ "nbformat": 4,
2684
+ "nbformat_minor": 2
2685
+ }
movie_dict.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8827151b0fa96483828c5f0a0dae75aa1f36b12522868e5b6b85eee9c4c51e07
3
+ size 2126498
movies.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b03ea838f9e4b56f3b8f00a49bdad1cf0f3aecaa884cf915b2bee2b05ebc1f
3
+ size 2145071
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ streamlit
3
+
similarity.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17c9793a20ac603ddff7e3ff7b0c16c630013099918bcf8dc285c4cb5a5f6469
3
+ size 184781251
test2_database ADDED
Binary file (16.4 kB). View file
 
tmdb_5000_credits.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d0050599ff88d40366c4841204b1489862bca346bfa46c20b05a65d14508435
3
+ size 40044293
tmdb_5000_movies.csv ADDED
The diff for this file is too large to render. See raw diff