\n",
" "
]
},
"metadata": {},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "kYA5KUWDFcZ2",
"outputId": "0d7f210e-341e-4070-f2a6-ac4993658e96"
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection budget \\\n",
"0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
"1 False NaN 65000000 \n",
"2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
"3 False NaN 16000000 \n",
"4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
"... ... ... ... \n",
"45461 False NaN 0 \n",
"45462 False NaN 0 \n",
"45463 False NaN 0 \n",
"45464 False NaN 0 \n",
"45465 False NaN 0 \n",
"\n",
" genres \\\n",
"0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
"1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
"2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
"3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
"4 [{'id': 35, 'name': 'Comedy'}] \n",
"... ... \n",
"45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n",
"45462 [{'id': 18, 'name': 'Drama'}] \n",
"45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n",
"45464 [] \n",
"45465 [] \n",
"\n",
" homepage id imdb_id \\\n",
"0 http://toystory.disney.com/toy-story 862 tt0114709 \n",
"1 NaN 8844 tt0113497 \n",
"2 NaN 15602 tt0113228 \n",
"3 NaN 31357 tt0114885 \n",
"4 NaN 11862 tt0113041 \n",
"... ... ... ... \n",
"45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n",
"45462 NaN 111109 tt2028550 \n",
"45463 NaN 67758 tt0303758 \n",
"45464 NaN 227506 tt0008536 \n",
"45465 NaN 461257 tt6980792 \n",
"\n",
" original_language original_title \\\n",
"0 en Toy Story \n",
"1 en Jumanji \n",
"2 en Grumpier Old Men \n",
"3 en Waiting to Exhale \n",
"4 en Father of the Bride Part II \n",
"... ... ... \n",
"45461 fa رگ خواب \n",
"45462 tl Siglo ng Pagluluwal \n",
"45463 en Betrayal \n",
"45464 en Satana likuyushchiy \n",
"45465 en Queerama \n",
"\n",
" overview ... release_date \\\n",
"0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
"1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
"2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
"3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
"4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
"... ... ... ... \n",
"45461 Rising and falling between a man and woman. ... NaN \n",
"45462 An artist struggles to finish his work while a... ... 2011-11-17 \n",
"45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n",
"45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n",
"45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n",
"\n",
" revenue runtime spoken_languages \\\n",
"0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
"2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"... ... ... ... \n",
"45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n",
"45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n",
"45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"45464 0.0 87.0 [] \n",
"45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"\n",
" status tagline \\\n",
"0 Released NaN \n",
"1 Released Roll the dice and unleash the excitement! \n",
"2 Released Still Yelling. Still Fighting. Still Ready for... \n",
"3 Released Friends are the people who let you be yourself... \n",
"4 Released Just When His World Is Back To Normal... He's ... \n",
"... ... ... \n",
"45461 Released Rising and falling between a man and woman \n",
"45462 Released NaN \n",
"45463 Released A deadly game of wits. \n",
"45464 Released NaN \n",
"45465 Released NaN \n",
"\n",
" title video vote_average vote_count \n",
"0 Toy Story False 7.7 5415.0 \n",
"1 Jumanji False 6.9 2413.0 \n",
"2 Grumpier Old Men False 6.5 92.0 \n",
"3 Waiting to Exhale False 6.1 34.0 \n",
"4 Father of the Bride Part II False 5.7 173.0 \n",
"... ... ... ... ... \n",
"45461 Subdue False 4.0 1.0 \n",
"45462 Century of Birthing False 9.0 3.0 \n",
"45463 Betrayal False 3.8 6.0 \n",
"45464 Satan Triumphant False 0.0 0.0 \n",
"45465 Queerama False 0.0 0.0 \n",
"\n",
"[45466 rows x 24 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
adult
\n",
"
belongs_to_collection
\n",
"
budget
\n",
"
genres
\n",
"
homepage
\n",
"
id
\n",
"
imdb_id
\n",
"
original_language
\n",
"
original_title
\n",
"
overview
\n",
"
...
\n",
"
release_date
\n",
"
revenue
\n",
"
runtime
\n",
"
spoken_languages
\n",
"
status
\n",
"
tagline
\n",
"
title
\n",
"
video
\n",
"
vote_average
\n",
"
vote_count
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
False
\n",
"
{'id': 10194, 'name': 'Toy Story Collection', ...
\n",
"
30000000
\n",
"
[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
\n",
"
http://toystory.disney.com/toy-story
\n",
"
862
\n",
"
tt0114709
\n",
"
en
\n",
"
Toy Story
\n",
"
Led by Woody, Andy's toys live happily in his ...
\n",
"
...
\n",
"
1995-10-30
\n",
"
373554033.0
\n",
"
81.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Toy Story
\n",
"
False
\n",
"
7.7
\n",
"
5415.0
\n",
"
\n",
"
\n",
"
1
\n",
"
False
\n",
"
NaN
\n",
"
65000000
\n",
"
[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
\n",
"
NaN
\n",
"
8844
\n",
"
tt0113497
\n",
"
en
\n",
"
Jumanji
\n",
"
When siblings Judy and Peter discover an encha...
\n",
"
...
\n",
"
1995-12-15
\n",
"
262797249.0
\n",
"
104.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
\n",
"
Released
\n",
"
Roll the dice and unleash the excitement!
\n",
"
Jumanji
\n",
"
False
\n",
"
6.9
\n",
"
2413.0
\n",
"
\n",
"
\n",
"
2
\n",
"
False
\n",
"
{'id': 119050, 'name': 'Grumpy Old Men Collect...
\n",
"
0
\n",
"
[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
\n",
"
NaN
\n",
"
15602
\n",
"
tt0113228
\n",
"
en
\n",
"
Grumpier Old Men
\n",
"
A family wedding reignites the ancient feud be...
\n",
"
...
\n",
"
1995-12-22
\n",
"
0.0
\n",
"
101.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Still Yelling. Still Fighting. Still Ready for...
\n",
"
Grumpier Old Men
\n",
"
False
\n",
"
6.5
\n",
"
92.0
\n",
"
\n",
"
\n",
"
3
\n",
"
False
\n",
"
NaN
\n",
"
16000000
\n",
"
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
\n",
"
NaN
\n",
"
31357
\n",
"
tt0114885
\n",
"
en
\n",
"
Waiting to Exhale
\n",
"
Cheated on, mistreated and stepped on, the wom...
\n",
"
...
\n",
"
1995-12-22
\n",
"
81452156.0
\n",
"
127.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Friends are the people who let you be yourself...
\n",
"
Waiting to Exhale
\n",
"
False
\n",
"
6.1
\n",
"
34.0
\n",
"
\n",
"
\n",
"
4
\n",
"
False
\n",
"
{'id': 96871, 'name': 'Father of the Bride Col...
\n",
"
0
\n",
"
[{'id': 35, 'name': 'Comedy'}]
\n",
"
NaN
\n",
"
11862
\n",
"
tt0113041
\n",
"
en
\n",
"
Father of the Bride Part II
\n",
"
Just when George Banks has recovered from his ...
\n",
"
...
\n",
"
1995-02-10
\n",
"
76578911.0
\n",
"
106.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Just When His World Is Back To Normal... He's ...
\n",
"
Father of the Bride Part II
\n",
"
False
\n",
"
5.7
\n",
"
173.0
\n",
"
\n",
"
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
...
\n",
"
\n",
"
\n",
"
45461
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
\n",
"
http://www.imdb.com/title/tt6209470/
\n",
"
439050
\n",
"
tt6209470
\n",
"
fa
\n",
"
رگ خواب
\n",
"
Rising and falling between a man and woman.
\n",
"
...
\n",
"
NaN
\n",
"
0.0
\n",
"
90.0
\n",
"
[{'iso_639_1': 'fa', 'name': 'فارسی'}]
\n",
"
Released
\n",
"
Rising and falling between a man and woman
\n",
"
Subdue
\n",
"
False
\n",
"
4.0
\n",
"
1.0
\n",
"
\n",
"
\n",
"
45462
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
[{'id': 18, 'name': 'Drama'}]
\n",
"
NaN
\n",
"
111109
\n",
"
tt2028550
\n",
"
tl
\n",
"
Siglo ng Pagluluwal
\n",
"
An artist struggles to finish his work while a...
\n",
"
...
\n",
"
2011-11-17
\n",
"
0.0
\n",
"
360.0
\n",
"
[{'iso_639_1': 'tl', 'name': ''}]
\n",
"
Released
\n",
"
NaN
\n",
"
Century of Birthing
\n",
"
False
\n",
"
9.0
\n",
"
3.0
\n",
"
\n",
"
\n",
"
45463
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
\n",
"
NaN
\n",
"
67758
\n",
"
tt0303758
\n",
"
en
\n",
"
Betrayal
\n",
"
When one of her hits goes wrong, a professiona...
\n",
"
...
\n",
"
2003-08-01
\n",
"
0.0
\n",
"
90.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
A deadly game of wits.
\n",
"
Betrayal
\n",
"
False
\n",
"
3.8
\n",
"
6.0
\n",
"
\n",
"
\n",
"
45464
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
[]
\n",
"
NaN
\n",
"
227506
\n",
"
tt0008536
\n",
"
en
\n",
"
Satana likuyushchiy
\n",
"
In a small town live two brothers, one a minis...
\n",
"
...
\n",
"
1917-10-21
\n",
"
0.0
\n",
"
87.0
\n",
"
[]
\n",
"
Released
\n",
"
NaN
\n",
"
Satan Triumphant
\n",
"
False
\n",
"
0.0
\n",
"
0.0
\n",
"
\n",
"
\n",
"
45465
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
[]
\n",
"
NaN
\n",
"
461257
\n",
"
tt6980792
\n",
"
en
\n",
"
Queerama
\n",
"
50 years after decriminalisation of homosexual...
\n",
"
...
\n",
"
2017-06-09
\n",
"
0.0
\n",
"
75.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Queerama
\n",
"
False
\n",
"
0.0
\n",
"
0.0
\n",
"
\n",
" \n",
"
\n",
"
45466 rows × 24 columns
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [
"metadata = pd.read_csv('/content/IMDB/movies_metadata.csv')\n",
"metadata"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7Y3S-BFBTTCY"
},
"source": [
"keep only related columns from released movies:"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "3zJqU8dUTTtY"
},
"outputs": [],
"source": [
"metadata = metadata[metadata['status'] == 'Released']\n",
"cols = np.array(['adult', 'belongs_to_collection', 'genres', 'id', 'original_language', 'title', 'production_countries', 'production_companies', 'video']) \n",
"metadata = metadata[cols]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WiV8m-zP7dxy",
"outputId": "ec13b127-a699-47a5-ce02-3dda6c354267"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"adult False\n",
"belongs_to_collection NaN\n",
"genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n",
"id 8844\n",
"original_language en\n",
"title Jumanji\n",
"production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n",
"production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n",
"video False\n",
"Name: 1, dtype: object"
]
},
"metadata": {},
"execution_count": 11
}
],
"source": [
"metadata.iloc[1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bBdpvpIl4vqJ",
"outputId": "a118ca77-0276-4408-f1df-6896507e8131"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"adult False\n",
"belongs_to_collection \n",
"genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n",
"id 8844\n",
"original_language en\n",
"title Jumanji\n",
"production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n",
"production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n",
"video False\n",
"Name: 1, dtype: object"
]
},
"metadata": {},
"execution_count": 12
}
],
"source": [
"def find_collection(x):\n",
" if x == '':\n",
" return ''\n",
" return eval(str(x))['name']\n",
"\n",
"metadata['belongs_to_collection'] = metadata['belongs_to_collection'].fillna('')\n",
"metadata['belongs_to_collection'] = metadata['belongs_to_collection'].apply(find_collection)\n",
"metadata.iloc[1]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "VRqAmj5aABKi",
"outputId": "8aff7bec-5c5c-4829-8422-f19d264542e6"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"adult False\n",
"belongs_to_collection \n",
"genres Adventure,Fantasy,Family\n",
"id 8844\n",
"original_language en\n",
"title Jumanji\n",
"production_countries United States of America\n",
"production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n",
"video False\n",
"Name: 1, dtype: object"
]
},
"metadata": {},
"execution_count": 13
}
],
"source": [
"def find_names(x):\n",
" if x == '':\n",
" return ''\n",
" genre_arr = eval(str(x))\n",
" return ','.join(i['name'] for i in eval(str(x)))\n",
" \n",
"metadata['genres'] = metadata['genres'].fillna('')\n",
"metadata['genres']=metadata['genres'].apply(find_names)\n",
"metadata['production_countries']=metadata['production_countries'].apply(find_names)\n",
"metadata['production_companies']=metadata['production_companies'].apply(find_names)\n",
"credits['cast'] = credits['cast'].apply(find_names)\n",
"metadata.iloc[1]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "GfHP8lcEzi6c",
"outputId": "4be60bff-1d92-4333-bfe8-4d5105e9858a"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"adult False\n",
"belongs_to_collection \n",
"genres Adventure,Fantasy,Family\n",
"id 8844\n",
"original_language en\n",
"title Jumanji\n",
"production_countries United States of America\n",
"production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n",
"video False\n",
"keywords board game,disappearance,based on children's b...\n",
"Name: 1, dtype: object"
]
},
"metadata": {},
"execution_count": 14
}
],
"source": [
"keywords['keywords'] = keywords['keywords'].apply(find_names)\n",
"metadata['id'] = metadata['id'].astype(int)\n",
"metadata = pd.merge(metadata,keywords,how='inner',on='id')\n",
"metadata.iloc[1]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "_sNely8jO2Co"
},
"outputs": [],
"source": [
"def to_int(x):\n",
" if x == 'True':\n",
" return 1\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iUHJJHwyHcz-",
"outputId": "798f1640-33c5-42a5-9d2c-d5c757bdc539"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array(['False', 'True'], dtype=object)"
]
},
"metadata": {},
"execution_count": 16
}
],
"source": [
"metadata['adult'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "kB8thP2fJ9Af"
},
"source": [
"there are 3 values other than True or False in adult column. there are entered by mistake so we remove those rows."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "U1d3Z-88KPYW",
"outputId": "a6b4bfec-73fa-488e-d004-1fc5f1c3c461"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([False, True], dtype=object)"
]
},
"metadata": {},
"execution_count": 17
}
],
"source": [
"metadata = metadata[(metadata['adult'] == 'True') | (metadata['adult'] == 'False')]\n",
"metadata['adult'] = metadata['adult'].apply(to_int)\n",
"metadata['video'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "ifUvKXYbQi2I"
},
"source": [
"removing nan values from dataset and replacing 'True' and 'False' with 1 and 0:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"id": "72DHRHQLLIxQ"
},
"outputs": [],
"source": [
"metadata = metadata[~metadata['video'].isna()]\n",
"metadata['video'] = metadata['video'].apply(to_int)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3XUewQIcKkv_"
},
"source": [
"## Vectorize string features"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 661
},
"id": "P0n1lJnUKj_-",
"outputId": "f72d7774-db3d-4de7-bd56-8a7316b6dcc1"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection genres \\\n",
"0 0 Toy Story Collection Animation,Comedy,Family \n",
"1 0 Adventure,Fantasy,Family \n",
"2 0 Grumpy Old Men Collection Romance,Comedy \n",
"3 0 Comedy,Drama,Romance \n",
"4 0 Father of the Bride Collection Comedy \n",
"... ... ... ... \n",
"46017 0 Drama,Family \n",
"46018 0 Drama \n",
"46019 0 Action,Drama,Thriller \n",
"46020 0 \n",
"46021 0 \n",
"\n",
" id original_language title \\\n",
"0 862 en Toy Story \n",
"1 8844 en Jumanji \n",
"2 15602 en Grumpier Old Men \n",
"3 31357 en Waiting to Exhale \n",
"4 11862 en Father of the Bride Part II \n",
"... ... ... ... \n",
"46017 439050 fa Subdue \n",
"46018 111109 tl Century of Birthing \n",
"46019 67758 en Betrayal \n",
"46020 227506 en Satan Triumphant \n",
"46021 461257 en Queerama \n",
"\n",
" production_countries \\\n",
"0 United States of America \n",
"1 United States of America \n",
"2 United States of America \n",
"3 United States of America \n",
"4 United States of America \n",
"... ... \n",
"46017 Iran \n",
"46018 Philippines \n",
"46019 United States of America \n",
"46020 Russia \n",
"46021 United Kingdom \n",
"\n",
" production_companies video \\\n",
"0 Pixar Animation Studios 0 \n",
"1 TriStar Pictures,Teitler Film,Interscope Commu... 0 \n",
"2 Warner Bros.,Lancaster Gate 0 \n",
"3 Twentieth Century Fox Film Corporation 0 \n",
"4 Sandollar Productions,Touchstone Pictures 0 \n",
"... ... ... \n",
"46017 0 \n",
"46018 Sine Olivia 0 \n",
"46019 American World Pictures 0 \n",
"46020 Yermoliev 0 \n",
"46021 0 \n",
"\n",
" keywords \n",
"0 jealousy,toy,boy,friendship,friends,rivalry,bo... \n",
"1 board game,disappearance,based on children's b... \n",
"2 fishing,best friend,duringcreditsstinger,old men \n",
"3 based on novel,interracial relationship,single... \n",
"4 baby,midlife crisis,confidence,aging,daughter,... \n",
"... ... \n",
"46017 tragic love \n",
"46018 artist,play,pinoy \n",
"46019 \n",
"46020 \n",
"46021 \n",
"\n",
"[46022 rows x 10 columns]"
],
"text/html": [
"\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 46
}
]
},
{
"cell_type": "code",
"source": [
"movie = pd.read_csv('/content/IMDB/movies_metadata.csv')\n",
"movie.head()"
],
"metadata": {
"id": "oIeGPmPI1tAk",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 787
},
"outputId": "4ab975fd-8432-418d-c826-0e85e14b6704"
},
"execution_count": 47,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n",
" exec(code_obj, self.user_global_ns, self.user_ns)\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection budget \\\n",
"0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
"1 False NaN 65000000 \n",
"2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
"3 False NaN 16000000 \n",
"4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
"\n",
" genres \\\n",
"0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
"1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
"2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
"3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
"4 [{'id': 35, 'name': 'Comedy'}] \n",
"\n",
" homepage id imdb_id original_language \\\n",
"0 http://toystory.disney.com/toy-story 862 tt0114709 en \n",
"1 NaN 8844 tt0113497 en \n",
"2 NaN 15602 tt0113228 en \n",
"3 NaN 31357 tt0114885 en \n",
"4 NaN 11862 tt0113041 en \n",
"\n",
" original_title \\\n",
"0 Toy Story \n",
"1 Jumanji \n",
"2 Grumpier Old Men \n",
"3 Waiting to Exhale \n",
"4 Father of the Bride Part II \n",
"\n",
" overview ... release_date \\\n",
"0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
"1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
"2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
"3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
"4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
"\n",
" revenue runtime spoken_languages \\\n",
"0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
"2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"\n",
" status tagline \\\n",
"0 Released NaN \n",
"1 Released Roll the dice and unleash the excitement! \n",
"2 Released Still Yelling. Still Fighting. Still Ready for... \n",
"3 Released Friends are the people who let you be yourself... \n",
"4 Released Just When His World Is Back To Normal... He's ... \n",
"\n",
" title video vote_average vote_count \n",
"0 Toy Story False 7.7 5415.0 \n",
"1 Jumanji False 6.9 2413.0 \n",
"2 Grumpier Old Men False 6.5 92.0 \n",
"3 Waiting to Exhale False 6.1 34.0 \n",
"4 Father of the Bride Part II False 5.7 173.0 \n",
"\n",
"[5 rows x 24 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
adult
\n",
"
belongs_to_collection
\n",
"
budget
\n",
"
genres
\n",
"
homepage
\n",
"
id
\n",
"
imdb_id
\n",
"
original_language
\n",
"
original_title
\n",
"
overview
\n",
"
...
\n",
"
release_date
\n",
"
revenue
\n",
"
runtime
\n",
"
spoken_languages
\n",
"
status
\n",
"
tagline
\n",
"
title
\n",
"
video
\n",
"
vote_average
\n",
"
vote_count
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
False
\n",
"
{'id': 10194, 'name': 'Toy Story Collection', ...
\n",
"
30000000
\n",
"
[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
\n",
"
http://toystory.disney.com/toy-story
\n",
"
862
\n",
"
tt0114709
\n",
"
en
\n",
"
Toy Story
\n",
"
Led by Woody, Andy's toys live happily in his ...
\n",
"
...
\n",
"
1995-10-30
\n",
"
373554033.0
\n",
"
81.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Toy Story
\n",
"
False
\n",
"
7.7
\n",
"
5415.0
\n",
"
\n",
"
\n",
"
1
\n",
"
False
\n",
"
NaN
\n",
"
65000000
\n",
"
[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
\n",
"
NaN
\n",
"
8844
\n",
"
tt0113497
\n",
"
en
\n",
"
Jumanji
\n",
"
When siblings Judy and Peter discover an encha...
\n",
"
...
\n",
"
1995-12-15
\n",
"
262797249.0
\n",
"
104.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
\n",
"
Released
\n",
"
Roll the dice and unleash the excitement!
\n",
"
Jumanji
\n",
"
False
\n",
"
6.9
\n",
"
2413.0
\n",
"
\n",
"
\n",
"
2
\n",
"
False
\n",
"
{'id': 119050, 'name': 'Grumpy Old Men Collect...
\n",
"
0
\n",
"
[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
\n",
"
NaN
\n",
"
15602
\n",
"
tt0113228
\n",
"
en
\n",
"
Grumpier Old Men
\n",
"
A family wedding reignites the ancient feud be...
\n",
"
...
\n",
"
1995-12-22
\n",
"
0.0
\n",
"
101.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Still Yelling. Still Fighting. Still Ready for...
\n",
"
Grumpier Old Men
\n",
"
False
\n",
"
6.5
\n",
"
92.0
\n",
"
\n",
"
\n",
"
3
\n",
"
False
\n",
"
NaN
\n",
"
16000000
\n",
"
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
\n",
"
NaN
\n",
"
31357
\n",
"
tt0114885
\n",
"
en
\n",
"
Waiting to Exhale
\n",
"
Cheated on, mistreated and stepped on, the wom...
\n",
"
...
\n",
"
1995-12-22
\n",
"
81452156.0
\n",
"
127.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Friends are the people who let you be yourself...
\n",
"
Waiting to Exhale
\n",
"
False
\n",
"
6.1
\n",
"
34.0
\n",
"
\n",
"
\n",
"
4
\n",
"
False
\n",
"
{'id': 96871, 'name': 'Father of the Bride Col...
\n",
"
0
\n",
"
[{'id': 35, 'name': 'Comedy'}]
\n",
"
NaN
\n",
"
11862
\n",
"
tt0113041
\n",
"
en
\n",
"
Father of the Bride Part II
\n",
"
Just when George Banks has recovered from his ...
\n",
"
...
\n",
"
1995-02-10
\n",
"
76578911.0
\n",
"
106.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Just When His World Is Back To Normal... He's ...
\n",
"
Father of the Bride Part II
\n",
"
False
\n",
"
5.7
\n",
"
173.0
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 24 columns
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 47
}
]
},
{
"cell_type": "code",
"source": [
"movie = movie.rename(columns={'id': 'movieId'})"
],
"metadata": {
"id": "rmDYxAgOgRNj"
},
"execution_count": 48,
"outputs": []
},
{
"cell_type": "code",
"source": [
"movie.shape"
],
"metadata": {
"id": "DoSsZcRpjo7Y",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "2c9f7194-d614-443b-d091-5f68f0e90655"
},
"execution_count": 49,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(45466, 24)"
]
},
"metadata": {},
"execution_count": 49
}
]
},
{
"cell_type": "code",
"source": [
"movie.head()"
],
"metadata": {
"id": "6XmWaDvFgeGU",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 750
},
"outputId": "8be37316-480a-43d3-82fa-40b236c09c26"
},
"execution_count": 50,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection budget \\\n",
"0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
"1 False NaN 65000000 \n",
"2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
"3 False NaN 16000000 \n",
"4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
"\n",
" genres \\\n",
"0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
"1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
"2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
"3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
"4 [{'id': 35, 'name': 'Comedy'}] \n",
"\n",
" homepage movieId imdb_id original_language \\\n",
"0 http://toystory.disney.com/toy-story 862 tt0114709 en \n",
"1 NaN 8844 tt0113497 en \n",
"2 NaN 15602 tt0113228 en \n",
"3 NaN 31357 tt0114885 en \n",
"4 NaN 11862 tt0113041 en \n",
"\n",
" original_title \\\n",
"0 Toy Story \n",
"1 Jumanji \n",
"2 Grumpier Old Men \n",
"3 Waiting to Exhale \n",
"4 Father of the Bride Part II \n",
"\n",
" overview ... release_date \\\n",
"0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
"1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
"2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
"3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
"4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
"\n",
" revenue runtime spoken_languages \\\n",
"0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
"2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"\n",
" status tagline \\\n",
"0 Released NaN \n",
"1 Released Roll the dice and unleash the excitement! \n",
"2 Released Still Yelling. Still Fighting. Still Ready for... \n",
"3 Released Friends are the people who let you be yourself... \n",
"4 Released Just When His World Is Back To Normal... He's ... \n",
"\n",
" title video vote_average vote_count \n",
"0 Toy Story False 7.7 5415.0 \n",
"1 Jumanji False 6.9 2413.0 \n",
"2 Grumpier Old Men False 6.5 92.0 \n",
"3 Waiting to Exhale False 6.1 34.0 \n",
"4 Father of the Bride Part II False 5.7 173.0 \n",
"\n",
"[5 rows x 24 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
adult
\n",
"
belongs_to_collection
\n",
"
budget
\n",
"
genres
\n",
"
homepage
\n",
"
movieId
\n",
"
imdb_id
\n",
"
original_language
\n",
"
original_title
\n",
"
overview
\n",
"
...
\n",
"
release_date
\n",
"
revenue
\n",
"
runtime
\n",
"
spoken_languages
\n",
"
status
\n",
"
tagline
\n",
"
title
\n",
"
video
\n",
"
vote_average
\n",
"
vote_count
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
False
\n",
"
{'id': 10194, 'name': 'Toy Story Collection', ...
\n",
"
30000000
\n",
"
[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
\n",
"
http://toystory.disney.com/toy-story
\n",
"
862
\n",
"
tt0114709
\n",
"
en
\n",
"
Toy Story
\n",
"
Led by Woody, Andy's toys live happily in his ...
\n",
"
...
\n",
"
1995-10-30
\n",
"
373554033.0
\n",
"
81.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Toy Story
\n",
"
False
\n",
"
7.7
\n",
"
5415.0
\n",
"
\n",
"
\n",
"
1
\n",
"
False
\n",
"
NaN
\n",
"
65000000
\n",
"
[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
\n",
"
NaN
\n",
"
8844
\n",
"
tt0113497
\n",
"
en
\n",
"
Jumanji
\n",
"
When siblings Judy and Peter discover an encha...
\n",
"
...
\n",
"
1995-12-15
\n",
"
262797249.0
\n",
"
104.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
\n",
"
Released
\n",
"
Roll the dice and unleash the excitement!
\n",
"
Jumanji
\n",
"
False
\n",
"
6.9
\n",
"
2413.0
\n",
"
\n",
"
\n",
"
2
\n",
"
False
\n",
"
{'id': 119050, 'name': 'Grumpy Old Men Collect...
\n",
"
0
\n",
"
[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
\n",
"
NaN
\n",
"
15602
\n",
"
tt0113228
\n",
"
en
\n",
"
Grumpier Old Men
\n",
"
A family wedding reignites the ancient feud be...
\n",
"
...
\n",
"
1995-12-22
\n",
"
0.0
\n",
"
101.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Still Yelling. Still Fighting. Still Ready for...
\n",
"
Grumpier Old Men
\n",
"
False
\n",
"
6.5
\n",
"
92.0
\n",
"
\n",
"
\n",
"
3
\n",
"
False
\n",
"
NaN
\n",
"
16000000
\n",
"
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
\n",
"
NaN
\n",
"
31357
\n",
"
tt0114885
\n",
"
en
\n",
"
Waiting to Exhale
\n",
"
Cheated on, mistreated and stepped on, the wom...
\n",
"
...
\n",
"
1995-12-22
\n",
"
81452156.0
\n",
"
127.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Friends are the people who let you be yourself...
\n",
"
Waiting to Exhale
\n",
"
False
\n",
"
6.1
\n",
"
34.0
\n",
"
\n",
"
\n",
"
4
\n",
"
False
\n",
"
{'id': 96871, 'name': 'Father of the Bride Col...
\n",
"
0
\n",
"
[{'id': 35, 'name': 'Comedy'}]
\n",
"
NaN
\n",
"
11862
\n",
"
tt0113041
\n",
"
en
\n",
"
Father of the Bride Part II
\n",
"
Just when George Banks has recovered from his ...
\n",
"
...
\n",
"
1995-02-10
\n",
"
76578911.0
\n",
"
106.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Just When His World Is Back To Normal... He's ...
\n",
"
Father of the Bride Part II
\n",
"
False
\n",
"
5.7
\n",
"
173.0
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 24 columns
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 50
}
]
},
{
"cell_type": "markdown",
"source": [
"### data preprocessing"
],
"metadata": {
"id": "oD9RMwahqemy"
}
},
{
"cell_type": "markdown",
"source": [
"There are three rows entered by mistake, so we remove that row."
],
"metadata": {
"id": "Wy2LqLxnklN1"
}
},
{
"cell_type": "code",
"source": [
"movie = movie[(movie['movieId']!='1997-08-20') & (movie['movieId']!='2012-09-29') & (movie['movieId']!='2014-01-01')]"
],
"metadata": {
"id": "OnIMWw3Nj3Dp"
},
"execution_count": 51,
"outputs": []
},
{
"cell_type": "code",
"source": [
"def find_names(x):\n",
" if x == '':\n",
" return ''\n",
" genre_arr = eval(str(x))\n",
" return ','.join(i['name'] for i in eval(str(x)))\n",
" \n",
"movie['genres'] = movie['genres'].fillna('')"
],
"metadata": {
"id": "kO8m6SsepBIg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "ff9d7d36-89d2-438e-f1d3-0b8f6cd8ea24"
},
"execution_count": 52,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
":7: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" movie['genres'] = movie['genres'].fillna('')\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"movie['genres']=movie['genres'].apply(find_names)"
],
"metadata": {
"id": "vOfKcOQ-pBIg"
},
"execution_count": 53,
"outputs": []
},
{
"cell_type": "code",
"source": [
"movie.movieId = movie.movieId.astype(\"uint64\")"
],
"metadata": {
"id": "0p-yhNZ3iRsl"
},
"execution_count": 54,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"only keep rating for movies with metadata in movie dataset"
],
"metadata": {
"id": "DWgywXEKuq2O"
}
},
{
"cell_type": "code",
"source": [
"new_rating = pd.merge(rating, movie, how='inner', on=[\"movieId\"])"
],
"metadata": {
"id": "psgzmBFLtcmx"
},
"execution_count": 55,
"outputs": []
},
{
"cell_type": "code",
"source": [
"new_rating = new_rating[[\"userId\", \"movieId\", \"rating\"]]"
],
"metadata": {
"id": "z9DjgdvYuhOW"
},
"execution_count": 56,
"outputs": []
},
{
"cell_type": "code",
"source": [
"movie.head()"
],
"metadata": {
"id": "gQ4VSPNUuFOc",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 750
},
"outputId": "d4245b96-1958-4704-e3eb-cc9b64effca3"
},
"execution_count": 57,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection budget \\\n",
"0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
"1 False NaN 65000000 \n",
"2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
"3 False NaN 16000000 \n",
"4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
"\n",
" genres homepage movieId \\\n",
"0 Animation,Comedy,Family http://toystory.disney.com/toy-story 862 \n",
"1 Adventure,Fantasy,Family NaN 8844 \n",
"2 Romance,Comedy NaN 15602 \n",
"3 Comedy,Drama,Romance NaN 31357 \n",
"4 Comedy NaN 11862 \n",
"\n",
" imdb_id original_language original_title \\\n",
"0 tt0114709 en Toy Story \n",
"1 tt0113497 en Jumanji \n",
"2 tt0113228 en Grumpier Old Men \n",
"3 tt0114885 en Waiting to Exhale \n",
"4 tt0113041 en Father of the Bride Part II \n",
"\n",
" overview ... release_date \\\n",
"0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
"1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
"2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
"3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
"4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
"\n",
" revenue runtime spoken_languages \\\n",
"0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
"2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
"\n",
" status tagline \\\n",
"0 Released NaN \n",
"1 Released Roll the dice and unleash the excitement! \n",
"2 Released Still Yelling. Still Fighting. Still Ready for... \n",
"3 Released Friends are the people who let you be yourself... \n",
"4 Released Just When His World Is Back To Normal... He's ... \n",
"\n",
" title video vote_average vote_count \n",
"0 Toy Story False 7.7 5415.0 \n",
"1 Jumanji False 6.9 2413.0 \n",
"2 Grumpier Old Men False 6.5 92.0 \n",
"3 Waiting to Exhale False 6.1 34.0 \n",
"4 Father of the Bride Part II False 5.7 173.0 \n",
"\n",
"[5 rows x 24 columns]"
],
"text/html": [
"\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 67
}
]
},
{
"cell_type": "markdown",
"source": [
"making recommendations"
],
"metadata": {
"id": "IDO7q6EjZ8q1"
}
},
{
"cell_type": "code",
"source": [
"def recommend_top_k(preds_df, ratings_df, movie, userId, k=10):\n",
" user_row = userId-1 \n",
" sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False) \n",
" user_data = ratings_df[ratings_df.userId == (userId)]\n",
" user_rated = user_data.merge(movie, how = 'left', left_on = 'movieId', right_on = 'movieId'). \\\n",
" sort_values(['rating'], ascending=False)\n",
" user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',\n",
" on = 'movieId').rename(columns = {user_row: 'prediction'}). \\\n",
" sort_values('prediction', ascending = False). \\\n",
" iloc[:k, :]\n",
" return user_rated, user_preds"
],
"metadata": {
"id": "0sZanc2nV7ot"
},
"execution_count": 68,
"outputs": []
},
{
"cell_type": "code",
"source": [
"collaborative_k = 100\n",
"user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, 220, collaborative_k)\n",
"mf.log_param('collaborative k', collaborative_k)"
],
"metadata": {
"id": "RWZBPk3QX4Hg",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "609849d1-ca57-4831-b2d8-10ae2d0504d4"
},
"execution_count": 69,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"100"
]
},
"metadata": {},
"execution_count": 69
}
]
},
{
"cell_type": "code",
"source": [
"user_preds.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 698
},
"id": "xmk5e3xln_Xk",
"outputId": "e92bff6a-7b20-427a-fa05-1743c8e4a166"
},
"execution_count": 70,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" adult belongs_to_collection budget \\\n",
"6388 False {'id': 528, 'name': 'The Terminator Collection... 200000000 \n",
"3382 False NaN 0 \n",
"5325 False {'id': 86055, 'name': 'Men In Black Collection... 140000000 \n",
"4020 False NaN 8000000 \n",
"286 False {'id': 300546, 'name': 'Once were Warriors Col... 0 \n",
"\n",
" genres \\\n",
"6388 Action,Thriller,Science Fiction \n",
"3382 Drama,Science Fiction,Adventure,Mystery \n",
"5325 Action,Adventure,Comedy,Science Fiction \n",
"4020 Drama,Thriller \n",
"286 Drama \n",
"\n",
" homepage movieId imdb_id \\\n",
"6388 NaN 296 tt0181852 \n",
"3382 NaN 593 tt0069293 \n",
"5325 http://www.sonypictures.com/homevideo/meninbla... 608 tt0120912 \n",
"4020 NaN 318 tt0120753 \n",
"286 NaN 527 tt0110729 \n",
"\n",
" original_language original_title \\\n",
"6388 en Terminator 3: Rise of the Machines \n",
"3382 ru Солярис \n",
"5325 en Men in Black II \n",
"4020 en The Million Dollar Hotel \n",
"286 en Once Were Warriors \n",
"\n",
" overview ... revenue \\\n",
"6388 It's been 10 years since John Connor saved Ear... ... 435000000.0 \n",
"3382 Ground control has been receiving strange tran... ... 0.0 \n",
"5325 Kay and Jay reunite to provide our best, last ... ... 441818803.0 \n",
"4020 The Million Dollar Hotel starts with a jump fr... ... 0.0 \n",
"286 A drama about a Maori family lving in Auckland... ... 2201126.0 \n",
"\n",
" runtime spoken_languages status \\\n",
"6388 109.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"3382 167.0 [{'iso_639_1': 'ru', 'name': 'Pусский'}] Released \n",
"5325 88.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"4020 122.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"286 99.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n",
"\n",
" tagline \\\n",
"6388 The Machines Will Rise. \n",
"3382 NaN \n",
"5325 Same Planet. New Scum. \n",
"4020 NaN \n",
"286 A family in crisis, a life in chaos... Nothing... \n",
"\n",
" title video vote_average vote_count \\\n",
"6388 Terminator 3: Rise of the Machines False 5.9 2177.0 \n",
"3382 Solaris False 7.7 364.0 \n",
"5325 Men in Black II False 6.1 3188.0 \n",
"4020 The Million Dollar Hotel False 5.9 76.0 \n",
"286 Once Were Warriors False 7.6 106.0 \n",
"\n",
" prediction \n",
"6388 4.792743 \n",
"3382 4.742942 \n",
"5325 4.647800 \n",
"4020 4.469385 \n",
"286 4.236960 \n",
"\n",
"[5 rows x 25 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
adult
\n",
"
belongs_to_collection
\n",
"
budget
\n",
"
genres
\n",
"
homepage
\n",
"
movieId
\n",
"
imdb_id
\n",
"
original_language
\n",
"
original_title
\n",
"
overview
\n",
"
...
\n",
"
revenue
\n",
"
runtime
\n",
"
spoken_languages
\n",
"
status
\n",
"
tagline
\n",
"
title
\n",
"
video
\n",
"
vote_average
\n",
"
vote_count
\n",
"
prediction
\n",
"
\n",
" \n",
" \n",
"
\n",
"
6388
\n",
"
False
\n",
"
{'id': 528, 'name': 'The Terminator Collection...
\n",
"
200000000
\n",
"
Action,Thriller,Science Fiction
\n",
"
NaN
\n",
"
296
\n",
"
tt0181852
\n",
"
en
\n",
"
Terminator 3: Rise of the Machines
\n",
"
It's been 10 years since John Connor saved Ear...
\n",
"
...
\n",
"
435000000.0
\n",
"
109.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
The Machines Will Rise.
\n",
"
Terminator 3: Rise of the Machines
\n",
"
False
\n",
"
5.9
\n",
"
2177.0
\n",
"
4.792743
\n",
"
\n",
"
\n",
"
3382
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
Drama,Science Fiction,Adventure,Mystery
\n",
"
NaN
\n",
"
593
\n",
"
tt0069293
\n",
"
ru
\n",
"
Солярис
\n",
"
Ground control has been receiving strange tran...
\n",
"
...
\n",
"
0.0
\n",
"
167.0
\n",
"
[{'iso_639_1': 'ru', 'name': 'Pусский'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Solaris
\n",
"
False
\n",
"
7.7
\n",
"
364.0
\n",
"
4.742942
\n",
"
\n",
"
\n",
"
5325
\n",
"
False
\n",
"
{'id': 86055, 'name': 'Men In Black Collection...
\n",
"
140000000
\n",
"
Action,Adventure,Comedy,Science Fiction
\n",
"
http://www.sonypictures.com/homevideo/meninbla...
\n",
"
608
\n",
"
tt0120912
\n",
"
en
\n",
"
Men in Black II
\n",
"
Kay and Jay reunite to provide our best, last ...
\n",
"
...
\n",
"
441818803.0
\n",
"
88.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Same Planet. New Scum.
\n",
"
Men in Black II
\n",
"
False
\n",
"
6.1
\n",
"
3188.0
\n",
"
4.647800
\n",
"
\n",
"
\n",
"
4020
\n",
"
False
\n",
"
NaN
\n",
"
8000000
\n",
"
Drama,Thriller
\n",
"
NaN
\n",
"
318
\n",
"
tt0120753
\n",
"
en
\n",
"
The Million Dollar Hotel
\n",
"
The Million Dollar Hotel starts with a jump fr...
\n",
"
...
\n",
"
0.0
\n",
"
122.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
The Million Dollar Hotel
\n",
"
False
\n",
"
5.9
\n",
"
76.0
\n",
"
4.469385
\n",
"
\n",
"
\n",
"
286
\n",
"
False
\n",
"
{'id': 300546, 'name': 'Once were Warriors Col...
\n",
"
0
\n",
"
Drama
\n",
"
NaN
\n",
"
527
\n",
"
tt0110729
\n",
"
en
\n",
"
Once Were Warriors
\n",
"
A drama about a Maori family lving in Auckland...
\n",
"
...
\n",
"
2201126.0
\n",
"
99.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
\n",
"
Released
\n",
"
A family in crisis, a life in chaos... Nothing...
\n",
"
Once Were Warriors
\n",
"
False
\n",
"
7.6
\n",
"
106.0
\n",
"
4.236960
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 25 columns
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 70
}
]
},
{
"cell_type": "code",
"source": [
"user_rated.head()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 577
},
"id": "x6ohhdH0sF0H",
"outputId": "5f8a17f7-3b50-48ad-9d10-809357834a3b"
},
"execution_count": 71,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" userId movieId rating adult belongs_to_collection budget \\\n",
"0 220 2294 5.0 False NaN 22000000 \n",
"46 220 1247 5.0 False NaN 85000000 \n",
"25 220 2762 5.0 False NaN 0 \n",
"27 220 260 5.0 False NaN 0 \n",
"59 220 2324 5.0 False NaN 3250000 \n",
"\n",
" genres homepage imdb_id \\\n",
"0 Comedy NaN tt0261392 \n",
"46 Drama,Thriller,History http://www.thegoodshepherdmovie.com/ tt0343737 \n",
"25 Drama,Crime NaN tt0029811 \n",
"27 Action,Thriller,Mystery NaN tt0026029 \n",
"59 Drama http://www.localcolormovie.com/ tt0472126 \n",
"\n",
" original_language ... release_date revenue runtime \\\n",
"0 en ... 2001-08-22 33788161.0 104.0 \n",
"46 en ... 2006-12-11 59908565.0 167.0 \n",
"25 en ... 1937-11-01 0.0 83.0 \n",
"27 en ... 1935-06-01 0.0 86.0 \n",
"59 en ... 2006-09-19 32788.0 107.0 \n",
"\n",
" spoken_languages status \\\n",
"0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"46 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n",
"25 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"27 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"59 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
"\n",
" tagline \\\n",
"0 Hollywood had it coming \n",
"46 The untold story of the most powerful covert a... \n",
"25 A Brilliant Melodrama \n",
"27 Handcuffed to the girl who double-crossed him \n",
"59 NaN \n",
"\n",
" title video vote_average vote_count \n",
"0 Jay and Silent Bob Strike Back False 6.4 491.0 \n",
"46 The Good Shepherd False 6.3 342.0 \n",
"25 Young and Innocent False 6.8 42.0 \n",
"27 The 39 Steps False 7.4 217.0 \n",
"59 Local Color False 6.1 8.0 \n",
"\n",
"[5 rows x 26 columns]"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
userId
\n",
"
movieId
\n",
"
rating
\n",
"
adult
\n",
"
belongs_to_collection
\n",
"
budget
\n",
"
genres
\n",
"
homepage
\n",
"
imdb_id
\n",
"
original_language
\n",
"
...
\n",
"
release_date
\n",
"
revenue
\n",
"
runtime
\n",
"
spoken_languages
\n",
"
status
\n",
"
tagline
\n",
"
title
\n",
"
video
\n",
"
vote_average
\n",
"
vote_count
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
220
\n",
"
2294
\n",
"
5.0
\n",
"
False
\n",
"
NaN
\n",
"
22000000
\n",
"
Comedy
\n",
"
NaN
\n",
"
tt0261392
\n",
"
en
\n",
"
...
\n",
"
2001-08-22
\n",
"
33788161.0
\n",
"
104.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Hollywood had it coming
\n",
"
Jay and Silent Bob Strike Back
\n",
"
False
\n",
"
6.4
\n",
"
491.0
\n",
"
\n",
"
\n",
"
46
\n",
"
220
\n",
"
1247
\n",
"
5.0
\n",
"
False
\n",
"
NaN
\n",
"
85000000
\n",
"
Drama,Thriller,History
\n",
"
http://www.thegoodshepherdmovie.com/
\n",
"
tt0343737
\n",
"
en
\n",
"
...
\n",
"
2006-12-11
\n",
"
59908565.0
\n",
"
167.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}, {'iso...
\n",
"
Released
\n",
"
The untold story of the most powerful covert a...
\n",
"
The Good Shepherd
\n",
"
False
\n",
"
6.3
\n",
"
342.0
\n",
"
\n",
"
\n",
"
25
\n",
"
220
\n",
"
2762
\n",
"
5.0
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
Drama,Crime
\n",
"
NaN
\n",
"
tt0029811
\n",
"
en
\n",
"
...
\n",
"
1937-11-01
\n",
"
0.0
\n",
"
83.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
A Brilliant Melodrama
\n",
"
Young and Innocent
\n",
"
False
\n",
"
6.8
\n",
"
42.0
\n",
"
\n",
"
\n",
"
27
\n",
"
220
\n",
"
260
\n",
"
5.0
\n",
"
False
\n",
"
NaN
\n",
"
0
\n",
"
Action,Thriller,Mystery
\n",
"
NaN
\n",
"
tt0026029
\n",
"
en
\n",
"
...
\n",
"
1935-06-01
\n",
"
0.0
\n",
"
86.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
Handcuffed to the girl who double-crossed him
\n",
"
The 39 Steps
\n",
"
False
\n",
"
7.4
\n",
"
217.0
\n",
"
\n",
"
\n",
"
59
\n",
"
220
\n",
"
2324
\n",
"
5.0
\n",
"
False
\n",
"
NaN
\n",
"
3250000
\n",
"
Drama
\n",
"
http://www.localcolormovie.com/
\n",
"
tt0472126
\n",
"
en
\n",
"
...
\n",
"
2006-09-19
\n",
"
32788.0
\n",
"
107.0
\n",
"
[{'iso_639_1': 'en', 'name': 'English'}]
\n",
"
Released
\n",
"
NaN
\n",
"
Local Color
\n",
"
False
\n",
"
6.1
\n",
"
8.0
\n",
"
\n",
" \n",
"
\n",
"
5 rows × 26 columns
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 71
}
]
},
{
"cell_type": "code",
"source": [
"user_rated[[\"title\", \"genres\"]].head(10)"
],
"metadata": {
"id": "18grZyJYmG5q",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"outputId": "c064b2c7-9b04-4aae-b76f-58c1df0da2dc"
},
"execution_count": 72,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title genres\n",
"0 Jay and Silent Bob Strike Back Comedy\n",
"46 The Good Shepherd Drama,Thriller,History\n",
"25 Young and Innocent Drama,Crime\n",
"27 The 39 Steps Action,Thriller,Mystery\n",
"59 Local Color Drama\n",
"31 The Big Sleep Crime,Drama,Mystery,Thriller\n",
"33 The Talented Mr. Ripley Thriller,Crime,Drama\n",
"42 The Big Parade Drama,Romance,War\n",
"73 Dancer in the Dark Drama,Crime,Music\n",
"110 Birdman of Alcatraz Drama"
],
"text/html": [
"\n",
"
\n",
"
\n",
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
title
\n",
"
genres
\n",
"
\n",
" \n",
" \n",
"
\n",
"
0
\n",
"
Jay and Silent Bob Strike Back
\n",
"
Comedy
\n",
"
\n",
"
\n",
"
46
\n",
"
The Good Shepherd
\n",
"
Drama,Thriller,History
\n",
"
\n",
"
\n",
"
25
\n",
"
Young and Innocent
\n",
"
Drama,Crime
\n",
"
\n",
"
\n",
"
27
\n",
"
The 39 Steps
\n",
"
Action,Thriller,Mystery
\n",
"
\n",
"
\n",
"
59
\n",
"
Local Color
\n",
"
Drama
\n",
"
\n",
"
\n",
"
31
\n",
"
The Big Sleep
\n",
"
Crime,Drama,Mystery,Thriller
\n",
"
\n",
"
\n",
"
33
\n",
"
The Talented Mr. Ripley
\n",
"
Thriller,Crime,Drama
\n",
"
\n",
"
\n",
"
42
\n",
"
The Big Parade
\n",
"
Drama,Romance,War
\n",
"
\n",
"
\n",
"
73
\n",
"
Dancer in the Dark
\n",
"
Drama,Crime,Music
\n",
"
\n",
"
\n",
"
110
\n",
"
Birdman of Alcatraz
\n",
"
Drama
\n",
"
\n",
" \n",
"
\n",
"
\n",
" \n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
" "
]
},
"metadata": {},
"execution_count": 72
}
]
},
{
"cell_type": "code",
"source": [
"user_preds[[\"title\", \"genres\"]].head(10)"
],
"metadata": {
"id": "Mtq_XCEFmLee",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 363
},
"outputId": "0cd7dd8d-2fde-4d76-f8c0-2131239e5738"
},
"execution_count": 73,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
" title \\\n",
"6388 Terminator 3: Rise of the Machines \n",
"3382 Solaris \n",
"5325 Men in Black II \n",
"4020 The Million Dollar Hotel \n",
"286 Once Were Warriors \n",
"2100 Young and Innocent \n",
"534 Sleepless in Seattle \n",
"2137 Say Anything... \n",
"11922 License to Wed \n",
"33911 The Tunnel \n",
"\n",
" genres \n",
"6388 Action,Thriller,Science Fiction \n",
"3382 Drama,Science Fiction,Adventure,Mystery \n",
"5325 Action,Adventure,Comedy,Science Fiction \n",
"4020 Drama,Thriller \n",
"286 Drama \n",
"2100 Drama,Crime \n",
"534 Comedy,Drama,Romance \n",
"2137 Comedy,Drama,Romance \n",
"11922 Comedy \n",
"33911 Science Fiction "
],
"text/html": [
"\n",
"