{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "be7a2809", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "import itertools" ] }, { "cell_type": "markdown", "id": "2666c747", "metadata": {}, "source": [ "# Loading dataset's" ] }, { "cell_type": "code", "execution_count": 2, "id": "860834e9", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)',...[{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[{'cast_id': 1, 'character': 'Alan Parrish', '...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'c...[{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah...[{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[{'cast_id': 1, 'character': 'George Banks', '...[{'credit_id': '52fe44959251416c75039ed7', 'de...11862
............
45471[{'cast_id': 0, 'character': '', 'credit_id': ...[{'credit_id': '5894a97d925141426c00818c', 'de...439050
45472[{'cast_id': 1002, 'character': 'Sister Angela...[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...111109
45473[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...[{'credit_id': '52fe4776c3a368484e0c8387', 'de...67758
45474[{'cast_id': 2, 'character': '', 'credit_id': ...[{'credit_id': '533bccebc3a36844cf0011a7', 'de...227506
45475[][{'credit_id': '593e676c92514105b702e68e', 'de...461257
\n", "

45476 rows × 3 columns

\n", "
" ], "text/plain": [ " cast \\\n", "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n", "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n", "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n", "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n", "4 [{'cast_id': 1, 'character': 'George Banks', '... \n", "... ... \n", "45471 [{'cast_id': 0, 'character': '', 'credit_id': ... \n", "45472 [{'cast_id': 1002, 'character': 'Sister Angela... \n", "45473 [{'cast_id': 6, 'character': 'Emily Shaw', 'cr... \n", "45474 [{'cast_id': 2, 'character': '', 'credit_id': ... \n", "45475 [] \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 \n", "... ... ... \n", "45471 [{'credit_id': '5894a97d925141426c00818c', 'de... 439050 \n", "45472 [{'credit_id': '52fe4af1c3a36847f81e9b15', 'de... 111109 \n", "45473 [{'credit_id': '52fe4776c3a368484e0c8387', 'de... 67758 \n", "45474 [{'credit_id': '533bccebc3a36844cf0011a7', 'de... 227506 \n", "45475 [{'credit_id': '593e676c92514105b702e68e', 'de... 461257 \n", "\n", "[45476 rows x 3 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits_ds = pd.read_csv('./dataset/IMDB/credits.csv')\n", "credits_ds" ] }, { "cell_type": "code", "execution_count": 3, "id": "0bf91af6", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdimdbIdtmdbId
01114709862.0
121134978844.0
2311322815602.0
3411488531357.0
4511304111862.0
............
91201626723859980402672.0
91211630564262980315011.0
91221639492531318391698.0
912316497727660137608.0
91241649793447228410803.0
\n", "

9125 rows × 3 columns

\n", "
" ], "text/plain": [ " movieId imdbId tmdbId\n", "0 1 114709 862.0\n", "1 2 113497 8844.0\n", "2 3 113228 15602.0\n", "3 4 114885 31357.0\n", "4 5 113041 11862.0\n", "... ... ... ...\n", "9120 162672 3859980 402672.0\n", "9121 163056 4262980 315011.0\n", "9122 163949 2531318 391698.0\n", "9123 164977 27660 137608.0\n", "9124 164979 3447228 410803.0\n", "\n", "[9125 rows x 3 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links_ds = pd.read_csv('./dataset/IMDB/links_small.csv')\n", "links_ds" ] }, { "cell_type": "code", "execution_count": 4, "id": "3693a9e8", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
...............
9999967162682.51065579370
10000067162694.01065149201
10000167163654.01070940363
10000267163852.51070979663
10000367165653.51074784724
\n", "

100004 rows × 4 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205\n", "... ... ... ... ...\n", "99999 671 6268 2.5 1065579370\n", "100000 671 6269 4.0 1065149201\n", "100001 671 6365 4.0 1070940363\n", "100002 671 6385 2.5 1070979663\n", "100003 671 6565 3.5 1074784724\n", "\n", "[100004 rows x 4 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ratings_ds = pd.read_csv('./dataset/IMDB/ratings_small.csv')\n", "ratings_ds" ] }, { "cell_type": "code", "execution_count": 5, "id": "3d009e72", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
18844[{'id': 10090, 'name': 'board game'}, {'id': 1...
215602[{'id': 1495, 'name': 'fishing'}, {'id': 12392...
331357[{'id': 818, 'name': 'based on novel'}, {'id':...
411862[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...
.........
46414439050[{'id': 10703, 'name': 'tragic love'}]
46415111109[{'id': 2679, 'name': 'artist'}, {'id': 14531,...
4641667758[]
46417227506[]
46418461257[]
\n", "

46419 rows × 2 columns

\n", "
" ], "text/plain": [ " id keywords\n", "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n", "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n", "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n", "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n", "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...\n", "... ... ...\n", "46414 439050 [{'id': 10703, 'name': 'tragic love'}]\n", "46415 111109 [{'id': 2679, 'name': 'artist'}, {'id': 14531,...\n", "46416 67758 []\n", "46417 227506 []\n", "46418 461257 []\n", "\n", "[46419 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords_ds = pd.read_csv('./dataset/IMDB/keywords.csv')\n", "keywords_ds" ] }, { "cell_type": "code", "execution_count": 6, "id": "f94cfd30", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\parsa\\AppData\\Local\\Temp\\ipykernel_1804\\4219932325.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " movies_metadata_ds = pd.read_csv('./dataset/IMDB/movies_metadata.csv')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
..................................................................
45461FalseNaN0[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...http://www.imdb.com/title/tt6209470/439050tt6209470faرگ خوابRising and falling between a man and woman....NaN0.090.0[{'iso_639_1': 'fa', 'name': 'فارسی'}]ReleasedRising and falling between a man and womanSubdueFalse4.01.0
45462FalseNaN0[{'id': 18, 'name': 'Drama'}]NaN111109tt2028550tlSiglo ng PagluluwalAn artist struggles to finish his work while a......2011-11-170.0360.0[{'iso_639_1': 'tl', 'name': ''}]ReleasedNaNCentury of BirthingFalse9.03.0
45463FalseNaN0[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...NaN67758tt0303758enBetrayalWhen one of her hits goes wrong, a professiona......2003-08-010.090.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedA deadly game of wits.BetrayalFalse3.86.0
45464FalseNaN0[]NaN227506tt0008536enSatana likuyushchiyIn a small town live two brothers, one a minis......1917-10-210.087.0[]ReleasedNaNSatan TriumphantFalse0.00.0
45465FalseNaN0[]NaN461257tt6980792enQueerama50 years after decriminalisation of homosexual......2017-06-090.075.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNQueeramaFalse0.00.0
\n", "

45466 rows × 24 columns

\n", "
" ], "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "... ... ... ... \n", "45461 False NaN 0 \n", "45462 False NaN 0 \n", "45463 False NaN 0 \n", "45464 False NaN 0 \n", "45465 False NaN 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "... ... \n", "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n", "45462 [{'id': 18, 'name': 'Drama'}] \n", "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n", "45464 [] \n", "45465 [] \n", "\n", " homepage id imdb_id \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 \n", "1 NaN 8844 tt0113497 \n", "2 NaN 15602 tt0113228 \n", "3 NaN 31357 tt0114885 \n", "4 NaN 11862 tt0113041 \n", "... ... ... ... \n", "45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n", "45462 NaN 111109 tt2028550 \n", "45463 NaN 67758 tt0303758 \n", "45464 NaN 227506 tt0008536 \n", "45465 NaN 461257 tt6980792 \n", "\n", " original_language original_title \\\n", "0 en Toy Story \n", "1 en Jumanji \n", "2 en Grumpier Old Men \n", "3 en Waiting to Exhale \n", "4 en Father of the Bride Part II \n", "... ... ... \n", "45461 fa رگ خواب \n", "45462 tl Siglo ng Pagluluwal \n", "45463 en Betrayal \n", "45464 en Satana likuyushchiy \n", "45465 en Queerama \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "... ... ... ... \n", "45461 Rising and falling between a man and woman. ... NaN \n", "45462 An artist struggles to finish his work while a... ... 2011-11-17 \n", "45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n", "45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n", "45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "... ... ... ... \n", "45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n", "45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n", "45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "45464 0.0 87.0 [] \n", "45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "... ... ... \n", "45461 Released Rising and falling between a man and woman \n", "45462 Released NaN \n", "45463 Released A deadly game of wits. \n", "45464 Released NaN \n", "45465 Released NaN \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "... ... ... ... ... \n", "45461 Subdue False 4.0 1.0 \n", "45462 Century of Birthing False 9.0 3.0 \n", "45463 Betrayal False 3.8 6.0 \n", "45464 Satan Triumphant False 0.0 0.0 \n", "45465 Queerama False 0.0 0.0 \n", "\n", "[45466 rows x 24 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_metadata_ds = pd.read_csv('./dataset/IMDB/movies_metadata.csv') \n", "movies_metadata_ds" ] }, { "cell_type": "code", "execution_count": 7, "id": "510a1b9d", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenresovervieworiginal_titlebelongs_to_collection
0862animation comedy familyLed by Woody, Andy's toys live happily in his ...Toy Storytoystorycollection
18844adventure fantasy familyWhen siblings Judy and Peter discover an encha...Jumanji
215602romance comedyA family wedding reignites the ancient feud be...Grumpier Old Mengrumpyoldmencollection
311862comedyJust when George Banks has recovered from his ...Father of the Bride Part IIfatherofthebridecollection
4949action crime drama thrillerObsessive master thief, Neil McCauley leads a ...Heat
..................
9613265189comedy dramaWhile holidaying in the French Alps, a Swedish...Turist
9614277839comedyFranky and Krimo dream of leaving the grey gri...Pattaya
9615430365comedyJean-Étienne Fougerole is an intellectual bohe...À bras ouverts
9616455661family animation romance comedyA closeted boy runs the risk of being outed by...In a Heartbeat
961714008comedyHyperactive teenager Kelly is enrolled into a ...Cadet Kelly
\n", "

9618 rows × 5 columns

\n", "
" ], "text/plain": [ " id genres \\\n", "0 862 animation comedy family \n", "1 8844 adventure fantasy family \n", "2 15602 romance comedy \n", "3 11862 comedy \n", "4 949 action crime drama thriller \n", "... ... ... \n", "9613 265189 comedy drama \n", "9614 277839 comedy \n", "9615 430365 comedy \n", "9616 455661 family animation romance comedy \n", "9617 14008 comedy \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Just when George Banks has recovered from his ... \n", "4 Obsessive master thief, Neil McCauley leads a ... \n", "... ... \n", "9613 While holidaying in the French Alps, a Swedish... \n", "9614 Franky and Krimo dream of leaving the grey gri... \n", "9615 Jean-Étienne Fougerole is an intellectual bohe... \n", "9616 A closeted boy runs the risk of being outed by... \n", "9617 Hyperactive teenager Kelly is enrolled into a ... \n", "\n", " original_title belongs_to_collection \n", "0 Toy Story toystorycollection \n", "1 Jumanji \n", "2 Grumpier Old Men grumpyoldmencollection \n", "3 Father of the Bride Part II fatherofthebridecollection \n", "4 Heat \n", "... ... ... \n", "9613 Turist \n", "9614 Pattaya \n", "9615 À bras ouverts \n", "9616 In a Heartbeat \n", "9617 Cadet Kelly \n", "\n", "[9618 rows x 5 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['status'] == 'Released']\n", "movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['vote_count'] > 40]\n", "movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['vote_average'] >= 5]\n", "important_col = ['id', 'genres', 'overview', 'original_title', 'belongs_to_collection']\n", "movies_metadata_ds = movies_metadata_ds[important_col]\n", "movies_metadata_ds.reset_index(inplace=True, drop=True)\n", "movies_metadata_ds['genres'] = movies_metadata_ds['genres'].apply(lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))\n", "movies_metadata_ds['belongs_to_collection'] = movies_metadata_ds['belongs_to_collection'].apply(lambda x: eval(str(x))['name'].lower().replace(' ', '') if str(x).lower() != 'nan' else '')\n", "\n", "movies_metadata_ds" ] }, { "cell_type": "code", "execution_count": 8, "id": "00f602df", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcast
0862tomhanks timallen donrickles jimvarney wallace...
18844robinwilliams jonathanhyde kirstendunst bradle...
215602waltermatthau jacklemmon ann-margret sophialor...
331357whitneyhouston angelabassett lorettadevine lel...
411862stevemartin dianekeaton martinshort kimberlywi...
.........
45471439050leilahatami kouroshtahami elhamkorda hamidnema...
45472111109angelaquino perrydizon hazelorencio joeltorre ...
4547367758erikaeleniak adambaldwin juliedupage jamesrema...
45474227506iwanmosschuchin nathalielissenko pavelpavlov a...
45475461257daisyasquith
\n", "

45476 rows × 2 columns

\n", "
" ], "text/plain": [ " id cast\n", "0 862 tomhanks timallen donrickles jimvarney wallace...\n", "1 8844 robinwilliams jonathanhyde kirstendunst bradle...\n", "2 15602 waltermatthau jacklemmon ann-margret sophialor...\n", "3 31357 whitneyhouston angelabassett lorettadevine lel...\n", "4 11862 stevemartin dianekeaton martinshort kimberlywi...\n", "... ... ...\n", "45471 439050 leilahatami kouroshtahami elhamkorda hamidnema...\n", "45472 111109 angelaquino perrydizon hazelorencio joeltorre ...\n", "45473 67758 erikaeleniak adambaldwin juliedupage jamesrema...\n", "45474 227506 iwanmosschuchin nathalielissenko pavelpavlov a...\n", "45475 461257 daisyasquith\n", "\n", "[45476 rows x 2 columns]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits_ds['cast'] = credits_ds['cast'].apply(lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))\n", "\n", "credits_ds['crew'] = credits_ds['crew'].apply(lambda x: [i['name'].lower().replace(' ', '') if i['job'] == 'Director' else '' for i in eval(x)])\n", "credits_ds['crew'] = credits_ds['crew'].apply(lambda x: ' '.join([i for i in x if i != '']))\n", "credits_ds['cast'] = credits_ds.apply(lambda x: x.loc['cast'] + ' ' + x.loc['crew'], axis=1)\n", "credits_ds = credits_ds[['id', 'cast']]\n", "credits_ds.reset_index(inplace=True, drop=True)\n", "credits_ds" ] }, { "cell_type": "code", "execution_count": 9, "id": "e8381582", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862jealousy toy boy friendship friends rivalry bo...
18844boardgame disappearance basedonchildren'sbook ...
215602fishing bestfriend duringcreditsstinger oldmen
331357basedonnovel interracialrelationship singlemot...
411862baby midlifecrisis confidence aging daughter m...
.........
46414439050tragiclove
46415111109artist play pinoy
4641667758
46417227506
46418461257
\n", "

46419 rows × 2 columns

\n", "
" ], "text/plain": [ " id keywords\n", "0 862 jealousy toy boy friendship friends rivalry bo...\n", "1 8844 boardgame disappearance basedonchildren'sbook ...\n", "2 15602 fishing bestfriend duringcreditsstinger oldmen\n", "3 31357 basedonnovel interracialrelationship singlemot...\n", "4 11862 baby midlifecrisis confidence aging daughter m...\n", "... ... ...\n", "46414 439050 tragiclove\n", "46415 111109 artist play pinoy\n", "46416 67758 \n", "46417 227506 \n", "46418 461257 \n", "\n", "[46419 rows x 2 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords_ds['keywords'] = keywords_ds['keywords'].apply(lambda x: ' '.join([i['name'].lower().replace(' ', '') for i in eval(x)]))\n", "keywords_ds" ] }, { "cell_type": "code", "execution_count": 10, "id": "e830b22b", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idgenresovervieworiginal_titlebelongs_to_collectionkeywordscast
0862animation comedy familyLed by Woody, Andy's toys live happily in his ...Toy Storytoystorycollectionjealousy toy boy friendship friends rivalry bo...tomhanks timallen donrickles jimvarney wallace...
18844adventure fantasy familyWhen siblings Judy and Peter discover an encha...Jumanjiboardgame disappearance basedonchildren'sbook ...robinwilliams jonathanhyde kirstendunst bradle...
215602romance comedyA family wedding reignites the ancient feud be...Grumpier Old Mengrumpyoldmencollectionfishing bestfriend duringcreditsstinger oldmenwaltermatthau jacklemmon ann-margret sophialor...
311862comedyJust when George Banks has recovered from his ...Father of the Bride Part IIfatherofthebridecollectionbaby midlifecrisis confidence aging daughter m...stevemartin dianekeaton martinshort kimberlywi...
4949action crime drama thrillerObsessive master thief, Neil McCauley leads a ...Heatrobbery detective bank obsession chase shootin...alpacino robertdeniro valkilmer jonvoight toms...
........................
9752265189comedy dramaWhile holidaying in the French Alps, a Swedish...Turistfemalenudity darkcomedy familyvacation avalanc...lisalovenkongsli johannesbahkuhnke clarawetter...
9753277839comedyFranky and Krimo dream of leaving the grey gri...Pattayaramzybedia malikbentalha franckgastambide gade...
9754430365comedyJean-Étienne Fougerole is an intellectual bohe...À bras ouvertschristianclavier aryabittan elsazylberstein cy...
9755455661family animation romance comedyA closeted boy runs the risk of being outed by...In a Heartbeatlove teenager lgbt shortbethdavid estebanbravo
975614008comedyHyperactive teenager Kelly is enrolled into a ...Cadet Kellymilitaryschoolhilaryduff christycarlsonromano garycole shawn...
\n", "

9757 rows × 7 columns

\n", "
" ], "text/plain": [ " id genres \\\n", "0 862 animation comedy family \n", "1 8844 adventure fantasy family \n", "2 15602 romance comedy \n", "3 11862 comedy \n", "4 949 action crime drama thriller \n", "... ... ... \n", "9752 265189 comedy drama \n", "9753 277839 comedy \n", "9754 430365 comedy \n", "9755 455661 family animation romance comedy \n", "9756 14008 comedy \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Just when George Banks has recovered from his ... \n", "4 Obsessive master thief, Neil McCauley leads a ... \n", "... ... \n", "9752 While holidaying in the French Alps, a Swedish... \n", "9753 Franky and Krimo dream of leaving the grey gri... \n", "9754 Jean-Étienne Fougerole is an intellectual bohe... \n", "9755 A closeted boy runs the risk of being outed by... \n", "9756 Hyperactive teenager Kelly is enrolled into a ... \n", "\n", " original_title belongs_to_collection \\\n", "0 Toy Story toystorycollection \n", "1 Jumanji \n", "2 Grumpier Old Men grumpyoldmencollection \n", "3 Father of the Bride Part II fatherofthebridecollection \n", "4 Heat \n", "... ... ... \n", "9752 Turist \n", "9753 Pattaya \n", "9754 À bras ouverts \n", "9755 In a Heartbeat \n", "9756 Cadet Kelly \n", "\n", " keywords \\\n", "0 jealousy toy boy friendship friends rivalry bo... \n", "1 boardgame disappearance basedonchildren'sbook ... \n", "2 fishing bestfriend duringcreditsstinger oldmen \n", "3 baby midlifecrisis confidence aging daughter m... \n", "4 robbery detective bank obsession chase shootin... \n", "... ... \n", "9752 femalenudity darkcomedy familyvacation avalanc... \n", "9753 \n", "9754 \n", "9755 love teenager lgbt short \n", "9756 militaryschool \n", "\n", " cast \n", "0 tomhanks timallen donrickles jimvarney wallace... \n", "1 robinwilliams jonathanhyde kirstendunst bradle... \n", "2 waltermatthau jacklemmon ann-margret sophialor... \n", "3 stevemartin dianekeaton martinshort kimberlywi... \n", "4 alpacino robertdeniro valkilmer jonvoight toms... \n", "... ... \n", "9752 lisalovenkongsli johannesbahkuhnke clarawetter... \n", "9753 ramzybedia malikbentalha franckgastambide gade... \n", "9754 christianclavier aryabittan elsazylberstein cy... \n", "9755 bethdavid estebanbravo \n", "9756 hilaryduff christycarlsonromano garycole shawn... \n", "\n", "[9757 rows x 7 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_metadata_ds = movies_metadata_ds[movies_metadata_ds['id'].str.isnumeric()]\n", "movies_metadata_ds['id'] = movies_metadata_ds['id'].astype(int)\n", "\n", "df = pd.merge(movies_metadata_ds, keywords_ds, on='id', how='left')\n", "df = pd.merge(df, credits_ds, on='id', how='left')\n", "df.reset_index(inplace=True)\n", "df.drop(columns=['index'], inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 11, "id": "c231e242", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitletoken
0862Toy Storyanimation comedy family Led by Woody, Andy's t...
18844Jumanjiadventure fantasy family When siblings Judy an...
215602Grumpier Old Menromance comedy A family wedding reignites the ...
311862Father of the Bride Part IIcomedy Just when George Banks has recovered fr...
4949Heataction crime drama thriller Obsessive master t...
............
9752265189Turistcomedy drama While holidaying in the French Al...
9753277839Pattayacomedy Franky and Krimo dream of leaving the g...
9754430365À bras ouvertscomedy Jean-Étienne Fougerole is an intellectu...
9755455661In a Heartbeatfamily animation romance comedy A closeted boy...
975614008Cadet Kellycomedy Hyperactive teenager Kelly is enrolled ...
\n", "

9757 rows × 3 columns

\n", "
" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 11862 Father of the Bride Part II \n", "4 949 Heat \n", "... ... ... \n", "9752 265189 Turist \n", "9753 277839 Pattaya \n", "9754 430365 À bras ouverts \n", "9755 455661 In a Heartbeat \n", "9756 14008 Cadet Kelly \n", "\n", " token \n", "0 animation comedy family Led by Woody, Andy's t... \n", "1 adventure fantasy family When siblings Judy an... \n", "2 romance comedy A family wedding reignites the ... \n", "3 comedy Just when George Banks has recovered fr... \n", "4 action crime drama thriller Obsessive master t... \n", "... ... \n", "9752 comedy drama While holidaying in the French Al... \n", "9753 comedy Franky and Krimo dream of leaving the g... \n", "9754 comedy Jean-Étienne Fougerole is an intellectu... \n", "9755 family animation romance comedy A closeted boy... \n", "9756 comedy Hyperactive teenager Kelly is enrolled ... \n", "\n", "[9757 rows x 3 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "col = list(df.columns)\n", "col.remove('id')\n", "col.remove('genres')\n", "col.remove('original_title')\n", "df['title'] = df['original_title']\n", "df['token'] = df['genres']\n", "for i in col:\n", " df ['token'] = df['token'] + ' ' + df[i]\n", "df = df[['id', 'title', 'token']]\n", "df" ] }, { "cell_type": "code", "execution_count": 12, "id": "80c71960", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\parsa\\AppData\\Local\\Temp\\ipykernel_1804\\1594095297.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df.drop(df[df['token'].isnull()].index, inplace=True)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitletoken
0862Toy Storyanimation comedy family Led by Woody, Andy's t...
18844Jumanjiadventure fantasy family When siblings Judy an...
215602Grumpier Old Menromance comedy A family wedding reignites the ...
311862Father of the Bride Part IIcomedy Just when George Banks has recovered fr...
4949Heataction crime drama thriller Obsessive master t...
............
9752265189Turistcomedy drama While holidaying in the French Al...
9753277839Pattayacomedy Franky and Krimo dream of leaving the g...
9754430365À bras ouvertscomedy Jean-Étienne Fougerole is an intellectu...
9755455661In a Heartbeatfamily animation romance comedy A closeted boy...
975614008Cadet Kellycomedy Hyperactive teenager Kelly is enrolled ...
\n", "

9716 rows × 3 columns

\n", "
" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 11862 Father of the Bride Part II \n", "4 949 Heat \n", "... ... ... \n", "9752 265189 Turist \n", "9753 277839 Pattaya \n", "9754 430365 À bras ouverts \n", "9755 455661 In a Heartbeat \n", "9756 14008 Cadet Kelly \n", "\n", " token \n", "0 animation comedy family Led by Woody, Andy's t... \n", "1 adventure fantasy family When siblings Judy an... \n", "2 romance comedy A family wedding reignites the ... \n", "3 comedy Just when George Banks has recovered fr... \n", "4 action crime drama thriller Obsessive master t... \n", "... ... \n", "9752 comedy drama While holidaying in the French Al... \n", "9753 comedy Franky and Krimo dream of leaving the g... \n", "9754 comedy Jean-Étienne Fougerole is an intellectu... \n", "9755 family animation romance comedy A closeted boy... \n", "9756 comedy Hyperactive teenager Kelly is enrolled ... \n", "\n", "[9716 rows x 3 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.drop(df[df['token'].isnull()].index, inplace=True)\n", "df" ] }, { "cell_type": "code", "execution_count": 13, "id": "a730431a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<9716x5000 sparse matrix of type ''\n", "\twith 413552 stored elements in Compressed Sparse Row format>" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf = TfidfVectorizer(max_features=5000)\n", "vectorized_data = tfidf.fit_transform(df['token'].values)\n", "vectorized_data" ] }, { "cell_type": "code", "execution_count": 14, "id": "515f3d66", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.04011667, 0.02115663, ..., 0.04573757, 0.11700804,\n", " 0.0376136 ],\n", " [0.04011667, 1. , 0.07140257, ..., 0.08394441, 0.01994877,\n", " 0.04027354],\n", " [0.02115663, 0.07140257, 1. , ..., 0.0780224 , 0.03333599,\n", " 0.10092987],\n", " ...,\n", " [0.04573757, 0.08394441, 0.0780224 , ..., 1. , 0.13819079,\n", " 0.06540503],\n", " [0.11700804, 0.01994877, 0.03333599, ..., 0.13819079, 1. ,\n", " 0.07908798],\n", " [0.0376136 , 0.04027354, 0.10092987, ..., 0.06540503, 0.07908798,\n", " 1. ]])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarity = cosine_similarity(vectorized_data)\n", "similarity" ] }, { "cell_type": "code", "execution_count": 15, "id": "ae0f1c2f", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Toy Story', 'Toy Story 2', 'Toy Story 3', 'The 40 Year Old Virgin', 'Hawaiian Vacation', 'Partysaurus Rex', 'Small Fry', 'Toy Story of Terror!', 'Toy Story That Time Forgot', '桃姐']\n", "***************************************************************************\n", "['Jumanji', 'Brainscan', 'Stay Alive', 'Wreck-It Ralph', 'Nirvana', 'Nerve', '12 Rounds 2: Reloaded', 'Panic Room', 'The Last Starfighter', 'Knights of Badassdom']\n", "***************************************************************************\n", "['Rocky III', 'Rocky IV', 'Rocky V', 'Rocky Balboa', 'Creed', 'It Could Happen to You', 'Rocky II', 'Rocky', 'The Ghost Writer', 'Cinderella Man']\n" ] } ], "source": [ "def recommend_by_movie(title, number=20):\n", " if len(df[df['title'] == title]) == 0:\n", " return []\n", " movie_id = df[df['title'] == title].index[0]\n", " distances = similarity[movie_id]\n", " movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])\n", " return [df.iloc[i[0]].title for i in movies[:number]]\n", "\n", "print(recommend_by_movie('Toy Story', 10))\n", "\n", "print('***************************************************************************')\n", "\n", "print(recommend_by_movie('Jumanji', 10))\n", "\n", "print('***************************************************************************')\n", "\n", "print(recommend_by_movie('Rocky III', 10))" ] }, { "cell_type": "code", "execution_count": 16, "id": "b9010f45", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "['American Pie',\n", " 'American Reunion',\n", " 'Date and Switch',\n", " 'High School Musical 3: Senior Year',\n", " 'American Pie 2',\n", " 'American Pie Presents: The Book of Love',\n", " 'Premature',\n", " 'Another Gay Movie',\n", " 'Very Good Girls',\n", " 'Immaturi',\n", " 'Sleepover',\n", " 'The Breakfast Club',\n", " 'Good Kids',\n", " \"Romy and Michele's High School Reunion\",\n", " 'Grease',\n", " \"Bill & Ted's Excellent Adventure\",\n", " 'Get a Job',\n", " \"Porky's\",\n", " \"Comment c'est Loin\",\n", " 'Boys and Girls',\n", " 'Rocky III',\n", " 'Rocky IV',\n", " 'Rocky V',\n", " 'Rocky Balboa',\n", " 'Creed',\n", " 'It Could Happen to You',\n", " 'Rocky II',\n", " 'Rocky',\n", " 'The Ghost Writer',\n", " 'Cinderella Man',\n", " 'Ant-Man',\n", " 'Bleed for This',\n", " 'Never Back Down: No Surrender',\n", " 'Елена',\n", " 'The Fighter',\n", " 'Kickboxer',\n", " 'Eye of the Beholder',\n", " 'Beyond a Reasonable Doubt',\n", " 'Até que a Sorte nos Separe',\n", " 'ブラム',\n", " 'Jay and Silent Bob Strike Back',\n", " 'Chasing Amy',\n", " 'Dogma',\n", " 'Clerks II',\n", " 'Mallrats',\n", " 'Clerks',\n", " 'Jersey Girl',\n", " 'Scream 3',\n", " 'Maps to the Stars',\n", " 'Electric Boogaloo: The Wild, Untold Story of Cannon Films',\n", " 'Star Wars: The Force Awakens',\n", " 'Bowfinger',\n", " 'Boris - Il film',\n", " 'Stuck on You',\n", " 'The Death of \"Superman Lives\": What Happened?',\n", " 'Cop Out',\n", " \"Singin' in the Rain\",\n", " 'The Artist',\n", " 'Saving Mr. Banks',\n", " 'Man About Town']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def recommen_by_user(user_id, number=20):\n", " user_rate_ds = ratings_ds[ratings_ds['userId'] == user_id]\n", " sort = user_rate_ds.sort_values(by='rating', ascending=False)\n", " movie_id = sort['movieId']\n", " movie_list = [df[df['id'] == id]['title'].values[0] for id in movie_id if len(df[df['id'] == id]['title']) > 0]\n", " result = [recommend_by_movie(str(title)) for title in movie_list]\n", " return list(itertools.chain.from_iterable(result))\n", "\n", "recommen_by_user(1)" ] }, { "cell_type": "markdown", "id": "600b5c9b", "metadata": {}, "source": [ "# Colaborative" ] }, { "cell_type": "code", "execution_count": 17, "id": "0627586f", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import torch\n", "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 18, "id": "9564e839", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\parsa\\AppData\\Local\\Temp\\ipykernel_1804\\514200638.py:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " movie_df = pd.read_csv('./dataset/IMDB/movies_metadata.csv')\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
..................................................................
45461FalseNaN0[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...http://www.imdb.com/title/tt6209470/439050tt6209470faرگ خوابRising and falling between a man and woman....NaN0.090.0[{'iso_639_1': 'fa', 'name': 'فارسی'}]ReleasedRising and falling between a man and womanSubdueFalse4.01.0
45462FalseNaN0[{'id': 18, 'name': 'Drama'}]NaN111109tt2028550tlSiglo ng PagluluwalAn artist struggles to finish his work while a......2011-11-170.0360.0[{'iso_639_1': 'tl', 'name': ''}]ReleasedNaNCentury of BirthingFalse9.03.0
45463FalseNaN0[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...NaN67758tt0303758enBetrayalWhen one of her hits goes wrong, a professiona......2003-08-010.090.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedA deadly game of wits.BetrayalFalse3.86.0
45464FalseNaN0[]NaN227506tt0008536enSatana likuyushchiyIn a small town live two brothers, one a minis......1917-10-210.087.0[]ReleasedNaNSatan TriumphantFalse0.00.0
45465FalseNaN0[]NaN461257tt6980792enQueerama50 years after decriminalisation of homosexual......2017-06-090.075.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNQueeramaFalse0.00.0
\n", "

45466 rows × 24 columns

\n", "
" ], "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "... ... ... ... \n", "45461 False NaN 0 \n", "45462 False NaN 0 \n", "45463 False NaN 0 \n", "45464 False NaN 0 \n", "45465 False NaN 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "... ... \n", "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n", "45462 [{'id': 18, 'name': 'Drama'}] \n", "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n", "45464 [] \n", "45465 [] \n", "\n", " homepage id imdb_id \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 \n", "1 NaN 8844 tt0113497 \n", "2 NaN 15602 tt0113228 \n", "3 NaN 31357 tt0114885 \n", "4 NaN 11862 tt0113041 \n", "... ... ... ... \n", "45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n", "45462 NaN 111109 tt2028550 \n", "45463 NaN 67758 tt0303758 \n", "45464 NaN 227506 tt0008536 \n", "45465 NaN 461257 tt6980792 \n", "\n", " original_language original_title \\\n", "0 en Toy Story \n", "1 en Jumanji \n", "2 en Grumpier Old Men \n", "3 en Waiting to Exhale \n", "4 en Father of the Bride Part II \n", "... ... ... \n", "45461 fa رگ خواب \n", "45462 tl Siglo ng Pagluluwal \n", "45463 en Betrayal \n", "45464 en Satana likuyushchiy \n", "45465 en Queerama \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "... ... ... ... \n", "45461 Rising and falling between a man and woman. ... NaN \n", "45462 An artist struggles to finish his work while a... ... 2011-11-17 \n", "45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n", "45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n", "45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "... ... ... ... \n", "45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n", "45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n", "45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "45464 0.0 87.0 [] \n", "45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "... ... ... \n", "45461 Released Rising and falling between a man and woman \n", "45462 Released NaN \n", "45463 Released A deadly game of wits. \n", "45464 Released NaN \n", "45465 Released NaN \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "... ... ... ... ... \n", "45461 Subdue False 4.0 1.0 \n", "45462 Century of Birthing False 9.0 3.0 \n", "45463 Betrayal False 3.8 6.0 \n", "45464 Satan Triumphant False 0.0 0.0 \n", "45465 Queerama False 0.0 0.0 \n", "\n", "[45466 rows x 24 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movie_df = pd.read_csv('./dataset/IMDB/movies_metadata.csv')\n", "movie_df" ] }, { "cell_type": "code", "execution_count": 19, "id": "68f44c1a", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
...............
9999967162682.51065579370
10000067162694.01065149201
10000167163654.01070940363
10000267163852.51070979663
10000367165653.51074784724
\n", "

100004 rows × 4 columns

\n", "
" ], "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205\n", "... ... ... ... ...\n", "99999 671 6268 2.5 1065579370\n", "100000 671 6269 4.0 1065149201\n", "100001 671 6365 4.0 1070940363\n", "100002 671 6385 2.5 1070979663\n", "100003 671 6565 3.5 1074784724\n", "\n", "[100004 rows x 4 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rate_df = pd.read_csv('./dataset/IMDB/ratings_small.csv')\n", "rate_df" ] }, { "cell_type": "code", "execution_count": 20, "id": "013fec1e", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([672, 163950])\n" ] }, { "data": { "text/plain": [ "tensor([[0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " ...,\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 4., 0., ..., 0., 0., 0.],\n", " [0., 5., 0., ..., 0., 0., 0.]])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "user_max = rate_df['userId'].max()\n", "movie_max = rate_df['movieId'].max()\n", "\n", "i = torch.LongTensor(rate_df[['userId', 'movieId']].to_numpy())\n", "v = torch.FloatTensor(rate_df[['rating']].to_numpy().flatten())\n", "\n", "sparse_matrix = torch.sparse.FloatTensor(i.t(), v, torch.Size([user_max+1, movie_max+1])).to_dense()\n", "\n", "print(sparse_matrix.shape)\n", "sparse_matrix" ] }, { "cell_type": "code", "execution_count": 21, "id": "4f49b621", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(672, 672)\n" ] }, { "data": { "text/plain": [ "array([[0. , 0. , 0. , ..., 0. , 0. ,\n", " 0. ],\n", " [0. , 1.0000001 , 0. , ..., 0.06291708, 0. ,\n", " 0.01746565],\n", " [0. , 0. , 1. , ..., 0.02413984, 0.17059463,\n", " 0.1131753 ],\n", " ...,\n", " [0. , 0.06291708, 0.02413984, ..., 1.0000001 , 0.04260878,\n", " 0.08520195],\n", " [0. , 0. , 0.17059463, ..., 0.04260878, 0.99999994,\n", " 0.22867674],\n", " [0. , 0.01746565, 0.1131753 , ..., 0.08520195, 0.22867674,\n", " 1. ]], dtype=float32)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "similarities_sparse = cosine_similarity(sparse_matrix, dense_output=False)\n", "print(similarities_sparse.shape)\n", "similarities_sparse" ] }, { "cell_type": "code", "execution_count": 22, "id": "cf892a39", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "[325,\n", " 634,\n", " 341,\n", " 310,\n", " 207,\n", " 35,\n", " 195,\n", " 485,\n", " 130,\n", " 229,\n", " 102,\n", " 403,\n", " 119,\n", " 387,\n", " 539,\n", " 575,\n", " 391,\n", " 468,\n", " 497,\n", " 510]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def top_n_idx_sparse(user_id, number=20):\n", " user_row = similarities_sparse[user_id]\n", " user_details = list(map(lambda x: (x[0], x[1]), enumerate(user_row)))\n", " sort = list(sorted(user_details, key=lambda x: x[1], reverse=True))\n", " # removing user itself\n", " sort = sort[1:]\n", " return list(map(lambda x: x[0], sort[:number]))\n", "\n", "user_user_similar = top_n_idx_sparse(1)\n", "user_user_similar" ] }, { "cell_type": "code", "execution_count": 23, "id": "e66bd52d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 31, 4085, 1569, 585, 1371, 1061, 1129, 3039, 1339, 1172],\n", " dtype=int64)" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def user_top_movies(user_id, number=10):\n", " user_rate = rate_df[rate_df['userId'] == user_id]\n", " sort = user_rate.sort_values(by='rating', ascending=False)\n", " number = number if number <= len(sort) else len(sort)\n", " return sort['movieId'].values[:number]\n", "\n", "user_325_top_movies = user_top_movies(325)\n", "user_325_top_movies" ] }, { "cell_type": "code", "execution_count": 24, "id": "5e18b5b5", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "['Monsters, Inc.',\n", " 'The Million Dollar Hotel',\n", " 'Sleepless in Seattle',\n", " 'Rocky III',\n", " 'Minority Report',\n", " 'Phar Lap',\n", " 'André Hazes, Zij Gelooft in Mij',\n", " 'Two-Minute Warning',\n", " 'Return of the Jedi',\n", " 'American Beauty',\n", " 'Torrente 3 The Protector',\n", " 'Antoine and Colette',\n", " 'Zatoichi',\n", " 'Interview with the Vampire',\n", " 'Blood Diamond',\n", " 'Walk the Line',\n", " 'Terminator 3: Rise of the Machines',\n", " 'Men in Black II',\n", " 'Meet the Parents',\n", " 'Frankenstein Unbound']" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def recommendation_for_user(user_id, number=20):\n", " similar_users = top_n_idx_sparse(user_id)\n", " movies = []\n", " for i in similar_users:\n", " similar_user_movies = user_top_movies(i)\n", " [movies.append(j) for j in similar_user_movies]\n", " temp = rate_df[rate_df['userId'] == user_id]\n", " for i in movies:\n", " if len(temp[temp['movieId'] == i]) > 0:\n", " movies.remove(i)\n", " titles = [movie_df[movie_df['id'] == str(id)]['title'].values[0] for id in movies if len(movie_df[movie_df['id'] == str(id)]['title']) > 0]\n", " number = number if number < len(titles) else len(titles)\n", " return titles[:number]\n", " \n", "recommendation_for_user(1) " ] }, { "cell_type": "markdown", "id": "2b4d10dd", "metadata": {}, "source": [ "# Ensemble" ] }, { "cell_type": "code", "execution_count": 25, "id": "b7e4aed3", "metadata": {}, "outputs": [], "source": [ "def ensemble_recommendation_intersection_based(user_id, number=10):\n", " collaborative = recommendation_for_user(user_id)\n", " content_based = recommen_by_user(user_id)\n", " result = list(set(collaborative) & set(content_based)) # finding intersect\n", " for i in result:\n", " collaborative.remove(i)\n", " content_based.remove(i)\n", " collaborative_index = 0\n", " content_base_index = 0\n", " while len(result) < number:\n", " if collaborative_index > content_base_index:\n", " result.append(content_based[content_base_index])\n", " content_base_index = content_base_index + 1\n", " else:\n", " result.append(collaborative[collaborative_index])\n", " collaborative_index = collaborative_index + 1\n", " return result\n", "\n", "def ensemble_recommendation_collaborative_based(user_id, number=10):\n", " collaborative = recommendation_for_user(user_id)\n", " results = []\n", " for movie in collaborative:\n", " recommended_movies = recommend_by_movie(movie)\n", " for i in recommended_movies:\n", " results.append(i) if i not in results else None\n", " return results[:number]\n", "\n", "\n", "\n", "def ensemble_recommendation(user_id, number=10, intersection_base=True):\n", " if intersection_base:\n", " return ensemble_recommendation_intersection_based(user_id, number)\n", " else:\n", " return ensemble_recommendation_collaborative_based(user_id, number=10)" ] }, { "cell_type": "code", "execution_count": 26, "id": "468ff925", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Rocky III',\n", " 'Monsters, Inc.',\n", " 'American Pie',\n", " 'The Million Dollar Hotel',\n", " 'American Reunion',\n", " 'Sleepless in Seattle',\n", " 'Date and Switch',\n", " 'Minority Report',\n", " 'High School Musical 3: Senior Year',\n", " 'Phar Lap']" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recommendation = ensemble_recommendation(1,intersection_base=True)\n", "recommendation" ] }, { "cell_type": "code", "execution_count": 27, "id": "07c80db7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Monsters, Inc.',\n", " 'Monsters University',\n", " 'Girl vs. Monster',\n", " 'Papa ou maman',\n", " \"Mike's New Car\",\n", " '捉妖记',\n", " 'Courageous',\n", " 'Séptimo',\n", " 'Indivisibili',\n", " 'Monster House']" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "recommendation = ensemble_recommendation(1, intersection_base=False)\n", "recommendation" ] }, { "cell_type": "markdown", "id": "b5db326e", "metadata": {}, "source": [ "# MLOps" ] }, { "cell_type": "code", "execution_count": null, "id": "12fc926f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" } }, "nbformat": 4, "nbformat_minor": 5 }