{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "sdXLxyndTymr" }, "source": [ "### install dependencies" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "y3Mq4BwgTMTY", "outputId": "3d127f2f-7c24-4b24-8afa-3444755b7606" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading...\n", "From: https://drive.google.com/uc?id=1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t\n", "To: /content/IMDB.zip\n", "100% 61.4M/61.4M [00:00<00:00, 224MB/s]\n" ] } ], "source": [ "!gdown \"1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "oU3KUmsiGtQU" }, "outputs": [], "source": [ "#!unzip IMDB.zip" ] }, { "cell_type": "code", "source": [ "#!pip install mlflow" ], "metadata": { "id": "a7swi2rM37ie" }, "execution_count": 3, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "XAq9DVtTvPim" }, "source": [ "# Content-based filtering" ] }, { "cell_type": "markdown", "metadata": { "id": "yxKpdNo1UFvc" }, "source": [ "### import libraries" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "A_qqPpWqUGJm" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import mlflow as mf" ] }, { "cell_type": "code", "source": [ "#mf.log_artifacts({'rating':'/content/rating_small.csv', 'rating':'/content/rating_small.csv', 'movies':'/content/movies_metadata.csv','keywords':'/content/keywords.csv', 'credits':'/content/credits.csv'})" ], "metadata": { "id": "LuqMSMvrEg1H" }, "execution_count": 5, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "gaQ7KyStURHR" }, "source": [ "### read data from file" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "0E7xGgKWqofc", "outputId": "b233ab4c-aef3-4997-a27f-acdeaf5dd900" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id keywords\n", "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n", "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n", "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n", "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n", "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...\n", "... ... ...\n", "46414 439050 [{'id': 10703, 'name': 'tragic love'}]\n", "46415 111109 [{'id': 2679, 'name': 'artist'}, {'id': 14531,...\n", "46416 67758 []\n", "46417 227506 []\n", "46418 461257 []\n", "\n", "[46419 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
18844[{'id': 10090, 'name': 'board game'}, {'id': 1...
215602[{'id': 1495, 'name': 'fishing'}, {'id': 12392...
331357[{'id': 818, 'name': 'based on novel'}, {'id':...
411862[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...
.........
46414439050[{'id': 10703, 'name': 'tragic love'}]
46415111109[{'id': 2679, 'name': 'artist'}, {'id': 14531,...
4641667758[]
46417227506[]
46418461257[]
\n", "

46419 rows × 2 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 6 } ], "source": [ "keywords = pd.read_csv('/content/IMDB/keywords.csv')\n", "keywords" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "AeT9NJibvkW4", "outputId": "abfc4537-ad26-4e7b-8120-84b83994df17" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205\n", "... ... ... ... ...\n", "99999 671 6268 2.5 1065579370\n", "100000 671 6269 4.0 1065149201\n", "100001 671 6365 4.0 1070940363\n", "100002 671 6385 2.5 1070979663\n", "100003 671 6565 3.5 1074784724\n", "\n", "[100004 rows x 4 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
...............
9999967162682.51065579370
10000067162694.01065149201
10000167163654.01070940363
10000267163852.51070979663
10000367165653.51074784724
\n", "

100004 rows × 4 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 7 } ], "source": [ "rating = pd.read_csv('/content/IMDB/ratings_small.csv')\n", "rating" ] }, { "cell_type": "code", "source": [ "credits = pd.read_csv('/content/IMDB/credits.csv')\n", "credits" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "jMaH5Yrs72sK", "outputId": "b7b0cc27-6c5f-4b6d-b719-7aa148adfa2e" }, "execution_count": 8, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " cast \\\n", "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n", "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n", "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n", "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n", "4 [{'cast_id': 1, 'character': 'George Banks', '... \n", "... ... \n", "45471 [{'cast_id': 0, 'character': '', 'credit_id': ... \n", "45472 [{'cast_id': 1002, 'character': 'Sister Angela... \n", "45473 [{'cast_id': 6, 'character': 'Emily Shaw', 'cr... \n", "45474 [{'cast_id': 2, 'character': '', 'credit_id': ... \n", "45475 [] \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 \n", "... ... ... \n", "45471 [{'credit_id': '5894a97d925141426c00818c', 'de... 439050 \n", "45472 [{'credit_id': '52fe4af1c3a36847f81e9b15', 'de... 111109 \n", "45473 [{'credit_id': '52fe4776c3a368484e0c8387', 'de... 67758 \n", "45474 [{'credit_id': '533bccebc3a36844cf0011a7', 'de... 227506 \n", "45475 [{'credit_id': '593e676c92514105b702e68e', 'de... 461257 \n", "\n", "[45476 rows x 3 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)',...[{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[{'cast_id': 1, 'character': 'Alan Parrish', '...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'c...[{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah...[{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[{'cast_id': 1, 'character': 'George Banks', '...[{'credit_id': '52fe44959251416c75039ed7', 'de...11862
............
45471[{'cast_id': 0, 'character': '', 'credit_id': ...[{'credit_id': '5894a97d925141426c00818c', 'de...439050
45472[{'cast_id': 1002, 'character': 'Sister Angela...[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...111109
45473[{'cast_id': 6, 'character': 'Emily Shaw', 'cr...[{'credit_id': '52fe4776c3a368484e0c8387', 'de...67758
45474[{'cast_id': 2, 'character': '', 'credit_id': ...[{'credit_id': '533bccebc3a36844cf0011a7', 'de...227506
45475[][{'credit_id': '593e676c92514105b702e68e', 'de...461257
\n", "

45476 rows × 3 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "kYA5KUWDFcZ2", "outputId": "0d7f210e-341e-4070-f2a6-ac4993658e96" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "... ... ... ... \n", "45461 False NaN 0 \n", "45462 False NaN 0 \n", "45463 False NaN 0 \n", "45464 False NaN 0 \n", "45465 False NaN 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "... ... \n", "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n", "45462 [{'id': 18, 'name': 'Drama'}] \n", "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n", "45464 [] \n", "45465 [] \n", "\n", " homepage id imdb_id \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 \n", "1 NaN 8844 tt0113497 \n", "2 NaN 15602 tt0113228 \n", "3 NaN 31357 tt0114885 \n", "4 NaN 11862 tt0113041 \n", "... ... ... ... \n", "45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n", "45462 NaN 111109 tt2028550 \n", "45463 NaN 67758 tt0303758 \n", "45464 NaN 227506 tt0008536 \n", "45465 NaN 461257 tt6980792 \n", "\n", " original_language original_title \\\n", "0 en Toy Story \n", "1 en Jumanji \n", "2 en Grumpier Old Men \n", "3 en Waiting to Exhale \n", "4 en Father of the Bride Part II \n", "... ... ... \n", "45461 fa رگ خواب \n", "45462 tl Siglo ng Pagluluwal \n", "45463 en Betrayal \n", "45464 en Satana likuyushchiy \n", "45465 en Queerama \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "... ... ... ... \n", "45461 Rising and falling between a man and woman. ... NaN \n", "45462 An artist struggles to finish his work while a... ... 2011-11-17 \n", "45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n", "45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n", "45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "... ... ... ... \n", "45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n", "45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n", "45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "45464 0.0 87.0 [] \n", "45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "... ... ... \n", "45461 Released Rising and falling between a man and woman \n", "45462 Released NaN \n", "45463 Released A deadly game of wits. \n", "45464 Released NaN \n", "45465 Released NaN \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "... ... ... ... ... \n", "45461 Subdue False 4.0 1.0 \n", "45462 Century of Birthing False 9.0 3.0 \n", "45463 Betrayal False 3.8 6.0 \n", "45464 Satan Triumphant False 0.0 0.0 \n", "45465 Queerama False 0.0 0.0 \n", "\n", "[45466 rows x 24 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
..................................................................
45461FalseNaN0[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...http://www.imdb.com/title/tt6209470/439050tt6209470faرگ خوابRising and falling between a man and woman....NaN0.090.0[{'iso_639_1': 'fa', 'name': 'فارسی'}]ReleasedRising and falling between a man and womanSubdueFalse4.01.0
45462FalseNaN0[{'id': 18, 'name': 'Drama'}]NaN111109tt2028550tlSiglo ng PagluluwalAn artist struggles to finish his work while a......2011-11-170.0360.0[{'iso_639_1': 'tl', 'name': ''}]ReleasedNaNCentury of BirthingFalse9.03.0
45463FalseNaN0[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...NaN67758tt0303758enBetrayalWhen one of her hits goes wrong, a professiona......2003-08-010.090.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedA deadly game of wits.BetrayalFalse3.86.0
45464FalseNaN0[]NaN227506tt0008536enSatana likuyushchiyIn a small town live two brothers, one a minis......1917-10-210.087.0[]ReleasedNaNSatan TriumphantFalse0.00.0
45465FalseNaN0[]NaN461257tt6980792enQueerama50 years after decriminalisation of homosexual......2017-06-090.075.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNQueeramaFalse0.00.0
\n", "

45466 rows × 24 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 9 } ], "source": [ "metadata = pd.read_csv('/content/IMDB/movies_metadata.csv')\n", "metadata" ] }, { "cell_type": "markdown", "metadata": { "id": "7Y3S-BFBTTCY" }, "source": [ "keep only related columns from released movies:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "3zJqU8dUTTtY" }, "outputs": [], "source": [ "metadata = metadata[metadata['status'] == 'Released']\n", "cols = np.array(['adult', 'belongs_to_collection', 'genres', 'id', 'original_language', 'title', 'production_countries', 'production_companies', 'video']) \n", "metadata = metadata[cols]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WiV8m-zP7dxy", "outputId": "ec13b127-a699-47a5-ce02-3dda6c354267" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "adult False\n", "belongs_to_collection NaN\n", "genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n", "id 8844\n", "original_language en\n", "title Jumanji\n", "production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n", "production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n", "video False\n", "Name: 1, dtype: object" ] }, "metadata": {}, "execution_count": 11 } ], "source": [ "metadata.iloc[1]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bBdpvpIl4vqJ", "outputId": "a118ca77-0276-4408-f1df-6896507e8131" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "adult False\n", "belongs_to_collection \n", "genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n", "id 8844\n", "original_language en\n", "title Jumanji\n", "production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n", "production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n", "video False\n", "Name: 1, dtype: object" ] }, "metadata": {}, "execution_count": 12 } ], "source": [ "def find_collection(x):\n", " if x == '':\n", " return ''\n", " return eval(str(x))['name']\n", "\n", "metadata['belongs_to_collection'] = metadata['belongs_to_collection'].fillna('')\n", "metadata['belongs_to_collection'] = metadata['belongs_to_collection'].apply(find_collection)\n", "metadata.iloc[1]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VRqAmj5aABKi", "outputId": "8aff7bec-5c5c-4829-8422-f19d264542e6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "adult False\n", "belongs_to_collection \n", "genres Adventure,Fantasy,Family\n", "id 8844\n", "original_language en\n", "title Jumanji\n", "production_countries United States of America\n", "production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n", "video False\n", "Name: 1, dtype: object" ] }, "metadata": {}, "execution_count": 13 } ], "source": [ "def find_names(x):\n", " if x == '':\n", " return ''\n", " genre_arr = eval(str(x))\n", " return ','.join(i['name'] for i in eval(str(x)))\n", " \n", "metadata['genres'] = metadata['genres'].fillna('')\n", "metadata['genres']=metadata['genres'].apply(find_names)\n", "metadata['production_countries']=metadata['production_countries'].apply(find_names)\n", "metadata['production_companies']=metadata['production_companies'].apply(find_names)\n", "credits['cast'] = credits['cast'].apply(find_names)\n", "metadata.iloc[1]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GfHP8lcEzi6c", "outputId": "4be60bff-1d92-4333-bfe8-4d5105e9858a" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "adult False\n", "belongs_to_collection \n", "genres Adventure,Fantasy,Family\n", "id 8844\n", "original_language en\n", "title Jumanji\n", "production_countries United States of America\n", "production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n", "video False\n", "keywords board game,disappearance,based on children's b...\n", "Name: 1, dtype: object" ] }, "metadata": {}, "execution_count": 14 } ], "source": [ "keywords['keywords'] = keywords['keywords'].apply(find_names)\n", "metadata['id'] = metadata['id'].astype(int)\n", "metadata = pd.merge(metadata,keywords,how='inner',on='id')\n", "metadata.iloc[1]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "_sNely8jO2Co" }, "outputs": [], "source": [ "def to_int(x):\n", " if x == 'True':\n", " return 1\n", " return 0" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iUHJJHwyHcz-", "outputId": "798f1640-33c5-42a5-9d2c-d5c757bdc539" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['False', 'True'], dtype=object)" ] }, "metadata": {}, "execution_count": 16 } ], "source": [ "metadata['adult'].unique()" ] }, { "cell_type": "markdown", "metadata": { "id": "kB8thP2fJ9Af" }, "source": [ "there are 3 values other than True or False in adult column. there are entered by mistake so we remove those rows." ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "U1d3Z-88KPYW", "outputId": "a6b4bfec-73fa-488e-d004-1fc5f1c3c461" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([False, True], dtype=object)" ] }, "metadata": {}, "execution_count": 17 } ], "source": [ "metadata = metadata[(metadata['adult'] == 'True') | (metadata['adult'] == 'False')]\n", "metadata['adult'] = metadata['adult'].apply(to_int)\n", "metadata['video'].unique()" ] }, { "cell_type": "markdown", "metadata": { "id": "ifUvKXYbQi2I" }, "source": [ "removing nan values from dataset and replacing 'True' and 'False' with 1 and 0:" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "72DHRHQLLIxQ" }, "outputs": [], "source": [ "metadata = metadata[~metadata['video'].isna()]\n", "metadata['video'] = metadata['video'].apply(to_int)" ] }, { "cell_type": "markdown", "metadata": { "id": "3XUewQIcKkv_" }, "source": [ "## Vectorize string features" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 661 }, "id": "P0n1lJnUKj_-", "outputId": "f72d7774-db3d-4de7-bd56-8a7316b6dcc1" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection genres \\\n", "0 0 Toy Story Collection Animation,Comedy,Family \n", "1 0 Adventure,Fantasy,Family \n", "2 0 Grumpy Old Men Collection Romance,Comedy \n", "3 0 Comedy,Drama,Romance \n", "4 0 Father of the Bride Collection Comedy \n", "... ... ... ... \n", "46017 0 Drama,Family \n", "46018 0 Drama \n", "46019 0 Action,Drama,Thriller \n", "46020 0 \n", "46021 0 \n", "\n", " id original_language title \\\n", "0 862 en Toy Story \n", "1 8844 en Jumanji \n", "2 15602 en Grumpier Old Men \n", "3 31357 en Waiting to Exhale \n", "4 11862 en Father of the Bride Part II \n", "... ... ... ... \n", "46017 439050 fa Subdue \n", "46018 111109 tl Century of Birthing \n", "46019 67758 en Betrayal \n", "46020 227506 en Satan Triumphant \n", "46021 461257 en Queerama \n", "\n", " production_countries \\\n", "0 United States of America \n", "1 United States of America \n", "2 United States of America \n", "3 United States of America \n", "4 United States of America \n", "... ... \n", "46017 Iran \n", "46018 Philippines \n", "46019 United States of America \n", "46020 Russia \n", "46021 United Kingdom \n", "\n", " production_companies video \\\n", "0 Pixar Animation Studios 0 \n", "1 TriStar Pictures,Teitler Film,Interscope Commu... 0 \n", "2 Warner Bros.,Lancaster Gate 0 \n", "3 Twentieth Century Fox Film Corporation 0 \n", "4 Sandollar Productions,Touchstone Pictures 0 \n", "... ... ... \n", "46017 0 \n", "46018 Sine Olivia 0 \n", "46019 American World Pictures 0 \n", "46020 Yermoliev 0 \n", "46021 0 \n", "\n", " keywords \n", "0 jealousy,toy,boy,friendship,friends,rivalry,bo... \n", "1 board game,disappearance,based on children's b... \n", "2 fishing,best friend,duringcreditsstinger,old men \n", "3 based on novel,interracial relationship,single... \n", "4 baby,midlife crisis,confidence,aging,daughter,... \n", "... ... \n", "46017 tragic love \n", "46018 artist,play,pinoy \n", "46019 \n", "46020 \n", "46021 \n", "\n", "[46022 rows x 10 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectiongenresidoriginal_languagetitleproduction_countriesproduction_companiesvideokeywords
00Toy Story CollectionAnimation,Comedy,Family862enToy StoryUnited States of AmericaPixar Animation Studios0jealousy,toy,boy,friendship,friends,rivalry,bo...
10Adventure,Fantasy,Family8844enJumanjiUnited States of AmericaTriStar Pictures,Teitler Film,Interscope Commu...0board game,disappearance,based on children's b...
20Grumpy Old Men CollectionRomance,Comedy15602enGrumpier Old MenUnited States of AmericaWarner Bros.,Lancaster Gate0fishing,best friend,duringcreditsstinger,old men
30Comedy,Drama,Romance31357enWaiting to ExhaleUnited States of AmericaTwentieth Century Fox Film Corporation0based on novel,interracial relationship,single...
40Father of the Bride CollectionComedy11862enFather of the Bride Part IIUnited States of AmericaSandollar Productions,Touchstone Pictures0baby,midlife crisis,confidence,aging,daughter,...
.................................
460170Drama,Family439050faSubdueIran0tragic love
460180Drama111109tlCentury of BirthingPhilippinesSine Olivia0artist,play,pinoy
460190Action,Drama,Thriller67758enBetrayalUnited States of AmericaAmerican World Pictures0
460200227506enSatan TriumphantRussiaYermoliev0
460210461257enQueeramaUnited Kingdom0
\n", "

46022 rows × 10 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 19 } ], "source": [ "metadata" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "2wnOtfe1m4aq" }, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "def my_tok(text):\n", " return text.split(\",\")\n", "\n", "def vectorize_string(col_name, feature_name, limit=None, df=metadata):\n", " vectorizer = CountVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)\n", " X = vectorizer.fit_transform(df[col_name])\n", " vec_cols = vectorizer.get_feature_names_out()\n", " vec_data = X.toarray()\n", " #vec_cols = np.char.add(feature_name+':', vec_cols)\n", " vec_cols = feature_name+':'+vec_cols\n", " return vec_data, vec_cols\n", "\n", "def tfidf(col_name, feature_name, limit=None, df=metadata):\n", " vectorizer = TfidfVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)\n", " X = vectorizer.fit_transform(df[col_name])\n", " vec_cols = vectorizer.get_feature_names_out()\n", " vec_data = X.toarray()\n", " #vec_cols = np.char.add(feature_name+':', vec_cols)\n", " vec_cols = feature_name+':'+vec_cols\n", " return vec_data, vec_cols" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "owM0Q_INPDyD", "outputId": "8975c88b-bbe0-43b4-ad2e-a6ee8eb326d8" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['genre:', 'genre:action', 'genre:adventure', 'genre:animation',\n", " 'genre:comedy', 'genre:crime', 'genre:documentary', 'genre:drama',\n", " 'genre:family', 'genre:fantasy', 'genre:foreign', 'genre:history',\n", " 'genre:horror', 'genre:music', 'genre:mystery', 'genre:romance',\n", " 'genre:science fiction', 'genre:thriller', 'genre:tv movie',\n", " 'genre:war', 'genre:western'], dtype=object)" ] }, "metadata": {}, "execution_count": 21 } ], "source": [ "genre_data, genre_cols = vectorize_string('genres', 'genre')\n", "genre_cols" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0N15-vMrswuZ", "outputId": "dbe96ed0-bff9-441c-afd2-a05fa5ab60ed" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['company:', 'company: the', 'company:amblin entertainment',\n", " 'company:american international pictures (aip)', 'company:arte',\n", " 'company:arte france cinéma', 'company:bbc', 'company:bbc films',\n", " 'company:blumhouse productions',\n", " 'company:british broadcasting corporation (bbc)', 'company:canal+',\n", " 'company:canal+ españa', 'company:castle rock entertainment',\n", " 'company:centre national de la cinématographie (cnc)',\n", " 'company:channel four films', 'company:ciné+',\n", " 'company:cinécinéma', 'company:cj entertainment',\n", " 'company:columbia pictures',\n", " 'company:columbia pictures corporation', 'company:dc comics',\n", " 'company:dimension films', 'company:dreamworks skg',\n", " 'company:dune entertainment', 'company:eurimages',\n", " 'company:europacorp', 'company:film i väst', 'company:film4',\n", " 'company:first national pictures', 'company:focus features',\n", " 'company:fox 2000 pictures', 'company:fox film corporation',\n", " 'company:fox searchlight pictures', 'company:france 2 cinéma',\n", " 'company:france 3 cinéma', 'company:gaumont',\n", " 'company:hallmark entertainment',\n", " 'company:hammer film productions', 'company:hbo films',\n", " 'company:hollywood pictures', 'company:home box office (hbo)',\n", " 'company:imagine entertainment', 'company:lakeshore entertainment',\n", " 'company:lenfilm', 'company:lions gate films', 'company:lionsgate',\n", " 'company:m6 films', 'company:metro-goldwyn-mayer (mgm)',\n", " 'company:millennium films', 'company:miramax films',\n", " 'company:monogram pictures', 'company:morgan creek productions',\n", " 'company:mosfilm', 'company:netflix', 'company:new line cinema',\n", " 'company:new regency pictures', 'company:new world pictures',\n", " 'company:nikkatsu', 'company:nordisk film',\n", " 'company:nu image films', 'company:orion pictures',\n", " 'company:paramount pictures', 'company:pathé',\n", " 'company:pixar animation studios',\n", " 'company:polygram filmed entertainment', 'company:rai cinema',\n", " 'company:regency enterprises', 'company:relativity media',\n", " 'company:rko radio pictures', 'company:samuel goldwyn company',\n", " 'company:screen gems', 'company:shaw brothers',\n", " 'company:shôchiku eiga', 'company:studiocanal',\n", " 'company:summit entertainment', 'company:svensk filmindustri (sf)',\n", " 'company:televisión española (tve)',\n", " 'company:tf1 films production', 'company:the rank organisation',\n", " 'company:the weinstein company', 'company:tla releasing',\n", " 'company:toho company', 'company:touchstone pictures',\n", " 'company:tristar pictures',\n", " 'company:twentieth century fox film corporation',\n", " 'company:téléfilm canada', 'company:uk film council',\n", " 'company:united artists',\n", " 'company:universal international pictures (ui)',\n", " 'company:universal pictures', 'company:village roadshow pictures',\n", " 'company:walt disney pictures', 'company:walt disney productions',\n", " 'company:warner bros.', 'company:warner bros. animation',\n", " 'company:westdeutscher rundfunk (wdr)', 'company:wild bunch',\n", " 'company:working title films', 'company:zentropa entertainments',\n", " 'company:zweites deutsches fernsehen (zdf)'], dtype=object)" ] }, "metadata": {}, "execution_count": 22 } ], "source": [ "companies_data, companies_cols = vectorize_string('production_companies', 'company', 100)\n", "companies_cols" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "zB0rmWDG7ktg", "outputId": "f162c8e1-00af-43c3-8d91-a47365b2e2be" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['country:', 'country:afghanistan', 'country:albania',\n", " 'country:algeria', 'country:angola', 'country:antarctica',\n", " 'country:argentina', 'country:armenia', 'country:aruba',\n", " 'country:australia', 'country:austria', 'country:azerbaijan',\n", " 'country:bahamas', 'country:bangladesh', 'country:belarus',\n", " 'country:belgium', 'country:bhutan', 'country:bolivia',\n", " 'country:bosnia and herzegovina', 'country:botswana',\n", " 'country:brazil', 'country:bulgaria', 'country:burkina faso',\n", " 'country:cambodia', 'country:cameroon', 'country:canada',\n", " 'country:chad', 'country:chile', 'country:china',\n", " 'country:colombia', 'country:congo', 'country:costa rica',\n", " \"country:cote d'ivoire\", 'country:croatia', 'country:cuba',\n", " 'country:cyprus', 'country:czech republic',\n", " 'country:czechoslovakia', 'country:denmark',\n", " 'country:dominican republic', 'country:east germany',\n", " 'country:ecuador', 'country:egypt', 'country:el salvador',\n", " 'country:estonia', 'country:ethiopia', 'country:finland',\n", " 'country:france', 'country:georgia', 'country:germany',\n", " 'country:ghana', 'country:greece', 'country:guatemala',\n", " 'country:hong kong', 'country:hungary', 'country:iceland',\n", " 'country:india', 'country:indonesia', 'country:iran',\n", " 'country:iraq', 'country:ireland', 'country:israel',\n", " 'country:italy', 'country:jamaica', 'country:japan',\n", " 'country:jordan', 'country:kazakhstan', 'country:kenya',\n", " 'country:kyrgyz republic',\n", " \"country:lao people's democratic republic\", 'country:latvia',\n", " 'country:lebanon', 'country:liberia',\n", " 'country:libyan arab jamahiriya', 'country:liechtenstein',\n", " 'country:lithuania', 'country:luxembourg', 'country:macao',\n", " 'country:macedonia', 'country:malaysia', 'country:mali',\n", " 'country:malta', 'country:mauritania', 'country:mexico',\n", " 'country:monaco', 'country:mongolia', 'country:montenegro',\n", " 'country:morocco', 'country:namibia', 'country:nepal',\n", " 'country:netherlands', 'country:new zealand', 'country:nicaragua',\n", " 'country:nigeria', 'country:north korea', 'country:norway',\n", " 'country:pakistan', 'country:palestinian territory',\n", " 'country:panama', 'country:papua new guinea', 'country:paraguay',\n", " 'country:peru', 'country:philippines', 'country:poland',\n", " 'country:portugal', 'country:puerto rico', 'country:qatar',\n", " 'country:romania', 'country:russia', 'country:rwanda',\n", " 'country:saudi arabia', 'country:senegal', 'country:serbia',\n", " 'country:serbia and montenegro', 'country:singapore',\n", " 'country:slovakia', 'country:slovenia', 'country:south africa',\n", " 'country:south korea', 'country:soviet union', 'country:spain',\n", " 'country:sri lanka', 'country:sweden', 'country:switzerland',\n", " 'country:syrian arab republic', 'country:taiwan',\n", " 'country:tajikistan', 'country:tanzania', 'country:thailand',\n", " 'country:trinidad and tobago', 'country:tunisia', 'country:turkey',\n", " 'country:uganda', 'country:ukraine',\n", " 'country:united arab emirates', 'country:united kingdom',\n", " 'country:united states of america', 'country:uruguay',\n", " 'country:uzbekistan', 'country:venezuela', 'country:vietnam',\n", " 'country:yugoslavia', 'country:zimbabwe'], dtype=object)" ] }, "metadata": {}, "execution_count": 23 } ], "source": [ "countries_data, countries_cols = vectorize_string('production_countries', 'country')\n", "countries_cols" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HgG4hhTf8vHI", "outputId": "24440e58-e2d0-4373-f2f9-6f5428b0da52" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['collection:', 'collection: amore e… - collezione',\n", " 'collection: band of assassins collection', ...,\n", " 'collection:что творят мужчины! (коллекция)',\n", " 'collection:男はつらいよ シリーズ', 'collection:식객 시리즈'], dtype=object)" ] }, "metadata": {}, "execution_count": 24 } ], "source": [ "collection_data, collection_cols = vectorize_string('belongs_to_collection', 'collection')\n", "collection_cols" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qOywVvXeRi-O", "outputId": "a9af8bdd-9e91-414d-d005-8a3b0f2de3c0" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['lang:', 'lang:ab', 'lang:af', 'lang:am', 'lang:ar', 'lang:bg',\n", " 'lang:bm', 'lang:bn', 'lang:bo', 'lang:bs', 'lang:ca', 'lang:cn',\n", " 'lang:cs', 'lang:da', 'lang:de', 'lang:el', 'lang:en', 'lang:es',\n", " 'lang:et', 'lang:eu', 'lang:fa', 'lang:fi', 'lang:fr', 'lang:he',\n", " 'lang:hi', 'lang:hr', 'lang:hu', 'lang:id', 'lang:is', 'lang:it',\n", " 'lang:iu', 'lang:ja', 'lang:ka', 'lang:kk', 'lang:kn', 'lang:ko',\n", " 'lang:ku', 'lang:ky', 'lang:lo', 'lang:lt', 'lang:lv', 'lang:mk',\n", " 'lang:ml', 'lang:mn', 'lang:mr', 'lang:ms', 'lang:nb', 'lang:ne',\n", " 'lang:nl', 'lang:no', 'lang:pa', 'lang:pl', 'lang:ps', 'lang:pt',\n", " 'lang:ro', 'lang:ru', 'lang:sh', 'lang:sk', 'lang:sl', 'lang:sq',\n", " 'lang:sr', 'lang:sv', 'lang:ta', 'lang:te', 'lang:th', 'lang:tl',\n", " 'lang:tr', 'lang:uk', 'lang:ur', 'lang:vi', 'lang:wo', 'lang:xx',\n", " 'lang:zh'], dtype=object)" ] }, "metadata": {}, "execution_count": 25 } ], "source": [ "metadata['original_language']= metadata['original_language'].fillna('')\n", "lang_data, lang_cols = vectorize_string('original_language', 'lang')\n", "lang_cols" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HXQOBJj67v0R", "outputId": "37f1f697-4f00-461d-8bef-8af66913895e" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(1333,)" ] }, "metadata": {}, "execution_count": 26 } ], "source": [ "collection_cols.shape" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cbSfr6HHE3Od", "outputId": "472739c5-8a0e-4855-c546-e653e8cab5a9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['keyword:', 'keyword: new york city', 'keyword:1930s',\n", " 'keyword:1940s', 'keyword:1950s', 'keyword:1960s', 'keyword:1970s',\n", " 'keyword:1980s', 'keyword:19th century', 'keyword:3d',\n", " 'keyword:accident', 'keyword:actor', 'keyword:actress',\n", " 'keyword:addiction', 'keyword:adolescence', 'keyword:adoption',\n", " 'keyword:adult animation', 'keyword:adultery', 'keyword:adventure',\n", " 'keyword:africa', 'keyword:african american',\n", " 'keyword:aftercreditsstinger', 'keyword:afterlife',\n", " 'keyword:age difference', 'keyword:agent', 'keyword:aging',\n", " 'keyword:aids', 'keyword:airplane', 'keyword:airplane crash',\n", " 'keyword:airport', 'keyword:alcohol', 'keyword:alcoholic',\n", " 'keyword:alcoholism', 'keyword:alien', 'keyword:alien invasion',\n", " 'keyword:alien life-form', 'keyword:ambition', 'keyword:ambush',\n", " 'keyword:american', 'keyword:american football', 'keyword:amnesia',\n", " 'keyword:amusement park', 'keyword:anarchic comedy',\n", " 'keyword:android', 'keyword:angel', 'keyword:animal',\n", " 'keyword:animal attack', 'keyword:animal horror',\n", " 'keyword:animation', 'keyword:anime', 'keyword:anthology',\n", " 'keyword:anthropomorphism', 'keyword:apache', 'keyword:apartment',\n", " 'keyword:apocalypse', 'keyword:archaeologist', 'keyword:architect',\n", " 'keyword:argentina', 'keyword:arizona', 'keyword:army',\n", " 'keyword:arranged marriage', 'keyword:art',\n", " 'keyword:artificial intelligence', 'keyword:artist',\n", " 'keyword:assassin', 'keyword:assassination', 'keyword:astronaut',\n", " 'keyword:asylum', 'keyword:atomic bomb', 'keyword:attack',\n", " 'keyword:australia', 'keyword:author', 'keyword:autism',\n", " 'keyword:aviation', 'keyword:axe murder', 'keyword:b movie',\n", " 'keyword:baby', 'keyword:babysitter', 'keyword:ballet',\n", " 'keyword:bank', 'keyword:bank robber', 'keyword:bank robbery',\n", " 'keyword:bar', 'keyword:baseball',\n", " \"keyword:based on children's book\", 'keyword:based on comic',\n", " 'keyword:based on manga', 'keyword:based on novel',\n", " 'keyword:based on play or musical', 'keyword:based on true events',\n", " 'keyword:based on true story', 'keyword:based on tv series',\n", " 'keyword:based on video game',\n", " 'keyword:based on young adult novel', 'keyword:basketball',\n", " 'keyword:battle', 'keyword:beach', 'keyword:bear',\n", " 'keyword:beautiful woman', 'keyword:becoming an adult',\n", " 'keyword:beer', 'keyword:berlin', 'keyword:best friend',\n", " 'keyword:betrayal', 'keyword:bible', 'keyword:biker',\n", " 'keyword:bikini', 'keyword:biography', 'keyword:birthday',\n", " 'keyword:black and white', 'keyword:black magic',\n", " 'keyword:black people', 'keyword:blackmail',\n", " 'keyword:blaxploitation', 'keyword:blindness', 'keyword:blood',\n", " 'keyword:blood splatter', 'keyword:boarding school',\n", " 'keyword:boat', 'keyword:bodyguard', 'keyword:bollywood',\n", " 'keyword:bomb', 'keyword:bondage', 'keyword:book',\n", " 'keyword:boston', 'keyword:bounty hunter', 'keyword:boxer',\n", " 'keyword:boy', 'keyword:boyfriend', 'keyword:brazil',\n", " 'keyword:brazilian', 'keyword:break-up', 'keyword:bride',\n", " 'keyword:bridge', 'keyword:brit noir', 'keyword:british',\n", " 'keyword:brothel', 'keyword:brother',\n", " 'keyword:brother brother relationship',\n", " 'keyword:brother sister relationship', 'keyword:brutality',\n", " 'keyword:bully', 'keyword:bullying', 'keyword:bus',\n", " 'keyword:business', 'keyword:business man', 'keyword:cabin',\n", " 'keyword:california', 'keyword:camp', 'keyword:camping',\n", " 'keyword:canada', 'keyword:cancer', 'keyword:cannibal',\n", " 'keyword:cannibalism', 'keyword:canuxploitation',\n", " 'keyword:capitalism', 'keyword:captain', 'keyword:car',\n", " 'keyword:car accident', 'keyword:car chase', 'keyword:car crash',\n", " 'keyword:car race', 'keyword:career', 'keyword:carnival',\n", " 'keyword:casino', 'keyword:castle', 'keyword:cat',\n", " 'keyword:catholic', 'keyword:catholicism', 'keyword:cattle',\n", " 'keyword:cave', 'keyword:celebration', 'keyword:celebrity',\n", " 'keyword:cell phone', 'keyword:cemetery', 'keyword:chainsaw',\n", " 'keyword:chaos', 'keyword:charlie chan', 'keyword:chase',\n", " 'keyword:cheating', 'keyword:cheerleader', 'keyword:chicago',\n", " 'keyword:child', 'keyword:child abuse', 'keyword:childhood',\n", " 'keyword:childhood friends', 'keyword:children', 'keyword:china',\n", " 'keyword:chinese', 'keyword:christian', 'keyword:christianity',\n", " 'keyword:christmas', 'keyword:church', 'keyword:cia',\n", " 'keyword:cigarette smoking', 'keyword:cinema', 'keyword:circus',\n", " 'keyword:city', 'keyword:civil war', 'keyword:classic noir',\n", " 'keyword:climbing', 'keyword:cocaine', 'keyword:coffin',\n", " 'keyword:cold war', 'keyword:college', 'keyword:coma',\n", " 'keyword:combat', 'keyword:comedian', 'keyword:comedy',\n", " 'keyword:comic book', 'keyword:coming of age',\n", " 'keyword:coming out', 'keyword:communism', 'keyword:communist',\n", " 'keyword:competition', 'keyword:composer', 'keyword:computer',\n", " 'keyword:con man', 'keyword:concert', 'keyword:conspiracy',\n", " 'keyword:cop', 'keyword:corporation', 'keyword:corpse',\n", " 'keyword:corruption', 'keyword:countryside', 'keyword:couple',\n", " 'keyword:court', 'keyword:court case', 'keyword:courtroom',\n", " 'keyword:cover-up', 'keyword:cowardliness', 'keyword:cowboy',\n", " 'keyword:creature', 'keyword:crime', 'keyword:criminal',\n", " 'keyword:cruelty', 'keyword:crush', 'keyword:cuba', 'keyword:cult',\n", " 'keyword:cult film', 'keyword:curse', 'keyword:cyberpunk',\n", " 'keyword:cyborg', 'keyword:dance', 'keyword:dancer',\n", " 'keyword:dancing', 'keyword:danger', 'keyword:dark comedy',\n", " 'keyword:date', 'keyword:dating', 'keyword:daughter',\n", " 'keyword:dc comics', 'keyword:death', 'keyword:death of a friend',\n", " 'keyword:debt', 'keyword:decapitation', 'keyword:deception',\n", " 'keyword:delusion', 'keyword:demon', 'keyword:department store',\n", " 'keyword:depression', 'keyword:desert', 'keyword:desire',\n", " 'keyword:detective', 'keyword:devil', 'keyword:diamond',\n", " 'keyword:diary', 'keyword:diner', 'keyword:dinosaur',\n", " 'keyword:dirty cop', 'keyword:disabled', 'keyword:disappearance',\n", " 'keyword:disaster', 'keyword:disguise', 'keyword:disney short',\n", " 'keyword:divorce', 'keyword:doctor', 'keyword:documentary',\n", " 'keyword:dog', 'keyword:doppelganger', 'keyword:double life',\n", " 'keyword:dracula', 'keyword:dragon', 'keyword:drama',\n", " 'keyword:dream', 'keyword:drinking', 'keyword:drowning',\n", " 'keyword:drug', 'keyword:drug abuse', 'keyword:drug addiction',\n", " 'keyword:drug dealer', 'keyword:drug lord', 'keyword:drug traffic',\n", " 'keyword:drug use', 'keyword:drunk', 'keyword:drunkenness',\n", " 'keyword:duel', 'keyword:duringcreditsstinger',\n", " 'keyword:dying and death', 'keyword:dysfunctional family',\n", " 'keyword:dystopia', 'keyword:dystopic future',\n", " 'keyword:earthquake', 'keyword:economics', 'keyword:education',\n", " 'keyword:egypt', 'keyword:england', 'keyword:epic',\n", " 'keyword:erotic movie', 'keyword:eroticism', 'keyword:escape',\n", " 'keyword:escape from prison', 'keyword:espionage',\n", " 'keyword:europe', 'keyword:evil', 'keyword:ex-con',\n", " 'keyword:exorcism', 'keyword:exotic island', 'keyword:expedition',\n", " 'keyword:experiment', 'keyword:experimental film',\n", " 'keyword:exploitation', 'keyword:explosion',\n", " 'keyword:extramarital affair', 'keyword:extreme violence',\n", " 'keyword:factory', 'keyword:fairy tale', 'keyword:faith',\n", " 'keyword:falling in love', 'keyword:false identity',\n", " 'keyword:falsely accused', 'keyword:family',\n", " 'keyword:family relationships', 'keyword:fantasy', 'keyword:farm',\n", " 'keyword:farmer', 'keyword:fashion', 'keyword:fate',\n", " 'keyword:father', 'keyword:father daughter relationship',\n", " 'keyword:father son relationship', 'keyword:fbi',\n", " 'keyword:fbi agent', 'keyword:fear', 'keyword:female friendship',\n", " 'keyword:female homosexuality', 'keyword:female nudity',\n", " 'keyword:female protagonist', 'keyword:feminism',\n", " 'keyword:femme fatale', 'keyword:fight', 'keyword:fighter',\n", " 'keyword:film director', 'keyword:film making',\n", " 'keyword:film noir', 'keyword:filmmaker', 'keyword:filmmaking',\n", " 'keyword:fire', 'keyword:fisherman', 'keyword:fistfight',\n", " 'keyword:flashback', 'keyword:florida', 'keyword:flying',\n", " 'keyword:flying saucer', 'keyword:food', 'keyword:forbidden love',\n", " 'keyword:forest', 'keyword:found footage', 'keyword:france',\n", " 'keyword:frankenstein', 'keyword:fraud', 'keyword:freedom',\n", " 'keyword:french', 'keyword:french noir', 'keyword:friends',\n", " 'keyword:friendship', 'keyword:fugitive', 'keyword:funeral',\n", " 'keyword:future', 'keyword:gambler', 'keyword:gambling',\n", " 'keyword:gang', 'keyword:gangster', 'keyword:gas station',\n", " 'keyword:gay', 'keyword:gay interest', 'keyword:gay man',\n", " 'keyword:gay relationship', 'keyword:general', 'keyword:german',\n", " 'keyword:germany', 'keyword:ghost', 'keyword:giallo',\n", " 'keyword:giant monster', 'keyword:girl', 'keyword:girlfriend',\n", " 'keyword:god', 'keyword:gold', 'keyword:good vs evil',\n", " 'keyword:gore', 'keyword:gothic', 'keyword:gothic horror',\n", " 'keyword:government', 'keyword:greece', 'keyword:greed',\n", " 'keyword:grief', 'keyword:guilt', 'keyword:gun',\n", " 'keyword:gunfight', 'keyword:gunslinger', 'keyword:gypsy',\n", " 'keyword:hacker', 'keyword:halloween', 'keyword:hallucination',\n", " 'keyword:hammer horror', 'keyword:haunted house',\n", " 'keyword:haunting', 'keyword:hawaii', 'keyword:heavy metal',\n", " 'keyword:heist', 'keyword:helicopter', 'keyword:hell',\n", " 'keyword:hero', 'keyword:heroin', 'keyword:high school',\n", " 'keyword:highway', 'keyword:hip-hop', 'keyword:hippie',\n", " 'keyword:historical figure', 'keyword:history',\n", " 'keyword:hitchhiker', 'keyword:hitman', 'keyword:holiday',\n", " 'keyword:hollywood', 'keyword:holocaust', 'keyword:home invasion',\n", " 'keyword:homeless person', 'keyword:homophobia',\n", " 'keyword:homosexuality', 'keyword:honeymoon', 'keyword:hong kong',\n", " 'keyword:hoodlum', 'keyword:horror', 'keyword:horse',\n", " 'keyword:hospital', 'keyword:hostage', 'keyword:hotel',\n", " 'keyword:hotel room', 'keyword:house',\n", " 'keyword:human experimentation', 'keyword:humiliation',\n", " 'keyword:hustler', 'keyword:hypnosis', 'keyword:identity',\n", " 'keyword:illegal drugs', 'keyword:illegal prostitution',\n", " 'keyword:illness', 'keyword:imax', 'keyword:immigrant',\n", " 'keyword:immigration', 'keyword:immortality', 'keyword:incest',\n", " 'keyword:independent film', 'keyword:india', 'keyword:indian lead',\n", " 'keyword:individual', 'keyword:infection', 'keyword:infidelity',\n", " 'keyword:inheritance', 'keyword:insanity', 'keyword:intelligence',\n", " 'keyword:internet', 'keyword:interracial relationship',\n", " 'keyword:interview', 'keyword:invasion', 'keyword:inventor',\n", " 'keyword:investigation', 'keyword:ireland', 'keyword:island',\n", " 'keyword:isolation', 'keyword:israel', 'keyword:italian',\n", " 'keyword:italy', 'keyword:jail', 'keyword:japan',\n", " 'keyword:japanese', 'keyword:jazz', 'keyword:jealousy',\n", " 'keyword:jesus christ', 'keyword:jew', 'keyword:jewish',\n", " 'keyword:journalism', 'keyword:journalist', 'keyword:journey',\n", " 'keyword:judge', 'keyword:jungle', 'keyword:justice',\n", " 'keyword:juvenile delinquent', 'keyword:kaiju',\n", " 'keyword:kidnapping', 'keyword:kids', 'keyword:kids and family',\n", " 'keyword:killer', 'keyword:king', 'keyword:kingdom',\n", " 'keyword:kiss', 'keyword:knife', 'keyword:knight', 'keyword:korea',\n", " 'keyword:korean movie', 'keyword:kung fu', 'keyword:laboratory',\n", " 'keyword:ladykiller', 'keyword:lake', 'keyword:las vegas',\n", " 'keyword:lawyer', 'keyword:legend', 'keyword:lesbian',\n", " 'keyword:lesbian relationship', 'keyword:lesbian sex',\n", " 'keyword:letter', 'keyword:lgbt', 'keyword:lie', 'keyword:lion',\n", " 'keyword:little boy', 'keyword:little girl',\n", " 'keyword:london england', 'keyword:loneliness',\n", " 'keyword:los angeles', 'keyword:loss of father',\n", " 'keyword:loss of lover', 'keyword:loss of mother',\n", " 'keyword:loss of virginity', 'keyword:love', 'keyword:love affair',\n", " 'keyword:love at first sight', \"keyword:love of one's life\",\n", " 'keyword:love triangle', 'keyword:lover', 'keyword:lovers',\n", " 'keyword:lovesickness', 'keyword:lust', 'keyword:mad scientist',\n", " 'keyword:madness', 'keyword:madrid', 'keyword:mafia',\n", " 'keyword:magic', 'keyword:maid', 'keyword:malayalam',\n", " 'keyword:male female relationship', 'keyword:male friendship',\n", " 'keyword:male nudity', 'keyword:manhattan', 'keyword:maniac',\n", " 'keyword:manipulation', 'keyword:mansion', 'keyword:marijuana',\n", " 'keyword:marriage', 'keyword:marriage crisis',\n", " 'keyword:marriage proposal', 'keyword:married couple',\n", " 'keyword:martial arts', 'keyword:marvel comic', 'keyword:mask',\n", " 'keyword:mass murder', 'keyword:massacre', 'keyword:masturbation',\n", " 'keyword:mayor', 'keyword:melodrama', 'keyword:memory',\n", " 'keyword:memory loss', 'keyword:mental illness',\n", " 'keyword:mercenary', 'keyword:mexican', 'keyword:mexico',\n", " 'keyword:midlife crisis', 'keyword:military',\n", " 'keyword:millionaire', 'keyword:mind control',\n", " 'keyword:miniseries', 'keyword:missing person', 'keyword:mission',\n", " 'keyword:mission of murder', 'keyword:mistaken identity',\n", " 'keyword:mobster', 'keyword:mockumentary', 'keyword:model',\n", " 'keyword:money', 'keyword:monk', 'keyword:monkey',\n", " 'keyword:monster', 'keyword:moon', 'keyword:motel',\n", " 'keyword:mother', 'keyword:mother daughter relationship',\n", " 'keyword:mother son relationship', 'keyword:motorcycle',\n", " 'keyword:mountain', 'keyword:movie star', 'keyword:mumblegore',\n", " 'keyword:mummy', 'keyword:murder', 'keyword:murderer',\n", " 'keyword:museum', 'keyword:music', 'keyword:music band',\n", " 'keyword:musical', 'keyword:musician', 'keyword:muslim',\n", " 'keyword:mutant', 'keyword:mutation', 'keyword:mystery',\n", " 'keyword:mythology', 'keyword:nanny', 'keyword:narration',\n", " 'keyword:nasa', 'keyword:native american', 'keyword:nature',\n", " 'keyword:navy', 'keyword:nazi germany', 'keyword:nazis',\n", " 'keyword:neighbor', 'keyword:neo-noir', 'keyword:nerd',\n", " 'keyword:new england', 'keyword:new love', 'keyword:new orleans',\n", " 'keyword:new york', 'keyword:new york city', 'keyword:new zealand',\n", " 'keyword:newspaper', 'keyword:nightclub', 'keyword:nightmare',\n", " 'keyword:ninja', 'keyword:nudity', 'keyword:nun', 'keyword:nurse',\n", " 'keyword:obsession', 'keyword:occult', 'keyword:ocean',\n", " 'keyword:older man younger woman relationship',\n", " 'keyword:older woman younger man relationship',\n", " 'keyword:olympic games', 'keyword:on the run',\n", " 'keyword:one-night stand', 'keyword:opera',\n", " 'keyword:organized crime', 'keyword:orphan', 'keyword:orphanage',\n", " 'keyword:outer space', 'keyword:outlaw', 'keyword:painter',\n", " 'keyword:painting', 'keyword:parallel world', 'keyword:paranoia',\n", " 'keyword:parent child relationship', 'keyword:paris',\n", " 'keyword:parody', 'keyword:party', 'keyword:passion',\n", " 'keyword:peasant', 'keyword:period drama', 'keyword:philippines',\n", " 'keyword:philosophy', 'keyword:photographer',\n", " 'keyword:photography', 'keyword:pig', 'keyword:pilot',\n", " 'keyword:pimp', 'keyword:pirate', 'keyword:pistol',\n", " 'keyword:planned murder', 'keyword:playboy', 'keyword:poet',\n", " 'keyword:poetry', 'keyword:poison', 'keyword:poker',\n", " 'keyword:police', 'keyword:police brutality',\n", " 'keyword:police corruption', 'keyword:police detective',\n", " 'keyword:police officer', 'keyword:police operation',\n", " 'keyword:policeman', 'keyword:political', 'keyword:politician',\n", " 'keyword:politics', 'keyword:pornography', 'keyword:possession',\n", " 'keyword:post-apocalyptic', 'keyword:poverty', 'keyword:power',\n", " 'keyword:pre-code', 'keyword:pregnancy', 'keyword:pregnant',\n", " 'keyword:president', 'keyword:priest', 'keyword:prince',\n", " 'keyword:princess', 'keyword:prison', 'keyword:prisoner',\n", " 'keyword:prisoners of war', 'keyword:private detective',\n", " 'keyword:professor', 'keyword:propaganda', 'keyword:prophecy',\n", " 'keyword:prostitute', 'keyword:prostitution', 'keyword:protest',\n", " 'keyword:proto-slasher', 'keyword:psychiatrist', 'keyword:psychic',\n", " 'keyword:psychological thriller', 'keyword:psychologist',\n", " 'keyword:psychology', 'keyword:psychopath', 'keyword:puberty',\n", " 'keyword:punk', 'keyword:puppet', 'keyword:queen',\n", " 'keyword:racism', 'keyword:radio', 'keyword:rain', 'keyword:ranch',\n", " 'keyword:ransom', 'keyword:rape', 'keyword:rebel',\n", " 'keyword:redemption', 'keyword:relationship',\n", " 'keyword:relationship problems', 'keyword:religion',\n", " 'keyword:remake', 'keyword:reporter', 'keyword:rescue',\n", " 'keyword:resistance', 'keyword:restaurant', 'keyword:resurrection',\n", " 'keyword:revenge', 'keyword:revolution', 'keyword:rifle',\n", " 'keyword:ritual', 'keyword:rivalry', 'keyword:river',\n", " 'keyword:road movie', 'keyword:road trip', 'keyword:robbery',\n", " 'keyword:robot', 'keyword:rock', 'keyword:rock and roll',\n", " 'keyword:rock band', 'keyword:rock music', 'keyword:rock star',\n", " 'keyword:romance', 'keyword:romantic comedy', 'keyword:rome',\n", " 'keyword:roommate', 'keyword:royalty', 'keyword:runaway',\n", " 'keyword:rural setting', 'keyword:russia', 'keyword:russian',\n", " 'keyword:sacrifice', 'keyword:sadism', 'keyword:sadness',\n", " 'keyword:sailor', 'keyword:salesman', 'keyword:saloon',\n", " 'keyword:samurai', 'keyword:san francisco', 'keyword:santa claus',\n", " 'keyword:satire', 'keyword:saving the world', 'keyword:scandal',\n", " 'keyword:schizophrenia', 'keyword:school', 'keyword:science',\n", " 'keyword:science fiction', 'keyword:scientist', 'keyword:scotland',\n", " 'keyword:sea', 'keyword:search', 'keyword:secret',\n", " 'keyword:secret agent', 'keyword:secret identity',\n", " 'keyword:secret love', 'keyword:secretary', 'keyword:seduction',\n", " 'keyword:sequel', 'keyword:serial killer',\n", " 'keyword:series of murders', 'keyword:sex',\n", " 'keyword:sexploitation', 'keyword:sexual abuse',\n", " 'keyword:sexuality', 'keyword:shakespeare', 'keyword:shark',\n", " 'keyword:sheriff', 'keyword:sherlock holmes', 'keyword:ship',\n", " 'keyword:shipwreck', 'keyword:shooting', 'keyword:shootout',\n", " 'keyword:short', 'keyword:shotgun', 'keyword:showdown',\n", " 'keyword:shower', 'keyword:silent film', 'keyword:singer',\n", " 'keyword:singing', 'keyword:single', 'keyword:single mother',\n", " 'keyword:single parent', 'keyword:sister',\n", " 'keyword:sister sister relationship', 'keyword:slapstick',\n", " 'keyword:slasher', 'keyword:slavery', 'keyword:small town',\n", " 'keyword:smuggling', 'keyword:snake', 'keyword:sniper',\n", " 'keyword:snow', 'keyword:soccer', 'keyword:society',\n", " 'keyword:soldier', 'keyword:son', 'keyword:song',\n", " 'keyword:south africa', 'keyword:south korea',\n", " 'keyword:southern usa', 'keyword:soviet union', 'keyword:space',\n", " 'keyword:space marine', 'keyword:space opera',\n", " 'keyword:space travel', 'keyword:spacecraft', 'keyword:spaceship',\n", " 'keyword:spaghetti western', 'keyword:spain', 'keyword:spider',\n", " 'keyword:spirit', 'keyword:spoof', 'keyword:sport', 'keyword:spy',\n", " 'keyword:stalker', 'keyword:stalking', 'keyword:stand-up comedy',\n", " 'keyword:stop motion', 'keyword:storm', 'keyword:stranded',\n", " 'keyword:stranger', 'keyword:street gang', 'keyword:strip club',\n", " 'keyword:stripper', 'keyword:student', 'keyword:submarine',\n", " 'keyword:subway', 'keyword:success', 'keyword:suicide',\n", " 'keyword:suicide attempt', 'keyword:summer', 'keyword:summer camp',\n", " 'keyword:summer vacation', 'keyword:super powers',\n", " 'keyword:superhero', 'keyword:supernatural',\n", " 'keyword:supernatural powers', 'keyword:surfing',\n", " 'keyword:surreal', 'keyword:surrealism', 'keyword:surveillance',\n", " 'keyword:survival', 'keyword:survivor', 'keyword:suspense',\n", " 'keyword:suspicion', 'keyword:swamp', 'keyword:sweden',\n", " 'keyword:swimming pool', 'keyword:sword',\n", " 'keyword:sword and sorcery', 'keyword:sword fight',\n", " 'keyword:swordplay', 'keyword:talking animal', 'keyword:tattoo',\n", " 'keyword:taxi', 'keyword:taxi driver', 'keyword:teacher',\n", " 'keyword:technology', 'keyword:teen comedy', 'keyword:teen movie',\n", " 'keyword:teenage boy', 'keyword:teenage crush',\n", " 'keyword:teenage girl', 'keyword:teenager', 'keyword:telekinesis',\n", " 'keyword:television', 'keyword:terminal illness', 'keyword:terror',\n", " 'keyword:terrorism', 'keyword:terrorist', 'keyword:texas',\n", " 'keyword:thailand', 'keyword:theater', 'keyword:theft',\n", " 'keyword:therapist', 'keyword:thief', 'keyword:thriller',\n", " 'keyword:time travel', 'keyword:tokyo japan', 'keyword:torture',\n", " 'keyword:tourist', 'keyword:tragedy', 'keyword:train',\n", " 'keyword:training', 'keyword:traitor', 'keyword:transformation',\n", " 'keyword:transvestism', 'keyword:trapped', 'keyword:trauma',\n", " 'keyword:travel', 'keyword:treasure', 'keyword:treasure hunt',\n", " 'keyword:trial', 'keyword:truck', 'keyword:turkey',\n", " 'keyword:tv movie', 'keyword:tv show', 'keyword:twins',\n", " 'keyword:u.s. army', 'keyword:u.s. navy', 'keyword:ufo',\n", " 'keyword:uncle', 'keyword:undead', 'keyword:undercover',\n", " 'keyword:undercover agent', 'keyword:undercover cop',\n", " 'keyword:underdog', 'keyword:underwater', 'keyword:underwear',\n", " 'keyword:unemployment', 'keyword:university',\n", " 'keyword:unrequited love', 'keyword:unsimulated sex',\n", " 'keyword:unsociability', 'keyword:upper class', 'keyword:usa',\n", " 'keyword:usa president', 'keyword:vacation', 'keyword:vampire',\n", " 'keyword:venice', 'keyword:victim', 'keyword:video game',\n", " 'keyword:video nasty', 'keyword:vietnam',\n", " 'keyword:vietnam veteran', 'keyword:vietnam war',\n", " 'keyword:vigilante', 'keyword:village', 'keyword:violence',\n", " 'keyword:virgin', 'keyword:virtual reality', 'keyword:virus',\n", " 'keyword:vision', 'keyword:volcano', 'keyword:voodoo',\n", " 'keyword:voyeur', 'keyword:voyeurism', 'keyword:waitress',\n", " 'keyword:war', 'keyword:war crimes', 'keyword:war veteran',\n", " 'keyword:washington d.c.', 'keyword:water', 'keyword:wealth',\n", " 'keyword:weapon', 'keyword:wedding', 'keyword:werewolf',\n", " 'keyword:wheelchair', 'keyword:widow', 'keyword:widower',\n", " 'keyword:wife', 'keyword:wife husband relationship',\n", " 'keyword:wilderness', 'keyword:winter', 'keyword:wish',\n", " 'keyword:witch', 'keyword:witchcraft', 'keyword:wolf',\n", " 'keyword:woman director', 'keyword:women', 'keyword:woods',\n", " 'keyword:world war i', 'keyword:world war ii', 'keyword:wrestling',\n", " 'keyword:writer', 'keyword:xenophobia', 'keyword:yakuza',\n", " 'keyword:young adult', 'keyword:youth', 'keyword:zombie'],\n", " dtype=object)" ] }, "metadata": {}, "execution_count": 27 } ], "source": [ "keyword_data, keyword_cols = tfidf('keywords', 'keyword', 1000)\n", "keyword_cols" ] }, { "cell_type": "code", "source": [ "credits.drop(columns=['crew'], inplace=True)\n", "credit_data, credit_cols = vectorize_string('cast','cast', 1000, df=credits)\n", "credit_cols" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EGKfKnjM-qYH", "outputId": "53ca2cee-2879-4b83-cb05-5382bad526a1" }, "execution_count": 28, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['cast:', 'cast: jr.', 'cast:adam sandler', 'cast:adam scott',\n", " 'cast:addison richards', 'cast:adolfo celi', 'cast:adrien brody',\n", " 'cast:agnes moorehead', 'cast:aidan quinn', 'cast:ajay devgn',\n", " 'cast:akim tamiroff', 'cast:akshay kumar', 'cast:al pacino',\n", " 'cast:alain delon', 'cast:alan arkin', 'cast:alan bates',\n", " 'cast:alan cumming', 'cast:alan hale', 'cast:alan mowbray',\n", " 'cast:alan napier', 'cast:alan rickman', 'cast:alan tudyk',\n", " 'cast:alberto sordi', 'cast:alec baldwin', 'cast:alec guinness',\n", " 'cast:alfre woodard', 'cast:alfred molina', 'cast:allen jenkins',\n", " 'cast:allison janney', 'cast:amitabh bachchan', 'cast:amy adams',\n", " 'cast:amy poehler', 'cast:andré dussollier', 'cast:andy garcía',\n", " 'cast:andy lau', 'cast:andy serkis', 'cast:angela lansbury',\n", " 'cast:angelina jolie', 'cast:anjelica huston', 'cast:ann doran',\n", " 'cast:ann-margret', 'cast:anne bancroft', 'cast:anne heche',\n", " 'cast:anthony hopkins', 'cast:anthony lapaglia',\n", " 'cast:anthony mackie', 'cast:anthony quinn', 'cast:anthony wong',\n", " 'cast:antonio banderas', 'cast:anupam kher', 'cast:armand assante',\n", " 'cast:arnold schwarzenegger', 'cast:arthur kennedy',\n", " 'cast:ava gardner', 'cast:barbara hale', 'cast:barbara hershey',\n", " 'cast:barbara stanwyck', 'cast:barry corbin',\n", " 'cast:barry sullivan', 'cast:barton maclane',\n", " 'cast:basil rathbone', 'cast:beau bridges', 'cast:bela lugosi',\n", " 'cast:ben affleck', 'cast:ben johnson', 'cast:ben kingsley',\n", " 'cast:ben stiller', 'cast:benicio del toro', 'cast:bernard blier',\n", " 'cast:bernard lee', 'cast:bert moorhouse', 'cast:bess flowers',\n", " 'cast:beth grant', 'cast:bette davis', 'cast:beulah bondi',\n", " \"cast:beverly d'angelo\", 'cast:bill hader', 'cast:bill moseley',\n", " 'cast:bill murray', 'cast:bill nighy', 'cast:bill paxton',\n", " 'cast:bill pullman', 'cast:billy bevan', 'cast:billy bob thornton',\n", " 'cast:billy connolly', 'cast:billy crystal', 'cast:billy gilbert',\n", " 'cast:billy zane', 'cast:bing crosby', 'cast:blythe danner',\n", " 'cast:bob balaban', 'cast:bob gunton', 'cast:bob hope',\n", " 'cast:bob hoskins', 'cast:bobby cannavale', 'cast:boris karloff',\n", " 'cast:brad dourif', 'cast:brad pitt', 'cast:brendan fraser',\n", " 'cast:brendan gleeson', 'cast:brian cox', 'cast:brian dennehy',\n", " 'cast:brian donlevy', 'cast:brian keith', 'cast:brion james',\n", " 'cast:brooks benedict', 'cast:bruce campbell',\n", " 'cast:bruce davison', 'cast:bruce dern', 'cast:bruce greenwood',\n", " 'cast:bruce mcgill', 'cast:bruce willis', 'cast:bud spencer',\n", " 'cast:burgess meredith', 'cast:burt lancaster',\n", " 'cast:burt reynolds', 'cast:burt young', 'cast:buster keaton',\n", " 'cast:byron foulger', 'cast:c. aubrey smith',\n", " 'cast:c. thomas howell', 'cast:callum keith rennie',\n", " 'cast:cameron diaz', 'cast:cameron mitchell', 'cast:carla gugino',\n", " 'cast:carol kane', 'cast:caroline aaron', 'cast:carrie fisher',\n", " 'cast:cary elwes', 'cast:cary grant', 'cast:cary-hiroyuki tagawa',\n", " 'cast:cate blanchett', 'cast:catherine deneuve',\n", " 'cast:catherine keener', \"cast:catherine o'hara\",\n", " 'cast:cecil kellaway', 'cast:cedric hardwicke',\n", " 'cast:charles boyer', 'cast:charles bronson', 'cast:charles dance',\n", " 'cast:charles durning', 'cast:charles halton', 'cast:charles lane',\n", " 'cast:charles laughton', 'cast:charles mcgraw',\n", " 'cast:charles middleton', 'cast:charles s. dutton',\n", " 'cast:charles trowbridge', 'cast:charlie chaplin',\n", " 'cast:charlie sheen', 'cast:charlize theron',\n", " 'cast:charlotte rampling', 'cast:charlton heston',\n", " 'cast:cheech marin', 'cast:chevy chase', 'cast:chill wills',\n", " 'cast:chishu ryu', 'cast:chloë sevigny', 'cast:chris cooper',\n", " 'cast:chris rock', 'cast:christian bale', 'cast:christian slater',\n", " 'cast:christina ricci', 'cast:christopher lee',\n", " 'cast:christopher lloyd', 'cast:christopher mcdonald',\n", " 'cast:christopher plummer', 'cast:christopher walken',\n", " 'cast:ciarán hinds', 'cast:clancy brown', 'cast:clarence muse',\n", " 'cast:clark gable', 'cast:claude rains', 'cast:claudia cardinale',\n", " 'cast:clifton collins jr', 'cast:clint eastwood',\n", " 'cast:clint howard', 'cast:clive owen', 'cast:cloris leachman',\n", " 'cast:colin farrell', 'cast:colin firth', 'cast:colleen camp',\n", " 'cast:colm meaney', 'cast:corbin bernsen', 'cast:crispin glover',\n", " 'cast:cuba gooding jr.', 'cast:cyril cusack', 'cast:cyril ring',\n", " 'cast:dabney coleman', 'cast:dakota fanning', 'cast:dan aykroyd',\n", " 'cast:dan hedaya', 'cast:dana andrews', 'cast:daniel brühl',\n", " 'cast:daniel craig', 'cast:danny aiello', 'cast:danny devito',\n", " 'cast:danny glover', 'cast:danny huston', 'cast:danny trejo',\n", " 'cast:daryl hannah', 'cast:david arquette', 'cast:david bradley',\n", " 'cast:david carradine', 'cast:david cross', 'cast:david keith',\n", " 'cast:david koechner', 'cast:david morse', 'cast:david niven',\n", " 'cast:david ogden stiers', 'cast:david paymer',\n", " 'cast:david strathairn', 'cast:david thewlis', 'cast:david warner',\n", " 'cast:dean jagger', 'cast:dean martin', 'cast:dean stockwell',\n", " 'cast:debbie reynolds', 'cast:dee wallace', 'cast:demi moore',\n", " 'cast:denholm elliott', 'cast:denis leary', 'cast:dennis haysbert',\n", " 'cast:dennis hopper', \"cast:dennis o'keefe\", 'cast:dennis quaid',\n", " 'cast:denzel washington', 'cast:derek jacobi',\n", " 'cast:dermot mulroney', 'cast:diane keaton', 'cast:diane lane',\n", " 'cast:dianne wiest', 'cast:dick miller', 'cast:diego abatantuono',\n", " 'cast:dolph lundgren', 'cast:dom deluise', 'cast:don beddoe',\n", " 'cast:don cheadle', 'cast:donal logue', 'cast:donald crisp',\n", " 'cast:donald meek', 'cast:donald pleasence',\n", " 'cast:donald sutherland', 'cast:doris lloyd',\n", " 'cast:douglas fowley', 'cast:douglass dumbrille',\n", " 'cast:drew barrymore', 'cast:dub taylor', 'cast:dustin hoffman',\n", " 'cast:dylan baker', 'cast:e.e. clive', 'cast:ed asner',\n", " 'cast:ed begley jr.', 'cast:ed harris', 'cast:ed lauter',\n", " 'cast:eddie albert', 'cast:eddie izzard', 'cast:eddie marsan',\n", " 'cast:eddie murphy', 'cast:edgar buchanan', \"cast:edmond o'brien\",\n", " 'cast:edmund mortimer', 'cast:edward arnold', 'cast:edward brophy',\n", " 'cast:edward g. robinson', 'cast:edward herrmann',\n", " 'cast:eleanor parker', 'cast:eli wallach', 'cast:elias koteas',\n", " 'cast:elijah wood', 'cast:elisha cook jr.', 'cast:elizabeth banks',\n", " 'cast:elizabeth taylor', 'cast:ellen barkin', 'cast:ellen burstyn',\n", " 'cast:ellen corby', 'cast:elliott gould', 'cast:emily watson',\n", " 'cast:emma thompson', 'cast:emmett vogan', 'cast:emory parnell',\n", " 'cast:eric idle', 'cast:eric roberts', 'cast:eric stoltz',\n", " 'cast:eric tsang', 'cast:ernest borgnine', 'cast:ernie hudson',\n", " 'cast:ethan hawke', 'cast:eugene levy', 'cast:eugene pallette',\n", " 'cast:ewan mcgregor', 'cast:f. murray abraham',\n", " 'cast:famke janssen', 'cast:faye dunaway', 'cast:fernando rey',\n", " 'cast:forest whitaker', 'cast:frances mcdormand',\n", " 'cast:franco nero', 'cast:frank faylen', 'cast:frank ferguson',\n", " 'cast:frank langella', 'cast:frank mayo', 'cast:frank mchugh',\n", " 'cast:frank mills', 'cast:frank morgan', 'cast:frank puglia',\n", " 'cast:frank reicher', 'cast:frank sinatra', 'cast:frank welker',\n", " 'cast:frankie faison', 'cast:fred astaire', 'cast:fred macmurray',\n", " 'cast:fred tatasciore', 'cast:fred ward', 'cast:fred willard',\n", " 'cast:fredric march', 'cast:gabriel byrne', 'cast:gary busey',\n", " 'cast:gary cole', 'cast:gary cooper', 'cast:gary lewis',\n", " 'cast:gary oldman', 'cast:gene hackman', 'cast:gene lockhart',\n", " 'cast:geoffrey lewis', 'cast:geoffrey rush',\n", " 'cast:george buck flower', 'cast:george c. scott',\n", " 'cast:george chandler', 'cast:george clooney', 'cast:george davis',\n", " 'cast:george irving', 'cast:george kennedy', 'cast:george sanders',\n", " 'cast:george segal', 'cast:george tobias',\n", " 'cast:geraldine chaplin', 'cast:gian maria volonté',\n", " 'cast:giancarlo esposito', 'cast:gina gershon',\n", " 'cast:ginger rogers', 'cast:gino corrado', 'cast:giovanni ribisi',\n", " 'cast:giuliano gemma', 'cast:glenn close', 'cast:glenn ford',\n", " 'cast:grace zabriskie', 'cast:grady sutton', 'cast:grant mitchell',\n", " 'cast:greg kinnear', 'cast:gregory peck', 'cast:grey griffin',\n", " 'cast:griffin dunne', 'cast:guinn williams', 'cast:guy kibbee',\n", " 'cast:guy pearce', 'cast:gwyneth paltrow', 'cast:gérard depardieu',\n", " 'cast:hal holbrook', 'cast:hank azaria', 'cast:harold miller',\n", " 'cast:harrison ford', 'cast:harry andrews', 'cast:harry carey',\n", " 'cast:harry cording', 'cast:harry davenport',\n", " 'cast:harry dean stanton', 'cast:harry hayden',\n", " 'cast:harry morgan', 'cast:harry strang', 'cast:harvey keitel',\n", " 'cast:heather graham', 'cast:helen mirren',\n", " 'cast:helena bonham carter', 'cast:henry fonda',\n", " \"cast:henry o'neill\", 'cast:henry silva', 'cast:henry stephenson',\n", " 'cast:herbert lom', 'cast:herbert marshall', 'cast:holmes herbert',\n", " 'cast:hugh jackman', 'cast:hugo weaving', 'cast:humphrey bogart',\n", " 'cast:huntz hall', 'cast:héctor elizondo', 'cast:ian holm',\n", " 'cast:ian mckellen', 'cast:ian mcshane', 'cast:ian wolfe',\n", " 'cast:ingrid bergman', 'cast:irving bacon',\n", " 'cast:isabella rossellini', 'cast:isabelle huppert',\n", " 'cast:j. carrol naish', 'cast:j. farrell macdonald',\n", " 'cast:j.k. simmons', 'cast:jack black', 'cast:jack elam',\n", " 'cast:jack lemmon', 'cast:jack nicholson', 'cast:jack palance',\n", " 'cast:jack warden', 'cast:jackie chan', 'cast:jacqueline bisset',\n", " 'cast:james caan', 'cast:james cagney', 'cast:james coburn',\n", " 'cast:james cosmo', 'cast:james cromwell', 'cast:james earl jones',\n", " 'cast:james flavin', 'cast:james fox', 'cast:james franco',\n", " 'cast:james gandolfini', 'cast:james garner', 'cast:james gleason',\n", " 'cast:james hong', 'cast:james le gros', 'cast:james marsden',\n", " 'cast:james mason', 'cast:james rebhorn', 'cast:james remar',\n", " 'cast:james russo', 'cast:james stewart', 'cast:james whitmore',\n", " 'cast:james woods', 'cast:jamie lee curtis', 'cast:jane darwell',\n", " 'cast:jane fonda', 'cast:jane lynch', 'cast:janeane garofalo',\n", " 'cast:janet leigh', 'cast:jared harris', 'cast:jason alexander',\n", " 'cast:jason bateman', 'cast:jason flemyng', 'cast:jason isaacs',\n", " 'cast:jason lee', 'cast:jason robards', 'cast:jason statham',\n", " 'cast:jean reno', 'cast:jean rochefort', 'cast:jean-claude brialy',\n", " 'cast:jean-claude van damme', 'cast:jean-louis trintignant',\n", " 'cast:jean-paul belmondo', 'cast:jeanne moreau',\n", " 'cast:jeff bennett', 'cast:jeff bridges', 'cast:jeff corey',\n", " 'cast:jeff daniels', 'cast:jeff goldblum', 'cast:jeffrey combs',\n", " 'cast:jeffrey tambor', 'cast:jennifer jason leigh',\n", " 'cast:jennifer tilly', 'cast:jeremy irons', 'cast:jeremy piven',\n", " 'cast:jerome cowan', 'cast:jerry lewis', 'cast:jessica lange',\n", " 'cast:jet li', 'cast:jim backus', 'cast:jim belushi',\n", " 'cast:jim broadbent', 'cast:jim carrey', 'cast:jim cummings',\n", " 'cast:joan blondell', 'cast:joan crawford', 'cast:joan cusack',\n", " 'cast:jodie foster', 'cast:joe mantegna', 'cast:joe pantoliano',\n", " 'cast:joe sawyer', 'cast:joel mccrea', 'cast:john c. mcginley',\n", " 'cast:john c. reilly', 'cast:john candy', 'cast:john carradine',\n", " 'cast:john carroll lynch', 'cast:john cleese', 'cast:john cusack',\n", " 'cast:john diehl', 'cast:john dimaggio', 'cast:john george',\n", " 'cast:john gielgud', 'cast:john goodman', 'cast:john hawkes',\n", " 'cast:john heard', 'cast:john hoyt', 'cast:john hurt',\n", " 'cast:john ireland', 'cast:john leguizamo', 'cast:john litel',\n", " 'cast:john lithgow', 'cast:john malkovich', 'cast:john mcintire',\n", " 'cast:john michael higgins', 'cast:john miljan', 'cast:john mills',\n", " 'cast:john qualen', 'cast:john ratzenberger',\n", " 'cast:john rhys-davies', 'cast:john ridgely', 'cast:john savage',\n", " 'cast:john saxon', 'cast:john travolta', 'cast:john turturro',\n", " 'cast:john wayne', 'cast:johnny depp', 'cast:johnny lever',\n", " 'cast:jon gries', 'cast:jon lovitz', 'cast:jon polito',\n", " 'cast:jon voight', 'cast:jonah hill', 'cast:jonathan hale',\n", " 'cast:jonathan pryce', 'cast:joseph cotten', 'cast:joseph crehan',\n", " 'cast:joseph gordon-levitt', 'cast:josh brolin', 'cast:josh lucas',\n", " 'cast:joss ackland', 'cast:jude law', 'cast:judi dench',\n", " 'cast:judy greer', 'cast:julia roberts', 'cast:julianne moore',\n", " 'cast:juliette binoche', 'cast:juliette lewis', 'cast:justin long',\n", " 'cast:jürgen prochnow', 'cast:kane hodder', 'cast:kareena kapoor',\n", " 'cast:karen black', 'cast:karl malden', 'cast:kate winslet',\n", " 'cast:katharine hepburn', 'cast:kathleen freeman',\n", " 'cast:kathy baker', 'cast:kathy bates', 'cast:keanu reeves',\n", " 'cast:keenan wynn', 'cast:keith carradine', 'cast:keith david',\n", " 'cast:kenneth branagh', 'cast:kevin bacon', 'cast:kevin corrigan',\n", " 'cast:kevin costner', 'cast:kevin dunn', 'cast:kevin kline',\n", " 'cast:kevin mccarthy', 'cast:kevin michael richardson',\n", " 'cast:kevin pollak', 'cast:kevin smith', 'cast:kevin spacey',\n", " 'cast:kiefer sutherland', 'cast:kim basinger', 'cast:kirk douglas',\n", " 'cast:kirsten dunst', 'cast:klaus kinski',\n", " 'cast:kris kristofferson', 'cast:kristen stewart',\n", " 'cast:kristen wiig', 'cast:kristin scott thomas', 'cast:ku feng',\n", " 'cast:kurt russell', 'cast:l.q. jones', 'cast:lam suet',\n", " 'cast:lambert wilson', 'cast:lance henriksen',\n", " 'cast:lane chandler', 'cast:larry miller', 'cast:larry steers',\n", " 'cast:laura dern', 'cast:laura linney', 'cast:lauren bacall',\n", " 'cast:laurence fishburne', 'cast:laurence olivier',\n", " 'cast:lee j. cobb', 'cast:lee marvin', 'cast:lee phelps',\n", " 'cast:lee van cleef', 'cast:lena headey', 'cast:leo gorcey',\n", " 'cast:leo white', 'cast:leoda richards', 'cast:leon ames',\n", " 'cast:leslie nielsen', 'cast:leslie phillips', 'cast:lewis stone',\n", " 'cast:leyland hodgson', 'cast:liam neeson', 'cast:liev schreiber',\n", " 'cast:lili taylor', 'cast:lin shaye', 'cast:lionel barrymore',\n", " 'cast:lionel stander', 'cast:lloyd bridges', 'cast:lochlyn munro',\n", " 'cast:lon chaney jr.', 'cast:lou diamond phillips',\n", " 'cast:louis calhern', 'cast:louis gossett', 'cast:louise beavers',\n", " 'cast:lucille ball', 'cast:lucy liu', 'cast:luis guzmán',\n", " 'cast:lukas haas', 'cast:luke wilson', 'cast:lyle talbot',\n", " 'cast:m. emmet walsh', 'cast:mae marsh', 'cast:maggie smith',\n", " 'cast:malcolm mcdowell', 'cast:marc lawrence', 'cast:marcel dalio',\n", " 'cast:marcello mastroianni', 'cast:marcia gay harden',\n", " 'cast:margo martindale', 'cast:maria bello',\n", " 'cast:marion cotillard', 'cast:marisa tomei',\n", " 'cast:mark boone junior', 'cast:mark hamill', 'cast:mark ruffalo',\n", " 'cast:mark strong', 'cast:mark wahlberg', 'cast:marlon brando',\n", " 'cast:martin balsam', 'cast:martin landau', 'cast:martin scorsese',\n", " 'cast:martin sheen', 'cast:mary astor', 'cast:mary field',\n", " 'cast:mary gordon', 'cast:mary kay place', 'cast:mary steenburgen',\n", " 'cast:masako nozawa', 'cast:mathieu amalric', 'cast:matt damon',\n", " 'cast:matt dillon', 'cast:matt frewer', 'cast:matthew broderick',\n", " 'cast:matthew lillard', 'cast:matthew mcconaughey',\n", " 'cast:matthew modine', 'cast:maury chaykin', 'cast:max von sydow',\n", " 'cast:mel blanc', 'cast:mel gibson', 'cast:melanie griffith',\n", " 'cast:melissa leo', 'cast:melvyn douglas', 'cast:meryl streep',\n", " 'cast:mia farrow', 'cast:michael biehn', 'cast:michael caine',\n", " 'cast:michael clarke duncan', 'cast:michael douglas',\n", " 'cast:michael gambon', 'cast:michael gough',\n", " 'cast:michael hordern', 'cast:michael ironside',\n", " 'cast:michael j. fox', 'cast:michael keaton',\n", " 'cast:michael lerner', 'cast:michael lonsdale',\n", " 'cast:michael madsen', 'cast:michael mckean',\n", " 'cast:michael murphy', 'cast:michael nyqvist', 'cast:michael paré',\n", " 'cast:michael peña', 'cast:michael rapaport',\n", " 'cast:michael rooker', 'cast:michael shannon',\n", " 'cast:michael sheen', 'cast:michael york', 'cast:michel piccoli',\n", " 'cast:michelle pfeiffer', 'cast:mickey rooney',\n", " 'cast:mickey rourke', 'cast:mike epps', 'cast:mike mazurki',\n", " 'cast:mike starr', 'cast:milton kibbee', 'cast:mira sorvino',\n", " 'cast:miranda richardson', 'cast:miriam margolyes',\n", " 'cast:missi pyle', 'cast:molly shannon', 'cast:monte blue',\n", " 'cast:morgan freeman', 'cast:moroni olsen', 'cast:morris ankrum',\n", " 'cast:myrna loy', 'cast:naomi watts', 'cast:naseeruddin shah',\n", " 'cast:nastassja kinski', 'cast:natalie portman',\n", " 'cast:natasha lyonne', 'cast:ned beatty', 'cast:nestor paiva',\n", " 'cast:nick nolte', 'cast:nicolas cage', 'cast:nicole kidman',\n", " 'cast:nigel bruce', 'cast:noah beery', 'cast:octavia spencer',\n", " 'cast:olin howland', 'cast:oliver platt', 'cast:oliver reed',\n", " 'cast:olivier gourmet', 'cast:olympia dukakis', 'cast:om puri',\n", " 'cast:omar sharif', 'cast:orson welles', 'cast:owen wilson',\n", " 'cast:pam grier', 'cast:paolo villaggio', 'cast:paresh rawal',\n", " 'cast:parker posey', 'cast:pat flaherty', 'cast:pat hingle',\n", " 'cast:patricia clarkson', 'cast:patrick bauchau',\n", " 'cast:patrick stewart', 'cast:patrick warburton',\n", " 'cast:patton oswalt', 'cast:paul dooley', 'cast:paul fix',\n", " 'cast:paul giamatti', 'cast:paul guilfoyle', 'cast:paul harvey',\n", " 'cast:paul newman', 'cast:paul rudd', 'cast:paul sorvino',\n", " 'cast:penélope cruz', 'cast:pete postlethwaite',\n", " 'cast:peter boyle', 'cast:peter coyote', 'cast:peter cushing',\n", " 'cast:peter falk', 'cast:peter fonda', 'cast:peter gallagher',\n", " 'cast:peter lawford', 'cast:peter lorre', \"cast:peter o'toole\",\n", " 'cast:peter sarsgaard', 'cast:peter sellers',\n", " 'cast:peter stormare', 'cast:peter ustinov',\n", " 'cast:philip baker hall', 'cast:philip ettington',\n", " 'cast:philip seymour hoffman', 'cast:philippe noiret',\n", " 'cast:pierce brosnan', 'cast:pierre watkin',\n", " 'cast:priyanka chopra', 'cast:pruitt taylor vince',\n", " 'cast:queen latifah', 'cast:r. lee ermey', 'cast:rachel weisz',\n", " 'cast:rade serbedzija', 'cast:ralph bellamy', 'cast:ralph fiennes',\n", " 'cast:ralph richardson', 'cast:randolph scott', 'cast:randy quaid',\n", " 'cast:ray liotta', 'cast:ray milland', 'cast:ray teal',\n", " 'cast:ray winstone', 'cast:ray wise', 'cast:raymond burr',\n", " 'cast:reese witherspoon', 'cast:reginald denny',\n", " 'cast:reginald owen', 'cast:regis toomey', 'cast:ren osugi',\n", " 'cast:rhys ifans', 'cast:rhys williams', 'cast:richard anderson',\n", " 'cast:richard burton', 'cast:richard dreyfuss',\n", " 'cast:richard e. grant', 'cast:richard gere',\n", " 'cast:richard harris', 'cast:richard jaeckel',\n", " 'cast:richard jenkins', 'cast:richard kind', 'cast:richard masur',\n", " 'cast:richard pryor', 'cast:richard riehle', 'cast:richard schiff',\n", " 'cast:richard widmark', 'cast:rip torn', 'cast:rob lowe',\n", " 'cast:rob paulsen', 'cast:rob schneider', 'cast:robbie coltrane',\n", " 'cast:robert barrat', 'cast:robert de niro',\n", " 'cast:robert downey jr.', 'cast:robert duvall',\n", " 'cast:robert englund', 'cast:robert forster', 'cast:robert loggia',\n", " 'cast:robert mitchum', 'cast:robert morley', 'cast:robert patrick',\n", " 'cast:robert redford', 'cast:robert ryan', 'cast:robert taylor',\n", " 'cast:robert vaughn', 'cast:robert wagner', 'cast:robert warwick',\n", " 'cast:robert young', 'cast:robin williams', 'cast:robin wright',\n", " 'cast:rock hudson', 'cast:rod steiger', 'cast:roddy mcdowall',\n", " 'cast:ron jeremy', 'cast:ron livingston', 'cast:ron perlman',\n", " 'cast:rosanna arquette', 'cast:rosario dawson', 'cast:rose byrne',\n", " 'cast:roy scheider', 'cast:royal dano', 'cast:russell crowe',\n", " 'cast:russell hicks', 'cast:rutger hauer', 'cast:ryan reynolds',\n", " 'cast:salma hayek', 'cast:salman khan', 'cast:sam elliott',\n", " 'cast:sam harris', 'cast:sam neill', 'cast:sam rockwell',\n", " 'cast:sam shepard', 'cast:sammo hung', 'cast:samuel l. jackson',\n", " 'cast:samuel s. hinds', 'cast:sandra bullock',\n", " 'cast:sarah silverman', 'cast:scarlett johansson',\n", " 'cast:scott glenn', 'cast:scott wilson', 'cast:sean astin',\n", " 'cast:sean bean', 'cast:sean connery', 'cast:sean penn',\n", " 'cast:sean young', 'cast:selmer jackson', 'cast:seth green',\n", " 'cast:seth rogen', 'cast:seymour cassel', 'cast:shah rukh khan',\n", " 'cast:sharon stone', 'cast:shelley winters',\n", " 'cast:shirley henderson', 'cast:shirley maclaine', 'cast:sid haig',\n", " 'cast:sid james', 'cast:sidney poitier', 'cast:sig ruman',\n", " 'cast:sigourney weaver', 'cast:simon pegg', 'cast:simon yam',\n", " 'cast:sissy spacek', 'cast:sophia loren', 'cast:spencer charters',\n", " 'cast:spencer tracy', 'cast:stacy keach', 'cast:stan lee',\n", " 'cast:stanley tucci', 'cast:stellan skarsgård',\n", " 'cast:stephen dorff', 'cast:stephen fry', 'cast:stephen lang',\n", " 'cast:stephen mchattie', 'cast:stephen rea', 'cast:stephen root',\n", " 'cast:stephen tobolowsky', 'cast:sterling holloway',\n", " 'cast:steve buscemi', 'cast:steve coogan', 'cast:steve guttenberg',\n", " 'cast:steve martin', 'cast:steve zahn', 'cast:steven geray',\n", " 'cast:steven seagal', 'cast:stockard channing',\n", " 'cast:strother martin', 'cast:stuart holmes',\n", " 'cast:susan sarandon', 'cast:susumu terajima',\n", " 'cast:sylvester stallone', 'cast:tadanobu asano',\n", " 'cast:takashi shimura', 'cast:tara strong', 'cast:tchéky karyo',\n", " 'cast:terence stamp', 'cast:teri garr', 'cast:terrence howard',\n", " 'cast:thomas jane', 'cast:thomas kretschmann',\n", " 'cast:thomas mitchell', 'cast:til schweiger', 'cast:tilda swinton',\n", " 'cast:tim blake nelson', 'cast:tim curry', 'cast:tim robbins',\n", " 'cast:tim roth', 'cast:tim thomerson', 'cast:timothy hutton',\n", " 'cast:timothy spall', 'cast:toby jones', 'cast:tom arnold',\n", " 'cast:tom berenger', 'cast:tom cruise', 'cast:tom dugan',\n", " 'cast:tom hanks', 'cast:tom kenny', 'cast:tom lister jr.',\n", " 'cast:tom selleck', 'cast:tom sizemore', 'cast:tom skerritt',\n", " 'cast:tom wilkinson', 'cast:tommy lee jones', 'cast:tomás milián',\n", " 'cast:toni collette', 'cast:tony curtis', 'cast:tony shalhoub',\n", " 'cast:tony todd', 'cast:toshirō mifune', 'cast:tracey walter',\n", " 'cast:treat williams', 'cast:trevor howard', 'cast:udo kier',\n", " 'cast:ugo tognazzi', 'cast:uma thurman', 'cast:una merkel',\n", " 'cast:val kilmer', 'cast:van johnson', 'cast:vanessa redgrave',\n", " 'cast:vernon dobtcheff', 'cast:viggo mortensen',\n", " 'cast:vince vaughn', 'cast:vincent cassel',\n", " \"cast:vincent d'onofrio\", 'cast:vincent price', 'cast:ving rhames',\n", " 'cast:vinnie jones', 'cast:viola davis', 'cast:virginia brissac',\n", " 'cast:virginia madsen', 'cast:vittorio gassman',\n", " 'cast:vivica a. fox', 'cast:wade boteler', 'cast:wallace ford',\n", " 'cast:wallace shawn', 'cast:walter brennan', 'cast:walter huston',\n", " 'cast:walter matthau', 'cast:walter pidgeon', 'cast:walter sande',\n", " 'cast:ward bond', 'cast:warren oates', 'cast:werner herzog',\n", " 'cast:wesley snipes', 'cast:whit bissell', 'cast:whoopi goldberg',\n", " 'cast:will arnett', 'cast:will ferrell', 'cast:will patton',\n", " 'cast:will wright', 'cast:willard robertson', 'cast:willem dafoe',\n", " 'cast:william b. davidson', 'cast:william demarest',\n", " 'cast:william fichtner', 'cast:william forsythe',\n", " 'cast:william h. macy', 'cast:william holden', 'cast:william hurt',\n", " 'cast:william powell', 'cast:william sadler',\n", " 'cast:william schallert', 'cast:william shatner',\n", " 'cast:winona ryder', 'cast:woody allen', 'cast:woody harrelson',\n", " 'cast:xander berkeley', 'cast:yuen biao'], dtype=object)" ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 645 }, "id": "F59_kK_Jsy6L", "outputId": "b588ab8d-79fe-4261-e403-66a1ce38b5c6" }, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:3678: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n", " self[col] = igetitem(value, i)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " title id adult video genre: \\\n", "0 Toy Story 862 0 0 0 \n", "1 Jumanji 8844 0 0 0 \n", "2 Grumpier Old Men 15602 0 0 0 \n", "3 Waiting to Exhale 31357 0 0 0 \n", "4 Father of the Bride Part II 11862 0 0 0 \n", "... ... ... ... ... ... \n", "46159 Subdue 439050 0 0 0 \n", "46160 Century of Birthing 111109 0 0 0 \n", "46161 Betrayal 67758 0 0 0 \n", "46162 Satan Triumphant 227506 0 0 1 \n", "46163 Queerama 461257 0 0 1 \n", "\n", " genre:action genre:adventure genre:animation genre:comedy \\\n", "0 0 0 1 1 \n", "1 0 1 0 0 \n", "2 0 0 0 1 \n", "3 0 0 0 1 \n", "4 0 0 0 1 \n", "... ... ... ... ... \n", "46159 0 0 0 0 \n", "46160 0 0 0 0 \n", "46161 1 0 0 0 \n", "46162 0 0 0 0 \n", "46163 0 0 0 0 \n", "\n", " genre:crime ... cast:william hurt cast:william powell \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "3 0 ... 0 0 \n", "4 0 ... 0 0 \n", "... ... ... ... ... \n", "46159 0 ... 0 0 \n", "46160 0 ... 0 0 \n", "46161 0 ... 0 0 \n", "46162 0 ... 0 0 \n", "46163 0 ... 0 0 \n", "\n", " cast:william sadler cast:william schallert cast:william shatner \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "... ... ... ... \n", "46159 0 0 0 \n", "46160 0 0 0 \n", "46161 0 0 0 \n", "46162 0 0 0 \n", "46163 0 0 0 \n", "\n", " cast:winona ryder cast:woody allen cast:woody harrelson \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "... ... ... ... \n", "46159 0 0 0 \n", "46160 0 0 0 \n", "46161 0 0 0 \n", "46162 0 0 0 \n", "46163 0 0 0 \n", "\n", " cast:xander berkeley cast:yuen biao \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "... ... ... \n", "46159 0 0 \n", "46160 0 0 \n", "46161 0 0 \n", "46162 0 0 \n", "46163 0 0 \n", "\n", "[46164 rows x 3675 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleidadultvideogenre:genre:actiongenre:adventuregenre:animationgenre:comedygenre:crime...cast:william hurtcast:william powellcast:william sadlercast:william schallertcast:william shatnercast:winona rydercast:woody allencast:woody harrelsoncast:xander berkeleycast:yuen biao
0Toy Story86200000110...0000000000
1Jumanji884400001000...0000000000
2Grumpier Old Men1560200000010...0000000000
3Waiting to Exhale3135700000010...0000000000
4Father of the Bride Part II1186200000010...0000000000
..................................................................
46159Subdue43905000000000...0000000000
46160Century of Birthing11110900000000...0000000000
46161Betrayal6775800010000...0000000000
46162Satan Triumphant22750600100000...0000000000
46163Queerama46125700100000...0000000000
\n", "

46164 rows × 3675 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 29 } ], "source": [ "metadata = pd.concat([metadata[['title','id','adult','video']], \n", " pd.DataFrame(genre_data, columns=genre_cols),\n", " pd.DataFrame(countries_data, columns=countries_cols),\n", " pd.DataFrame(collection_data, columns=collection_cols),\n", " pd.DataFrame(keyword_data, columns=keyword_cols),\n", " pd.DataFrame(companies_data, columns=companies_cols),\n", " pd.DataFrame(lang_data, columns=lang_cols)], axis=1)\n", "\n", "credits[credit_cols] = credit_data\n", "metadata = pd.merge(metadata, credits, how='inner', on='id')\n", "metadata" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "id": "jONYrkyX4ylp" }, "outputs": [], "source": [ "#metadata.drop(['production_countries', 'genres', 'belongs_to_collection', 'keywords', 'production_companies', 'original_language'], axis=1, inplace=True)" ] }, { "cell_type": "markdown", "metadata": { "id": "Xw1AygX879iQ" }, "source": [ "list of all numerical features(everything except id and title)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KvUlIctgo58P", "outputId": "48ceca68-86eb-47fc-8860-37346cbaffd7" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array(['adult', 'video', 'genre:', ..., 'cast:woody harrelson',\n", " 'cast:xander berkeley', 'cast:yuen biao'], dtype=object)" ] }, "metadata": {}, "execution_count": 31 } ], "source": [ "feature_cols = np.concatenate((np.array(['adult', 'video']), genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols))\n", "feature_cols\n", "#metadata[feature_cols] = metadata[feature_cols].astype('int8')" ] }, { "cell_type": "code", "source": [ "del genre_data,countries_data,collection_data,keyword_data,companies_data,lang_data,credit_data\n", "del genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols" ], "metadata": { "id": "CavDxPCEVFfR" }, "execution_count": 32, "outputs": [] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "orMZ0gXh6znM", "outputId": "990f15a7-091d-4618-f87d-9d237b500fb6" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(3672,)" ] }, "metadata": {}, "execution_count": 33 } ], "source": [ "feature_cols.shape" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 609 }, "id": "vRWJ9z7I591C", "outputId": "29b45b84-7eaa-4873-d1fa-2a7f17c50bb9" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title id adult video genre: \\\n", "0 Toy Story 862 0 0 0 \n", "1 Jumanji 8844 0 0 0 \n", "2 Grumpier Old Men 15602 0 0 0 \n", "3 Waiting to Exhale 31357 0 0 0 \n", "4 Father of the Bride Part II 11862 0 0 0 \n", "... ... ... ... ... ... \n", "46159 Subdue 439050 0 0 0 \n", "46160 Century of Birthing 111109 0 0 0 \n", "46161 Betrayal 67758 0 0 0 \n", "46162 Satan Triumphant 227506 0 0 1 \n", "46163 Queerama 461257 0 0 1 \n", "\n", " genre:action genre:adventure genre:animation genre:comedy \\\n", "0 0 0 1 1 \n", "1 0 1 0 0 \n", "2 0 0 0 1 \n", "3 0 0 0 1 \n", "4 0 0 0 1 \n", "... ... ... ... ... \n", "46159 0 0 0 0 \n", "46160 0 0 0 0 \n", "46161 1 0 0 0 \n", "46162 0 0 0 0 \n", "46163 0 0 0 0 \n", "\n", " genre:crime ... cast:william hurt cast:william powell \\\n", "0 0 ... 0 0 \n", "1 0 ... 0 0 \n", "2 0 ... 0 0 \n", "3 0 ... 0 0 \n", "4 0 ... 0 0 \n", "... ... ... ... ... \n", "46159 0 ... 0 0 \n", "46160 0 ... 0 0 \n", "46161 0 ... 0 0 \n", "46162 0 ... 0 0 \n", "46163 0 ... 0 0 \n", "\n", " cast:william sadler cast:william schallert cast:william shatner \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "... ... ... ... \n", "46159 0 0 0 \n", "46160 0 0 0 \n", "46161 0 0 0 \n", "46162 0 0 0 \n", "46163 0 0 0 \n", "\n", " cast:winona ryder cast:woody allen cast:woody harrelson \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "... ... ... ... \n", "46159 0 0 0 \n", "46160 0 0 0 \n", "46161 0 0 0 \n", "46162 0 0 0 \n", "46163 0 0 0 \n", "\n", " cast:xander berkeley cast:yuen biao \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "... ... ... \n", "46159 0 0 \n", "46160 0 0 \n", "46161 0 0 \n", "46162 0 0 \n", "46163 0 0 \n", "\n", "[46164 rows x 3675 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleidadultvideogenre:genre:actiongenre:adventuregenre:animationgenre:comedygenre:crime...cast:william hurtcast:william powellcast:william sadlercast:william schallertcast:william shatnercast:winona rydercast:woody allencast:woody harrelsoncast:xander berkeleycast:yuen biao
0Toy Story86200000110...0000000000
1Jumanji884400001000...0000000000
2Grumpier Old Men1560200000010...0000000000
3Waiting to Exhale3135700000010...0000000000
4Father of the Bride Part II1186200000010...0000000000
..................................................................
46159Subdue43905000000000...0000000000
46160Century of Birthing11110900000000...0000000000
46161Betrayal6775800010000...0000000000
46162Satan Triumphant22750600100000...0000000000
46163Queerama46125700100000...0000000000
\n", "

46164 rows × 3675 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 34 } ], "source": [ "metadata" ] }, { "cell_type": "code", "source": [ "def split_dataframe(df, holdout_fraction=0.1):\n", " test = df.sample(frac=holdout_fraction, replace=False)\n", " train = df[~df.index.isin(test.index)]\n", " return train, test\n", "\n", "train, test = split_dataframe(metadata)" ], "metadata": { "id": "s-OAc5zG-qkJ" }, "execution_count": 35, "outputs": [] }, { "cell_type": "code", "source": [ "allIds = metadata['id']\n", "\n", "number_of_batches = 4\n", "batches = np.array_split(train, number_of_batches)\n", "mf.log_param('number of batches', number_of_batches)\n", "del metadata\n", "del train" ], "metadata": { "id": "_sS14fV3Zr6n" }, "execution_count": 36, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "BZvzSjJUG3nX" }, "source": [ "## Algorithm\n" ] }, { "cell_type": "code", "source": [ "batches[0]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 696 }, "id": "e6kQZWCWgBC2", "outputId": "cefa0dc8-47b6-41df-c12e-bc0f12c59845" }, "execution_count": 37, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title id adult video genre: genre:action \\\n", "0 Toy Story 862 0 0 0 0 \n", "2 Grumpier Old Men 15602 0 0 0 0 \n", "3 Waiting to Exhale 31357 0 0 0 0 \n", "4 Father of the Bride Part II 11862 0 0 0 0 \n", "5 Heat 949 0 0 0 1 \n", "... ... ... ... ... ... ... \n", "11542 The Bothersome Man 13318 0 0 0 0 \n", "11543 Don't Drink the Water 10462 0 0 0 0 \n", "11544 The Good German 182 0 0 0 0 \n", "11546 Letters from Iwo Jima 1251 0 0 0 1 \n", "11547 Presenting Lily Mars 43512 0 0 0 0 \n", "\n", " genre:adventure genre:animation genre:comedy genre:crime ... \\\n", "0 0 1 1 0 ... \n", "2 0 0 1 0 ... \n", "3 0 0 1 0 ... \n", "4 0 0 1 0 ... \n", "5 0 0 0 1 ... \n", "... ... ... ... ... ... \n", "11542 0 0 1 0 ... \n", "11543 0 0 1 0 ... \n", "11544 0 0 0 1 ... \n", "11546 1 0 0 0 ... \n", "11547 0 0 0 0 ... \n", "\n", " cast:william hurt cast:william powell cast:william sadler \\\n", "0 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "5 0 0 0 \n", "... ... ... ... \n", "11542 0 0 0 \n", "11543 0 0 0 \n", "11544 0 0 0 \n", "11546 0 0 0 \n", "11547 0 0 0 \n", "\n", " cast:william schallert cast:william shatner cast:winona ryder \\\n", "0 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "5 0 0 0 \n", "... ... ... ... \n", "11542 0 0 0 \n", "11543 0 0 0 \n", "11544 0 0 0 \n", "11546 0 0 0 \n", "11547 0 0 0 \n", "\n", " cast:woody allen cast:woody harrelson cast:xander berkeley \\\n", "0 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "5 0 0 1 \n", "... ... ... ... \n", "11542 0 0 0 \n", "11543 1 0 0 \n", "11544 0 0 0 \n", "11546 0 0 0 \n", "11547 0 0 0 \n", "\n", " cast:yuen biao \n", "0 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "5 0 \n", "... ... \n", "11542 0 \n", "11543 0 \n", "11544 0 \n", "11546 0 \n", "11547 0 \n", "\n", "[10387 rows x 3675 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titleidadultvideogenre:genre:actiongenre:adventuregenre:animationgenre:comedygenre:crime...cast:william hurtcast:william powellcast:william sadlercast:william schallertcast:william shatnercast:winona rydercast:woody allencast:woody harrelsoncast:xander berkeleycast:yuen biao
0Toy Story86200000110...0000000000
2Grumpier Old Men1560200000010...0000000000
3Waiting to Exhale3135700000010...0000000000
4Father of the Bride Part II1186200000010...0000000000
5Heat94900010001...0000000010
..................................................................
11542The Bothersome Man1331800000010...0000000000
11543Don't Drink the Water1046200000010...0000001000
11544The Good German18200000001...0000000000
11546Letters from Iwo Jima125100011000...0000000000
11547Presenting Lily Mars4351200000000...0000000000
\n", "

10387 rows × 3675 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "id": "XyGIImAhG7ZI" }, "outputs": [], "source": [ "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "markdown", "metadata": { "id": "SUJ0Cc9H0KIF" }, "source": [ "`content_based_recommmeder` returns a list of movie ids based on it's input. the input should be a dataframe which has `movieId`, `rating` columns(like `ratings_small.csv` but without `userId`)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "id": "u2IFqWvQKWb3" }, "outputs": [], "source": [ "number_of_batches =1\n", "def content_based_recommender_movie(movieId):\n", " print(\"movie title is:\", metadata[metadata['id']==movieId])\n", " sim_mat= cosine_similarity(metadata[feature_cols])\n", " return sim_mat\n", "\n", "#content_based_recommender_movie(272)" ] }, { "cell_type": "code", "source": [ "batches[1].describe()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 411 }, "id": "K_kmTTPmv3GZ", "outputId": "82087e57-01db-42cb-cfde-b950569ff26a" }, "execution_count": 40, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id adult video genre: genre:action \\\n", "count 10387.000000 10387.000000 10387.0 10387.000000 10387.000000 \n", "mean 70745.557909 0.000096 0.0 0.039569 0.151054 \n", "std 63999.684940 0.009812 0.0 0.194953 0.358119 \n", "min 3.000000 0.000000 0.0 0.000000 0.000000 \n", "25% 25769.500000 0.000000 0.0 0.000000 0.000000 \n", "50% 50675.000000 0.000000 0.0 0.000000 0.000000 \n", "75% 94217.000000 0.000000 0.0 0.000000 0.000000 \n", "max 469172.000000 1.000000 0.0 1.000000 1.000000 \n", "\n", " genre:adventure genre:animation genre:comedy genre:crime \\\n", "count 10387.000000 10387.000000 10387.000000 10387.000000 \n", "mean 0.071628 0.038125 0.265813 0.094541 \n", "std 0.257883 0.191506 0.441786 0.292594 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 1.000000 0.000000 \n", "max 1.000000 1.000000 1.000000 1.000000 \n", "\n", " genre:documentary ... cast:william hurt cast:william powell \\\n", "count 10387.000000 ... 10387.000000 10387.000000 \n", "mean 0.111678 ... 0.001637 0.000674 \n", "std 0.314985 ... 0.040424 0.025952 \n", "min 0.000000 ... 0.000000 0.000000 \n", "25% 0.000000 ... 0.000000 0.000000 \n", "50% 0.000000 ... 0.000000 0.000000 \n", "75% 0.000000 ... 0.000000 0.000000 \n", "max 1.000000 ... 1.000000 1.000000 \n", "\n", " cast:william sadler cast:william schallert cast:william shatner \\\n", "count 10387.000000 10387.000000 10387.000000 \n", "mean 0.001348 0.001252 0.000674 \n", "std 0.036690 0.037983 0.025952 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 \n", "max 1.000000 2.000000 1.000000 \n", "\n", " cast:winona ryder cast:woody allen cast:woody harrelson \\\n", "count 10387.000000 10387.000000 10387.000000 \n", "mean 0.001059 0.000578 0.002214 \n", "std 0.032527 0.024028 0.047007 \n", "min 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 \n", "max 1.000000 1.000000 1.000000 \n", "\n", " cast:xander berkeley cast:yuen biao \n", "count 10387.000000 10387.000000 \n", "mean 0.000770 0.001155 \n", "std 0.027743 0.036697 \n", "min 0.000000 0.000000 \n", "25% 0.000000 0.000000 \n", "50% 0.000000 0.000000 \n", "75% 0.000000 0.000000 \n", "max 1.000000 2.000000 \n", "\n", "[8 rows x 3673 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idadultvideogenre:genre:actiongenre:adventuregenre:animationgenre:comedygenre:crimegenre:documentary...cast:william hurtcast:william powellcast:william sadlercast:william schallertcast:william shatnercast:winona rydercast:woody allencast:woody harrelsoncast:xander berkeleycast:yuen biao
count10387.00000010387.00000010387.010387.00000010387.00000010387.00000010387.00000010387.00000010387.00000010387.000000...10387.00000010387.00000010387.00000010387.00000010387.00000010387.00000010387.00000010387.00000010387.00000010387.000000
mean70745.5579090.0000960.00.0395690.1510540.0716280.0381250.2658130.0945410.111678...0.0016370.0006740.0013480.0012520.0006740.0010590.0005780.0022140.0007700.001155
std63999.6849400.0098120.00.1949530.3581190.2578830.1915060.4417860.2925940.314985...0.0404240.0259520.0366900.0379830.0259520.0325270.0240280.0470070.0277430.036697
min3.0000000.0000000.00.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%25769.5000000.0000000.00.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
50%50675.0000000.0000000.00.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
75%94217.0000000.0000000.00.0000000.0000000.0000000.0000001.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
max469172.0000001.0000000.01.0000001.0000001.0000001.0000001.0000001.0000001.000000...1.0000001.0000001.0000002.0000001.0000001.0000001.0000001.0000001.0000002.000000
\n", "

8 rows × 3673 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "id": "YgtIqUAvvoSC", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a15673ec-f5f3-42a3-a503-493261375544" }, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(10387, 3)" ] }, "metadata": {}, "execution_count": 41 } ], "source": [ "from sklearn.metrics.pairwise import euclidean_distances as dist\n", "def content_based_recommender(user, df, k=10, movieIds=allIds):\n", " user_movies = pd.merge(user,df,how='inner',left_on='movieId',right_on='id')\n", " user_movies[feature_cols] = user_movies[feature_cols].multiply(user_movies['rating'], axis=\"index\")\n", " mean_user_movies = user_movies[feature_cols].mean(axis=0)\n", " sim_mat = cosine_similarity(df[feature_cols][df.id.isin(movieIds)], mean_user_movies[feature_cols].values.reshape(1,-1))\n", " temp_data = {'id':df['id'][df.id.isin(movieIds)], 'title':df['title'][df.id.isin(movieIds)], 'sim':sim_mat.flatten()}\n", " return pd.DataFrame(temp_data)\n", "\n", "def content_based_all_batches(user, k=10, movieIds=allIds):\n", " ans = content_based_recommender(user, batches[0], k, movieIds)\n", " for i in range(1,number_of_batches):\n", " ans.append(content_based_recommender(user, batches[i], k, movieIds))\n", " return ans.sort_values(by='sim', ascending=False)\n", " \n", "\n", "content_based_k = 10\n", "mf.log_param('content based k', content_based_k)\n", "#xx = content_based_recommender(rating[rating['userId'] == 1], batches[1], content_based_k)\n", "xx = content_based_all_batches(rating[rating['userId'] == 1], content_based_k)\n", "xx.shape" ] }, { "cell_type": "markdown", "source": [ "# Collaborative Filtering" ], "metadata": { "id": "lzvbAt8G4dXl" } }, { "cell_type": "markdown", "source": [ "### import libraries" ], "metadata": { "id": "8M7o2QcASuz_" } }, { "cell_type": "code", "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.utils.extmath import randomized_svd" ], "metadata": { "id": "O4tIp_1tSu0A" }, "execution_count": 42, "outputs": [] }, { "cell_type": "markdown", "source": [ "### explore datasets" ], "metadata": { "id": "pHlQSPF7S2HM" } }, { "cell_type": "code", "source": [ "rating = pd.read_csv('/content/IMDB/ratings_small.csv')\n", "rating.head()" ], "metadata": { "id": "1lyBZ3Tf1oGN", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "47f50819-5da6-49d6-9b39-928a31c19dea" }, "execution_count": 43, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating timestamp\n", "0 1 31 2.5 1260759144\n", "1 1 1029 3.0 1260759179\n", "2 1 1061 3.0 1260759182\n", "3 1 1129 2.0 1260759185\n", "4 1 1172 4.0 1260759205" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtimestamp
01312.51260759144
1110293.01260759179
2110613.01260759182
3111292.01260759185
4111724.01260759205
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 43 } ] }, { "cell_type": "code", "source": [ "rating.shape" ], "metadata": { "id": "8X7DuYoXV2OT", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "7b2810a0-f159-4217-834d-cf882efe0705" }, "execution_count": 44, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(100004, 4)" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "links_small = pd.read_csv('/content/IMDB/links_small.csv')\n", "links_small.head()" ], "metadata": { "id": "lHxWpbnLSDqM", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "b7c1e95e-8a4b-46e5-ca06-33baea81964b" }, "execution_count": 45, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId imdbId tmdbId\n", "0 1 114709 862.0\n", "1 2 113497 8844.0\n", "2 3 113228 15602.0\n", "3 4 114885 31357.0\n", "4 5 113041 11862.0" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdimdbIdtmdbId
01114709862.0
121134978844.0
2311322815602.0
3411488531357.0
4511304111862.0
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 45 } ] }, { "cell_type": "code", "source": [ "credits = pd.read_csv('/content/IMDB/credits.csv')\n", "credits.head()" ], "metadata": { "id": "Z0jZ8QI_SM39", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "3e4c5fcc-e763-4ba3-d699-9fda6cdce923" }, "execution_count": 46, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " cast \\\n", "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n", "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n", "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n", "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n", "4 [{'cast_id': 1, 'character': 'George Banks', '... \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)',...[{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[{'cast_id': 1, 'character': 'Alan Parrish', '...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'c...[{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah...[{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[{'cast_id': 1, 'character': 'George Banks', '...[{'credit_id': '52fe44959251416c75039ed7', 'de...11862
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 46 } ] }, { "cell_type": "code", "source": [ "movie = pd.read_csv('/content/IMDB/movies_metadata.csv')\n", "movie.head()" ], "metadata": { "id": "oIeGPmPI1tAk", "colab": { "base_uri": "https://localhost:8080/", "height": 787 }, "outputId": "4ab975fd-8432-418d-c826-0e85e14b6704" }, "execution_count": 47, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "\n", " homepage id imdb_id original_language \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n", "1 NaN 8844 tt0113497 en \n", "2 NaN 15602 tt0113228 en \n", "3 NaN 31357 tt0114885 en \n", "4 NaN 11862 tt0113041 en \n", "\n", " original_title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "\n", "[5 rows x 24 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
\n", "

5 rows × 24 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 47 } ] }, { "cell_type": "code", "source": [ "movie = movie.rename(columns={'id': 'movieId'})" ], "metadata": { "id": "rmDYxAgOgRNj" }, "execution_count": 48, "outputs": [] }, { "cell_type": "code", "source": [ "movie.shape" ], "metadata": { "id": "DoSsZcRpjo7Y", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "2c9f7194-d614-443b-d091-5f68f0e90655" }, "execution_count": 49, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "(45466, 24)" ] }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "code", "source": [ "movie.head()" ], "metadata": { "id": "6XmWaDvFgeGU", "colab": { "base_uri": "https://localhost:8080/", "height": 750 }, "outputId": "8be37316-480a-43d3-82fa-40b236c09c26" }, "execution_count": 50, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "\n", " homepage movieId imdb_id original_language \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n", "1 NaN 8844 tt0113497 en \n", "2 NaN 15602 tt0113228 en \n", "3 NaN 31357 tt0114885 en \n", "4 NaN 11862 tt0113041 en \n", "\n", " original_title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "\n", "[5 rows x 24 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepagemovieIdimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
\n", "

5 rows × 24 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 50 } ] }, { "cell_type": "markdown", "source": [ "### data preprocessing" ], "metadata": { "id": "oD9RMwahqemy" } }, { "cell_type": "markdown", "source": [ "There are three rows entered by mistake, so we remove that row." ], "metadata": { "id": "Wy2LqLxnklN1" } }, { "cell_type": "code", "source": [ "movie = movie[(movie['movieId']!='1997-08-20') & (movie['movieId']!='2012-09-29') & (movie['movieId']!='2014-01-01')]" ], "metadata": { "id": "OnIMWw3Nj3Dp" }, "execution_count": 51, "outputs": [] }, { "cell_type": "code", "source": [ "def find_names(x):\n", " if x == '':\n", " return ''\n", " genre_arr = eval(str(x))\n", " return ','.join(i['name'] for i in eval(str(x)))\n", " \n", "movie['genres'] = movie['genres'].fillna('')" ], "metadata": { "id": "kO8m6SsepBIg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "ff9d7d36-89d2-438e-f1d3-0b8f6cd8ea24" }, "execution_count": 52, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ ":7: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movie['genres'] = movie['genres'].fillna('')\n" ] } ] }, { "cell_type": "code", "source": [ "movie['genres']=movie['genres'].apply(find_names)" ], "metadata": { "id": "vOfKcOQ-pBIg" }, "execution_count": 53, "outputs": [] }, { "cell_type": "code", "source": [ "movie.movieId = movie.movieId.astype(\"uint64\")" ], "metadata": { "id": "0p-yhNZ3iRsl" }, "execution_count": 54, "outputs": [] }, { "cell_type": "markdown", "source": [ "only keep rating for movies with metadata in movie dataset" ], "metadata": { "id": "DWgywXEKuq2O" } }, { "cell_type": "code", "source": [ "new_rating = pd.merge(rating, movie, how='inner', on=[\"movieId\"])" ], "metadata": { "id": "psgzmBFLtcmx" }, "execution_count": 55, "outputs": [] }, { "cell_type": "code", "source": [ "new_rating = new_rating[[\"userId\", \"movieId\", \"rating\"]]" ], "metadata": { "id": "z9DjgdvYuhOW" }, "execution_count": 56, "outputs": [] }, { "cell_type": "code", "source": [ "movie.head()" ], "metadata": { "id": "gQ4VSPNUuFOc", "colab": { "base_uri": "https://localhost:8080/", "height": 750 }, "outputId": "d4245b96-1958-4704-e3eb-cc9b64effca3" }, "execution_count": 57, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "\n", " genres homepage movieId \\\n", "0 Animation,Comedy,Family http://toystory.disney.com/toy-story 862 \n", "1 Adventure,Fantasy,Family NaN 8844 \n", "2 Romance,Comedy NaN 15602 \n", "3 Comedy,Drama,Romance NaN 31357 \n", "4 Comedy NaN 11862 \n", "\n", " imdb_id original_language original_title \\\n", "0 tt0114709 en Toy Story \n", "1 tt0113497 en Jumanji \n", "2 tt0113228 en Grumpier Old Men \n", "3 tt0114885 en Waiting to Exhale \n", "4 tt0113041 en Father of the Bride Part II \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "\n", "[5 rows x 24 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepagemovieIdimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000Animation,Comedy,Familyhttp://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000Adventure,Fantasy,FamilyNaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0Romance,ComedyNaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000Comedy,Drama,RomanceNaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0ComedyNaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
\n", "

5 rows × 24 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 57 } ] }, { "cell_type": "code", "source": [ "new_rating.head()" ], "metadata": { "id": "gQE2lTG0tua6", "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "outputId": "51c28cea-a4ad-4e89-ced5-3e3953b6cc1f" }, "execution_count": 58, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating\n", "0 1 1371 2.5\n", "1 4 1371 4.0\n", "2 7 1371 3.0\n", "3 19 1371 4.0\n", "4 21 1371 3.0" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdrating
0113712.5
1413714.0
2713713.0
31913714.0
42113713.0
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 58 } ] }, { "cell_type": "code", "source": [ "train, test = split_dataframe(new_rating)" ], "metadata": { "id": "mcdRTQyqAyhU" }, "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "source": [ "### matrix factorization" ], "metadata": { "id": "rpZ7r095Z2-G" } }, { "cell_type": "code", "source": [ "inter_mat_df = rating.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)\n", "inter_mat_df" ], "metadata": { "id": "7oG8lkIRUX1M", "colab": { "base_uri": "https://localhost:8080/", "height": 455 }, "outputId": "cd42edf3-2e15-4dec-8dfb-1a60bbd1ca4a" }, "execution_count": 59, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "movieId 1 2 3 4 5 6 7 8 \\\n", "userId \n", "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... \n", "667 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 \n", "668 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "669 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "670 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "671 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "movieId 9 10 ... 161084 161155 161594 161830 161918 161944 \\\n", "userId ... \n", "1 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... ... ... ... \n", "667 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "668 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "669 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "670 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "671 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", "movieId 162376 162542 162672 163949 \n", "userId \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "5 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... \n", "667 0.0 0.0 0.0 0.0 \n", "668 0.0 0.0 0.0 0.0 \n", "669 0.0 0.0 0.0 0.0 \n", "670 0.0 0.0 0.0 0.0 \n", "671 0.0 0.0 0.0 0.0 \n", "\n", "[671 rows x 9066 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieId12345678910...161084161155161594161830161918161944162376162542162672163949
userId
10.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
20.00.00.00.00.00.00.00.00.04.0...0.00.00.00.00.00.00.00.00.00.0
30.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
40.00.00.00.00.00.00.00.00.04.0...0.00.00.00.00.00.00.00.00.00.0
50.00.04.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
..................................................................
6670.00.00.00.00.04.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
6680.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
6690.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
6704.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
6715.00.00.00.00.00.00.00.00.00.0...0.00.00.00.00.00.00.00.00.00.0
\n", "

671 rows × 9066 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 59 } ] }, { "cell_type": "code", "source": [ "inter_mat = inter_mat_df.to_numpy()" ], "metadata": { "id": "9It-QDN_UcYt" }, "execution_count": 60, "outputs": [] }, { "cell_type": "code", "source": [ "ratings_mean = np.mean(inter_mat, axis = 1)\n", "inter_mat_normal = inter_mat - ratings_mean.reshape(-1, 1)" ], "metadata": { "id": "_ksd9gu8UfGt" }, "execution_count": 61, "outputs": [] }, { "cell_type": "code", "source": [ "inter_mat_normal" ], "metadata": { "id": "2eayKPzpUiX_", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e38be4f9-b9b7-4408-df0d-e589f5a240a5" }, "execution_count": 62, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[-0.00562541, -0.00562541, -0.00562541, ..., -0.00562541,\n", " -0.00562541, -0.00562541],\n", " [-0.02923009, -0.02923009, -0.02923009, ..., -0.02923009,\n", " -0.02923009, -0.02923009],\n", " [-0.02007501, -0.02007501, -0.02007501, ..., -0.02007501,\n", " -0.02007501, -0.02007501],\n", " ...,\n", " [-0.01367748, -0.01367748, -0.01367748, ..., -0.01367748,\n", " -0.01367748, -0.01367748],\n", " [ 3.98698434, -0.01301566, -0.01301566, ..., -0.01301566,\n", " -0.01301566, -0.01301566],\n", " [ 4.95030885, -0.04969115, -0.04969115, ..., -0.04969115,\n", " -0.04969115, -0.04969115]])" ] }, "metadata": {}, "execution_count": 62 } ] }, { "cell_type": "markdown", "source": [ "We use singular value decomposition for matrix factorization" ], "metadata": { "id": "__cFBTZSVEbK" } }, { "cell_type": "code", "source": [ "svd_U, svd_sigma, svd_V = randomized_svd(inter_mat_normal, \n", " n_components=15,\n", " n_iter=5,\n", " random_state=47)" ], "metadata": { "id": "9FcuzszEU-Xv" }, "execution_count": 63, "outputs": [] }, { "cell_type": "markdown", "source": [ "This function gives the diagonal form" ], "metadata": { "id": "2z3Zj7hGVhtJ" } }, { "cell_type": "code", "source": [ "svd_sigma = np.diag(svd_sigma)" ], "metadata": { "id": "BQnpVVH_VgUG" }, "execution_count": 64, "outputs": [] }, { "cell_type": "markdown", "source": [ "Making predictions" ], "metadata": { "id": "JrTuTHy_VpsO" } }, { "cell_type": "code", "source": [ "rating_weights = np.dot(np.dot(svd_U, svd_sigma), svd_V) + ratings_mean.reshape(-1, 1)" ], "metadata": { "id": "pPnlGgkKVpKd" }, "execution_count": 65, "outputs": [] }, { "cell_type": "code", "source": [ "weights_df = pd.DataFrame(rating_weights, columns = inter_mat_df.columns)" ], "metadata": { "id": "sKdvldPoVsfs" }, "execution_count": 66, "outputs": [] }, { "cell_type": "code", "source": [ "weights_df.head()" ], "metadata": { "id": "FzLYVyH8VyXq", "colab": { "base_uri": "https://localhost:8080/", "height": 299 }, "outputId": "0360b11f-2954-496c-e926-5d55a6067ed8" }, "execution_count": 67, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "movieId 1 2 3 4 5 6 7 \\\n", "0 -0.081106 0.024332 -0.016835 -0.006440 -0.028982 0.031704 -0.000823 \n", "1 1.010433 1.676149 0.180399 0.133225 0.332127 0.659165 0.161904 \n", "2 1.023170 0.389789 -0.031488 0.031783 0.000714 0.135239 -0.080027 \n", "3 1.980784 1.192729 0.128675 0.106373 -0.234511 -0.550348 -0.085587 \n", "4 1.216316 0.926650 0.122319 0.063811 0.628848 -0.137074 0.356099 \n", "\n", "movieId 8 9 10 ... 161084 161155 161594 \\\n", "0 -0.004803 -0.005659 0.039295 ... -0.004629 -0.004407 0.011158 \n", "1 0.061847 0.088149 2.271430 ... -0.000064 -0.001081 0.006137 \n", "2 0.030734 -0.026625 0.397392 ... 0.003256 0.007455 -0.012356 \n", "3 -0.143159 -0.015308 1.372206 ... 0.032565 -0.021875 0.084978 \n", "4 0.082625 -0.068154 0.567814 ... -0.028169 -0.028221 0.000680 \n", "\n", "movieId 161830 161918 161944 162376 162542 162672 163949 \n", "0 -0.004547 -0.004340 -0.005154 0.019218 -0.005677 -0.005391 -0.004297 \n", "1 -0.003853 -0.003784 -0.004673 0.011201 -0.002404 -0.003039 0.003864 \n", "2 -0.000387 -0.006532 -0.000590 -0.024486 0.015854 0.014274 -0.005391 \n", "3 0.008436 0.018263 -0.022110 0.133075 -0.016431 -0.014345 0.076347 \n", "4 -0.024315 -0.023941 -0.061806 0.013551 -0.023701 -0.024246 -0.031276 \n", "\n", "[5 rows x 9066 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieId12345678910...161084161155161594161830161918161944162376162542162672163949
0-0.0811060.024332-0.016835-0.006440-0.0289820.031704-0.000823-0.004803-0.0056590.039295...-0.004629-0.0044070.011158-0.004547-0.004340-0.0051540.019218-0.005677-0.005391-0.004297
11.0104331.6761490.1803990.1332250.3321270.6591650.1619040.0618470.0881492.271430...-0.000064-0.0010810.006137-0.003853-0.003784-0.0046730.011201-0.002404-0.0030390.003864
21.0231700.389789-0.0314880.0317830.0007140.135239-0.0800270.030734-0.0266250.397392...0.0032560.007455-0.012356-0.000387-0.006532-0.000590-0.0244860.0158540.014274-0.005391
31.9807841.1927290.1286750.106373-0.234511-0.550348-0.085587-0.143159-0.0153081.372206...0.032565-0.0218750.0849780.0084360.018263-0.0221100.133075-0.016431-0.0143450.076347
41.2163160.9266500.1223190.0638110.628848-0.1370740.3560990.082625-0.0681540.567814...-0.028169-0.0282210.000680-0.024315-0.023941-0.0618060.013551-0.023701-0.024246-0.031276
\n", "

5 rows × 9066 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 67 } ] }, { "cell_type": "markdown", "source": [ "making recommendations" ], "metadata": { "id": "IDO7q6EjZ8q1" } }, { "cell_type": "code", "source": [ "def recommend_top_k(preds_df, ratings_df, movie, userId, k=10):\n", " user_row = userId-1 \n", " sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False) \n", " user_data = ratings_df[ratings_df.userId == (userId)]\n", " user_rated = user_data.merge(movie, how = 'left', left_on = 'movieId', right_on = 'movieId'). \\\n", " sort_values(['rating'], ascending=False)\n", " user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',\n", " on = 'movieId').rename(columns = {user_row: 'prediction'}). \\\n", " sort_values('prediction', ascending = False). \\\n", " iloc[:k, :]\n", " return user_rated, user_preds" ], "metadata": { "id": "0sZanc2nV7ot" }, "execution_count": 68, "outputs": [] }, { "cell_type": "code", "source": [ "collaborative_k = 100\n", "user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, 220, collaborative_k)\n", "mf.log_param('collaborative k', collaborative_k)" ], "metadata": { "id": "RWZBPk3QX4Hg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "609849d1-ca57-4831-b2d8-10ae2d0504d4" }, "execution_count": 69, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "100" ] }, "metadata": {}, "execution_count": 69 } ] }, { "cell_type": "code", "source": [ "user_preds.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 698 }, "id": "xmk5e3xln_Xk", "outputId": "e92bff6a-7b20-427a-fa05-1743c8e4a166" }, "execution_count": 70, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " adult belongs_to_collection budget \\\n", "6388 False {'id': 528, 'name': 'The Terminator Collection... 200000000 \n", "3382 False NaN 0 \n", "5325 False {'id': 86055, 'name': 'Men In Black Collection... 140000000 \n", "4020 False NaN 8000000 \n", "286 False {'id': 300546, 'name': 'Once were Warriors Col... 0 \n", "\n", " genres \\\n", "6388 Action,Thriller,Science Fiction \n", "3382 Drama,Science Fiction,Adventure,Mystery \n", "5325 Action,Adventure,Comedy,Science Fiction \n", "4020 Drama,Thriller \n", "286 Drama \n", "\n", " homepage movieId imdb_id \\\n", "6388 NaN 296 tt0181852 \n", "3382 NaN 593 tt0069293 \n", "5325 http://www.sonypictures.com/homevideo/meninbla... 608 tt0120912 \n", "4020 NaN 318 tt0120753 \n", "286 NaN 527 tt0110729 \n", "\n", " original_language original_title \\\n", "6388 en Terminator 3: Rise of the Machines \n", "3382 ru Солярис \n", "5325 en Men in Black II \n", "4020 en The Million Dollar Hotel \n", "286 en Once Were Warriors \n", "\n", " overview ... revenue \\\n", "6388 It's been 10 years since John Connor saved Ear... ... 435000000.0 \n", "3382 Ground control has been receiving strange tran... ... 0.0 \n", "5325 Kay and Jay reunite to provide our best, last ... ... 441818803.0 \n", "4020 The Million Dollar Hotel starts with a jump fr... ... 0.0 \n", "286 A drama about a Maori family lving in Auckland... ... 2201126.0 \n", "\n", " runtime spoken_languages status \\\n", "6388 109.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "3382 167.0 [{'iso_639_1': 'ru', 'name': 'Pусский'}] Released \n", "5325 88.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "4020 122.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "286 99.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n", "\n", " tagline \\\n", "6388 The Machines Will Rise. \n", "3382 NaN \n", "5325 Same Planet. New Scum. \n", "4020 NaN \n", "286 A family in crisis, a life in chaos... Nothing... \n", "\n", " title video vote_average vote_count \\\n", "6388 Terminator 3: Rise of the Machines False 5.9 2177.0 \n", "3382 Solaris False 7.7 364.0 \n", "5325 Men in Black II False 6.1 3188.0 \n", "4020 The Million Dollar Hotel False 5.9 76.0 \n", "286 Once Were Warriors False 7.6 106.0 \n", "\n", " prediction \n", "6388 4.792743 \n", "3382 4.742942 \n", "5325 4.647800 \n", "4020 4.469385 \n", "286 4.236960 \n", "\n", "[5 rows x 25 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepagemovieIdimdb_idoriginal_languageoriginal_titleoverview...revenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_countprediction
6388False{'id': 528, 'name': 'The Terminator Collection...200000000Action,Thriller,Science FictionNaN296tt0181852enTerminator 3: Rise of the MachinesIt's been 10 years since John Connor saved Ear......435000000.0109.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedThe Machines Will Rise.Terminator 3: Rise of the MachinesFalse5.92177.04.792743
3382FalseNaN0Drama,Science Fiction,Adventure,MysteryNaN593tt0069293ruСолярисGround control has been receiving strange tran......0.0167.0[{'iso_639_1': 'ru', 'name': 'Pусский'}]ReleasedNaNSolarisFalse7.7364.04.742942
5325False{'id': 86055, 'name': 'Men In Black Collection...140000000Action,Adventure,Comedy,Science Fictionhttp://www.sonypictures.com/homevideo/meninbla...608tt0120912enMen in Black IIKay and Jay reunite to provide our best, last ......441818803.088.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedSame Planet. New Scum.Men in Black IIFalse6.13188.04.647800
4020FalseNaN8000000Drama,ThrillerNaN318tt0120753enThe Million Dollar HotelThe Million Dollar Hotel starts with a jump fr......0.0122.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNThe Million Dollar HotelFalse5.976.04.469385
286False{'id': 300546, 'name': 'Once were Warriors Col...0DramaNaN527tt0110729enOnce Were WarriorsA drama about a Maori family lving in Auckland......2201126.099.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedA family in crisis, a life in chaos... Nothing...Once Were WarriorsFalse7.6106.04.236960
\n", "

5 rows × 25 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 70 } ] }, { "cell_type": "code", "source": [ "user_rated.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 577 }, "id": "x6ohhdH0sF0H", "outputId": "5f8a17f7-3b50-48ad-9d10-809357834a3b" }, "execution_count": 71, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating adult belongs_to_collection budget \\\n", "0 220 2294 5.0 False NaN 22000000 \n", "46 220 1247 5.0 False NaN 85000000 \n", "25 220 2762 5.0 False NaN 0 \n", "27 220 260 5.0 False NaN 0 \n", "59 220 2324 5.0 False NaN 3250000 \n", "\n", " genres homepage imdb_id \\\n", "0 Comedy NaN tt0261392 \n", "46 Drama,Thriller,History http://www.thegoodshepherdmovie.com/ tt0343737 \n", "25 Drama,Crime NaN tt0029811 \n", "27 Action,Thriller,Mystery NaN tt0026029 \n", "59 Drama http://www.localcolormovie.com/ tt0472126 \n", "\n", " original_language ... release_date revenue runtime \\\n", "0 en ... 2001-08-22 33788161.0 104.0 \n", "46 en ... 2006-12-11 59908565.0 167.0 \n", "25 en ... 1937-11-01 0.0 83.0 \n", "27 en ... 1935-06-01 0.0 86.0 \n", "59 en ... 2006-09-19 32788.0 107.0 \n", "\n", " spoken_languages status \\\n", "0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "46 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n", "25 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "27 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "59 [{'iso_639_1': 'en', 'name': 'English'}] Released \n", "\n", " tagline \\\n", "0 Hollywood had it coming \n", "46 The untold story of the most powerful covert a... \n", "25 A Brilliant Melodrama \n", "27 Handcuffed to the girl who double-crossed him \n", "59 NaN \n", "\n", " title video vote_average vote_count \n", "0 Jay and Silent Bob Strike Back False 6.4 491.0 \n", "46 The Good Shepherd False 6.3 342.0 \n", "25 Young and Innocent False 6.8 42.0 \n", "27 The 39 Steps False 7.4 217.0 \n", "59 Local Color False 6.1 8.0 \n", "\n", "[5 rows x 26 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingadultbelongs_to_collectionbudgetgenreshomepageimdb_idoriginal_language...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
022022945.0FalseNaN22000000ComedyNaNtt0261392en...2001-08-2233788161.0104.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedHollywood had it comingJay and Silent Bob Strike BackFalse6.4491.0
4622012475.0FalseNaN85000000Drama,Thriller,Historyhttp://www.thegoodshepherdmovie.com/tt0343737en...2006-12-1159908565.0167.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedThe untold story of the most powerful covert a...The Good ShepherdFalse6.3342.0
2522027625.0FalseNaN0Drama,CrimeNaNtt0029811en...1937-11-010.083.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedA Brilliant MelodramaYoung and InnocentFalse6.842.0
272202605.0FalseNaN0Action,Thriller,MysteryNaNtt0026029en...1935-06-010.086.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedHandcuffed to the girl who double-crossed himThe 39 StepsFalse7.4217.0
5922023245.0FalseNaN3250000Dramahttp://www.localcolormovie.com/tt0472126en...2006-09-1932788.0107.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNLocal ColorFalse6.18.0
\n", "

5 rows × 26 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 71 } ] }, { "cell_type": "code", "source": [ "user_rated[[\"title\", \"genres\"]].head(10)" ], "metadata": { "id": "18grZyJYmG5q", "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "outputId": "c064b2c7-9b04-4aae-b76f-58c1df0da2dc" }, "execution_count": 72, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title genres\n", "0 Jay and Silent Bob Strike Back Comedy\n", "46 The Good Shepherd Drama,Thriller,History\n", "25 Young and Innocent Drama,Crime\n", "27 The 39 Steps Action,Thriller,Mystery\n", "59 Local Color Drama\n", "31 The Big Sleep Crime,Drama,Mystery,Thriller\n", "33 The Talented Mr. Ripley Thriller,Crime,Drama\n", "42 The Big Parade Drama,Romance,War\n", "73 Dancer in the Dark Drama,Crime,Music\n", "110 Birdman of Alcatraz Drama" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
0Jay and Silent Bob Strike BackComedy
46The Good ShepherdDrama,Thriller,History
25Young and InnocentDrama,Crime
27The 39 StepsAction,Thriller,Mystery
59Local ColorDrama
31The Big SleepCrime,Drama,Mystery,Thriller
33The Talented Mr. RipleyThriller,Crime,Drama
42The Big ParadeDrama,Romance,War
73Dancer in the DarkDrama,Crime,Music
110Birdman of AlcatrazDrama
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 72 } ] }, { "cell_type": "code", "source": [ "user_preds[[\"title\", \"genres\"]].head(10)" ], "metadata": { "id": "Mtq_XCEFmLee", "colab": { "base_uri": "https://localhost:8080/", "height": 363 }, "outputId": "0cd7dd8d-2fde-4d76-f8c0-2131239e5738" }, "execution_count": 73, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title \\\n", "6388 Terminator 3: Rise of the Machines \n", "3382 Solaris \n", "5325 Men in Black II \n", "4020 The Million Dollar Hotel \n", "286 Once Were Warriors \n", "2100 Young and Innocent \n", "534 Sleepless in Seattle \n", "2137 Say Anything... \n", "11922 License to Wed \n", "33911 The Tunnel \n", "\n", " genres \n", "6388 Action,Thriller,Science Fiction \n", "3382 Drama,Science Fiction,Adventure,Mystery \n", "5325 Action,Adventure,Comedy,Science Fiction \n", "4020 Drama,Thriller \n", "286 Drama \n", "2100 Drama,Crime \n", "534 Comedy,Drama,Romance \n", "2137 Comedy,Drama,Romance \n", "11922 Comedy \n", "33911 Science Fiction " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlegenres
6388Terminator 3: Rise of the MachinesAction,Thriller,Science Fiction
3382SolarisDrama,Science Fiction,Adventure,Mystery
5325Men in Black IIAction,Adventure,Comedy,Science Fiction
4020The Million Dollar HotelDrama,Thriller
286Once Were WarriorsDrama
2100Young and InnocentDrama,Crime
534Sleepless in SeattleComedy,Drama,Romance
2137Say Anything...Comedy,Drama,Romance
11922License to WedComedy
33911The TunnelScience Fiction
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 73 } ] }, { "cell_type": "markdown", "metadata": { "id": "7h0B7szQuRbE" }, "source": [ "# Ensemble Model" ] }, { "cell_type": "code", "execution_count": 74, "metadata": { "id": "BIG1-TQ7g1HJ" }, "outputs": [], "source": [ "def ensemble(userId, k=10):\n", " user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, userId, k*k)\n", " content_based_result = content_based_all_batches(rating[rating['userId'] == userId], k=k, movieIds=user_preds['movieId'])\n", " return content_based_result[['id','title']]" ] }, { "cell_type": "code", "source": [ "ensemble_k=10\n", "mf.log_param('ensemble k', ensemble_k)\n", "ensemble(220, ensemble_k)" ], "metadata": { "id": "DSGgvMKIPUIu", "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "outputId": "8fa1410d-315a-47ff-bba9-65b14c568708" }, "execution_count": 75, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " id title\n", "2663 912 The Thomas Crown Affair\n", "1415 1968 Fools Rush In\n", "2077 1580 Rope\n", "2110 2762 Young and Innocent\n", "533 858 Sleepless in Seattle\n", "... ... ...\n", "255 11 Star Wars\n", "1315 377 A Nightmare on Elm Street\n", "1344 364 Batman Returns\n", "5524 1682 Mothra vs. Godzilla\n", "6146 2671 Ringu\n", "\n", "[67 rows x 2 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitle
2663912The Thomas Crown Affair
14151968Fools Rush In
20771580Rope
21102762Young and Innocent
533858Sleepless in Seattle
.........
25511Star Wars
1315377A Nightmare on Elm Street
1344364Batman Returns
55241682Mothra vs. Godzilla
61462671Ringu
\n", "

67 rows × 2 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 75 } ] }, { "cell_type": "markdown", "source": [ "# Evaluation" ], "metadata": { "id": "oFDjsFmJgvwa" } }, { "cell_type": "code", "source": [ "df_res = user_preds[[\"movieId\", \"prediction\"]]. \\\n", " merge(user_rated[[\"movieId\", \"rating\"]], how = 'outer', on = 'movieId')" ], "metadata": { "id": "lePskKh9rObl" }, "execution_count": 76, "outputs": [] }, { "cell_type": "code", "source": [ "df_res.sort_values(by='prediction',ascending=False,inplace=True)\n", "df_res" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "gHZkoIr-r6dM", "outputId": "946e5b4d-5c2f-4262-d601-ef915a3b0417" }, "execution_count": 77, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId prediction rating\n", "0 296.0 4.792743 5.0\n", "1 593.0 4.742942 4.0\n", "2 608.0 4.647800 5.0\n", "3 318.0 4.469385 NaN\n", "4 527.0 4.236960 5.0\n", ".. ... ... ...\n", "174 2269.0 NaN 1.0\n", "175 586.0 NaN 1.0\n", "176 344.0 NaN 1.0\n", "177 2054.0 NaN 1.0\n", "178 2617.0 NaN 1.0\n", "\n", "[179 rows x 3 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdpredictionrating
0296.04.7927435.0
1593.04.7429424.0
2608.04.6478005.0
3318.04.469385NaN
4527.04.2369605.0
............
1742269.0NaN1.0
175586.0NaN1.0
176344.0NaN1.0
1772054.0NaN1.0
1782617.0NaN1.0
\n", "

179 rows × 3 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 77 } ] }, { "cell_type": "code", "source": [ "threshold = 2\n", "df_res['prediction'] = df_res['prediction'] >= threshold\n", "df_res['rating'] = df_res['rating'] >= threshold\n", "df_res" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "Wq-kIghejSIO", "outputId": "4af55343-80d8-42e5-a61b-69615fa48659" }, "execution_count": 78, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId prediction rating\n", "0 296.0 True True\n", "1 593.0 True True\n", "2 608.0 True True\n", "3 318.0 True False\n", "4 527.0 True True\n", ".. ... ... ...\n", "174 2269.0 False False\n", "175 586.0 False False\n", "176 344.0 False False\n", "177 2054.0 False False\n", "178 2617.0 False False\n", "\n", "[179 rows x 3 columns]" ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdpredictionrating
0296.0TrueTrue
1593.0TrueTrue
2608.0TrueTrue
3318.0TrueFalse
4527.0TrueTrue
............
1742269.0FalseFalse
175586.0FalseFalse
176344.0FalseFalse
1772054.0FalseFalse
1782617.0FalseFalse
\n", "

179 rows × 3 columns

\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 78 } ] }, { "cell_type": "code", "source": [ "def precision_at_k(df, k=10, y_test: str='rating', y_pred='prediction'): \n", " dfK = df.head(k)\n", " sum_df = dfK[y_pred].sum()\n", " true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]\n", " if sum_df > 0:\n", " return true_pred/sum_df\n", " else:\n", " return None\n", "\n", "def recall_at_k(df, k=10, y_test='rating', y_pred='prediction'):\n", " dfK = df.head(k)\n", " sum_df = df[y_test].sum()\n", " true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]\n", " if sum_df > 0:\n", " return true_pred/sum_df\n", " else:\n", " return None" ], "metadata": { "id": "z3iLPKpZqQ0e" }, "execution_count": 79, "outputs": [] }, { "cell_type": "code", "source": [ "prec_at_k = precision_at_k(df_res, 100, y_test='rating', y_pred='prediction')\n", "rec_at_k = recall_at_k(df_res, 100, y_test='rating', y_pred='prediction')" ], "metadata": { "id": "IrMcSYwPsrV3" }, "execution_count": 80, "outputs": [] }, { "cell_type": "code", "source": [ "print(\"precision@k: \", prec_at_k)\n", "print(\"recall@k: \", rec_at_k)\n", "mf.log_metric('recall', rec_at_k)\n", "mf.log_metric('precision', prec_at_k)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r2mMjEt5s31U", "outputId": "6d69f9d6-10eb-4ee5-90d5-ee8a4495c391" }, "execution_count": 81, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "precision@k: 0.7941176470588235\n", "recall@k: 0.21774193548387097\n" ] } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "nj5lPtCfP17d" }, "execution_count": 81, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "z1HlQhn9gyTJ" }, "source": [ "# MLOps" ] }, { "cell_type": "code", "execution_count": 82, "metadata": { "id": "d7GJNfWMgzy3" }, "outputs": [], "source": [ "def updata_batch(new_batch):\n", " number_of_batches = number_of_batches+1\n", " batches = batches.append(new_batch)\n", " mf.log_param('number of batches', number_of_batches)" ] }, { "cell_type": "code", "source": [], "metadata": { "id": "FkGxEvXj21Dw" }, "execution_count": 82, "outputs": [] } ], "metadata": { "colab": { "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }