diff --git "a/ML_Final_Project.ipynb" "b/ML_Final_Project.ipynb"
new file mode 100644--- /dev/null
+++ "b/ML_Final_Project.ipynb"
@@ -0,0 +1,9930 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sdXLxyndTymr"
+ },
+ "source": [
+ "### install dependencies"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "y3Mq4BwgTMTY",
+ "outputId": "3d127f2f-7c24-4b24-8afa-3444755b7606"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Downloading...\n",
+ "From: https://drive.google.com/uc?id=1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t\n",
+ "To: /content/IMDB.zip\n",
+ "100% 61.4M/61.4M [00:00<00:00, 224MB/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "!gdown \"1W3-WEplVSztLR3lvkyYdiKZGMT4y0cNi&confirm=t\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "oU3KUmsiGtQU"
+ },
+ "outputs": [],
+ "source": [
+ "#!unzip IMDB.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#!pip install mlflow"
+ ],
+ "metadata": {
+ "id": "a7swi2rM37ie"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "XAq9DVtTvPim"
+ },
+ "source": [
+ "# Content-based filtering"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "yxKpdNo1UFvc"
+ },
+ "source": [
+ "### import libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "A_qqPpWqUGJm"
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import mlflow as mf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "#mf.log_artifacts({'rating':'/content/rating_small.csv', 'rating':'/content/rating_small.csv', 'movies':'/content/movies_metadata.csv','keywords':'/content/keywords.csv', 'credits':'/content/credits.csv'})"
+ ],
+ "metadata": {
+ "id": "LuqMSMvrEg1H"
+ },
+ "execution_count": 5,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "gaQ7KyStURHR"
+ },
+ "source": [
+ "### read data from file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "id": "0E7xGgKWqofc",
+ "outputId": "b233ab4c-aef3-4997-a27f-acdeaf5dd900"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id keywords\n",
+ "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n",
+ "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n",
+ "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n",
+ "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n",
+ "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...\n",
+ "... ... ...\n",
+ "46414 439050 [{'id': 10703, 'name': 'tragic love'}]\n",
+ "46415 111109 [{'id': 2679, 'name': 'artist'}, {'id': 14531,...\n",
+ "46416 67758 []\n",
+ "46417 227506 []\n",
+ "46418 461257 []\n",
+ "\n",
+ "[46419 rows x 2 columns]"
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " keywords | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 862 | \n",
+ " [{'id': 931, 'name': 'jealousy'}, {'id': 4290,... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 8844 | \n",
+ " [{'id': 10090, 'name': 'board game'}, {'id': 1... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 15602 | \n",
+ " [{'id': 1495, 'name': 'fishing'}, {'id': 12392... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 31357 | \n",
+ " [{'id': 818, 'name': 'based on novel'}, {'id':... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 11862 | \n",
+ " [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n... | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 46414 | \n",
+ " 439050 | \n",
+ " [{'id': 10703, 'name': 'tragic love'}] | \n",
+ "
\n",
+ " \n",
+ " 46415 | \n",
+ " 111109 | \n",
+ " [{'id': 2679, 'name': 'artist'}, {'id': 14531,... | \n",
+ "
\n",
+ " \n",
+ " 46416 | \n",
+ " 67758 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 46417 | \n",
+ " 227506 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 46418 | \n",
+ " 461257 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
46419 rows × 2 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 6
+ }
+ ],
+ "source": [
+ "keywords = pd.read_csv('/content/IMDB/keywords.csv')\n",
+ "keywords"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "id": "AeT9NJibvkW4",
+ "outputId": "abfc4537-ad26-4e7b-8120-84b83994df17"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " userId movieId rating timestamp\n",
+ "0 1 31 2.5 1260759144\n",
+ "1 1 1029 3.0 1260759179\n",
+ "2 1 1061 3.0 1260759182\n",
+ "3 1 1129 2.0 1260759185\n",
+ "4 1 1172 4.0 1260759205\n",
+ "... ... ... ... ...\n",
+ "99999 671 6268 2.5 1065579370\n",
+ "100000 671 6269 4.0 1065149201\n",
+ "100001 671 6365 4.0 1070940363\n",
+ "100002 671 6385 2.5 1070979663\n",
+ "100003 671 6565 3.5 1074784724\n",
+ "\n",
+ "[100004 rows x 4 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " rating | \n",
+ " timestamp | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 31 | \n",
+ " 2.5 | \n",
+ " 1260759144 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1029 | \n",
+ " 3.0 | \n",
+ " 1260759179 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1061 | \n",
+ " 3.0 | \n",
+ " 1260759182 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1129 | \n",
+ " 2.0 | \n",
+ " 1260759185 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1172 | \n",
+ " 4.0 | \n",
+ " 1260759205 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 99999 | \n",
+ " 671 | \n",
+ " 6268 | \n",
+ " 2.5 | \n",
+ " 1065579370 | \n",
+ "
\n",
+ " \n",
+ " 100000 | \n",
+ " 671 | \n",
+ " 6269 | \n",
+ " 4.0 | \n",
+ " 1065149201 | \n",
+ "
\n",
+ " \n",
+ " 100001 | \n",
+ " 671 | \n",
+ " 6365 | \n",
+ " 4.0 | \n",
+ " 1070940363 | \n",
+ "
\n",
+ " \n",
+ " 100002 | \n",
+ " 671 | \n",
+ " 6385 | \n",
+ " 2.5 | \n",
+ " 1070979663 | \n",
+ "
\n",
+ " \n",
+ " 100003 | \n",
+ " 671 | \n",
+ " 6565 | \n",
+ " 3.5 | \n",
+ " 1074784724 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
100004 rows × 4 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 7
+ }
+ ],
+ "source": [
+ "rating = pd.read_csv('/content/IMDB/ratings_small.csv')\n",
+ "rating"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits = pd.read_csv('/content/IMDB/credits.csv')\n",
+ "credits"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "id": "jMaH5Yrs72sK",
+ "outputId": "b7b0cc27-6c5f-4b6d-b719-7aa148adfa2e"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " cast \\\n",
+ "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
+ "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
+ "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
+ "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
+ "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
+ "... ... \n",
+ "45471 [{'cast_id': 0, 'character': '', 'credit_id': ... \n",
+ "45472 [{'cast_id': 1002, 'character': 'Sister Angela... \n",
+ "45473 [{'cast_id': 6, 'character': 'Emily Shaw', 'cr... \n",
+ "45474 [{'cast_id': 2, 'character': '', 'credit_id': ... \n",
+ "45475 [] \n",
+ "\n",
+ " crew id \n",
+ "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
+ "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
+ "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
+ "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
+ "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 \n",
+ "... ... ... \n",
+ "45471 [{'credit_id': '5894a97d925141426c00818c', 'de... 439050 \n",
+ "45472 [{'credit_id': '52fe4af1c3a36847f81e9b15', 'de... 111109 \n",
+ "45473 [{'credit_id': '52fe4776c3a368484e0c8387', 'de... 67758 \n",
+ "45474 [{'credit_id': '533bccebc3a36844cf0011a7', 'de... 227506 \n",
+ "45475 [{'credit_id': '593e676c92514105b702e68e', 'de... 461257 \n",
+ "\n",
+ "[45476 rows x 3 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cast | \n",
+ " crew | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
+ " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
+ " 862 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
+ " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
+ " 8844 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
+ " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
+ " 15602 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
+ " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
+ " 31357 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
+ " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
+ " 11862 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45471 | \n",
+ " [{'cast_id': 0, 'character': '', 'credit_id': ... | \n",
+ " [{'credit_id': '5894a97d925141426c00818c', 'de... | \n",
+ " 439050 | \n",
+ "
\n",
+ " \n",
+ " 45472 | \n",
+ " [{'cast_id': 1002, 'character': 'Sister Angela... | \n",
+ " [{'credit_id': '52fe4af1c3a36847f81e9b15', 'de... | \n",
+ " 111109 | \n",
+ "
\n",
+ " \n",
+ " 45473 | \n",
+ " [{'cast_id': 6, 'character': 'Emily Shaw', 'cr... | \n",
+ " [{'credit_id': '52fe4776c3a368484e0c8387', 'de... | \n",
+ " 67758 | \n",
+ "
\n",
+ " \n",
+ " 45474 | \n",
+ " [{'cast_id': 2, 'character': '', 'credit_id': ... | \n",
+ " [{'credit_id': '533bccebc3a36844cf0011a7', 'de... | \n",
+ " 227506 | \n",
+ "
\n",
+ " \n",
+ " 45475 | \n",
+ " [] | \n",
+ " [{'credit_id': '593e676c92514105b702e68e', 'de... | \n",
+ " 461257 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
45476 rows × 3 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 8
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "kYA5KUWDFcZ2",
+ "outputId": "0d7f210e-341e-4070-f2a6-ac4993658e96"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection budget \\\n",
+ "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
+ "1 False NaN 65000000 \n",
+ "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
+ "3 False NaN 16000000 \n",
+ "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
+ "... ... ... ... \n",
+ "45461 False NaN 0 \n",
+ "45462 False NaN 0 \n",
+ "45463 False NaN 0 \n",
+ "45464 False NaN 0 \n",
+ "45465 False NaN 0 \n",
+ "\n",
+ " genres \\\n",
+ "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
+ "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
+ "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
+ "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
+ "4 [{'id': 35, 'name': 'Comedy'}] \n",
+ "... ... \n",
+ "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n",
+ "45462 [{'id': 18, 'name': 'Drama'}] \n",
+ "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n",
+ "45464 [] \n",
+ "45465 [] \n",
+ "\n",
+ " homepage id imdb_id \\\n",
+ "0 http://toystory.disney.com/toy-story 862 tt0114709 \n",
+ "1 NaN 8844 tt0113497 \n",
+ "2 NaN 15602 tt0113228 \n",
+ "3 NaN 31357 tt0114885 \n",
+ "4 NaN 11862 tt0113041 \n",
+ "... ... ... ... \n",
+ "45461 http://www.imdb.com/title/tt6209470/ 439050 tt6209470 \n",
+ "45462 NaN 111109 tt2028550 \n",
+ "45463 NaN 67758 tt0303758 \n",
+ "45464 NaN 227506 tt0008536 \n",
+ "45465 NaN 461257 tt6980792 \n",
+ "\n",
+ " original_language original_title \\\n",
+ "0 en Toy Story \n",
+ "1 en Jumanji \n",
+ "2 en Grumpier Old Men \n",
+ "3 en Waiting to Exhale \n",
+ "4 en Father of the Bride Part II \n",
+ "... ... ... \n",
+ "45461 fa رگ خواب \n",
+ "45462 tl Siglo ng Pagluluwal \n",
+ "45463 en Betrayal \n",
+ "45464 en Satana likuyushchiy \n",
+ "45465 en Queerama \n",
+ "\n",
+ " overview ... release_date \\\n",
+ "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
+ "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
+ "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
+ "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
+ "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
+ "... ... ... ... \n",
+ "45461 Rising and falling between a man and woman. ... NaN \n",
+ "45462 An artist struggles to finish his work while a... ... 2011-11-17 \n",
+ "45463 When one of her hits goes wrong, a professiona... ... 2003-08-01 \n",
+ "45464 In a small town live two brothers, one a minis... ... 1917-10-21 \n",
+ "45465 50 years after decriminalisation of homosexual... ... 2017-06-09 \n",
+ "\n",
+ " revenue runtime spoken_languages \\\n",
+ "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "... ... ... ... \n",
+ "45461 0.0 90.0 [{'iso_639_1': 'fa', 'name': 'فارسی'}] \n",
+ "45462 0.0 360.0 [{'iso_639_1': 'tl', 'name': ''}] \n",
+ "45463 0.0 90.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "45464 0.0 87.0 [] \n",
+ "45465 0.0 75.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "\n",
+ " status tagline \\\n",
+ "0 Released NaN \n",
+ "1 Released Roll the dice and unleash the excitement! \n",
+ "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
+ "3 Released Friends are the people who let you be yourself... \n",
+ "4 Released Just When His World Is Back To Normal... He's ... \n",
+ "... ... ... \n",
+ "45461 Released Rising and falling between a man and woman \n",
+ "45462 Released NaN \n",
+ "45463 Released A deadly game of wits. \n",
+ "45464 Released NaN \n",
+ "45465 Released NaN \n",
+ "\n",
+ " title video vote_average vote_count \n",
+ "0 Toy Story False 7.7 5415.0 \n",
+ "1 Jumanji False 6.9 2413.0 \n",
+ "2 Grumpier Old Men False 6.5 92.0 \n",
+ "3 Waiting to Exhale False 6.1 34.0 \n",
+ "4 Father of the Bride Part II False 5.7 173.0 \n",
+ "... ... ... ... ... \n",
+ "45461 Subdue False 4.0 1.0 \n",
+ "45462 Century of Birthing False 9.0 3.0 \n",
+ "45463 Betrayal False 3.8 6.0 \n",
+ "45464 Satan Triumphant False 0.0 0.0 \n",
+ "45465 Queerama False 0.0 0.0 \n",
+ "\n",
+ "[45466 rows x 24 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " id | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " original_title | \n",
+ " overview | \n",
+ " ... | \n",
+ " release_date | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " False | \n",
+ " {'id': 10194, 'name': 'Toy Story Collection', ... | \n",
+ " 30000000 | \n",
+ " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
+ " http://toystory.disney.com/toy-story | \n",
+ " 862 | \n",
+ " tt0114709 | \n",
+ " en | \n",
+ " Toy Story | \n",
+ " Led by Woody, Andy's toys live happily in his ... | \n",
+ " ... | \n",
+ " 1995-10-30 | \n",
+ " 373554033.0 | \n",
+ " 81.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Toy Story | \n",
+ " False | \n",
+ " 7.7 | \n",
+ " 5415.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 65000000 | \n",
+ " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
+ " NaN | \n",
+ " 8844 | \n",
+ " tt0113497 | \n",
+ " en | \n",
+ " Jumanji | \n",
+ " When siblings Judy and Peter discover an encha... | \n",
+ " ... | \n",
+ " 1995-12-15 | \n",
+ " 262797249.0 | \n",
+ " 104.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " Roll the dice and unleash the excitement! | \n",
+ " Jumanji | \n",
+ " False | \n",
+ " 6.9 | \n",
+ " 2413.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " False | \n",
+ " {'id': 119050, 'name': 'Grumpy Old Men Collect... | \n",
+ " 0 | \n",
+ " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
+ " NaN | \n",
+ " 15602 | \n",
+ " tt0113228 | \n",
+ " en | \n",
+ " Grumpier Old Men | \n",
+ " A family wedding reignites the ancient feud be... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 0.0 | \n",
+ " 101.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Still Yelling. Still Fighting. Still Ready for... | \n",
+ " Grumpier Old Men | \n",
+ " False | \n",
+ " 6.5 | \n",
+ " 92.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 16000000 | \n",
+ " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
+ " NaN | \n",
+ " 31357 | \n",
+ " tt0114885 | \n",
+ " en | \n",
+ " Waiting to Exhale | \n",
+ " Cheated on, mistreated and stepped on, the wom... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 81452156.0 | \n",
+ " 127.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Friends are the people who let you be yourself... | \n",
+ " Waiting to Exhale | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 34.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " False | \n",
+ " {'id': 96871, 'name': 'Father of the Bride Col... | \n",
+ " 0 | \n",
+ " [{'id': 35, 'name': 'Comedy'}] | \n",
+ " NaN | \n",
+ " 11862 | \n",
+ " tt0113041 | \n",
+ " en | \n",
+ " Father of the Bride Part II | \n",
+ " Just when George Banks has recovered from his ... | \n",
+ " ... | \n",
+ " 1995-02-10 | \n",
+ " 76578911.0 | \n",
+ " 106.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Just When His World Is Back To Normal... He's ... | \n",
+ " Father of the Bride Part II | \n",
+ " False | \n",
+ " 5.7 | \n",
+ " 173.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 45461 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... | \n",
+ " http://www.imdb.com/title/tt6209470/ | \n",
+ " 439050 | \n",
+ " tt6209470 | \n",
+ " fa | \n",
+ " رگ خواب | \n",
+ " Rising and falling between a man and woman. | \n",
+ " ... | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 90.0 | \n",
+ " [{'iso_639_1': 'fa', 'name': 'فارسی'}] | \n",
+ " Released | \n",
+ " Rising and falling between a man and woman | \n",
+ " Subdue | \n",
+ " False | \n",
+ " 4.0 | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 45462 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " [{'id': 18, 'name': 'Drama'}] | \n",
+ " NaN | \n",
+ " 111109 | \n",
+ " tt2028550 | \n",
+ " tl | \n",
+ " Siglo ng Pagluluwal | \n",
+ " An artist struggles to finish his work while a... | \n",
+ " ... | \n",
+ " 2011-11-17 | \n",
+ " 0.0 | \n",
+ " 360.0 | \n",
+ " [{'iso_639_1': 'tl', 'name': ''}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Century of Birthing | \n",
+ " False | \n",
+ " 9.0 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 45463 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... | \n",
+ " NaN | \n",
+ " 67758 | \n",
+ " tt0303758 | \n",
+ " en | \n",
+ " Betrayal | \n",
+ " When one of her hits goes wrong, a professiona... | \n",
+ " ... | \n",
+ " 2003-08-01 | \n",
+ " 0.0 | \n",
+ " 90.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " A deadly game of wits. | \n",
+ " Betrayal | \n",
+ " False | \n",
+ " 3.8 | \n",
+ " 6.0 | \n",
+ "
\n",
+ " \n",
+ " 45464 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " [] | \n",
+ " NaN | \n",
+ " 227506 | \n",
+ " tt0008536 | \n",
+ " en | \n",
+ " Satana likuyushchiy | \n",
+ " In a small town live two brothers, one a minis... | \n",
+ " ... | \n",
+ " 1917-10-21 | \n",
+ " 0.0 | \n",
+ " 87.0 | \n",
+ " [] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Satan Triumphant | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 45465 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " [] | \n",
+ " NaN | \n",
+ " 461257 | \n",
+ " tt6980792 | \n",
+ " en | \n",
+ " Queerama | \n",
+ " 50 years after decriminalisation of homosexual... | \n",
+ " ... | \n",
+ " 2017-06-09 | \n",
+ " 0.0 | \n",
+ " 75.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Queerama | \n",
+ " False | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
45466 rows × 24 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 9
+ }
+ ],
+ "source": [
+ "metadata = pd.read_csv('/content/IMDB/movies_metadata.csv')\n",
+ "metadata"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7Y3S-BFBTTCY"
+ },
+ "source": [
+ "keep only related columns from released movies:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "3zJqU8dUTTtY"
+ },
+ "outputs": [],
+ "source": [
+ "metadata = metadata[metadata['status'] == 'Released']\n",
+ "cols = np.array(['adult', 'belongs_to_collection', 'genres', 'id', 'original_language', 'title', 'production_countries', 'production_companies', 'video']) \n",
+ "metadata = metadata[cols]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WiV8m-zP7dxy",
+ "outputId": "ec13b127-a699-47a5-ce02-3dda6c354267"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "adult False\n",
+ "belongs_to_collection NaN\n",
+ "genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n",
+ "id 8844\n",
+ "original_language en\n",
+ "title Jumanji\n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n",
+ "production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n",
+ "video False\n",
+ "Name: 1, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 11
+ }
+ ],
+ "source": [
+ "metadata.iloc[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "bBdpvpIl4vqJ",
+ "outputId": "a118ca77-0276-4408-f1df-6896507e8131"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "adult False\n",
+ "belongs_to_collection \n",
+ "genres [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...\n",
+ "id 8844\n",
+ "original_language en\n",
+ "title Jumanji\n",
+ "production_countries [{'iso_3166_1': 'US', 'name': 'United States o...\n",
+ "production_companies [{'name': 'TriStar Pictures', 'id': 559}, {'na...\n",
+ "video False\n",
+ "Name: 1, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 12
+ }
+ ],
+ "source": [
+ "def find_collection(x):\n",
+ " if x == '':\n",
+ " return ''\n",
+ " return eval(str(x))['name']\n",
+ "\n",
+ "metadata['belongs_to_collection'] = metadata['belongs_to_collection'].fillna('')\n",
+ "metadata['belongs_to_collection'] = metadata['belongs_to_collection'].apply(find_collection)\n",
+ "metadata.iloc[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "VRqAmj5aABKi",
+ "outputId": "8aff7bec-5c5c-4829-8422-f19d264542e6"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "adult False\n",
+ "belongs_to_collection \n",
+ "genres Adventure,Fantasy,Family\n",
+ "id 8844\n",
+ "original_language en\n",
+ "title Jumanji\n",
+ "production_countries United States of America\n",
+ "production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n",
+ "video False\n",
+ "Name: 1, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 13
+ }
+ ],
+ "source": [
+ "def find_names(x):\n",
+ " if x == '':\n",
+ " return ''\n",
+ " genre_arr = eval(str(x))\n",
+ " return ','.join(i['name'] for i in eval(str(x)))\n",
+ " \n",
+ "metadata['genres'] = metadata['genres'].fillna('')\n",
+ "metadata['genres']=metadata['genres'].apply(find_names)\n",
+ "metadata['production_countries']=metadata['production_countries'].apply(find_names)\n",
+ "metadata['production_companies']=metadata['production_companies'].apply(find_names)\n",
+ "credits['cast'] = credits['cast'].apply(find_names)\n",
+ "metadata.iloc[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GfHP8lcEzi6c",
+ "outputId": "4be60bff-1d92-4333-bfe8-4d5105e9858a"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "adult False\n",
+ "belongs_to_collection \n",
+ "genres Adventure,Fantasy,Family\n",
+ "id 8844\n",
+ "original_language en\n",
+ "title Jumanji\n",
+ "production_countries United States of America\n",
+ "production_companies TriStar Pictures,Teitler Film,Interscope Commu...\n",
+ "video False\n",
+ "keywords board game,disappearance,based on children's b...\n",
+ "Name: 1, dtype: object"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 14
+ }
+ ],
+ "source": [
+ "keywords['keywords'] = keywords['keywords'].apply(find_names)\n",
+ "metadata['id'] = metadata['id'].astype(int)\n",
+ "metadata = pd.merge(metadata,keywords,how='inner',on='id')\n",
+ "metadata.iloc[1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "id": "_sNely8jO2Co"
+ },
+ "outputs": [],
+ "source": [
+ "def to_int(x):\n",
+ " if x == 'True':\n",
+ " return 1\n",
+ " return 0"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "iUHJJHwyHcz-",
+ "outputId": "798f1640-33c5-42a5-9d2c-d5c757bdc539"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['False', 'True'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 16
+ }
+ ],
+ "source": [
+ "metadata['adult'].unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kB8thP2fJ9Af"
+ },
+ "source": [
+ "there are 3 values other than True or False in adult column. there are entered by mistake so we remove those rows."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "U1d3Z-88KPYW",
+ "outputId": "a6b4bfec-73fa-488e-d004-1fc5f1c3c461"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([False, True], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 17
+ }
+ ],
+ "source": [
+ "metadata = metadata[(metadata['adult'] == 'True') | (metadata['adult'] == 'False')]\n",
+ "metadata['adult'] = metadata['adult'].apply(to_int)\n",
+ "metadata['video'].unique()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ifUvKXYbQi2I"
+ },
+ "source": [
+ "removing nan values from dataset and replacing 'True' and 'False' with 1 and 0:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "id": "72DHRHQLLIxQ"
+ },
+ "outputs": [],
+ "source": [
+ "metadata = metadata[~metadata['video'].isna()]\n",
+ "metadata['video'] = metadata['video'].apply(to_int)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3XUewQIcKkv_"
+ },
+ "source": [
+ "## Vectorize string features"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 661
+ },
+ "id": "P0n1lJnUKj_-",
+ "outputId": "f72d7774-db3d-4de7-bd56-8a7316b6dcc1"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection genres \\\n",
+ "0 0 Toy Story Collection Animation,Comedy,Family \n",
+ "1 0 Adventure,Fantasy,Family \n",
+ "2 0 Grumpy Old Men Collection Romance,Comedy \n",
+ "3 0 Comedy,Drama,Romance \n",
+ "4 0 Father of the Bride Collection Comedy \n",
+ "... ... ... ... \n",
+ "46017 0 Drama,Family \n",
+ "46018 0 Drama \n",
+ "46019 0 Action,Drama,Thriller \n",
+ "46020 0 \n",
+ "46021 0 \n",
+ "\n",
+ " id original_language title \\\n",
+ "0 862 en Toy Story \n",
+ "1 8844 en Jumanji \n",
+ "2 15602 en Grumpier Old Men \n",
+ "3 31357 en Waiting to Exhale \n",
+ "4 11862 en Father of the Bride Part II \n",
+ "... ... ... ... \n",
+ "46017 439050 fa Subdue \n",
+ "46018 111109 tl Century of Birthing \n",
+ "46019 67758 en Betrayal \n",
+ "46020 227506 en Satan Triumphant \n",
+ "46021 461257 en Queerama \n",
+ "\n",
+ " production_countries \\\n",
+ "0 United States of America \n",
+ "1 United States of America \n",
+ "2 United States of America \n",
+ "3 United States of America \n",
+ "4 United States of America \n",
+ "... ... \n",
+ "46017 Iran \n",
+ "46018 Philippines \n",
+ "46019 United States of America \n",
+ "46020 Russia \n",
+ "46021 United Kingdom \n",
+ "\n",
+ " production_companies video \\\n",
+ "0 Pixar Animation Studios 0 \n",
+ "1 TriStar Pictures,Teitler Film,Interscope Commu... 0 \n",
+ "2 Warner Bros.,Lancaster Gate 0 \n",
+ "3 Twentieth Century Fox Film Corporation 0 \n",
+ "4 Sandollar Productions,Touchstone Pictures 0 \n",
+ "... ... ... \n",
+ "46017 0 \n",
+ "46018 Sine Olivia 0 \n",
+ "46019 American World Pictures 0 \n",
+ "46020 Yermoliev 0 \n",
+ "46021 0 \n",
+ "\n",
+ " keywords \n",
+ "0 jealousy,toy,boy,friendship,friends,rivalry,bo... \n",
+ "1 board game,disappearance,based on children's b... \n",
+ "2 fishing,best friend,duringcreditsstinger,old men \n",
+ "3 based on novel,interracial relationship,single... \n",
+ "4 baby,midlife crisis,confidence,aging,daughter,... \n",
+ "... ... \n",
+ "46017 tragic love \n",
+ "46018 artist,play,pinoy \n",
+ "46019 \n",
+ "46020 \n",
+ "46021 \n",
+ "\n",
+ "[46022 rows x 10 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " genres | \n",
+ " id | \n",
+ " original_language | \n",
+ " title | \n",
+ " production_countries | \n",
+ " production_companies | \n",
+ " video | \n",
+ " keywords | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " Toy Story Collection | \n",
+ " Animation,Comedy,Family | \n",
+ " 862 | \n",
+ " en | \n",
+ " Toy Story | \n",
+ " United States of America | \n",
+ " Pixar Animation Studios | \n",
+ " 0 | \n",
+ " jealousy,toy,boy,friendship,friends,rivalry,bo... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 0 | \n",
+ " | \n",
+ " Adventure,Fantasy,Family | \n",
+ " 8844 | \n",
+ " en | \n",
+ " Jumanji | \n",
+ " United States of America | \n",
+ " TriStar Pictures,Teitler Film,Interscope Commu... | \n",
+ " 0 | \n",
+ " board game,disappearance,based on children's b... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0 | \n",
+ " Grumpy Old Men Collection | \n",
+ " Romance,Comedy | \n",
+ " 15602 | \n",
+ " en | \n",
+ " Grumpier Old Men | \n",
+ " United States of America | \n",
+ " Warner Bros.,Lancaster Gate | \n",
+ " 0 | \n",
+ " fishing,best friend,duringcreditsstinger,old men | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0 | \n",
+ " | \n",
+ " Comedy,Drama,Romance | \n",
+ " 31357 | \n",
+ " en | \n",
+ " Waiting to Exhale | \n",
+ " United States of America | \n",
+ " Twentieth Century Fox Film Corporation | \n",
+ " 0 | \n",
+ " based on novel,interracial relationship,single... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0 | \n",
+ " Father of the Bride Collection | \n",
+ " Comedy | \n",
+ " 11862 | \n",
+ " en | \n",
+ " Father of the Bride Part II | \n",
+ " United States of America | \n",
+ " Sandollar Productions,Touchstone Pictures | \n",
+ " 0 | \n",
+ " baby,midlife crisis,confidence,aging,daughter,... | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 46017 | \n",
+ " 0 | \n",
+ " | \n",
+ " Drama,Family | \n",
+ " 439050 | \n",
+ " fa | \n",
+ " Subdue | \n",
+ " Iran | \n",
+ " | \n",
+ " 0 | \n",
+ " tragic love | \n",
+ "
\n",
+ " \n",
+ " 46018 | \n",
+ " 0 | \n",
+ " | \n",
+ " Drama | \n",
+ " 111109 | \n",
+ " tl | \n",
+ " Century of Birthing | \n",
+ " Philippines | \n",
+ " Sine Olivia | \n",
+ " 0 | \n",
+ " artist,play,pinoy | \n",
+ "
\n",
+ " \n",
+ " 46019 | \n",
+ " 0 | \n",
+ " | \n",
+ " Action,Drama,Thriller | \n",
+ " 67758 | \n",
+ " en | \n",
+ " Betrayal | \n",
+ " United States of America | \n",
+ " American World Pictures | \n",
+ " 0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 46020 | \n",
+ " 0 | \n",
+ " | \n",
+ " | \n",
+ " 227506 | \n",
+ " en | \n",
+ " Satan Triumphant | \n",
+ " Russia | \n",
+ " Yermoliev | \n",
+ " 0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 46021 | \n",
+ " 0 | \n",
+ " | \n",
+ " | \n",
+ " 461257 | \n",
+ " en | \n",
+ " Queerama | \n",
+ " United Kingdom | \n",
+ " | \n",
+ " 0 | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
46022 rows × 10 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 19
+ }
+ ],
+ "source": [
+ "metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "id": "2wnOtfe1m4aq"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.feature_extraction.text import CountVectorizer\n",
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+ "\n",
+ "def my_tok(text):\n",
+ " return text.split(\",\")\n",
+ "\n",
+ "def vectorize_string(col_name, feature_name, limit=None, df=metadata):\n",
+ " vectorizer = CountVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)\n",
+ " X = vectorizer.fit_transform(df[col_name])\n",
+ " vec_cols = vectorizer.get_feature_names_out()\n",
+ " vec_data = X.toarray()\n",
+ " #vec_cols = np.char.add(feature_name+':', vec_cols)\n",
+ " vec_cols = feature_name+':'+vec_cols\n",
+ " return vec_data, vec_cols\n",
+ "\n",
+ "def tfidf(col_name, feature_name, limit=None, df=metadata):\n",
+ " vectorizer = TfidfVectorizer(tokenizer=my_tok, max_features=limit, min_df=2)\n",
+ " X = vectorizer.fit_transform(df[col_name])\n",
+ " vec_cols = vectorizer.get_feature_names_out()\n",
+ " vec_data = X.toarray()\n",
+ " #vec_cols = np.char.add(feature_name+':', vec_cols)\n",
+ " vec_cols = feature_name+':'+vec_cols\n",
+ " return vec_data, vec_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "owM0Q_INPDyD",
+ "outputId": "8975c88b-bbe0-43b4-ad2e-a6ee8eb326d8"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['genre:', 'genre:action', 'genre:adventure', 'genre:animation',\n",
+ " 'genre:comedy', 'genre:crime', 'genre:documentary', 'genre:drama',\n",
+ " 'genre:family', 'genre:fantasy', 'genre:foreign', 'genre:history',\n",
+ " 'genre:horror', 'genre:music', 'genre:mystery', 'genre:romance',\n",
+ " 'genre:science fiction', 'genre:thriller', 'genre:tv movie',\n",
+ " 'genre:war', 'genre:western'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 21
+ }
+ ],
+ "source": [
+ "genre_data, genre_cols = vectorize_string('genres', 'genre')\n",
+ "genre_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "0N15-vMrswuZ",
+ "outputId": "dbe96ed0-bff9-441c-afd2-a05fa5ab60ed"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['company:', 'company: the', 'company:amblin entertainment',\n",
+ " 'company:american international pictures (aip)', 'company:arte',\n",
+ " 'company:arte france cinéma', 'company:bbc', 'company:bbc films',\n",
+ " 'company:blumhouse productions',\n",
+ " 'company:british broadcasting corporation (bbc)', 'company:canal+',\n",
+ " 'company:canal+ españa', 'company:castle rock entertainment',\n",
+ " 'company:centre national de la cinématographie (cnc)',\n",
+ " 'company:channel four films', 'company:ciné+',\n",
+ " 'company:cinécinéma', 'company:cj entertainment',\n",
+ " 'company:columbia pictures',\n",
+ " 'company:columbia pictures corporation', 'company:dc comics',\n",
+ " 'company:dimension films', 'company:dreamworks skg',\n",
+ " 'company:dune entertainment', 'company:eurimages',\n",
+ " 'company:europacorp', 'company:film i väst', 'company:film4',\n",
+ " 'company:first national pictures', 'company:focus features',\n",
+ " 'company:fox 2000 pictures', 'company:fox film corporation',\n",
+ " 'company:fox searchlight pictures', 'company:france 2 cinéma',\n",
+ " 'company:france 3 cinéma', 'company:gaumont',\n",
+ " 'company:hallmark entertainment',\n",
+ " 'company:hammer film productions', 'company:hbo films',\n",
+ " 'company:hollywood pictures', 'company:home box office (hbo)',\n",
+ " 'company:imagine entertainment', 'company:lakeshore entertainment',\n",
+ " 'company:lenfilm', 'company:lions gate films', 'company:lionsgate',\n",
+ " 'company:m6 films', 'company:metro-goldwyn-mayer (mgm)',\n",
+ " 'company:millennium films', 'company:miramax films',\n",
+ " 'company:monogram pictures', 'company:morgan creek productions',\n",
+ " 'company:mosfilm', 'company:netflix', 'company:new line cinema',\n",
+ " 'company:new regency pictures', 'company:new world pictures',\n",
+ " 'company:nikkatsu', 'company:nordisk film',\n",
+ " 'company:nu image films', 'company:orion pictures',\n",
+ " 'company:paramount pictures', 'company:pathé',\n",
+ " 'company:pixar animation studios',\n",
+ " 'company:polygram filmed entertainment', 'company:rai cinema',\n",
+ " 'company:regency enterprises', 'company:relativity media',\n",
+ " 'company:rko radio pictures', 'company:samuel goldwyn company',\n",
+ " 'company:screen gems', 'company:shaw brothers',\n",
+ " 'company:shôchiku eiga', 'company:studiocanal',\n",
+ " 'company:summit entertainment', 'company:svensk filmindustri (sf)',\n",
+ " 'company:televisión española (tve)',\n",
+ " 'company:tf1 films production', 'company:the rank organisation',\n",
+ " 'company:the weinstein company', 'company:tla releasing',\n",
+ " 'company:toho company', 'company:touchstone pictures',\n",
+ " 'company:tristar pictures',\n",
+ " 'company:twentieth century fox film corporation',\n",
+ " 'company:téléfilm canada', 'company:uk film council',\n",
+ " 'company:united artists',\n",
+ " 'company:universal international pictures (ui)',\n",
+ " 'company:universal pictures', 'company:village roadshow pictures',\n",
+ " 'company:walt disney pictures', 'company:walt disney productions',\n",
+ " 'company:warner bros.', 'company:warner bros. animation',\n",
+ " 'company:westdeutscher rundfunk (wdr)', 'company:wild bunch',\n",
+ " 'company:working title films', 'company:zentropa entertainments',\n",
+ " 'company:zweites deutsches fernsehen (zdf)'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 22
+ }
+ ],
+ "source": [
+ "companies_data, companies_cols = vectorize_string('production_companies', 'company', 100)\n",
+ "companies_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zB0rmWDG7ktg",
+ "outputId": "f162c8e1-00af-43c3-8d91-a47365b2e2be"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['country:', 'country:afghanistan', 'country:albania',\n",
+ " 'country:algeria', 'country:angola', 'country:antarctica',\n",
+ " 'country:argentina', 'country:armenia', 'country:aruba',\n",
+ " 'country:australia', 'country:austria', 'country:azerbaijan',\n",
+ " 'country:bahamas', 'country:bangladesh', 'country:belarus',\n",
+ " 'country:belgium', 'country:bhutan', 'country:bolivia',\n",
+ " 'country:bosnia and herzegovina', 'country:botswana',\n",
+ " 'country:brazil', 'country:bulgaria', 'country:burkina faso',\n",
+ " 'country:cambodia', 'country:cameroon', 'country:canada',\n",
+ " 'country:chad', 'country:chile', 'country:china',\n",
+ " 'country:colombia', 'country:congo', 'country:costa rica',\n",
+ " \"country:cote d'ivoire\", 'country:croatia', 'country:cuba',\n",
+ " 'country:cyprus', 'country:czech republic',\n",
+ " 'country:czechoslovakia', 'country:denmark',\n",
+ " 'country:dominican republic', 'country:east germany',\n",
+ " 'country:ecuador', 'country:egypt', 'country:el salvador',\n",
+ " 'country:estonia', 'country:ethiopia', 'country:finland',\n",
+ " 'country:france', 'country:georgia', 'country:germany',\n",
+ " 'country:ghana', 'country:greece', 'country:guatemala',\n",
+ " 'country:hong kong', 'country:hungary', 'country:iceland',\n",
+ " 'country:india', 'country:indonesia', 'country:iran',\n",
+ " 'country:iraq', 'country:ireland', 'country:israel',\n",
+ " 'country:italy', 'country:jamaica', 'country:japan',\n",
+ " 'country:jordan', 'country:kazakhstan', 'country:kenya',\n",
+ " 'country:kyrgyz republic',\n",
+ " \"country:lao people's democratic republic\", 'country:latvia',\n",
+ " 'country:lebanon', 'country:liberia',\n",
+ " 'country:libyan arab jamahiriya', 'country:liechtenstein',\n",
+ " 'country:lithuania', 'country:luxembourg', 'country:macao',\n",
+ " 'country:macedonia', 'country:malaysia', 'country:mali',\n",
+ " 'country:malta', 'country:mauritania', 'country:mexico',\n",
+ " 'country:monaco', 'country:mongolia', 'country:montenegro',\n",
+ " 'country:morocco', 'country:namibia', 'country:nepal',\n",
+ " 'country:netherlands', 'country:new zealand', 'country:nicaragua',\n",
+ " 'country:nigeria', 'country:north korea', 'country:norway',\n",
+ " 'country:pakistan', 'country:palestinian territory',\n",
+ " 'country:panama', 'country:papua new guinea', 'country:paraguay',\n",
+ " 'country:peru', 'country:philippines', 'country:poland',\n",
+ " 'country:portugal', 'country:puerto rico', 'country:qatar',\n",
+ " 'country:romania', 'country:russia', 'country:rwanda',\n",
+ " 'country:saudi arabia', 'country:senegal', 'country:serbia',\n",
+ " 'country:serbia and montenegro', 'country:singapore',\n",
+ " 'country:slovakia', 'country:slovenia', 'country:south africa',\n",
+ " 'country:south korea', 'country:soviet union', 'country:spain',\n",
+ " 'country:sri lanka', 'country:sweden', 'country:switzerland',\n",
+ " 'country:syrian arab republic', 'country:taiwan',\n",
+ " 'country:tajikistan', 'country:tanzania', 'country:thailand',\n",
+ " 'country:trinidad and tobago', 'country:tunisia', 'country:turkey',\n",
+ " 'country:uganda', 'country:ukraine',\n",
+ " 'country:united arab emirates', 'country:united kingdom',\n",
+ " 'country:united states of america', 'country:uruguay',\n",
+ " 'country:uzbekistan', 'country:venezuela', 'country:vietnam',\n",
+ " 'country:yugoslavia', 'country:zimbabwe'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 23
+ }
+ ],
+ "source": [
+ "countries_data, countries_cols = vectorize_string('production_countries', 'country')\n",
+ "countries_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HgG4hhTf8vHI",
+ "outputId": "24440e58-e2d0-4373-f2f9-6f5428b0da52"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['collection:', 'collection: amore e… - collezione',\n",
+ " 'collection: band of assassins collection', ...,\n",
+ " 'collection:что творят мужчины! (коллекция)',\n",
+ " 'collection:男はつらいよ シリーズ', 'collection:식객 시리즈'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 24
+ }
+ ],
+ "source": [
+ "collection_data, collection_cols = vectorize_string('belongs_to_collection', 'collection')\n",
+ "collection_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "qOywVvXeRi-O",
+ "outputId": "a9af8bdd-9e91-414d-d005-8a3b0f2de3c0"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['lang:', 'lang:ab', 'lang:af', 'lang:am', 'lang:ar', 'lang:bg',\n",
+ " 'lang:bm', 'lang:bn', 'lang:bo', 'lang:bs', 'lang:ca', 'lang:cn',\n",
+ " 'lang:cs', 'lang:da', 'lang:de', 'lang:el', 'lang:en', 'lang:es',\n",
+ " 'lang:et', 'lang:eu', 'lang:fa', 'lang:fi', 'lang:fr', 'lang:he',\n",
+ " 'lang:hi', 'lang:hr', 'lang:hu', 'lang:id', 'lang:is', 'lang:it',\n",
+ " 'lang:iu', 'lang:ja', 'lang:ka', 'lang:kk', 'lang:kn', 'lang:ko',\n",
+ " 'lang:ku', 'lang:ky', 'lang:lo', 'lang:lt', 'lang:lv', 'lang:mk',\n",
+ " 'lang:ml', 'lang:mn', 'lang:mr', 'lang:ms', 'lang:nb', 'lang:ne',\n",
+ " 'lang:nl', 'lang:no', 'lang:pa', 'lang:pl', 'lang:ps', 'lang:pt',\n",
+ " 'lang:ro', 'lang:ru', 'lang:sh', 'lang:sk', 'lang:sl', 'lang:sq',\n",
+ " 'lang:sr', 'lang:sv', 'lang:ta', 'lang:te', 'lang:th', 'lang:tl',\n",
+ " 'lang:tr', 'lang:uk', 'lang:ur', 'lang:vi', 'lang:wo', 'lang:xx',\n",
+ " 'lang:zh'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 25
+ }
+ ],
+ "source": [
+ "metadata['original_language']= metadata['original_language'].fillna('')\n",
+ "lang_data, lang_cols = vectorize_string('original_language', 'lang')\n",
+ "lang_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "HXQOBJj67v0R",
+ "outputId": "37f1f697-4f00-461d-8bef-8af66913895e"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(1333,)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 26
+ }
+ ],
+ "source": [
+ "collection_cols.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cbSfr6HHE3Od",
+ "outputId": "472739c5-8a0e-4855-c546-e653e8cab5a9"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['keyword:', 'keyword: new york city', 'keyword:1930s',\n",
+ " 'keyword:1940s', 'keyword:1950s', 'keyword:1960s', 'keyword:1970s',\n",
+ " 'keyword:1980s', 'keyword:19th century', 'keyword:3d',\n",
+ " 'keyword:accident', 'keyword:actor', 'keyword:actress',\n",
+ " 'keyword:addiction', 'keyword:adolescence', 'keyword:adoption',\n",
+ " 'keyword:adult animation', 'keyword:adultery', 'keyword:adventure',\n",
+ " 'keyword:africa', 'keyword:african american',\n",
+ " 'keyword:aftercreditsstinger', 'keyword:afterlife',\n",
+ " 'keyword:age difference', 'keyword:agent', 'keyword:aging',\n",
+ " 'keyword:aids', 'keyword:airplane', 'keyword:airplane crash',\n",
+ " 'keyword:airport', 'keyword:alcohol', 'keyword:alcoholic',\n",
+ " 'keyword:alcoholism', 'keyword:alien', 'keyword:alien invasion',\n",
+ " 'keyword:alien life-form', 'keyword:ambition', 'keyword:ambush',\n",
+ " 'keyword:american', 'keyword:american football', 'keyword:amnesia',\n",
+ " 'keyword:amusement park', 'keyword:anarchic comedy',\n",
+ " 'keyword:android', 'keyword:angel', 'keyword:animal',\n",
+ " 'keyword:animal attack', 'keyword:animal horror',\n",
+ " 'keyword:animation', 'keyword:anime', 'keyword:anthology',\n",
+ " 'keyword:anthropomorphism', 'keyword:apache', 'keyword:apartment',\n",
+ " 'keyword:apocalypse', 'keyword:archaeologist', 'keyword:architect',\n",
+ " 'keyword:argentina', 'keyword:arizona', 'keyword:army',\n",
+ " 'keyword:arranged marriage', 'keyword:art',\n",
+ " 'keyword:artificial intelligence', 'keyword:artist',\n",
+ " 'keyword:assassin', 'keyword:assassination', 'keyword:astronaut',\n",
+ " 'keyword:asylum', 'keyword:atomic bomb', 'keyword:attack',\n",
+ " 'keyword:australia', 'keyword:author', 'keyword:autism',\n",
+ " 'keyword:aviation', 'keyword:axe murder', 'keyword:b movie',\n",
+ " 'keyword:baby', 'keyword:babysitter', 'keyword:ballet',\n",
+ " 'keyword:bank', 'keyword:bank robber', 'keyword:bank robbery',\n",
+ " 'keyword:bar', 'keyword:baseball',\n",
+ " \"keyword:based on children's book\", 'keyword:based on comic',\n",
+ " 'keyword:based on manga', 'keyword:based on novel',\n",
+ " 'keyword:based on play or musical', 'keyword:based on true events',\n",
+ " 'keyword:based on true story', 'keyword:based on tv series',\n",
+ " 'keyword:based on video game',\n",
+ " 'keyword:based on young adult novel', 'keyword:basketball',\n",
+ " 'keyword:battle', 'keyword:beach', 'keyword:bear',\n",
+ " 'keyword:beautiful woman', 'keyword:becoming an adult',\n",
+ " 'keyword:beer', 'keyword:berlin', 'keyword:best friend',\n",
+ " 'keyword:betrayal', 'keyword:bible', 'keyword:biker',\n",
+ " 'keyword:bikini', 'keyword:biography', 'keyword:birthday',\n",
+ " 'keyword:black and white', 'keyword:black magic',\n",
+ " 'keyword:black people', 'keyword:blackmail',\n",
+ " 'keyword:blaxploitation', 'keyword:blindness', 'keyword:blood',\n",
+ " 'keyword:blood splatter', 'keyword:boarding school',\n",
+ " 'keyword:boat', 'keyword:bodyguard', 'keyword:bollywood',\n",
+ " 'keyword:bomb', 'keyword:bondage', 'keyword:book',\n",
+ " 'keyword:boston', 'keyword:bounty hunter', 'keyword:boxer',\n",
+ " 'keyword:boy', 'keyword:boyfriend', 'keyword:brazil',\n",
+ " 'keyword:brazilian', 'keyword:break-up', 'keyword:bride',\n",
+ " 'keyword:bridge', 'keyword:brit noir', 'keyword:british',\n",
+ " 'keyword:brothel', 'keyword:brother',\n",
+ " 'keyword:brother brother relationship',\n",
+ " 'keyword:brother sister relationship', 'keyword:brutality',\n",
+ " 'keyword:bully', 'keyword:bullying', 'keyword:bus',\n",
+ " 'keyword:business', 'keyword:business man', 'keyword:cabin',\n",
+ " 'keyword:california', 'keyword:camp', 'keyword:camping',\n",
+ " 'keyword:canada', 'keyword:cancer', 'keyword:cannibal',\n",
+ " 'keyword:cannibalism', 'keyword:canuxploitation',\n",
+ " 'keyword:capitalism', 'keyword:captain', 'keyword:car',\n",
+ " 'keyword:car accident', 'keyword:car chase', 'keyword:car crash',\n",
+ " 'keyword:car race', 'keyword:career', 'keyword:carnival',\n",
+ " 'keyword:casino', 'keyword:castle', 'keyword:cat',\n",
+ " 'keyword:catholic', 'keyword:catholicism', 'keyword:cattle',\n",
+ " 'keyword:cave', 'keyword:celebration', 'keyword:celebrity',\n",
+ " 'keyword:cell phone', 'keyword:cemetery', 'keyword:chainsaw',\n",
+ " 'keyword:chaos', 'keyword:charlie chan', 'keyword:chase',\n",
+ " 'keyword:cheating', 'keyword:cheerleader', 'keyword:chicago',\n",
+ " 'keyword:child', 'keyword:child abuse', 'keyword:childhood',\n",
+ " 'keyword:childhood friends', 'keyword:children', 'keyword:china',\n",
+ " 'keyword:chinese', 'keyword:christian', 'keyword:christianity',\n",
+ " 'keyword:christmas', 'keyword:church', 'keyword:cia',\n",
+ " 'keyword:cigarette smoking', 'keyword:cinema', 'keyword:circus',\n",
+ " 'keyword:city', 'keyword:civil war', 'keyword:classic noir',\n",
+ " 'keyword:climbing', 'keyword:cocaine', 'keyword:coffin',\n",
+ " 'keyword:cold war', 'keyword:college', 'keyword:coma',\n",
+ " 'keyword:combat', 'keyword:comedian', 'keyword:comedy',\n",
+ " 'keyword:comic book', 'keyword:coming of age',\n",
+ " 'keyword:coming out', 'keyword:communism', 'keyword:communist',\n",
+ " 'keyword:competition', 'keyword:composer', 'keyword:computer',\n",
+ " 'keyword:con man', 'keyword:concert', 'keyword:conspiracy',\n",
+ " 'keyword:cop', 'keyword:corporation', 'keyword:corpse',\n",
+ " 'keyword:corruption', 'keyword:countryside', 'keyword:couple',\n",
+ " 'keyword:court', 'keyword:court case', 'keyword:courtroom',\n",
+ " 'keyword:cover-up', 'keyword:cowardliness', 'keyword:cowboy',\n",
+ " 'keyword:creature', 'keyword:crime', 'keyword:criminal',\n",
+ " 'keyword:cruelty', 'keyword:crush', 'keyword:cuba', 'keyword:cult',\n",
+ " 'keyword:cult film', 'keyword:curse', 'keyword:cyberpunk',\n",
+ " 'keyword:cyborg', 'keyword:dance', 'keyword:dancer',\n",
+ " 'keyword:dancing', 'keyword:danger', 'keyword:dark comedy',\n",
+ " 'keyword:date', 'keyword:dating', 'keyword:daughter',\n",
+ " 'keyword:dc comics', 'keyword:death', 'keyword:death of a friend',\n",
+ " 'keyword:debt', 'keyword:decapitation', 'keyword:deception',\n",
+ " 'keyword:delusion', 'keyword:demon', 'keyword:department store',\n",
+ " 'keyword:depression', 'keyword:desert', 'keyword:desire',\n",
+ " 'keyword:detective', 'keyword:devil', 'keyword:diamond',\n",
+ " 'keyword:diary', 'keyword:diner', 'keyword:dinosaur',\n",
+ " 'keyword:dirty cop', 'keyword:disabled', 'keyword:disappearance',\n",
+ " 'keyword:disaster', 'keyword:disguise', 'keyword:disney short',\n",
+ " 'keyword:divorce', 'keyword:doctor', 'keyword:documentary',\n",
+ " 'keyword:dog', 'keyword:doppelganger', 'keyword:double life',\n",
+ " 'keyword:dracula', 'keyword:dragon', 'keyword:drama',\n",
+ " 'keyword:dream', 'keyword:drinking', 'keyword:drowning',\n",
+ " 'keyword:drug', 'keyword:drug abuse', 'keyword:drug addiction',\n",
+ " 'keyword:drug dealer', 'keyword:drug lord', 'keyword:drug traffic',\n",
+ " 'keyword:drug use', 'keyword:drunk', 'keyword:drunkenness',\n",
+ " 'keyword:duel', 'keyword:duringcreditsstinger',\n",
+ " 'keyword:dying and death', 'keyword:dysfunctional family',\n",
+ " 'keyword:dystopia', 'keyword:dystopic future',\n",
+ " 'keyword:earthquake', 'keyword:economics', 'keyword:education',\n",
+ " 'keyword:egypt', 'keyword:england', 'keyword:epic',\n",
+ " 'keyword:erotic movie', 'keyword:eroticism', 'keyword:escape',\n",
+ " 'keyword:escape from prison', 'keyword:espionage',\n",
+ " 'keyword:europe', 'keyword:evil', 'keyword:ex-con',\n",
+ " 'keyword:exorcism', 'keyword:exotic island', 'keyword:expedition',\n",
+ " 'keyword:experiment', 'keyword:experimental film',\n",
+ " 'keyword:exploitation', 'keyword:explosion',\n",
+ " 'keyword:extramarital affair', 'keyword:extreme violence',\n",
+ " 'keyword:factory', 'keyword:fairy tale', 'keyword:faith',\n",
+ " 'keyword:falling in love', 'keyword:false identity',\n",
+ " 'keyword:falsely accused', 'keyword:family',\n",
+ " 'keyword:family relationships', 'keyword:fantasy', 'keyword:farm',\n",
+ " 'keyword:farmer', 'keyword:fashion', 'keyword:fate',\n",
+ " 'keyword:father', 'keyword:father daughter relationship',\n",
+ " 'keyword:father son relationship', 'keyword:fbi',\n",
+ " 'keyword:fbi agent', 'keyword:fear', 'keyword:female friendship',\n",
+ " 'keyword:female homosexuality', 'keyword:female nudity',\n",
+ " 'keyword:female protagonist', 'keyword:feminism',\n",
+ " 'keyword:femme fatale', 'keyword:fight', 'keyword:fighter',\n",
+ " 'keyword:film director', 'keyword:film making',\n",
+ " 'keyword:film noir', 'keyword:filmmaker', 'keyword:filmmaking',\n",
+ " 'keyword:fire', 'keyword:fisherman', 'keyword:fistfight',\n",
+ " 'keyword:flashback', 'keyword:florida', 'keyword:flying',\n",
+ " 'keyword:flying saucer', 'keyword:food', 'keyword:forbidden love',\n",
+ " 'keyword:forest', 'keyword:found footage', 'keyword:france',\n",
+ " 'keyword:frankenstein', 'keyword:fraud', 'keyword:freedom',\n",
+ " 'keyword:french', 'keyword:french noir', 'keyword:friends',\n",
+ " 'keyword:friendship', 'keyword:fugitive', 'keyword:funeral',\n",
+ " 'keyword:future', 'keyword:gambler', 'keyword:gambling',\n",
+ " 'keyword:gang', 'keyword:gangster', 'keyword:gas station',\n",
+ " 'keyword:gay', 'keyword:gay interest', 'keyword:gay man',\n",
+ " 'keyword:gay relationship', 'keyword:general', 'keyword:german',\n",
+ " 'keyword:germany', 'keyword:ghost', 'keyword:giallo',\n",
+ " 'keyword:giant monster', 'keyword:girl', 'keyword:girlfriend',\n",
+ " 'keyword:god', 'keyword:gold', 'keyword:good vs evil',\n",
+ " 'keyword:gore', 'keyword:gothic', 'keyword:gothic horror',\n",
+ " 'keyword:government', 'keyword:greece', 'keyword:greed',\n",
+ " 'keyword:grief', 'keyword:guilt', 'keyword:gun',\n",
+ " 'keyword:gunfight', 'keyword:gunslinger', 'keyword:gypsy',\n",
+ " 'keyword:hacker', 'keyword:halloween', 'keyword:hallucination',\n",
+ " 'keyword:hammer horror', 'keyword:haunted house',\n",
+ " 'keyword:haunting', 'keyword:hawaii', 'keyword:heavy metal',\n",
+ " 'keyword:heist', 'keyword:helicopter', 'keyword:hell',\n",
+ " 'keyword:hero', 'keyword:heroin', 'keyword:high school',\n",
+ " 'keyword:highway', 'keyword:hip-hop', 'keyword:hippie',\n",
+ " 'keyword:historical figure', 'keyword:history',\n",
+ " 'keyword:hitchhiker', 'keyword:hitman', 'keyword:holiday',\n",
+ " 'keyword:hollywood', 'keyword:holocaust', 'keyword:home invasion',\n",
+ " 'keyword:homeless person', 'keyword:homophobia',\n",
+ " 'keyword:homosexuality', 'keyword:honeymoon', 'keyword:hong kong',\n",
+ " 'keyword:hoodlum', 'keyword:horror', 'keyword:horse',\n",
+ " 'keyword:hospital', 'keyword:hostage', 'keyword:hotel',\n",
+ " 'keyword:hotel room', 'keyword:house',\n",
+ " 'keyword:human experimentation', 'keyword:humiliation',\n",
+ " 'keyword:hustler', 'keyword:hypnosis', 'keyword:identity',\n",
+ " 'keyword:illegal drugs', 'keyword:illegal prostitution',\n",
+ " 'keyword:illness', 'keyword:imax', 'keyword:immigrant',\n",
+ " 'keyword:immigration', 'keyword:immortality', 'keyword:incest',\n",
+ " 'keyword:independent film', 'keyword:india', 'keyword:indian lead',\n",
+ " 'keyword:individual', 'keyword:infection', 'keyword:infidelity',\n",
+ " 'keyword:inheritance', 'keyword:insanity', 'keyword:intelligence',\n",
+ " 'keyword:internet', 'keyword:interracial relationship',\n",
+ " 'keyword:interview', 'keyword:invasion', 'keyword:inventor',\n",
+ " 'keyword:investigation', 'keyword:ireland', 'keyword:island',\n",
+ " 'keyword:isolation', 'keyword:israel', 'keyword:italian',\n",
+ " 'keyword:italy', 'keyword:jail', 'keyword:japan',\n",
+ " 'keyword:japanese', 'keyword:jazz', 'keyword:jealousy',\n",
+ " 'keyword:jesus christ', 'keyword:jew', 'keyword:jewish',\n",
+ " 'keyword:journalism', 'keyword:journalist', 'keyword:journey',\n",
+ " 'keyword:judge', 'keyword:jungle', 'keyword:justice',\n",
+ " 'keyword:juvenile delinquent', 'keyword:kaiju',\n",
+ " 'keyword:kidnapping', 'keyword:kids', 'keyword:kids and family',\n",
+ " 'keyword:killer', 'keyword:king', 'keyword:kingdom',\n",
+ " 'keyword:kiss', 'keyword:knife', 'keyword:knight', 'keyword:korea',\n",
+ " 'keyword:korean movie', 'keyword:kung fu', 'keyword:laboratory',\n",
+ " 'keyword:ladykiller', 'keyword:lake', 'keyword:las vegas',\n",
+ " 'keyword:lawyer', 'keyword:legend', 'keyword:lesbian',\n",
+ " 'keyword:lesbian relationship', 'keyword:lesbian sex',\n",
+ " 'keyword:letter', 'keyword:lgbt', 'keyword:lie', 'keyword:lion',\n",
+ " 'keyword:little boy', 'keyword:little girl',\n",
+ " 'keyword:london england', 'keyword:loneliness',\n",
+ " 'keyword:los angeles', 'keyword:loss of father',\n",
+ " 'keyword:loss of lover', 'keyword:loss of mother',\n",
+ " 'keyword:loss of virginity', 'keyword:love', 'keyword:love affair',\n",
+ " 'keyword:love at first sight', \"keyword:love of one's life\",\n",
+ " 'keyword:love triangle', 'keyword:lover', 'keyword:lovers',\n",
+ " 'keyword:lovesickness', 'keyword:lust', 'keyword:mad scientist',\n",
+ " 'keyword:madness', 'keyword:madrid', 'keyword:mafia',\n",
+ " 'keyword:magic', 'keyword:maid', 'keyword:malayalam',\n",
+ " 'keyword:male female relationship', 'keyword:male friendship',\n",
+ " 'keyword:male nudity', 'keyword:manhattan', 'keyword:maniac',\n",
+ " 'keyword:manipulation', 'keyword:mansion', 'keyword:marijuana',\n",
+ " 'keyword:marriage', 'keyword:marriage crisis',\n",
+ " 'keyword:marriage proposal', 'keyword:married couple',\n",
+ " 'keyword:martial arts', 'keyword:marvel comic', 'keyword:mask',\n",
+ " 'keyword:mass murder', 'keyword:massacre', 'keyword:masturbation',\n",
+ " 'keyword:mayor', 'keyword:melodrama', 'keyword:memory',\n",
+ " 'keyword:memory loss', 'keyword:mental illness',\n",
+ " 'keyword:mercenary', 'keyword:mexican', 'keyword:mexico',\n",
+ " 'keyword:midlife crisis', 'keyword:military',\n",
+ " 'keyword:millionaire', 'keyword:mind control',\n",
+ " 'keyword:miniseries', 'keyword:missing person', 'keyword:mission',\n",
+ " 'keyword:mission of murder', 'keyword:mistaken identity',\n",
+ " 'keyword:mobster', 'keyword:mockumentary', 'keyword:model',\n",
+ " 'keyword:money', 'keyword:monk', 'keyword:monkey',\n",
+ " 'keyword:monster', 'keyword:moon', 'keyword:motel',\n",
+ " 'keyword:mother', 'keyword:mother daughter relationship',\n",
+ " 'keyword:mother son relationship', 'keyword:motorcycle',\n",
+ " 'keyword:mountain', 'keyword:movie star', 'keyword:mumblegore',\n",
+ " 'keyword:mummy', 'keyword:murder', 'keyword:murderer',\n",
+ " 'keyword:museum', 'keyword:music', 'keyword:music band',\n",
+ " 'keyword:musical', 'keyword:musician', 'keyword:muslim',\n",
+ " 'keyword:mutant', 'keyword:mutation', 'keyword:mystery',\n",
+ " 'keyword:mythology', 'keyword:nanny', 'keyword:narration',\n",
+ " 'keyword:nasa', 'keyword:native american', 'keyword:nature',\n",
+ " 'keyword:navy', 'keyword:nazi germany', 'keyword:nazis',\n",
+ " 'keyword:neighbor', 'keyword:neo-noir', 'keyword:nerd',\n",
+ " 'keyword:new england', 'keyword:new love', 'keyword:new orleans',\n",
+ " 'keyword:new york', 'keyword:new york city', 'keyword:new zealand',\n",
+ " 'keyword:newspaper', 'keyword:nightclub', 'keyword:nightmare',\n",
+ " 'keyword:ninja', 'keyword:nudity', 'keyword:nun', 'keyword:nurse',\n",
+ " 'keyword:obsession', 'keyword:occult', 'keyword:ocean',\n",
+ " 'keyword:older man younger woman relationship',\n",
+ " 'keyword:older woman younger man relationship',\n",
+ " 'keyword:olympic games', 'keyword:on the run',\n",
+ " 'keyword:one-night stand', 'keyword:opera',\n",
+ " 'keyword:organized crime', 'keyword:orphan', 'keyword:orphanage',\n",
+ " 'keyword:outer space', 'keyword:outlaw', 'keyword:painter',\n",
+ " 'keyword:painting', 'keyword:parallel world', 'keyword:paranoia',\n",
+ " 'keyword:parent child relationship', 'keyword:paris',\n",
+ " 'keyword:parody', 'keyword:party', 'keyword:passion',\n",
+ " 'keyword:peasant', 'keyword:period drama', 'keyword:philippines',\n",
+ " 'keyword:philosophy', 'keyword:photographer',\n",
+ " 'keyword:photography', 'keyword:pig', 'keyword:pilot',\n",
+ " 'keyword:pimp', 'keyword:pirate', 'keyword:pistol',\n",
+ " 'keyword:planned murder', 'keyword:playboy', 'keyword:poet',\n",
+ " 'keyword:poetry', 'keyword:poison', 'keyword:poker',\n",
+ " 'keyword:police', 'keyword:police brutality',\n",
+ " 'keyword:police corruption', 'keyword:police detective',\n",
+ " 'keyword:police officer', 'keyword:police operation',\n",
+ " 'keyword:policeman', 'keyword:political', 'keyword:politician',\n",
+ " 'keyword:politics', 'keyword:pornography', 'keyword:possession',\n",
+ " 'keyword:post-apocalyptic', 'keyword:poverty', 'keyword:power',\n",
+ " 'keyword:pre-code', 'keyword:pregnancy', 'keyword:pregnant',\n",
+ " 'keyword:president', 'keyword:priest', 'keyword:prince',\n",
+ " 'keyword:princess', 'keyword:prison', 'keyword:prisoner',\n",
+ " 'keyword:prisoners of war', 'keyword:private detective',\n",
+ " 'keyword:professor', 'keyword:propaganda', 'keyword:prophecy',\n",
+ " 'keyword:prostitute', 'keyword:prostitution', 'keyword:protest',\n",
+ " 'keyword:proto-slasher', 'keyword:psychiatrist', 'keyword:psychic',\n",
+ " 'keyword:psychological thriller', 'keyword:psychologist',\n",
+ " 'keyword:psychology', 'keyword:psychopath', 'keyword:puberty',\n",
+ " 'keyword:punk', 'keyword:puppet', 'keyword:queen',\n",
+ " 'keyword:racism', 'keyword:radio', 'keyword:rain', 'keyword:ranch',\n",
+ " 'keyword:ransom', 'keyword:rape', 'keyword:rebel',\n",
+ " 'keyword:redemption', 'keyword:relationship',\n",
+ " 'keyword:relationship problems', 'keyword:religion',\n",
+ " 'keyword:remake', 'keyword:reporter', 'keyword:rescue',\n",
+ " 'keyword:resistance', 'keyword:restaurant', 'keyword:resurrection',\n",
+ " 'keyword:revenge', 'keyword:revolution', 'keyword:rifle',\n",
+ " 'keyword:ritual', 'keyword:rivalry', 'keyword:river',\n",
+ " 'keyword:road movie', 'keyword:road trip', 'keyword:robbery',\n",
+ " 'keyword:robot', 'keyword:rock', 'keyword:rock and roll',\n",
+ " 'keyword:rock band', 'keyword:rock music', 'keyword:rock star',\n",
+ " 'keyword:romance', 'keyword:romantic comedy', 'keyword:rome',\n",
+ " 'keyword:roommate', 'keyword:royalty', 'keyword:runaway',\n",
+ " 'keyword:rural setting', 'keyword:russia', 'keyword:russian',\n",
+ " 'keyword:sacrifice', 'keyword:sadism', 'keyword:sadness',\n",
+ " 'keyword:sailor', 'keyword:salesman', 'keyword:saloon',\n",
+ " 'keyword:samurai', 'keyword:san francisco', 'keyword:santa claus',\n",
+ " 'keyword:satire', 'keyword:saving the world', 'keyword:scandal',\n",
+ " 'keyword:schizophrenia', 'keyword:school', 'keyword:science',\n",
+ " 'keyword:science fiction', 'keyword:scientist', 'keyword:scotland',\n",
+ " 'keyword:sea', 'keyword:search', 'keyword:secret',\n",
+ " 'keyword:secret agent', 'keyword:secret identity',\n",
+ " 'keyword:secret love', 'keyword:secretary', 'keyword:seduction',\n",
+ " 'keyword:sequel', 'keyword:serial killer',\n",
+ " 'keyword:series of murders', 'keyword:sex',\n",
+ " 'keyword:sexploitation', 'keyword:sexual abuse',\n",
+ " 'keyword:sexuality', 'keyword:shakespeare', 'keyword:shark',\n",
+ " 'keyword:sheriff', 'keyword:sherlock holmes', 'keyword:ship',\n",
+ " 'keyword:shipwreck', 'keyword:shooting', 'keyword:shootout',\n",
+ " 'keyword:short', 'keyword:shotgun', 'keyword:showdown',\n",
+ " 'keyword:shower', 'keyword:silent film', 'keyword:singer',\n",
+ " 'keyword:singing', 'keyword:single', 'keyword:single mother',\n",
+ " 'keyword:single parent', 'keyword:sister',\n",
+ " 'keyword:sister sister relationship', 'keyword:slapstick',\n",
+ " 'keyword:slasher', 'keyword:slavery', 'keyword:small town',\n",
+ " 'keyword:smuggling', 'keyword:snake', 'keyword:sniper',\n",
+ " 'keyword:snow', 'keyword:soccer', 'keyword:society',\n",
+ " 'keyword:soldier', 'keyword:son', 'keyword:song',\n",
+ " 'keyword:south africa', 'keyword:south korea',\n",
+ " 'keyword:southern usa', 'keyword:soviet union', 'keyword:space',\n",
+ " 'keyword:space marine', 'keyword:space opera',\n",
+ " 'keyword:space travel', 'keyword:spacecraft', 'keyword:spaceship',\n",
+ " 'keyword:spaghetti western', 'keyword:spain', 'keyword:spider',\n",
+ " 'keyword:spirit', 'keyword:spoof', 'keyword:sport', 'keyword:spy',\n",
+ " 'keyword:stalker', 'keyword:stalking', 'keyword:stand-up comedy',\n",
+ " 'keyword:stop motion', 'keyword:storm', 'keyword:stranded',\n",
+ " 'keyword:stranger', 'keyword:street gang', 'keyword:strip club',\n",
+ " 'keyword:stripper', 'keyword:student', 'keyword:submarine',\n",
+ " 'keyword:subway', 'keyword:success', 'keyword:suicide',\n",
+ " 'keyword:suicide attempt', 'keyword:summer', 'keyword:summer camp',\n",
+ " 'keyword:summer vacation', 'keyword:super powers',\n",
+ " 'keyword:superhero', 'keyword:supernatural',\n",
+ " 'keyword:supernatural powers', 'keyword:surfing',\n",
+ " 'keyword:surreal', 'keyword:surrealism', 'keyword:surveillance',\n",
+ " 'keyword:survival', 'keyword:survivor', 'keyword:suspense',\n",
+ " 'keyword:suspicion', 'keyword:swamp', 'keyword:sweden',\n",
+ " 'keyword:swimming pool', 'keyword:sword',\n",
+ " 'keyword:sword and sorcery', 'keyword:sword fight',\n",
+ " 'keyword:swordplay', 'keyword:talking animal', 'keyword:tattoo',\n",
+ " 'keyword:taxi', 'keyword:taxi driver', 'keyword:teacher',\n",
+ " 'keyword:technology', 'keyword:teen comedy', 'keyword:teen movie',\n",
+ " 'keyword:teenage boy', 'keyword:teenage crush',\n",
+ " 'keyword:teenage girl', 'keyword:teenager', 'keyword:telekinesis',\n",
+ " 'keyword:television', 'keyword:terminal illness', 'keyword:terror',\n",
+ " 'keyword:terrorism', 'keyword:terrorist', 'keyword:texas',\n",
+ " 'keyword:thailand', 'keyword:theater', 'keyword:theft',\n",
+ " 'keyword:therapist', 'keyword:thief', 'keyword:thriller',\n",
+ " 'keyword:time travel', 'keyword:tokyo japan', 'keyword:torture',\n",
+ " 'keyword:tourist', 'keyword:tragedy', 'keyword:train',\n",
+ " 'keyword:training', 'keyword:traitor', 'keyword:transformation',\n",
+ " 'keyword:transvestism', 'keyword:trapped', 'keyword:trauma',\n",
+ " 'keyword:travel', 'keyword:treasure', 'keyword:treasure hunt',\n",
+ " 'keyword:trial', 'keyword:truck', 'keyword:turkey',\n",
+ " 'keyword:tv movie', 'keyword:tv show', 'keyword:twins',\n",
+ " 'keyword:u.s. army', 'keyword:u.s. navy', 'keyword:ufo',\n",
+ " 'keyword:uncle', 'keyword:undead', 'keyword:undercover',\n",
+ " 'keyword:undercover agent', 'keyword:undercover cop',\n",
+ " 'keyword:underdog', 'keyword:underwater', 'keyword:underwear',\n",
+ " 'keyword:unemployment', 'keyword:university',\n",
+ " 'keyword:unrequited love', 'keyword:unsimulated sex',\n",
+ " 'keyword:unsociability', 'keyword:upper class', 'keyword:usa',\n",
+ " 'keyword:usa president', 'keyword:vacation', 'keyword:vampire',\n",
+ " 'keyword:venice', 'keyword:victim', 'keyword:video game',\n",
+ " 'keyword:video nasty', 'keyword:vietnam',\n",
+ " 'keyword:vietnam veteran', 'keyword:vietnam war',\n",
+ " 'keyword:vigilante', 'keyword:village', 'keyword:violence',\n",
+ " 'keyword:virgin', 'keyword:virtual reality', 'keyword:virus',\n",
+ " 'keyword:vision', 'keyword:volcano', 'keyword:voodoo',\n",
+ " 'keyword:voyeur', 'keyword:voyeurism', 'keyword:waitress',\n",
+ " 'keyword:war', 'keyword:war crimes', 'keyword:war veteran',\n",
+ " 'keyword:washington d.c.', 'keyword:water', 'keyword:wealth',\n",
+ " 'keyword:weapon', 'keyword:wedding', 'keyword:werewolf',\n",
+ " 'keyword:wheelchair', 'keyword:widow', 'keyword:widower',\n",
+ " 'keyword:wife', 'keyword:wife husband relationship',\n",
+ " 'keyword:wilderness', 'keyword:winter', 'keyword:wish',\n",
+ " 'keyword:witch', 'keyword:witchcraft', 'keyword:wolf',\n",
+ " 'keyword:woman director', 'keyword:women', 'keyword:woods',\n",
+ " 'keyword:world war i', 'keyword:world war ii', 'keyword:wrestling',\n",
+ " 'keyword:writer', 'keyword:xenophobia', 'keyword:yakuza',\n",
+ " 'keyword:young adult', 'keyword:youth', 'keyword:zombie'],\n",
+ " dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 27
+ }
+ ],
+ "source": [
+ "keyword_data, keyword_cols = tfidf('keywords', 'keyword', 1000)\n",
+ "keyword_cols"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits.drop(columns=['crew'], inplace=True)\n",
+ "credit_data, credit_cols = vectorize_string('cast','cast', 1000, df=credits)\n",
+ "credit_cols"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "EGKfKnjM-qYH",
+ "outputId": "53ca2cee-2879-4b83-cb05-5382bad526a1"
+ },
+ "execution_count": 28,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['cast:', 'cast: jr.', 'cast:adam sandler', 'cast:adam scott',\n",
+ " 'cast:addison richards', 'cast:adolfo celi', 'cast:adrien brody',\n",
+ " 'cast:agnes moorehead', 'cast:aidan quinn', 'cast:ajay devgn',\n",
+ " 'cast:akim tamiroff', 'cast:akshay kumar', 'cast:al pacino',\n",
+ " 'cast:alain delon', 'cast:alan arkin', 'cast:alan bates',\n",
+ " 'cast:alan cumming', 'cast:alan hale', 'cast:alan mowbray',\n",
+ " 'cast:alan napier', 'cast:alan rickman', 'cast:alan tudyk',\n",
+ " 'cast:alberto sordi', 'cast:alec baldwin', 'cast:alec guinness',\n",
+ " 'cast:alfre woodard', 'cast:alfred molina', 'cast:allen jenkins',\n",
+ " 'cast:allison janney', 'cast:amitabh bachchan', 'cast:amy adams',\n",
+ " 'cast:amy poehler', 'cast:andré dussollier', 'cast:andy garcía',\n",
+ " 'cast:andy lau', 'cast:andy serkis', 'cast:angela lansbury',\n",
+ " 'cast:angelina jolie', 'cast:anjelica huston', 'cast:ann doran',\n",
+ " 'cast:ann-margret', 'cast:anne bancroft', 'cast:anne heche',\n",
+ " 'cast:anthony hopkins', 'cast:anthony lapaglia',\n",
+ " 'cast:anthony mackie', 'cast:anthony quinn', 'cast:anthony wong',\n",
+ " 'cast:antonio banderas', 'cast:anupam kher', 'cast:armand assante',\n",
+ " 'cast:arnold schwarzenegger', 'cast:arthur kennedy',\n",
+ " 'cast:ava gardner', 'cast:barbara hale', 'cast:barbara hershey',\n",
+ " 'cast:barbara stanwyck', 'cast:barry corbin',\n",
+ " 'cast:barry sullivan', 'cast:barton maclane',\n",
+ " 'cast:basil rathbone', 'cast:beau bridges', 'cast:bela lugosi',\n",
+ " 'cast:ben affleck', 'cast:ben johnson', 'cast:ben kingsley',\n",
+ " 'cast:ben stiller', 'cast:benicio del toro', 'cast:bernard blier',\n",
+ " 'cast:bernard lee', 'cast:bert moorhouse', 'cast:bess flowers',\n",
+ " 'cast:beth grant', 'cast:bette davis', 'cast:beulah bondi',\n",
+ " \"cast:beverly d'angelo\", 'cast:bill hader', 'cast:bill moseley',\n",
+ " 'cast:bill murray', 'cast:bill nighy', 'cast:bill paxton',\n",
+ " 'cast:bill pullman', 'cast:billy bevan', 'cast:billy bob thornton',\n",
+ " 'cast:billy connolly', 'cast:billy crystal', 'cast:billy gilbert',\n",
+ " 'cast:billy zane', 'cast:bing crosby', 'cast:blythe danner',\n",
+ " 'cast:bob balaban', 'cast:bob gunton', 'cast:bob hope',\n",
+ " 'cast:bob hoskins', 'cast:bobby cannavale', 'cast:boris karloff',\n",
+ " 'cast:brad dourif', 'cast:brad pitt', 'cast:brendan fraser',\n",
+ " 'cast:brendan gleeson', 'cast:brian cox', 'cast:brian dennehy',\n",
+ " 'cast:brian donlevy', 'cast:brian keith', 'cast:brion james',\n",
+ " 'cast:brooks benedict', 'cast:bruce campbell',\n",
+ " 'cast:bruce davison', 'cast:bruce dern', 'cast:bruce greenwood',\n",
+ " 'cast:bruce mcgill', 'cast:bruce willis', 'cast:bud spencer',\n",
+ " 'cast:burgess meredith', 'cast:burt lancaster',\n",
+ " 'cast:burt reynolds', 'cast:burt young', 'cast:buster keaton',\n",
+ " 'cast:byron foulger', 'cast:c. aubrey smith',\n",
+ " 'cast:c. thomas howell', 'cast:callum keith rennie',\n",
+ " 'cast:cameron diaz', 'cast:cameron mitchell', 'cast:carla gugino',\n",
+ " 'cast:carol kane', 'cast:caroline aaron', 'cast:carrie fisher',\n",
+ " 'cast:cary elwes', 'cast:cary grant', 'cast:cary-hiroyuki tagawa',\n",
+ " 'cast:cate blanchett', 'cast:catherine deneuve',\n",
+ " 'cast:catherine keener', \"cast:catherine o'hara\",\n",
+ " 'cast:cecil kellaway', 'cast:cedric hardwicke',\n",
+ " 'cast:charles boyer', 'cast:charles bronson', 'cast:charles dance',\n",
+ " 'cast:charles durning', 'cast:charles halton', 'cast:charles lane',\n",
+ " 'cast:charles laughton', 'cast:charles mcgraw',\n",
+ " 'cast:charles middleton', 'cast:charles s. dutton',\n",
+ " 'cast:charles trowbridge', 'cast:charlie chaplin',\n",
+ " 'cast:charlie sheen', 'cast:charlize theron',\n",
+ " 'cast:charlotte rampling', 'cast:charlton heston',\n",
+ " 'cast:cheech marin', 'cast:chevy chase', 'cast:chill wills',\n",
+ " 'cast:chishu ryu', 'cast:chloë sevigny', 'cast:chris cooper',\n",
+ " 'cast:chris rock', 'cast:christian bale', 'cast:christian slater',\n",
+ " 'cast:christina ricci', 'cast:christopher lee',\n",
+ " 'cast:christopher lloyd', 'cast:christopher mcdonald',\n",
+ " 'cast:christopher plummer', 'cast:christopher walken',\n",
+ " 'cast:ciarán hinds', 'cast:clancy brown', 'cast:clarence muse',\n",
+ " 'cast:clark gable', 'cast:claude rains', 'cast:claudia cardinale',\n",
+ " 'cast:clifton collins jr', 'cast:clint eastwood',\n",
+ " 'cast:clint howard', 'cast:clive owen', 'cast:cloris leachman',\n",
+ " 'cast:colin farrell', 'cast:colin firth', 'cast:colleen camp',\n",
+ " 'cast:colm meaney', 'cast:corbin bernsen', 'cast:crispin glover',\n",
+ " 'cast:cuba gooding jr.', 'cast:cyril cusack', 'cast:cyril ring',\n",
+ " 'cast:dabney coleman', 'cast:dakota fanning', 'cast:dan aykroyd',\n",
+ " 'cast:dan hedaya', 'cast:dana andrews', 'cast:daniel brühl',\n",
+ " 'cast:daniel craig', 'cast:danny aiello', 'cast:danny devito',\n",
+ " 'cast:danny glover', 'cast:danny huston', 'cast:danny trejo',\n",
+ " 'cast:daryl hannah', 'cast:david arquette', 'cast:david bradley',\n",
+ " 'cast:david carradine', 'cast:david cross', 'cast:david keith',\n",
+ " 'cast:david koechner', 'cast:david morse', 'cast:david niven',\n",
+ " 'cast:david ogden stiers', 'cast:david paymer',\n",
+ " 'cast:david strathairn', 'cast:david thewlis', 'cast:david warner',\n",
+ " 'cast:dean jagger', 'cast:dean martin', 'cast:dean stockwell',\n",
+ " 'cast:debbie reynolds', 'cast:dee wallace', 'cast:demi moore',\n",
+ " 'cast:denholm elliott', 'cast:denis leary', 'cast:dennis haysbert',\n",
+ " 'cast:dennis hopper', \"cast:dennis o'keefe\", 'cast:dennis quaid',\n",
+ " 'cast:denzel washington', 'cast:derek jacobi',\n",
+ " 'cast:dermot mulroney', 'cast:diane keaton', 'cast:diane lane',\n",
+ " 'cast:dianne wiest', 'cast:dick miller', 'cast:diego abatantuono',\n",
+ " 'cast:dolph lundgren', 'cast:dom deluise', 'cast:don beddoe',\n",
+ " 'cast:don cheadle', 'cast:donal logue', 'cast:donald crisp',\n",
+ " 'cast:donald meek', 'cast:donald pleasence',\n",
+ " 'cast:donald sutherland', 'cast:doris lloyd',\n",
+ " 'cast:douglas fowley', 'cast:douglass dumbrille',\n",
+ " 'cast:drew barrymore', 'cast:dub taylor', 'cast:dustin hoffman',\n",
+ " 'cast:dylan baker', 'cast:e.e. clive', 'cast:ed asner',\n",
+ " 'cast:ed begley jr.', 'cast:ed harris', 'cast:ed lauter',\n",
+ " 'cast:eddie albert', 'cast:eddie izzard', 'cast:eddie marsan',\n",
+ " 'cast:eddie murphy', 'cast:edgar buchanan', \"cast:edmond o'brien\",\n",
+ " 'cast:edmund mortimer', 'cast:edward arnold', 'cast:edward brophy',\n",
+ " 'cast:edward g. robinson', 'cast:edward herrmann',\n",
+ " 'cast:eleanor parker', 'cast:eli wallach', 'cast:elias koteas',\n",
+ " 'cast:elijah wood', 'cast:elisha cook jr.', 'cast:elizabeth banks',\n",
+ " 'cast:elizabeth taylor', 'cast:ellen barkin', 'cast:ellen burstyn',\n",
+ " 'cast:ellen corby', 'cast:elliott gould', 'cast:emily watson',\n",
+ " 'cast:emma thompson', 'cast:emmett vogan', 'cast:emory parnell',\n",
+ " 'cast:eric idle', 'cast:eric roberts', 'cast:eric stoltz',\n",
+ " 'cast:eric tsang', 'cast:ernest borgnine', 'cast:ernie hudson',\n",
+ " 'cast:ethan hawke', 'cast:eugene levy', 'cast:eugene pallette',\n",
+ " 'cast:ewan mcgregor', 'cast:f. murray abraham',\n",
+ " 'cast:famke janssen', 'cast:faye dunaway', 'cast:fernando rey',\n",
+ " 'cast:forest whitaker', 'cast:frances mcdormand',\n",
+ " 'cast:franco nero', 'cast:frank faylen', 'cast:frank ferguson',\n",
+ " 'cast:frank langella', 'cast:frank mayo', 'cast:frank mchugh',\n",
+ " 'cast:frank mills', 'cast:frank morgan', 'cast:frank puglia',\n",
+ " 'cast:frank reicher', 'cast:frank sinatra', 'cast:frank welker',\n",
+ " 'cast:frankie faison', 'cast:fred astaire', 'cast:fred macmurray',\n",
+ " 'cast:fred tatasciore', 'cast:fred ward', 'cast:fred willard',\n",
+ " 'cast:fredric march', 'cast:gabriel byrne', 'cast:gary busey',\n",
+ " 'cast:gary cole', 'cast:gary cooper', 'cast:gary lewis',\n",
+ " 'cast:gary oldman', 'cast:gene hackman', 'cast:gene lockhart',\n",
+ " 'cast:geoffrey lewis', 'cast:geoffrey rush',\n",
+ " 'cast:george buck flower', 'cast:george c. scott',\n",
+ " 'cast:george chandler', 'cast:george clooney', 'cast:george davis',\n",
+ " 'cast:george irving', 'cast:george kennedy', 'cast:george sanders',\n",
+ " 'cast:george segal', 'cast:george tobias',\n",
+ " 'cast:geraldine chaplin', 'cast:gian maria volonté',\n",
+ " 'cast:giancarlo esposito', 'cast:gina gershon',\n",
+ " 'cast:ginger rogers', 'cast:gino corrado', 'cast:giovanni ribisi',\n",
+ " 'cast:giuliano gemma', 'cast:glenn close', 'cast:glenn ford',\n",
+ " 'cast:grace zabriskie', 'cast:grady sutton', 'cast:grant mitchell',\n",
+ " 'cast:greg kinnear', 'cast:gregory peck', 'cast:grey griffin',\n",
+ " 'cast:griffin dunne', 'cast:guinn williams', 'cast:guy kibbee',\n",
+ " 'cast:guy pearce', 'cast:gwyneth paltrow', 'cast:gérard depardieu',\n",
+ " 'cast:hal holbrook', 'cast:hank azaria', 'cast:harold miller',\n",
+ " 'cast:harrison ford', 'cast:harry andrews', 'cast:harry carey',\n",
+ " 'cast:harry cording', 'cast:harry davenport',\n",
+ " 'cast:harry dean stanton', 'cast:harry hayden',\n",
+ " 'cast:harry morgan', 'cast:harry strang', 'cast:harvey keitel',\n",
+ " 'cast:heather graham', 'cast:helen mirren',\n",
+ " 'cast:helena bonham carter', 'cast:henry fonda',\n",
+ " \"cast:henry o'neill\", 'cast:henry silva', 'cast:henry stephenson',\n",
+ " 'cast:herbert lom', 'cast:herbert marshall', 'cast:holmes herbert',\n",
+ " 'cast:hugh jackman', 'cast:hugo weaving', 'cast:humphrey bogart',\n",
+ " 'cast:huntz hall', 'cast:héctor elizondo', 'cast:ian holm',\n",
+ " 'cast:ian mckellen', 'cast:ian mcshane', 'cast:ian wolfe',\n",
+ " 'cast:ingrid bergman', 'cast:irving bacon',\n",
+ " 'cast:isabella rossellini', 'cast:isabelle huppert',\n",
+ " 'cast:j. carrol naish', 'cast:j. farrell macdonald',\n",
+ " 'cast:j.k. simmons', 'cast:jack black', 'cast:jack elam',\n",
+ " 'cast:jack lemmon', 'cast:jack nicholson', 'cast:jack palance',\n",
+ " 'cast:jack warden', 'cast:jackie chan', 'cast:jacqueline bisset',\n",
+ " 'cast:james caan', 'cast:james cagney', 'cast:james coburn',\n",
+ " 'cast:james cosmo', 'cast:james cromwell', 'cast:james earl jones',\n",
+ " 'cast:james flavin', 'cast:james fox', 'cast:james franco',\n",
+ " 'cast:james gandolfini', 'cast:james garner', 'cast:james gleason',\n",
+ " 'cast:james hong', 'cast:james le gros', 'cast:james marsden',\n",
+ " 'cast:james mason', 'cast:james rebhorn', 'cast:james remar',\n",
+ " 'cast:james russo', 'cast:james stewart', 'cast:james whitmore',\n",
+ " 'cast:james woods', 'cast:jamie lee curtis', 'cast:jane darwell',\n",
+ " 'cast:jane fonda', 'cast:jane lynch', 'cast:janeane garofalo',\n",
+ " 'cast:janet leigh', 'cast:jared harris', 'cast:jason alexander',\n",
+ " 'cast:jason bateman', 'cast:jason flemyng', 'cast:jason isaacs',\n",
+ " 'cast:jason lee', 'cast:jason robards', 'cast:jason statham',\n",
+ " 'cast:jean reno', 'cast:jean rochefort', 'cast:jean-claude brialy',\n",
+ " 'cast:jean-claude van damme', 'cast:jean-louis trintignant',\n",
+ " 'cast:jean-paul belmondo', 'cast:jeanne moreau',\n",
+ " 'cast:jeff bennett', 'cast:jeff bridges', 'cast:jeff corey',\n",
+ " 'cast:jeff daniels', 'cast:jeff goldblum', 'cast:jeffrey combs',\n",
+ " 'cast:jeffrey tambor', 'cast:jennifer jason leigh',\n",
+ " 'cast:jennifer tilly', 'cast:jeremy irons', 'cast:jeremy piven',\n",
+ " 'cast:jerome cowan', 'cast:jerry lewis', 'cast:jessica lange',\n",
+ " 'cast:jet li', 'cast:jim backus', 'cast:jim belushi',\n",
+ " 'cast:jim broadbent', 'cast:jim carrey', 'cast:jim cummings',\n",
+ " 'cast:joan blondell', 'cast:joan crawford', 'cast:joan cusack',\n",
+ " 'cast:jodie foster', 'cast:joe mantegna', 'cast:joe pantoliano',\n",
+ " 'cast:joe sawyer', 'cast:joel mccrea', 'cast:john c. mcginley',\n",
+ " 'cast:john c. reilly', 'cast:john candy', 'cast:john carradine',\n",
+ " 'cast:john carroll lynch', 'cast:john cleese', 'cast:john cusack',\n",
+ " 'cast:john diehl', 'cast:john dimaggio', 'cast:john george',\n",
+ " 'cast:john gielgud', 'cast:john goodman', 'cast:john hawkes',\n",
+ " 'cast:john heard', 'cast:john hoyt', 'cast:john hurt',\n",
+ " 'cast:john ireland', 'cast:john leguizamo', 'cast:john litel',\n",
+ " 'cast:john lithgow', 'cast:john malkovich', 'cast:john mcintire',\n",
+ " 'cast:john michael higgins', 'cast:john miljan', 'cast:john mills',\n",
+ " 'cast:john qualen', 'cast:john ratzenberger',\n",
+ " 'cast:john rhys-davies', 'cast:john ridgely', 'cast:john savage',\n",
+ " 'cast:john saxon', 'cast:john travolta', 'cast:john turturro',\n",
+ " 'cast:john wayne', 'cast:johnny depp', 'cast:johnny lever',\n",
+ " 'cast:jon gries', 'cast:jon lovitz', 'cast:jon polito',\n",
+ " 'cast:jon voight', 'cast:jonah hill', 'cast:jonathan hale',\n",
+ " 'cast:jonathan pryce', 'cast:joseph cotten', 'cast:joseph crehan',\n",
+ " 'cast:joseph gordon-levitt', 'cast:josh brolin', 'cast:josh lucas',\n",
+ " 'cast:joss ackland', 'cast:jude law', 'cast:judi dench',\n",
+ " 'cast:judy greer', 'cast:julia roberts', 'cast:julianne moore',\n",
+ " 'cast:juliette binoche', 'cast:juliette lewis', 'cast:justin long',\n",
+ " 'cast:jürgen prochnow', 'cast:kane hodder', 'cast:kareena kapoor',\n",
+ " 'cast:karen black', 'cast:karl malden', 'cast:kate winslet',\n",
+ " 'cast:katharine hepburn', 'cast:kathleen freeman',\n",
+ " 'cast:kathy baker', 'cast:kathy bates', 'cast:keanu reeves',\n",
+ " 'cast:keenan wynn', 'cast:keith carradine', 'cast:keith david',\n",
+ " 'cast:kenneth branagh', 'cast:kevin bacon', 'cast:kevin corrigan',\n",
+ " 'cast:kevin costner', 'cast:kevin dunn', 'cast:kevin kline',\n",
+ " 'cast:kevin mccarthy', 'cast:kevin michael richardson',\n",
+ " 'cast:kevin pollak', 'cast:kevin smith', 'cast:kevin spacey',\n",
+ " 'cast:kiefer sutherland', 'cast:kim basinger', 'cast:kirk douglas',\n",
+ " 'cast:kirsten dunst', 'cast:klaus kinski',\n",
+ " 'cast:kris kristofferson', 'cast:kristen stewart',\n",
+ " 'cast:kristen wiig', 'cast:kristin scott thomas', 'cast:ku feng',\n",
+ " 'cast:kurt russell', 'cast:l.q. jones', 'cast:lam suet',\n",
+ " 'cast:lambert wilson', 'cast:lance henriksen',\n",
+ " 'cast:lane chandler', 'cast:larry miller', 'cast:larry steers',\n",
+ " 'cast:laura dern', 'cast:laura linney', 'cast:lauren bacall',\n",
+ " 'cast:laurence fishburne', 'cast:laurence olivier',\n",
+ " 'cast:lee j. cobb', 'cast:lee marvin', 'cast:lee phelps',\n",
+ " 'cast:lee van cleef', 'cast:lena headey', 'cast:leo gorcey',\n",
+ " 'cast:leo white', 'cast:leoda richards', 'cast:leon ames',\n",
+ " 'cast:leslie nielsen', 'cast:leslie phillips', 'cast:lewis stone',\n",
+ " 'cast:leyland hodgson', 'cast:liam neeson', 'cast:liev schreiber',\n",
+ " 'cast:lili taylor', 'cast:lin shaye', 'cast:lionel barrymore',\n",
+ " 'cast:lionel stander', 'cast:lloyd bridges', 'cast:lochlyn munro',\n",
+ " 'cast:lon chaney jr.', 'cast:lou diamond phillips',\n",
+ " 'cast:louis calhern', 'cast:louis gossett', 'cast:louise beavers',\n",
+ " 'cast:lucille ball', 'cast:lucy liu', 'cast:luis guzmán',\n",
+ " 'cast:lukas haas', 'cast:luke wilson', 'cast:lyle talbot',\n",
+ " 'cast:m. emmet walsh', 'cast:mae marsh', 'cast:maggie smith',\n",
+ " 'cast:malcolm mcdowell', 'cast:marc lawrence', 'cast:marcel dalio',\n",
+ " 'cast:marcello mastroianni', 'cast:marcia gay harden',\n",
+ " 'cast:margo martindale', 'cast:maria bello',\n",
+ " 'cast:marion cotillard', 'cast:marisa tomei',\n",
+ " 'cast:mark boone junior', 'cast:mark hamill', 'cast:mark ruffalo',\n",
+ " 'cast:mark strong', 'cast:mark wahlberg', 'cast:marlon brando',\n",
+ " 'cast:martin balsam', 'cast:martin landau', 'cast:martin scorsese',\n",
+ " 'cast:martin sheen', 'cast:mary astor', 'cast:mary field',\n",
+ " 'cast:mary gordon', 'cast:mary kay place', 'cast:mary steenburgen',\n",
+ " 'cast:masako nozawa', 'cast:mathieu amalric', 'cast:matt damon',\n",
+ " 'cast:matt dillon', 'cast:matt frewer', 'cast:matthew broderick',\n",
+ " 'cast:matthew lillard', 'cast:matthew mcconaughey',\n",
+ " 'cast:matthew modine', 'cast:maury chaykin', 'cast:max von sydow',\n",
+ " 'cast:mel blanc', 'cast:mel gibson', 'cast:melanie griffith',\n",
+ " 'cast:melissa leo', 'cast:melvyn douglas', 'cast:meryl streep',\n",
+ " 'cast:mia farrow', 'cast:michael biehn', 'cast:michael caine',\n",
+ " 'cast:michael clarke duncan', 'cast:michael douglas',\n",
+ " 'cast:michael gambon', 'cast:michael gough',\n",
+ " 'cast:michael hordern', 'cast:michael ironside',\n",
+ " 'cast:michael j. fox', 'cast:michael keaton',\n",
+ " 'cast:michael lerner', 'cast:michael lonsdale',\n",
+ " 'cast:michael madsen', 'cast:michael mckean',\n",
+ " 'cast:michael murphy', 'cast:michael nyqvist', 'cast:michael paré',\n",
+ " 'cast:michael peña', 'cast:michael rapaport',\n",
+ " 'cast:michael rooker', 'cast:michael shannon',\n",
+ " 'cast:michael sheen', 'cast:michael york', 'cast:michel piccoli',\n",
+ " 'cast:michelle pfeiffer', 'cast:mickey rooney',\n",
+ " 'cast:mickey rourke', 'cast:mike epps', 'cast:mike mazurki',\n",
+ " 'cast:mike starr', 'cast:milton kibbee', 'cast:mira sorvino',\n",
+ " 'cast:miranda richardson', 'cast:miriam margolyes',\n",
+ " 'cast:missi pyle', 'cast:molly shannon', 'cast:monte blue',\n",
+ " 'cast:morgan freeman', 'cast:moroni olsen', 'cast:morris ankrum',\n",
+ " 'cast:myrna loy', 'cast:naomi watts', 'cast:naseeruddin shah',\n",
+ " 'cast:nastassja kinski', 'cast:natalie portman',\n",
+ " 'cast:natasha lyonne', 'cast:ned beatty', 'cast:nestor paiva',\n",
+ " 'cast:nick nolte', 'cast:nicolas cage', 'cast:nicole kidman',\n",
+ " 'cast:nigel bruce', 'cast:noah beery', 'cast:octavia spencer',\n",
+ " 'cast:olin howland', 'cast:oliver platt', 'cast:oliver reed',\n",
+ " 'cast:olivier gourmet', 'cast:olympia dukakis', 'cast:om puri',\n",
+ " 'cast:omar sharif', 'cast:orson welles', 'cast:owen wilson',\n",
+ " 'cast:pam grier', 'cast:paolo villaggio', 'cast:paresh rawal',\n",
+ " 'cast:parker posey', 'cast:pat flaherty', 'cast:pat hingle',\n",
+ " 'cast:patricia clarkson', 'cast:patrick bauchau',\n",
+ " 'cast:patrick stewart', 'cast:patrick warburton',\n",
+ " 'cast:patton oswalt', 'cast:paul dooley', 'cast:paul fix',\n",
+ " 'cast:paul giamatti', 'cast:paul guilfoyle', 'cast:paul harvey',\n",
+ " 'cast:paul newman', 'cast:paul rudd', 'cast:paul sorvino',\n",
+ " 'cast:penélope cruz', 'cast:pete postlethwaite',\n",
+ " 'cast:peter boyle', 'cast:peter coyote', 'cast:peter cushing',\n",
+ " 'cast:peter falk', 'cast:peter fonda', 'cast:peter gallagher',\n",
+ " 'cast:peter lawford', 'cast:peter lorre', \"cast:peter o'toole\",\n",
+ " 'cast:peter sarsgaard', 'cast:peter sellers',\n",
+ " 'cast:peter stormare', 'cast:peter ustinov',\n",
+ " 'cast:philip baker hall', 'cast:philip ettington',\n",
+ " 'cast:philip seymour hoffman', 'cast:philippe noiret',\n",
+ " 'cast:pierce brosnan', 'cast:pierre watkin',\n",
+ " 'cast:priyanka chopra', 'cast:pruitt taylor vince',\n",
+ " 'cast:queen latifah', 'cast:r. lee ermey', 'cast:rachel weisz',\n",
+ " 'cast:rade serbedzija', 'cast:ralph bellamy', 'cast:ralph fiennes',\n",
+ " 'cast:ralph richardson', 'cast:randolph scott', 'cast:randy quaid',\n",
+ " 'cast:ray liotta', 'cast:ray milland', 'cast:ray teal',\n",
+ " 'cast:ray winstone', 'cast:ray wise', 'cast:raymond burr',\n",
+ " 'cast:reese witherspoon', 'cast:reginald denny',\n",
+ " 'cast:reginald owen', 'cast:regis toomey', 'cast:ren osugi',\n",
+ " 'cast:rhys ifans', 'cast:rhys williams', 'cast:richard anderson',\n",
+ " 'cast:richard burton', 'cast:richard dreyfuss',\n",
+ " 'cast:richard e. grant', 'cast:richard gere',\n",
+ " 'cast:richard harris', 'cast:richard jaeckel',\n",
+ " 'cast:richard jenkins', 'cast:richard kind', 'cast:richard masur',\n",
+ " 'cast:richard pryor', 'cast:richard riehle', 'cast:richard schiff',\n",
+ " 'cast:richard widmark', 'cast:rip torn', 'cast:rob lowe',\n",
+ " 'cast:rob paulsen', 'cast:rob schneider', 'cast:robbie coltrane',\n",
+ " 'cast:robert barrat', 'cast:robert de niro',\n",
+ " 'cast:robert downey jr.', 'cast:robert duvall',\n",
+ " 'cast:robert englund', 'cast:robert forster', 'cast:robert loggia',\n",
+ " 'cast:robert mitchum', 'cast:robert morley', 'cast:robert patrick',\n",
+ " 'cast:robert redford', 'cast:robert ryan', 'cast:robert taylor',\n",
+ " 'cast:robert vaughn', 'cast:robert wagner', 'cast:robert warwick',\n",
+ " 'cast:robert young', 'cast:robin williams', 'cast:robin wright',\n",
+ " 'cast:rock hudson', 'cast:rod steiger', 'cast:roddy mcdowall',\n",
+ " 'cast:ron jeremy', 'cast:ron livingston', 'cast:ron perlman',\n",
+ " 'cast:rosanna arquette', 'cast:rosario dawson', 'cast:rose byrne',\n",
+ " 'cast:roy scheider', 'cast:royal dano', 'cast:russell crowe',\n",
+ " 'cast:russell hicks', 'cast:rutger hauer', 'cast:ryan reynolds',\n",
+ " 'cast:salma hayek', 'cast:salman khan', 'cast:sam elliott',\n",
+ " 'cast:sam harris', 'cast:sam neill', 'cast:sam rockwell',\n",
+ " 'cast:sam shepard', 'cast:sammo hung', 'cast:samuel l. jackson',\n",
+ " 'cast:samuel s. hinds', 'cast:sandra bullock',\n",
+ " 'cast:sarah silverman', 'cast:scarlett johansson',\n",
+ " 'cast:scott glenn', 'cast:scott wilson', 'cast:sean astin',\n",
+ " 'cast:sean bean', 'cast:sean connery', 'cast:sean penn',\n",
+ " 'cast:sean young', 'cast:selmer jackson', 'cast:seth green',\n",
+ " 'cast:seth rogen', 'cast:seymour cassel', 'cast:shah rukh khan',\n",
+ " 'cast:sharon stone', 'cast:shelley winters',\n",
+ " 'cast:shirley henderson', 'cast:shirley maclaine', 'cast:sid haig',\n",
+ " 'cast:sid james', 'cast:sidney poitier', 'cast:sig ruman',\n",
+ " 'cast:sigourney weaver', 'cast:simon pegg', 'cast:simon yam',\n",
+ " 'cast:sissy spacek', 'cast:sophia loren', 'cast:spencer charters',\n",
+ " 'cast:spencer tracy', 'cast:stacy keach', 'cast:stan lee',\n",
+ " 'cast:stanley tucci', 'cast:stellan skarsgård',\n",
+ " 'cast:stephen dorff', 'cast:stephen fry', 'cast:stephen lang',\n",
+ " 'cast:stephen mchattie', 'cast:stephen rea', 'cast:stephen root',\n",
+ " 'cast:stephen tobolowsky', 'cast:sterling holloway',\n",
+ " 'cast:steve buscemi', 'cast:steve coogan', 'cast:steve guttenberg',\n",
+ " 'cast:steve martin', 'cast:steve zahn', 'cast:steven geray',\n",
+ " 'cast:steven seagal', 'cast:stockard channing',\n",
+ " 'cast:strother martin', 'cast:stuart holmes',\n",
+ " 'cast:susan sarandon', 'cast:susumu terajima',\n",
+ " 'cast:sylvester stallone', 'cast:tadanobu asano',\n",
+ " 'cast:takashi shimura', 'cast:tara strong', 'cast:tchéky karyo',\n",
+ " 'cast:terence stamp', 'cast:teri garr', 'cast:terrence howard',\n",
+ " 'cast:thomas jane', 'cast:thomas kretschmann',\n",
+ " 'cast:thomas mitchell', 'cast:til schweiger', 'cast:tilda swinton',\n",
+ " 'cast:tim blake nelson', 'cast:tim curry', 'cast:tim robbins',\n",
+ " 'cast:tim roth', 'cast:tim thomerson', 'cast:timothy hutton',\n",
+ " 'cast:timothy spall', 'cast:toby jones', 'cast:tom arnold',\n",
+ " 'cast:tom berenger', 'cast:tom cruise', 'cast:tom dugan',\n",
+ " 'cast:tom hanks', 'cast:tom kenny', 'cast:tom lister jr.',\n",
+ " 'cast:tom selleck', 'cast:tom sizemore', 'cast:tom skerritt',\n",
+ " 'cast:tom wilkinson', 'cast:tommy lee jones', 'cast:tomás milián',\n",
+ " 'cast:toni collette', 'cast:tony curtis', 'cast:tony shalhoub',\n",
+ " 'cast:tony todd', 'cast:toshirō mifune', 'cast:tracey walter',\n",
+ " 'cast:treat williams', 'cast:trevor howard', 'cast:udo kier',\n",
+ " 'cast:ugo tognazzi', 'cast:uma thurman', 'cast:una merkel',\n",
+ " 'cast:val kilmer', 'cast:van johnson', 'cast:vanessa redgrave',\n",
+ " 'cast:vernon dobtcheff', 'cast:viggo mortensen',\n",
+ " 'cast:vince vaughn', 'cast:vincent cassel',\n",
+ " \"cast:vincent d'onofrio\", 'cast:vincent price', 'cast:ving rhames',\n",
+ " 'cast:vinnie jones', 'cast:viola davis', 'cast:virginia brissac',\n",
+ " 'cast:virginia madsen', 'cast:vittorio gassman',\n",
+ " 'cast:vivica a. fox', 'cast:wade boteler', 'cast:wallace ford',\n",
+ " 'cast:wallace shawn', 'cast:walter brennan', 'cast:walter huston',\n",
+ " 'cast:walter matthau', 'cast:walter pidgeon', 'cast:walter sande',\n",
+ " 'cast:ward bond', 'cast:warren oates', 'cast:werner herzog',\n",
+ " 'cast:wesley snipes', 'cast:whit bissell', 'cast:whoopi goldberg',\n",
+ " 'cast:will arnett', 'cast:will ferrell', 'cast:will patton',\n",
+ " 'cast:will wright', 'cast:willard robertson', 'cast:willem dafoe',\n",
+ " 'cast:william b. davidson', 'cast:william demarest',\n",
+ " 'cast:william fichtner', 'cast:william forsythe',\n",
+ " 'cast:william h. macy', 'cast:william holden', 'cast:william hurt',\n",
+ " 'cast:william powell', 'cast:william sadler',\n",
+ " 'cast:william schallert', 'cast:william shatner',\n",
+ " 'cast:winona ryder', 'cast:woody allen', 'cast:woody harrelson',\n",
+ " 'cast:xander berkeley', 'cast:yuen biao'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 28
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 645
+ },
+ "id": "F59_kK_Jsy6L",
+ "outputId": "b588ab8d-79fe-4261-e403-66a1ce38b5c6"
+ },
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py:3678: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`\n",
+ " self[col] = igetitem(value, i)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " title id adult video genre: \\\n",
+ "0 Toy Story 862 0 0 0 \n",
+ "1 Jumanji 8844 0 0 0 \n",
+ "2 Grumpier Old Men 15602 0 0 0 \n",
+ "3 Waiting to Exhale 31357 0 0 0 \n",
+ "4 Father of the Bride Part II 11862 0 0 0 \n",
+ "... ... ... ... ... ... \n",
+ "46159 Subdue 439050 0 0 0 \n",
+ "46160 Century of Birthing 111109 0 0 0 \n",
+ "46161 Betrayal 67758 0 0 0 \n",
+ "46162 Satan Triumphant 227506 0 0 1 \n",
+ "46163 Queerama 461257 0 0 1 \n",
+ "\n",
+ " genre:action genre:adventure genre:animation genre:comedy \\\n",
+ "0 0 0 1 1 \n",
+ "1 0 1 0 0 \n",
+ "2 0 0 0 1 \n",
+ "3 0 0 0 1 \n",
+ "4 0 0 0 1 \n",
+ "... ... ... ... ... \n",
+ "46159 0 0 0 0 \n",
+ "46160 0 0 0 0 \n",
+ "46161 1 0 0 0 \n",
+ "46162 0 0 0 0 \n",
+ "46163 0 0 0 0 \n",
+ "\n",
+ " genre:crime ... cast:william hurt cast:william powell \\\n",
+ "0 0 ... 0 0 \n",
+ "1 0 ... 0 0 \n",
+ "2 0 ... 0 0 \n",
+ "3 0 ... 0 0 \n",
+ "4 0 ... 0 0 \n",
+ "... ... ... ... ... \n",
+ "46159 0 ... 0 0 \n",
+ "46160 0 ... 0 0 \n",
+ "46161 0 ... 0 0 \n",
+ "46162 0 ... 0 0 \n",
+ "46163 0 ... 0 0 \n",
+ "\n",
+ " cast:william sadler cast:william schallert cast:william shatner \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "... ... ... ... \n",
+ "46159 0 0 0 \n",
+ "46160 0 0 0 \n",
+ "46161 0 0 0 \n",
+ "46162 0 0 0 \n",
+ "46163 0 0 0 \n",
+ "\n",
+ " cast:winona ryder cast:woody allen cast:woody harrelson \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "... ... ... ... \n",
+ "46159 0 0 0 \n",
+ "46160 0 0 0 \n",
+ "46161 0 0 0 \n",
+ "46162 0 0 0 \n",
+ "46163 0 0 0 \n",
+ "\n",
+ " cast:xander berkeley cast:yuen biao \n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "... ... ... \n",
+ "46159 0 0 \n",
+ "46160 0 0 \n",
+ "46161 0 0 \n",
+ "46162 0 0 \n",
+ "46163 0 0 \n",
+ "\n",
+ "[46164 rows x 3675 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " id | \n",
+ " adult | \n",
+ " video | \n",
+ " genre: | \n",
+ " genre:action | \n",
+ " genre:adventure | \n",
+ " genre:animation | \n",
+ " genre:comedy | \n",
+ " genre:crime | \n",
+ " ... | \n",
+ " cast:william hurt | \n",
+ " cast:william powell | \n",
+ " cast:william sadler | \n",
+ " cast:william schallert | \n",
+ " cast:william shatner | \n",
+ " cast:winona ryder | \n",
+ " cast:woody allen | \n",
+ " cast:woody harrelson | \n",
+ " cast:xander berkeley | \n",
+ " cast:yuen biao | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Toy Story | \n",
+ " 862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Jumanji | \n",
+ " 8844 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Grumpier Old Men | \n",
+ " 15602 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Waiting to Exhale | \n",
+ " 31357 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Father of the Bride Part II | \n",
+ " 11862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 46159 | \n",
+ " Subdue | \n",
+ " 439050 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46160 | \n",
+ " Century of Birthing | \n",
+ " 111109 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46161 | \n",
+ " Betrayal | \n",
+ " 67758 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46162 | \n",
+ " Satan Triumphant | \n",
+ " 227506 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46163 | \n",
+ " Queerama | \n",
+ " 461257 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
46164 rows × 3675 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 29
+ }
+ ],
+ "source": [
+ "metadata = pd.concat([metadata[['title','id','adult','video']], \n",
+ " pd.DataFrame(genre_data, columns=genre_cols),\n",
+ " pd.DataFrame(countries_data, columns=countries_cols),\n",
+ " pd.DataFrame(collection_data, columns=collection_cols),\n",
+ " pd.DataFrame(keyword_data, columns=keyword_cols),\n",
+ " pd.DataFrame(companies_data, columns=companies_cols),\n",
+ " pd.DataFrame(lang_data, columns=lang_cols)], axis=1)\n",
+ "\n",
+ "credits[credit_cols] = credit_data\n",
+ "metadata = pd.merge(metadata, credits, how='inner', on='id')\n",
+ "metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "id": "jONYrkyX4ylp"
+ },
+ "outputs": [],
+ "source": [
+ "#metadata.drop(['production_countries', 'genres', 'belongs_to_collection', 'keywords', 'production_companies', 'original_language'], axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Xw1AygX879iQ"
+ },
+ "source": [
+ "list of all numerical features(everything except id and title)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KvUlIctgo58P",
+ "outputId": "48ceca68-86eb-47fc-8860-37346cbaffd7"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array(['adult', 'video', 'genre:', ..., 'cast:woody harrelson',\n",
+ " 'cast:xander berkeley', 'cast:yuen biao'], dtype=object)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 31
+ }
+ ],
+ "source": [
+ "feature_cols = np.concatenate((np.array(['adult', 'video']), genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols))\n",
+ "feature_cols\n",
+ "#metadata[feature_cols] = metadata[feature_cols].astype('int8')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "del genre_data,countries_data,collection_data,keyword_data,companies_data,lang_data,credit_data\n",
+ "del genre_cols,countries_cols,collection_cols,keyword_cols,companies_cols,lang_cols,credit_cols"
+ ],
+ "metadata": {
+ "id": "CavDxPCEVFfR"
+ },
+ "execution_count": 32,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "orMZ0gXh6znM",
+ "outputId": "990f15a7-091d-4618-f87d-9d237b500fb6"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(3672,)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 33
+ }
+ ],
+ "source": [
+ "feature_cols.shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 609
+ },
+ "id": "vRWJ9z7I591C",
+ "outputId": "29b45b84-7eaa-4873-d1fa-2a7f17c50bb9"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " title id adult video genre: \\\n",
+ "0 Toy Story 862 0 0 0 \n",
+ "1 Jumanji 8844 0 0 0 \n",
+ "2 Grumpier Old Men 15602 0 0 0 \n",
+ "3 Waiting to Exhale 31357 0 0 0 \n",
+ "4 Father of the Bride Part II 11862 0 0 0 \n",
+ "... ... ... ... ... ... \n",
+ "46159 Subdue 439050 0 0 0 \n",
+ "46160 Century of Birthing 111109 0 0 0 \n",
+ "46161 Betrayal 67758 0 0 0 \n",
+ "46162 Satan Triumphant 227506 0 0 1 \n",
+ "46163 Queerama 461257 0 0 1 \n",
+ "\n",
+ " genre:action genre:adventure genre:animation genre:comedy \\\n",
+ "0 0 0 1 1 \n",
+ "1 0 1 0 0 \n",
+ "2 0 0 0 1 \n",
+ "3 0 0 0 1 \n",
+ "4 0 0 0 1 \n",
+ "... ... ... ... ... \n",
+ "46159 0 0 0 0 \n",
+ "46160 0 0 0 0 \n",
+ "46161 1 0 0 0 \n",
+ "46162 0 0 0 0 \n",
+ "46163 0 0 0 0 \n",
+ "\n",
+ " genre:crime ... cast:william hurt cast:william powell \\\n",
+ "0 0 ... 0 0 \n",
+ "1 0 ... 0 0 \n",
+ "2 0 ... 0 0 \n",
+ "3 0 ... 0 0 \n",
+ "4 0 ... 0 0 \n",
+ "... ... ... ... ... \n",
+ "46159 0 ... 0 0 \n",
+ "46160 0 ... 0 0 \n",
+ "46161 0 ... 0 0 \n",
+ "46162 0 ... 0 0 \n",
+ "46163 0 ... 0 0 \n",
+ "\n",
+ " cast:william sadler cast:william schallert cast:william shatner \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "... ... ... ... \n",
+ "46159 0 0 0 \n",
+ "46160 0 0 0 \n",
+ "46161 0 0 0 \n",
+ "46162 0 0 0 \n",
+ "46163 0 0 0 \n",
+ "\n",
+ " cast:winona ryder cast:woody allen cast:woody harrelson \\\n",
+ "0 0 0 0 \n",
+ "1 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "... ... ... ... \n",
+ "46159 0 0 0 \n",
+ "46160 0 0 0 \n",
+ "46161 0 0 0 \n",
+ "46162 0 0 0 \n",
+ "46163 0 0 0 \n",
+ "\n",
+ " cast:xander berkeley cast:yuen biao \n",
+ "0 0 0 \n",
+ "1 0 0 \n",
+ "2 0 0 \n",
+ "3 0 0 \n",
+ "4 0 0 \n",
+ "... ... ... \n",
+ "46159 0 0 \n",
+ "46160 0 0 \n",
+ "46161 0 0 \n",
+ "46162 0 0 \n",
+ "46163 0 0 \n",
+ "\n",
+ "[46164 rows x 3675 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " id | \n",
+ " adult | \n",
+ " video | \n",
+ " genre: | \n",
+ " genre:action | \n",
+ " genre:adventure | \n",
+ " genre:animation | \n",
+ " genre:comedy | \n",
+ " genre:crime | \n",
+ " ... | \n",
+ " cast:william hurt | \n",
+ " cast:william powell | \n",
+ " cast:william sadler | \n",
+ " cast:william schallert | \n",
+ " cast:william shatner | \n",
+ " cast:winona ryder | \n",
+ " cast:woody allen | \n",
+ " cast:woody harrelson | \n",
+ " cast:xander berkeley | \n",
+ " cast:yuen biao | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Toy Story | \n",
+ " 862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Jumanji | \n",
+ " 8844 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Grumpier Old Men | \n",
+ " 15602 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Waiting to Exhale | \n",
+ " 31357 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Father of the Bride Part II | \n",
+ " 11862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 46159 | \n",
+ " Subdue | \n",
+ " 439050 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46160 | \n",
+ " Century of Birthing | \n",
+ " 111109 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46161 | \n",
+ " Betrayal | \n",
+ " 67758 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46162 | \n",
+ " Satan Triumphant | \n",
+ " 227506 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 46163 | \n",
+ " Queerama | \n",
+ " 461257 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
46164 rows × 3675 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 34
+ }
+ ],
+ "source": [
+ "metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def split_dataframe(df, holdout_fraction=0.1):\n",
+ " test = df.sample(frac=holdout_fraction, replace=False)\n",
+ " train = df[~df.index.isin(test.index)]\n",
+ " return train, test\n",
+ "\n",
+ "train, test = split_dataframe(metadata)"
+ ],
+ "metadata": {
+ "id": "s-OAc5zG-qkJ"
+ },
+ "execution_count": 35,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "allIds = metadata['id']\n",
+ "\n",
+ "number_of_batches = 4\n",
+ "batches = np.array_split(train, number_of_batches)\n",
+ "mf.log_param('number of batches', number_of_batches)\n",
+ "del metadata\n",
+ "del train"
+ ],
+ "metadata": {
+ "id": "_sS14fV3Zr6n"
+ },
+ "execution_count": 36,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "BZvzSjJUG3nX"
+ },
+ "source": [
+ "## Algorithm\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "batches[0]"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 696
+ },
+ "id": "e6kQZWCWgBC2",
+ "outputId": "cefa0dc8-47b6-41df-c12e-bc0f12c59845"
+ },
+ "execution_count": 37,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " title id adult video genre: genre:action \\\n",
+ "0 Toy Story 862 0 0 0 0 \n",
+ "2 Grumpier Old Men 15602 0 0 0 0 \n",
+ "3 Waiting to Exhale 31357 0 0 0 0 \n",
+ "4 Father of the Bride Part II 11862 0 0 0 0 \n",
+ "5 Heat 949 0 0 0 1 \n",
+ "... ... ... ... ... ... ... \n",
+ "11542 The Bothersome Man 13318 0 0 0 0 \n",
+ "11543 Don't Drink the Water 10462 0 0 0 0 \n",
+ "11544 The Good German 182 0 0 0 0 \n",
+ "11546 Letters from Iwo Jima 1251 0 0 0 1 \n",
+ "11547 Presenting Lily Mars 43512 0 0 0 0 \n",
+ "\n",
+ " genre:adventure genre:animation genre:comedy genre:crime ... \\\n",
+ "0 0 1 1 0 ... \n",
+ "2 0 0 1 0 ... \n",
+ "3 0 0 1 0 ... \n",
+ "4 0 0 1 0 ... \n",
+ "5 0 0 0 1 ... \n",
+ "... ... ... ... ... ... \n",
+ "11542 0 0 1 0 ... \n",
+ "11543 0 0 1 0 ... \n",
+ "11544 0 0 0 1 ... \n",
+ "11546 1 0 0 0 ... \n",
+ "11547 0 0 0 0 ... \n",
+ "\n",
+ " cast:william hurt cast:william powell cast:william sadler \\\n",
+ "0 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "5 0 0 0 \n",
+ "... ... ... ... \n",
+ "11542 0 0 0 \n",
+ "11543 0 0 0 \n",
+ "11544 0 0 0 \n",
+ "11546 0 0 0 \n",
+ "11547 0 0 0 \n",
+ "\n",
+ " cast:william schallert cast:william shatner cast:winona ryder \\\n",
+ "0 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "5 0 0 0 \n",
+ "... ... ... ... \n",
+ "11542 0 0 0 \n",
+ "11543 0 0 0 \n",
+ "11544 0 0 0 \n",
+ "11546 0 0 0 \n",
+ "11547 0 0 0 \n",
+ "\n",
+ " cast:woody allen cast:woody harrelson cast:xander berkeley \\\n",
+ "0 0 0 0 \n",
+ "2 0 0 0 \n",
+ "3 0 0 0 \n",
+ "4 0 0 0 \n",
+ "5 0 0 1 \n",
+ "... ... ... ... \n",
+ "11542 0 0 0 \n",
+ "11543 1 0 0 \n",
+ "11544 0 0 0 \n",
+ "11546 0 0 0 \n",
+ "11547 0 0 0 \n",
+ "\n",
+ " cast:yuen biao \n",
+ "0 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ "5 0 \n",
+ "... ... \n",
+ "11542 0 \n",
+ "11543 0 \n",
+ "11544 0 \n",
+ "11546 0 \n",
+ "11547 0 \n",
+ "\n",
+ "[10387 rows x 3675 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " id | \n",
+ " adult | \n",
+ " video | \n",
+ " genre: | \n",
+ " genre:action | \n",
+ " genre:adventure | \n",
+ " genre:animation | \n",
+ " genre:comedy | \n",
+ " genre:crime | \n",
+ " ... | \n",
+ " cast:william hurt | \n",
+ " cast:william powell | \n",
+ " cast:william sadler | \n",
+ " cast:william schallert | \n",
+ " cast:william shatner | \n",
+ " cast:winona ryder | \n",
+ " cast:woody allen | \n",
+ " cast:woody harrelson | \n",
+ " cast:xander berkeley | \n",
+ " cast:yuen biao | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Toy Story | \n",
+ " 862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " Grumpier Old Men | \n",
+ " 15602 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Waiting to Exhale | \n",
+ " 31357 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Father of the Bride Part II | \n",
+ " 11862 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " Heat | \n",
+ " 949 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 11542 | \n",
+ " The Bothersome Man | \n",
+ " 13318 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 11543 | \n",
+ " Don't Drink the Water | \n",
+ " 10462 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 11544 | \n",
+ " The Good German | \n",
+ " 182 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 11546 | \n",
+ " Letters from Iwo Jima | \n",
+ " 1251 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 11547 | \n",
+ " Presenting Lily Mars | \n",
+ " 43512 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10387 rows × 3675 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 37
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "id": "XyGIImAhG7ZI"
+ },
+ "outputs": [],
+ "source": [
+ "from sklearn.metrics.pairwise import cosine_similarity"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SUJ0Cc9H0KIF"
+ },
+ "source": [
+ "`content_based_recommmeder` returns a list of movie ids based on it's input. the input should be a dataframe which has `movieId`, `rating` columns(like `ratings_small.csv` but without `userId`)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {
+ "id": "u2IFqWvQKWb3"
+ },
+ "outputs": [],
+ "source": [
+ "number_of_batches =1\n",
+ "def content_based_recommender_movie(movieId):\n",
+ " print(\"movie title is:\", metadata[metadata['id']==movieId])\n",
+ " sim_mat= cosine_similarity(metadata[feature_cols])\n",
+ " return sim_mat\n",
+ "\n",
+ "#content_based_recommender_movie(272)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "batches[1].describe()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 411
+ },
+ "id": "K_kmTTPmv3GZ",
+ "outputId": "82087e57-01db-42cb-cfde-b950569ff26a"
+ },
+ "execution_count": 40,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id adult video genre: genre:action \\\n",
+ "count 10387.000000 10387.000000 10387.0 10387.000000 10387.000000 \n",
+ "mean 70745.557909 0.000096 0.0 0.039569 0.151054 \n",
+ "std 63999.684940 0.009812 0.0 0.194953 0.358119 \n",
+ "min 3.000000 0.000000 0.0 0.000000 0.000000 \n",
+ "25% 25769.500000 0.000000 0.0 0.000000 0.000000 \n",
+ "50% 50675.000000 0.000000 0.0 0.000000 0.000000 \n",
+ "75% 94217.000000 0.000000 0.0 0.000000 0.000000 \n",
+ "max 469172.000000 1.000000 0.0 1.000000 1.000000 \n",
+ "\n",
+ " genre:adventure genre:animation genre:comedy genre:crime \\\n",
+ "count 10387.000000 10387.000000 10387.000000 10387.000000 \n",
+ "mean 0.071628 0.038125 0.265813 0.094541 \n",
+ "std 0.257883 0.191506 0.441786 0.292594 \n",
+ "min 0.000000 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 0.000000 0.000000 \n",
+ "75% 0.000000 0.000000 1.000000 0.000000 \n",
+ "max 1.000000 1.000000 1.000000 1.000000 \n",
+ "\n",
+ " genre:documentary ... cast:william hurt cast:william powell \\\n",
+ "count 10387.000000 ... 10387.000000 10387.000000 \n",
+ "mean 0.111678 ... 0.001637 0.000674 \n",
+ "std 0.314985 ... 0.040424 0.025952 \n",
+ "min 0.000000 ... 0.000000 0.000000 \n",
+ "25% 0.000000 ... 0.000000 0.000000 \n",
+ "50% 0.000000 ... 0.000000 0.000000 \n",
+ "75% 0.000000 ... 0.000000 0.000000 \n",
+ "max 1.000000 ... 1.000000 1.000000 \n",
+ "\n",
+ " cast:william sadler cast:william schallert cast:william shatner \\\n",
+ "count 10387.000000 10387.000000 10387.000000 \n",
+ "mean 0.001348 0.001252 0.000674 \n",
+ "std 0.036690 0.037983 0.025952 \n",
+ "min 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 0.000000 \n",
+ "75% 0.000000 0.000000 0.000000 \n",
+ "max 1.000000 2.000000 1.000000 \n",
+ "\n",
+ " cast:winona ryder cast:woody allen cast:woody harrelson \\\n",
+ "count 10387.000000 10387.000000 10387.000000 \n",
+ "mean 0.001059 0.000578 0.002214 \n",
+ "std 0.032527 0.024028 0.047007 \n",
+ "min 0.000000 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 0.000000 \n",
+ "75% 0.000000 0.000000 0.000000 \n",
+ "max 1.000000 1.000000 1.000000 \n",
+ "\n",
+ " cast:xander berkeley cast:yuen biao \n",
+ "count 10387.000000 10387.000000 \n",
+ "mean 0.000770 0.001155 \n",
+ "std 0.027743 0.036697 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 \n",
+ "75% 0.000000 0.000000 \n",
+ "max 1.000000 2.000000 \n",
+ "\n",
+ "[8 rows x 3673 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " adult | \n",
+ " video | \n",
+ " genre: | \n",
+ " genre:action | \n",
+ " genre:adventure | \n",
+ " genre:animation | \n",
+ " genre:comedy | \n",
+ " genre:crime | \n",
+ " genre:documentary | \n",
+ " ... | \n",
+ " cast:william hurt | \n",
+ " cast:william powell | \n",
+ " cast:william sadler | \n",
+ " cast:william schallert | \n",
+ " cast:william shatner | \n",
+ " cast:winona ryder | \n",
+ " cast:woody allen | \n",
+ " cast:woody harrelson | \n",
+ " cast:xander berkeley | \n",
+ " cast:yuen biao | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.0 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " ... | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ " 10387.000000 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 70745.557909 | \n",
+ " 0.000096 | \n",
+ " 0.0 | \n",
+ " 0.039569 | \n",
+ " 0.151054 | \n",
+ " 0.071628 | \n",
+ " 0.038125 | \n",
+ " 0.265813 | \n",
+ " 0.094541 | \n",
+ " 0.111678 | \n",
+ " ... | \n",
+ " 0.001637 | \n",
+ " 0.000674 | \n",
+ " 0.001348 | \n",
+ " 0.001252 | \n",
+ " 0.000674 | \n",
+ " 0.001059 | \n",
+ " 0.000578 | \n",
+ " 0.002214 | \n",
+ " 0.000770 | \n",
+ " 0.001155 | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 63999.684940 | \n",
+ " 0.009812 | \n",
+ " 0.0 | \n",
+ " 0.194953 | \n",
+ " 0.358119 | \n",
+ " 0.257883 | \n",
+ " 0.191506 | \n",
+ " 0.441786 | \n",
+ " 0.292594 | \n",
+ " 0.314985 | \n",
+ " ... | \n",
+ " 0.040424 | \n",
+ " 0.025952 | \n",
+ " 0.036690 | \n",
+ " 0.037983 | \n",
+ " 0.025952 | \n",
+ " 0.032527 | \n",
+ " 0.024028 | \n",
+ " 0.047007 | \n",
+ " 0.027743 | \n",
+ " 0.036697 | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 3.000000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 25769.500000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 50675.000000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 94217.000000 | \n",
+ " 0.000000 | \n",
+ " 0.0 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 1.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " ... | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 469172.000000 | \n",
+ " 1.000000 | \n",
+ " 0.0 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " ... | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 1.000000 | \n",
+ " 2.000000 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
8 rows × 3673 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 40
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {
+ "id": "YgtIqUAvvoSC",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "a15673ec-f5f3-42a3-a503-493261375544"
+ },
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(10387, 3)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 41
+ }
+ ],
+ "source": [
+ "from sklearn.metrics.pairwise import euclidean_distances as dist\n",
+ "def content_based_recommender(user, df, k=10, movieIds=allIds):\n",
+ " user_movies = pd.merge(user,df,how='inner',left_on='movieId',right_on='id')\n",
+ " user_movies[feature_cols] = user_movies[feature_cols].multiply(user_movies['rating'], axis=\"index\")\n",
+ " mean_user_movies = user_movies[feature_cols].mean(axis=0)\n",
+ " sim_mat = cosine_similarity(df[feature_cols][df.id.isin(movieIds)], mean_user_movies[feature_cols].values.reshape(1,-1))\n",
+ " temp_data = {'id':df['id'][df.id.isin(movieIds)], 'title':df['title'][df.id.isin(movieIds)], 'sim':sim_mat.flatten()}\n",
+ " return pd.DataFrame(temp_data)\n",
+ "\n",
+ "def content_based_all_batches(user, k=10, movieIds=allIds):\n",
+ " ans = content_based_recommender(user, batches[0], k, movieIds)\n",
+ " for i in range(1,number_of_batches):\n",
+ " ans.append(content_based_recommender(user, batches[i], k, movieIds))\n",
+ " return ans.sort_values(by='sim', ascending=False)\n",
+ " \n",
+ "\n",
+ "content_based_k = 10\n",
+ "mf.log_param('content based k', content_based_k)\n",
+ "#xx = content_based_recommender(rating[rating['userId'] == 1], batches[1], content_based_k)\n",
+ "xx = content_based_all_batches(rating[rating['userId'] == 1], content_based_k)\n",
+ "xx.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Collaborative Filtering"
+ ],
+ "metadata": {
+ "id": "lzvbAt8G4dXl"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### import libraries"
+ ],
+ "metadata": {
+ "id": "8M7o2QcASuz_"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from sklearn.utils.extmath import randomized_svd"
+ ],
+ "metadata": {
+ "id": "O4tIp_1tSu0A"
+ },
+ "execution_count": 42,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### explore datasets"
+ ],
+ "metadata": {
+ "id": "pHlQSPF7S2HM"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "rating = pd.read_csv('/content/IMDB/ratings_small.csv')\n",
+ "rating.head()"
+ ],
+ "metadata": {
+ "id": "1lyBZ3Tf1oGN",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "47f50819-5da6-49d6-9b39-928a31c19dea"
+ },
+ "execution_count": 43,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " userId movieId rating timestamp\n",
+ "0 1 31 2.5 1260759144\n",
+ "1 1 1029 3.0 1260759179\n",
+ "2 1 1061 3.0 1260759182\n",
+ "3 1 1129 2.0 1260759185\n",
+ "4 1 1172 4.0 1260759205"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " rating | \n",
+ " timestamp | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 31 | \n",
+ " 2.5 | \n",
+ " 1260759144 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1029 | \n",
+ " 3.0 | \n",
+ " 1260759179 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1061 | \n",
+ " 3.0 | \n",
+ " 1260759182 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1129 | \n",
+ " 2.0 | \n",
+ " 1260759185 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1172 | \n",
+ " 4.0 | \n",
+ " 1260759205 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 43
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "rating.shape"
+ ],
+ "metadata": {
+ "id": "8X7DuYoXV2OT",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "7b2810a0-f159-4217-834d-cf882efe0705"
+ },
+ "execution_count": 44,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(100004, 4)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 44
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "links_small = pd.read_csv('/content/IMDB/links_small.csv')\n",
+ "links_small.head()"
+ ],
+ "metadata": {
+ "id": "lHxWpbnLSDqM",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "b7c1e95e-8a4b-46e5-ca06-33baea81964b"
+ },
+ "execution_count": 45,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " movieId imdbId tmdbId\n",
+ "0 1 114709 862.0\n",
+ "1 2 113497 8844.0\n",
+ "2 3 113228 15602.0\n",
+ "3 4 114885 31357.0\n",
+ "4 5 113041 11862.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " movieId | \n",
+ " imdbId | \n",
+ " tmdbId | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 114709 | \n",
+ " 862.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 113497 | \n",
+ " 8844.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 113228 | \n",
+ " 15602.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 114885 | \n",
+ " 31357.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 113041 | \n",
+ " 11862.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 45
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "credits = pd.read_csv('/content/IMDB/credits.csv')\n",
+ "credits.head()"
+ ],
+ "metadata": {
+ "id": "Z0jZ8QI_SM39",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "3e4c5fcc-e763-4ba3-d699-9fda6cdce923"
+ },
+ "execution_count": 46,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " cast \\\n",
+ "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n",
+ "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n",
+ "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n",
+ "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n",
+ "4 [{'cast_id': 1, 'character': 'George Banks', '... \n",
+ "\n",
+ " crew id \n",
+ "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n",
+ "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n",
+ "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n",
+ "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n",
+ "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " cast | \n",
+ " crew | \n",
+ " id | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " [{'cast_id': 14, 'character': 'Woody (voice)',... | \n",
+ " [{'credit_id': '52fe4284c3a36847f8024f49', 'de... | \n",
+ " 862 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " [{'cast_id': 1, 'character': 'Alan Parrish', '... | \n",
+ " [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... | \n",
+ " 8844 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [{'cast_id': 2, 'character': 'Max Goldman', 'c... | \n",
+ " [{'credit_id': '52fe466a9251416c75077a89', 'de... | \n",
+ " 15602 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [{'cast_id': 1, 'character': \"Savannah 'Vannah... | \n",
+ " [{'credit_id': '52fe44779251416c91011acb', 'de... | \n",
+ " 31357 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [{'cast_id': 1, 'character': 'George Banks', '... | \n",
+ " [{'credit_id': '52fe44959251416c75039ed7', 'de... | \n",
+ " 11862 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 46
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie = pd.read_csv('/content/IMDB/movies_metadata.csv')\n",
+ "movie.head()"
+ ],
+ "metadata": {
+ "id": "oIeGPmPI1tAk",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 787
+ },
+ "outputId": "4ab975fd-8432-418d-c826-0e85e14b6704"
+ },
+ "execution_count": 47,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.8/dist-packages/IPython/core/interactiveshell.py:3326: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.\n",
+ " exec(code_obj, self.user_global_ns, self.user_ns)\n"
+ ]
+ },
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection budget \\\n",
+ "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
+ "1 False NaN 65000000 \n",
+ "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
+ "3 False NaN 16000000 \n",
+ "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
+ "\n",
+ " genres \\\n",
+ "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
+ "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
+ "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
+ "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
+ "4 [{'id': 35, 'name': 'Comedy'}] \n",
+ "\n",
+ " homepage id imdb_id original_language \\\n",
+ "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n",
+ "1 NaN 8844 tt0113497 en \n",
+ "2 NaN 15602 tt0113228 en \n",
+ "3 NaN 31357 tt0114885 en \n",
+ "4 NaN 11862 tt0113041 en \n",
+ "\n",
+ " original_title \\\n",
+ "0 Toy Story \n",
+ "1 Jumanji \n",
+ "2 Grumpier Old Men \n",
+ "3 Waiting to Exhale \n",
+ "4 Father of the Bride Part II \n",
+ "\n",
+ " overview ... release_date \\\n",
+ "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
+ "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
+ "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
+ "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
+ "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
+ "\n",
+ " revenue runtime spoken_languages \\\n",
+ "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "\n",
+ " status tagline \\\n",
+ "0 Released NaN \n",
+ "1 Released Roll the dice and unleash the excitement! \n",
+ "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
+ "3 Released Friends are the people who let you be yourself... \n",
+ "4 Released Just When His World Is Back To Normal... He's ... \n",
+ "\n",
+ " title video vote_average vote_count \n",
+ "0 Toy Story False 7.7 5415.0 \n",
+ "1 Jumanji False 6.9 2413.0 \n",
+ "2 Grumpier Old Men False 6.5 92.0 \n",
+ "3 Waiting to Exhale False 6.1 34.0 \n",
+ "4 Father of the Bride Part II False 5.7 173.0 \n",
+ "\n",
+ "[5 rows x 24 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " id | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " original_title | \n",
+ " overview | \n",
+ " ... | \n",
+ " release_date | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " False | \n",
+ " {'id': 10194, 'name': 'Toy Story Collection', ... | \n",
+ " 30000000 | \n",
+ " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
+ " http://toystory.disney.com/toy-story | \n",
+ " 862 | \n",
+ " tt0114709 | \n",
+ " en | \n",
+ " Toy Story | \n",
+ " Led by Woody, Andy's toys live happily in his ... | \n",
+ " ... | \n",
+ " 1995-10-30 | \n",
+ " 373554033.0 | \n",
+ " 81.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Toy Story | \n",
+ " False | \n",
+ " 7.7 | \n",
+ " 5415.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 65000000 | \n",
+ " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
+ " NaN | \n",
+ " 8844 | \n",
+ " tt0113497 | \n",
+ " en | \n",
+ " Jumanji | \n",
+ " When siblings Judy and Peter discover an encha... | \n",
+ " ... | \n",
+ " 1995-12-15 | \n",
+ " 262797249.0 | \n",
+ " 104.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " Roll the dice and unleash the excitement! | \n",
+ " Jumanji | \n",
+ " False | \n",
+ " 6.9 | \n",
+ " 2413.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " False | \n",
+ " {'id': 119050, 'name': 'Grumpy Old Men Collect... | \n",
+ " 0 | \n",
+ " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
+ " NaN | \n",
+ " 15602 | \n",
+ " tt0113228 | \n",
+ " en | \n",
+ " Grumpier Old Men | \n",
+ " A family wedding reignites the ancient feud be... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 0.0 | \n",
+ " 101.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Still Yelling. Still Fighting. Still Ready for... | \n",
+ " Grumpier Old Men | \n",
+ " False | \n",
+ " 6.5 | \n",
+ " 92.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 16000000 | \n",
+ " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
+ " NaN | \n",
+ " 31357 | \n",
+ " tt0114885 | \n",
+ " en | \n",
+ " Waiting to Exhale | \n",
+ " Cheated on, mistreated and stepped on, the wom... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 81452156.0 | \n",
+ " 127.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Friends are the people who let you be yourself... | \n",
+ " Waiting to Exhale | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 34.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " False | \n",
+ " {'id': 96871, 'name': 'Father of the Bride Col... | \n",
+ " 0 | \n",
+ " [{'id': 35, 'name': 'Comedy'}] | \n",
+ " NaN | \n",
+ " 11862 | \n",
+ " tt0113041 | \n",
+ " en | \n",
+ " Father of the Bride Part II | \n",
+ " Just when George Banks has recovered from his ... | \n",
+ " ... | \n",
+ " 1995-02-10 | \n",
+ " 76578911.0 | \n",
+ " 106.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Just When His World Is Back To Normal... He's ... | \n",
+ " Father of the Bride Part II | \n",
+ " False | \n",
+ " 5.7 | \n",
+ " 173.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 24 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 47
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie = movie.rename(columns={'id': 'movieId'})"
+ ],
+ "metadata": {
+ "id": "rmDYxAgOgRNj"
+ },
+ "execution_count": 48,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie.shape"
+ ],
+ "metadata": {
+ "id": "DoSsZcRpjo7Y",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "2c9f7194-d614-443b-d091-5f68f0e90655"
+ },
+ "execution_count": 49,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(45466, 24)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 49
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie.head()"
+ ],
+ "metadata": {
+ "id": "6XmWaDvFgeGU",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 750
+ },
+ "outputId": "8be37316-480a-43d3-82fa-40b236c09c26"
+ },
+ "execution_count": 50,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection budget \\\n",
+ "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
+ "1 False NaN 65000000 \n",
+ "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
+ "3 False NaN 16000000 \n",
+ "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
+ "\n",
+ " genres \\\n",
+ "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n",
+ "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n",
+ "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n",
+ "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n",
+ "4 [{'id': 35, 'name': 'Comedy'}] \n",
+ "\n",
+ " homepage movieId imdb_id original_language \\\n",
+ "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n",
+ "1 NaN 8844 tt0113497 en \n",
+ "2 NaN 15602 tt0113228 en \n",
+ "3 NaN 31357 tt0114885 en \n",
+ "4 NaN 11862 tt0113041 en \n",
+ "\n",
+ " original_title \\\n",
+ "0 Toy Story \n",
+ "1 Jumanji \n",
+ "2 Grumpier Old Men \n",
+ "3 Waiting to Exhale \n",
+ "4 Father of the Bride Part II \n",
+ "\n",
+ " overview ... release_date \\\n",
+ "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
+ "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
+ "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
+ "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
+ "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
+ "\n",
+ " revenue runtime spoken_languages \\\n",
+ "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "\n",
+ " status tagline \\\n",
+ "0 Released NaN \n",
+ "1 Released Roll the dice and unleash the excitement! \n",
+ "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
+ "3 Released Friends are the people who let you be yourself... \n",
+ "4 Released Just When His World Is Back To Normal... He's ... \n",
+ "\n",
+ " title video vote_average vote_count \n",
+ "0 Toy Story False 7.7 5415.0 \n",
+ "1 Jumanji False 6.9 2413.0 \n",
+ "2 Grumpier Old Men False 6.5 92.0 \n",
+ "3 Waiting to Exhale False 6.1 34.0 \n",
+ "4 Father of the Bride Part II False 5.7 173.0 \n",
+ "\n",
+ "[5 rows x 24 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " movieId | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " original_title | \n",
+ " overview | \n",
+ " ... | \n",
+ " release_date | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " False | \n",
+ " {'id': 10194, 'name': 'Toy Story Collection', ... | \n",
+ " 30000000 | \n",
+ " [{'id': 16, 'name': 'Animation'}, {'id': 35, '... | \n",
+ " http://toystory.disney.com/toy-story | \n",
+ " 862 | \n",
+ " tt0114709 | \n",
+ " en | \n",
+ " Toy Story | \n",
+ " Led by Woody, Andy's toys live happily in his ... | \n",
+ " ... | \n",
+ " 1995-10-30 | \n",
+ " 373554033.0 | \n",
+ " 81.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Toy Story | \n",
+ " False | \n",
+ " 7.7 | \n",
+ " 5415.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 65000000 | \n",
+ " [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... | \n",
+ " NaN | \n",
+ " 8844 | \n",
+ " tt0113497 | \n",
+ " en | \n",
+ " Jumanji | \n",
+ " When siblings Judy and Peter discover an encha... | \n",
+ " ... | \n",
+ " 1995-12-15 | \n",
+ " 262797249.0 | \n",
+ " 104.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " Roll the dice and unleash the excitement! | \n",
+ " Jumanji | \n",
+ " False | \n",
+ " 6.9 | \n",
+ " 2413.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " False | \n",
+ " {'id': 119050, 'name': 'Grumpy Old Men Collect... | \n",
+ " 0 | \n",
+ " [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... | \n",
+ " NaN | \n",
+ " 15602 | \n",
+ " tt0113228 | \n",
+ " en | \n",
+ " Grumpier Old Men | \n",
+ " A family wedding reignites the ancient feud be... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 0.0 | \n",
+ " 101.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Still Yelling. Still Fighting. Still Ready for... | \n",
+ " Grumpier Old Men | \n",
+ " False | \n",
+ " 6.5 | \n",
+ " 92.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 16000000 | \n",
+ " [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... | \n",
+ " NaN | \n",
+ " 31357 | \n",
+ " tt0114885 | \n",
+ " en | \n",
+ " Waiting to Exhale | \n",
+ " Cheated on, mistreated and stepped on, the wom... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 81452156.0 | \n",
+ " 127.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Friends are the people who let you be yourself... | \n",
+ " Waiting to Exhale | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 34.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " False | \n",
+ " {'id': 96871, 'name': 'Father of the Bride Col... | \n",
+ " 0 | \n",
+ " [{'id': 35, 'name': 'Comedy'}] | \n",
+ " NaN | \n",
+ " 11862 | \n",
+ " tt0113041 | \n",
+ " en | \n",
+ " Father of the Bride Part II | \n",
+ " Just when George Banks has recovered from his ... | \n",
+ " ... | \n",
+ " 1995-02-10 | \n",
+ " 76578911.0 | \n",
+ " 106.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Just When His World Is Back To Normal... He's ... | \n",
+ " Father of the Bride Part II | \n",
+ " False | \n",
+ " 5.7 | \n",
+ " 173.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 24 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 50
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### data preprocessing"
+ ],
+ "metadata": {
+ "id": "oD9RMwahqemy"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "There are three rows entered by mistake, so we remove that row."
+ ],
+ "metadata": {
+ "id": "Wy2LqLxnklN1"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie = movie[(movie['movieId']!='1997-08-20') & (movie['movieId']!='2012-09-29') & (movie['movieId']!='2014-01-01')]"
+ ],
+ "metadata": {
+ "id": "OnIMWw3Nj3Dp"
+ },
+ "execution_count": 51,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def find_names(x):\n",
+ " if x == '':\n",
+ " return ''\n",
+ " genre_arr = eval(str(x))\n",
+ " return ','.join(i['name'] for i in eval(str(x)))\n",
+ " \n",
+ "movie['genres'] = movie['genres'].fillna('')"
+ ],
+ "metadata": {
+ "id": "kO8m6SsepBIg",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "ff9d7d36-89d2-438e-f1d3-0b8f6cd8ea24"
+ },
+ "execution_count": 52,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ ":7: SettingWithCopyWarning: \n",
+ "A value is trying to be set on a copy of a slice from a DataFrame.\n",
+ "Try using .loc[row_indexer,col_indexer] = value instead\n",
+ "\n",
+ "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
+ " movie['genres'] = movie['genres'].fillna('')\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie['genres']=movie['genres'].apply(find_names)"
+ ],
+ "metadata": {
+ "id": "vOfKcOQ-pBIg"
+ },
+ "execution_count": 53,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie.movieId = movie.movieId.astype(\"uint64\")"
+ ],
+ "metadata": {
+ "id": "0p-yhNZ3iRsl"
+ },
+ "execution_count": 54,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "only keep rating for movies with metadata in movie dataset"
+ ],
+ "metadata": {
+ "id": "DWgywXEKuq2O"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "new_rating = pd.merge(rating, movie, how='inner', on=[\"movieId\"])"
+ ],
+ "metadata": {
+ "id": "psgzmBFLtcmx"
+ },
+ "execution_count": 55,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "new_rating = new_rating[[\"userId\", \"movieId\", \"rating\"]]"
+ ],
+ "metadata": {
+ "id": "z9DjgdvYuhOW"
+ },
+ "execution_count": 56,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "movie.head()"
+ ],
+ "metadata": {
+ "id": "gQ4VSPNUuFOc",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 750
+ },
+ "outputId": "d4245b96-1958-4704-e3eb-cc9b64effca3"
+ },
+ "execution_count": 57,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection budget \\\n",
+ "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n",
+ "1 False NaN 65000000 \n",
+ "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n",
+ "3 False NaN 16000000 \n",
+ "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n",
+ "\n",
+ " genres homepage movieId \\\n",
+ "0 Animation,Comedy,Family http://toystory.disney.com/toy-story 862 \n",
+ "1 Adventure,Fantasy,Family NaN 8844 \n",
+ "2 Romance,Comedy NaN 15602 \n",
+ "3 Comedy,Drama,Romance NaN 31357 \n",
+ "4 Comedy NaN 11862 \n",
+ "\n",
+ " imdb_id original_language original_title \\\n",
+ "0 tt0114709 en Toy Story \n",
+ "1 tt0113497 en Jumanji \n",
+ "2 tt0113228 en Grumpier Old Men \n",
+ "3 tt0114885 en Waiting to Exhale \n",
+ "4 tt0113041 en Father of the Bride Part II \n",
+ "\n",
+ " overview ... release_date \\\n",
+ "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n",
+ "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n",
+ "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n",
+ "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n",
+ "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n",
+ "\n",
+ " revenue runtime spoken_languages \\\n",
+ "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n",
+ "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n",
+ "\n",
+ " status tagline \\\n",
+ "0 Released NaN \n",
+ "1 Released Roll the dice and unleash the excitement! \n",
+ "2 Released Still Yelling. Still Fighting. Still Ready for... \n",
+ "3 Released Friends are the people who let you be yourself... \n",
+ "4 Released Just When His World Is Back To Normal... He's ... \n",
+ "\n",
+ " title video vote_average vote_count \n",
+ "0 Toy Story False 7.7 5415.0 \n",
+ "1 Jumanji False 6.9 2413.0 \n",
+ "2 Grumpier Old Men False 6.5 92.0 \n",
+ "3 Waiting to Exhale False 6.1 34.0 \n",
+ "4 Father of the Bride Part II False 5.7 173.0 \n",
+ "\n",
+ "[5 rows x 24 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " movieId | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " original_title | \n",
+ " overview | \n",
+ " ... | \n",
+ " release_date | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " False | \n",
+ " {'id': 10194, 'name': 'Toy Story Collection', ... | \n",
+ " 30000000 | \n",
+ " Animation,Comedy,Family | \n",
+ " http://toystory.disney.com/toy-story | \n",
+ " 862 | \n",
+ " tt0114709 | \n",
+ " en | \n",
+ " Toy Story | \n",
+ " Led by Woody, Andy's toys live happily in his ... | \n",
+ " ... | \n",
+ " 1995-10-30 | \n",
+ " 373554033.0 | \n",
+ " 81.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Toy Story | \n",
+ " False | \n",
+ " 7.7 | \n",
+ " 5415.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 65000000 | \n",
+ " Adventure,Fantasy,Family | \n",
+ " NaN | \n",
+ " 8844 | \n",
+ " tt0113497 | \n",
+ " en | \n",
+ " Jumanji | \n",
+ " When siblings Judy and Peter discover an encha... | \n",
+ " ... | \n",
+ " 1995-12-15 | \n",
+ " 262797249.0 | \n",
+ " 104.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " Roll the dice and unleash the excitement! | \n",
+ " Jumanji | \n",
+ " False | \n",
+ " 6.9 | \n",
+ " 2413.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " False | \n",
+ " {'id': 119050, 'name': 'Grumpy Old Men Collect... | \n",
+ " 0 | \n",
+ " Romance,Comedy | \n",
+ " NaN | \n",
+ " 15602 | \n",
+ " tt0113228 | \n",
+ " en | \n",
+ " Grumpier Old Men | \n",
+ " A family wedding reignites the ancient feud be... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 0.0 | \n",
+ " 101.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Still Yelling. Still Fighting. Still Ready for... | \n",
+ " Grumpier Old Men | \n",
+ " False | \n",
+ " 6.5 | \n",
+ " 92.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 16000000 | \n",
+ " Comedy,Drama,Romance | \n",
+ " NaN | \n",
+ " 31357 | \n",
+ " tt0114885 | \n",
+ " en | \n",
+ " Waiting to Exhale | \n",
+ " Cheated on, mistreated and stepped on, the wom... | \n",
+ " ... | \n",
+ " 1995-12-22 | \n",
+ " 81452156.0 | \n",
+ " 127.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Friends are the people who let you be yourself... | \n",
+ " Waiting to Exhale | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 34.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " False | \n",
+ " {'id': 96871, 'name': 'Father of the Bride Col... | \n",
+ " 0 | \n",
+ " Comedy | \n",
+ " NaN | \n",
+ " 11862 | \n",
+ " tt0113041 | \n",
+ " en | \n",
+ " Father of the Bride Part II | \n",
+ " Just when George Banks has recovered from his ... | \n",
+ " ... | \n",
+ " 1995-02-10 | \n",
+ " 76578911.0 | \n",
+ " 106.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Just When His World Is Back To Normal... He's ... | \n",
+ " Father of the Bride Part II | \n",
+ " False | \n",
+ " 5.7 | \n",
+ " 173.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 24 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 57
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "new_rating.head()"
+ ],
+ "metadata": {
+ "id": "gQE2lTG0tua6",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "outputId": "51c28cea-a4ad-4e89-ced5-3e3953b6cc1f"
+ },
+ "execution_count": 58,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " userId movieId rating\n",
+ "0 1 1371 2.5\n",
+ "1 4 1371 4.0\n",
+ "2 7 1371 3.0\n",
+ "3 19 1371 4.0\n",
+ "4 21 1371 3.0"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " rating | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1371 | \n",
+ " 2.5 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 1371 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 7 | \n",
+ " 1371 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 19 | \n",
+ " 1371 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 21 | \n",
+ " 1371 | \n",
+ " 3.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 58
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "train, test = split_dataframe(new_rating)"
+ ],
+ "metadata": {
+ "id": "mcdRTQyqAyhU"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### matrix factorization"
+ ],
+ "metadata": {
+ "id": "rpZ7r095Z2-G"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "inter_mat_df = rating.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0)\n",
+ "inter_mat_df"
+ ],
+ "metadata": {
+ "id": "7oG8lkIRUX1M",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 455
+ },
+ "outputId": "cd42edf3-2e15-4dec-8dfb-1a60bbd1ca4a"
+ },
+ "execution_count": 59,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "movieId 1 2 3 4 5 6 7 8 \\\n",
+ "userId \n",
+ "1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "5 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... \n",
+ "667 0.0 0.0 0.0 0.0 0.0 4.0 0.0 0.0 \n",
+ "668 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "669 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "670 4.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "671 5.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "movieId 9 10 ... 161084 161155 161594 161830 161918 161944 \\\n",
+ "userId ... \n",
+ "1 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 4.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "5 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... ... ... ... ... ... \n",
+ "667 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "668 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "669 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "670 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "671 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "movieId 162376 162542 162672 163949 \n",
+ "userId \n",
+ "1 0.0 0.0 0.0 0.0 \n",
+ "2 0.0 0.0 0.0 0.0 \n",
+ "3 0.0 0.0 0.0 0.0 \n",
+ "4 0.0 0.0 0.0 0.0 \n",
+ "5 0.0 0.0 0.0 0.0 \n",
+ "... ... ... ... ... \n",
+ "667 0.0 0.0 0.0 0.0 \n",
+ "668 0.0 0.0 0.0 0.0 \n",
+ "669 0.0 0.0 0.0 0.0 \n",
+ "670 0.0 0.0 0.0 0.0 \n",
+ "671 0.0 0.0 0.0 0.0 \n",
+ "\n",
+ "[671 rows x 9066 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " movieId | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " ... | \n",
+ " 161084 | \n",
+ " 161155 | \n",
+ " 161594 | \n",
+ " 161830 | \n",
+ " 161918 | \n",
+ " 161944 | \n",
+ " 162376 | \n",
+ " 162542 | \n",
+ " 162672 | \n",
+ " 163949 | \n",
+ "
\n",
+ " \n",
+ " userId | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 667 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 668 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 669 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 670 | \n",
+ " 4.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 671 | \n",
+ " 5.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
671 rows × 9066 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 59
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "inter_mat = inter_mat_df.to_numpy()"
+ ],
+ "metadata": {
+ "id": "9It-QDN_UcYt"
+ },
+ "execution_count": 60,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ratings_mean = np.mean(inter_mat, axis = 1)\n",
+ "inter_mat_normal = inter_mat - ratings_mean.reshape(-1, 1)"
+ ],
+ "metadata": {
+ "id": "_ksd9gu8UfGt"
+ },
+ "execution_count": 61,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "inter_mat_normal"
+ ],
+ "metadata": {
+ "id": "2eayKPzpUiX_",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "e38be4f9-b9b7-4408-df0d-e589f5a240a5"
+ },
+ "execution_count": 62,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "array([[-0.00562541, -0.00562541, -0.00562541, ..., -0.00562541,\n",
+ " -0.00562541, -0.00562541],\n",
+ " [-0.02923009, -0.02923009, -0.02923009, ..., -0.02923009,\n",
+ " -0.02923009, -0.02923009],\n",
+ " [-0.02007501, -0.02007501, -0.02007501, ..., -0.02007501,\n",
+ " -0.02007501, -0.02007501],\n",
+ " ...,\n",
+ " [-0.01367748, -0.01367748, -0.01367748, ..., -0.01367748,\n",
+ " -0.01367748, -0.01367748],\n",
+ " [ 3.98698434, -0.01301566, -0.01301566, ..., -0.01301566,\n",
+ " -0.01301566, -0.01301566],\n",
+ " [ 4.95030885, -0.04969115, -0.04969115, ..., -0.04969115,\n",
+ " -0.04969115, -0.04969115]])"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 62
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "We use singular value decomposition for matrix factorization"
+ ],
+ "metadata": {
+ "id": "__cFBTZSVEbK"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "svd_U, svd_sigma, svd_V = randomized_svd(inter_mat_normal, \n",
+ " n_components=15,\n",
+ " n_iter=5,\n",
+ " random_state=47)"
+ ],
+ "metadata": {
+ "id": "9FcuzszEU-Xv"
+ },
+ "execution_count": 63,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "This function gives the diagonal form"
+ ],
+ "metadata": {
+ "id": "2z3Zj7hGVhtJ"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "svd_sigma = np.diag(svd_sigma)"
+ ],
+ "metadata": {
+ "id": "BQnpVVH_VgUG"
+ },
+ "execution_count": 64,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "Making predictions"
+ ],
+ "metadata": {
+ "id": "JrTuTHy_VpsO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "rating_weights = np.dot(np.dot(svd_U, svd_sigma), svd_V) + ratings_mean.reshape(-1, 1)"
+ ],
+ "metadata": {
+ "id": "pPnlGgkKVpKd"
+ },
+ "execution_count": 65,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "weights_df = pd.DataFrame(rating_weights, columns = inter_mat_df.columns)"
+ ],
+ "metadata": {
+ "id": "sKdvldPoVsfs"
+ },
+ "execution_count": 66,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "weights_df.head()"
+ ],
+ "metadata": {
+ "id": "FzLYVyH8VyXq",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 299
+ },
+ "outputId": "0360b11f-2954-496c-e926-5d55a6067ed8"
+ },
+ "execution_count": 67,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "movieId 1 2 3 4 5 6 7 \\\n",
+ "0 -0.081106 0.024332 -0.016835 -0.006440 -0.028982 0.031704 -0.000823 \n",
+ "1 1.010433 1.676149 0.180399 0.133225 0.332127 0.659165 0.161904 \n",
+ "2 1.023170 0.389789 -0.031488 0.031783 0.000714 0.135239 -0.080027 \n",
+ "3 1.980784 1.192729 0.128675 0.106373 -0.234511 -0.550348 -0.085587 \n",
+ "4 1.216316 0.926650 0.122319 0.063811 0.628848 -0.137074 0.356099 \n",
+ "\n",
+ "movieId 8 9 10 ... 161084 161155 161594 \\\n",
+ "0 -0.004803 -0.005659 0.039295 ... -0.004629 -0.004407 0.011158 \n",
+ "1 0.061847 0.088149 2.271430 ... -0.000064 -0.001081 0.006137 \n",
+ "2 0.030734 -0.026625 0.397392 ... 0.003256 0.007455 -0.012356 \n",
+ "3 -0.143159 -0.015308 1.372206 ... 0.032565 -0.021875 0.084978 \n",
+ "4 0.082625 -0.068154 0.567814 ... -0.028169 -0.028221 0.000680 \n",
+ "\n",
+ "movieId 161830 161918 161944 162376 162542 162672 163949 \n",
+ "0 -0.004547 -0.004340 -0.005154 0.019218 -0.005677 -0.005391 -0.004297 \n",
+ "1 -0.003853 -0.003784 -0.004673 0.011201 -0.002404 -0.003039 0.003864 \n",
+ "2 -0.000387 -0.006532 -0.000590 -0.024486 0.015854 0.014274 -0.005391 \n",
+ "3 0.008436 0.018263 -0.022110 0.133075 -0.016431 -0.014345 0.076347 \n",
+ "4 -0.024315 -0.023941 -0.061806 0.013551 -0.023701 -0.024246 -0.031276 \n",
+ "\n",
+ "[5 rows x 9066 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " movieId | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " ... | \n",
+ " 161084 | \n",
+ " 161155 | \n",
+ " 161594 | \n",
+ " 161830 | \n",
+ " 161918 | \n",
+ " 161944 | \n",
+ " 162376 | \n",
+ " 162542 | \n",
+ " 162672 | \n",
+ " 163949 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " -0.081106 | \n",
+ " 0.024332 | \n",
+ " -0.016835 | \n",
+ " -0.006440 | \n",
+ " -0.028982 | \n",
+ " 0.031704 | \n",
+ " -0.000823 | \n",
+ " -0.004803 | \n",
+ " -0.005659 | \n",
+ " 0.039295 | \n",
+ " ... | \n",
+ " -0.004629 | \n",
+ " -0.004407 | \n",
+ " 0.011158 | \n",
+ " -0.004547 | \n",
+ " -0.004340 | \n",
+ " -0.005154 | \n",
+ " 0.019218 | \n",
+ " -0.005677 | \n",
+ " -0.005391 | \n",
+ " -0.004297 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1.010433 | \n",
+ " 1.676149 | \n",
+ " 0.180399 | \n",
+ " 0.133225 | \n",
+ " 0.332127 | \n",
+ " 0.659165 | \n",
+ " 0.161904 | \n",
+ " 0.061847 | \n",
+ " 0.088149 | \n",
+ " 2.271430 | \n",
+ " ... | \n",
+ " -0.000064 | \n",
+ " -0.001081 | \n",
+ " 0.006137 | \n",
+ " -0.003853 | \n",
+ " -0.003784 | \n",
+ " -0.004673 | \n",
+ " 0.011201 | \n",
+ " -0.002404 | \n",
+ " -0.003039 | \n",
+ " 0.003864 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.023170 | \n",
+ " 0.389789 | \n",
+ " -0.031488 | \n",
+ " 0.031783 | \n",
+ " 0.000714 | \n",
+ " 0.135239 | \n",
+ " -0.080027 | \n",
+ " 0.030734 | \n",
+ " -0.026625 | \n",
+ " 0.397392 | \n",
+ " ... | \n",
+ " 0.003256 | \n",
+ " 0.007455 | \n",
+ " -0.012356 | \n",
+ " -0.000387 | \n",
+ " -0.006532 | \n",
+ " -0.000590 | \n",
+ " -0.024486 | \n",
+ " 0.015854 | \n",
+ " 0.014274 | \n",
+ " -0.005391 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1.980784 | \n",
+ " 1.192729 | \n",
+ " 0.128675 | \n",
+ " 0.106373 | \n",
+ " -0.234511 | \n",
+ " -0.550348 | \n",
+ " -0.085587 | \n",
+ " -0.143159 | \n",
+ " -0.015308 | \n",
+ " 1.372206 | \n",
+ " ... | \n",
+ " 0.032565 | \n",
+ " -0.021875 | \n",
+ " 0.084978 | \n",
+ " 0.008436 | \n",
+ " 0.018263 | \n",
+ " -0.022110 | \n",
+ " 0.133075 | \n",
+ " -0.016431 | \n",
+ " -0.014345 | \n",
+ " 0.076347 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1.216316 | \n",
+ " 0.926650 | \n",
+ " 0.122319 | \n",
+ " 0.063811 | \n",
+ " 0.628848 | \n",
+ " -0.137074 | \n",
+ " 0.356099 | \n",
+ " 0.082625 | \n",
+ " -0.068154 | \n",
+ " 0.567814 | \n",
+ " ... | \n",
+ " -0.028169 | \n",
+ " -0.028221 | \n",
+ " 0.000680 | \n",
+ " -0.024315 | \n",
+ " -0.023941 | \n",
+ " -0.061806 | \n",
+ " 0.013551 | \n",
+ " -0.023701 | \n",
+ " -0.024246 | \n",
+ " -0.031276 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 9066 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 67
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "making recommendations"
+ ],
+ "metadata": {
+ "id": "IDO7q6EjZ8q1"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def recommend_top_k(preds_df, ratings_df, movie, userId, k=10):\n",
+ " user_row = userId-1 \n",
+ " sorted_user_predictions = preds_df.iloc[user_row].sort_values(ascending=False) \n",
+ " user_data = ratings_df[ratings_df.userId == (userId)]\n",
+ " user_rated = user_data.merge(movie, how = 'left', left_on = 'movieId', right_on = 'movieId'). \\\n",
+ " sort_values(['rating'], ascending=False)\n",
+ " user_preds = movie.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',\n",
+ " on = 'movieId').rename(columns = {user_row: 'prediction'}). \\\n",
+ " sort_values('prediction', ascending = False). \\\n",
+ " iloc[:k, :]\n",
+ " return user_rated, user_preds"
+ ],
+ "metadata": {
+ "id": "0sZanc2nV7ot"
+ },
+ "execution_count": 68,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "collaborative_k = 100\n",
+ "user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, 220, collaborative_k)\n",
+ "mf.log_param('collaborative k', collaborative_k)"
+ ],
+ "metadata": {
+ "id": "RWZBPk3QX4Hg",
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "outputId": "609849d1-ca57-4831-b2d8-10ae2d0504d4"
+ },
+ "execution_count": 69,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "100"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 69
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "user_preds.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 698
+ },
+ "id": "xmk5e3xln_Xk",
+ "outputId": "e92bff6a-7b20-427a-fa05-1743c8e4a166"
+ },
+ "execution_count": 70,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " adult belongs_to_collection budget \\\n",
+ "6388 False {'id': 528, 'name': 'The Terminator Collection... 200000000 \n",
+ "3382 False NaN 0 \n",
+ "5325 False {'id': 86055, 'name': 'Men In Black Collection... 140000000 \n",
+ "4020 False NaN 8000000 \n",
+ "286 False {'id': 300546, 'name': 'Once were Warriors Col... 0 \n",
+ "\n",
+ " genres \\\n",
+ "6388 Action,Thriller,Science Fiction \n",
+ "3382 Drama,Science Fiction,Adventure,Mystery \n",
+ "5325 Action,Adventure,Comedy,Science Fiction \n",
+ "4020 Drama,Thriller \n",
+ "286 Drama \n",
+ "\n",
+ " homepage movieId imdb_id \\\n",
+ "6388 NaN 296 tt0181852 \n",
+ "3382 NaN 593 tt0069293 \n",
+ "5325 http://www.sonypictures.com/homevideo/meninbla... 608 tt0120912 \n",
+ "4020 NaN 318 tt0120753 \n",
+ "286 NaN 527 tt0110729 \n",
+ "\n",
+ " original_language original_title \\\n",
+ "6388 en Terminator 3: Rise of the Machines \n",
+ "3382 ru Солярис \n",
+ "5325 en Men in Black II \n",
+ "4020 en The Million Dollar Hotel \n",
+ "286 en Once Were Warriors \n",
+ "\n",
+ " overview ... revenue \\\n",
+ "6388 It's been 10 years since John Connor saved Ear... ... 435000000.0 \n",
+ "3382 Ground control has been receiving strange tran... ... 0.0 \n",
+ "5325 Kay and Jay reunite to provide our best, last ... ... 441818803.0 \n",
+ "4020 The Million Dollar Hotel starts with a jump fr... ... 0.0 \n",
+ "286 A drama about a Maori family lving in Auckland... ... 2201126.0 \n",
+ "\n",
+ " runtime spoken_languages status \\\n",
+ "6388 109.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "3382 167.0 [{'iso_639_1': 'ru', 'name': 'Pусский'}] Released \n",
+ "5325 88.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "4020 122.0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "286 99.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n",
+ "\n",
+ " tagline \\\n",
+ "6388 The Machines Will Rise. \n",
+ "3382 NaN \n",
+ "5325 Same Planet. New Scum. \n",
+ "4020 NaN \n",
+ "286 A family in crisis, a life in chaos... Nothing... \n",
+ "\n",
+ " title video vote_average vote_count \\\n",
+ "6388 Terminator 3: Rise of the Machines False 5.9 2177.0 \n",
+ "3382 Solaris False 7.7 364.0 \n",
+ "5325 Men in Black II False 6.1 3188.0 \n",
+ "4020 The Million Dollar Hotel False 5.9 76.0 \n",
+ "286 Once Were Warriors False 7.6 106.0 \n",
+ "\n",
+ " prediction \n",
+ "6388 4.792743 \n",
+ "3382 4.742942 \n",
+ "5325 4.647800 \n",
+ "4020 4.469385 \n",
+ "286 4.236960 \n",
+ "\n",
+ "[5 rows x 25 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " movieId | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " original_title | \n",
+ " overview | \n",
+ " ... | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ " prediction | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6388 | \n",
+ " False | \n",
+ " {'id': 528, 'name': 'The Terminator Collection... | \n",
+ " 200000000 | \n",
+ " Action,Thriller,Science Fiction | \n",
+ " NaN | \n",
+ " 296 | \n",
+ " tt0181852 | \n",
+ " en | \n",
+ " Terminator 3: Rise of the Machines | \n",
+ " It's been 10 years since John Connor saved Ear... | \n",
+ " ... | \n",
+ " 435000000.0 | \n",
+ " 109.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " The Machines Will Rise. | \n",
+ " Terminator 3: Rise of the Machines | \n",
+ " False | \n",
+ " 5.9 | \n",
+ " 2177.0 | \n",
+ " 4.792743 | \n",
+ "
\n",
+ " \n",
+ " 3382 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " Drama,Science Fiction,Adventure,Mystery | \n",
+ " NaN | \n",
+ " 593 | \n",
+ " tt0069293 | \n",
+ " ru | \n",
+ " Солярис | \n",
+ " Ground control has been receiving strange tran... | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 167.0 | \n",
+ " [{'iso_639_1': 'ru', 'name': 'Pусский'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Solaris | \n",
+ " False | \n",
+ " 7.7 | \n",
+ " 364.0 | \n",
+ " 4.742942 | \n",
+ "
\n",
+ " \n",
+ " 5325 | \n",
+ " False | \n",
+ " {'id': 86055, 'name': 'Men In Black Collection... | \n",
+ " 140000000 | \n",
+ " Action,Adventure,Comedy,Science Fiction | \n",
+ " http://www.sonypictures.com/homevideo/meninbla... | \n",
+ " 608 | \n",
+ " tt0120912 | \n",
+ " en | \n",
+ " Men in Black II | \n",
+ " Kay and Jay reunite to provide our best, last ... | \n",
+ " ... | \n",
+ " 441818803.0 | \n",
+ " 88.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Same Planet. New Scum. | \n",
+ " Men in Black II | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 3188.0 | \n",
+ " 4.647800 | \n",
+ "
\n",
+ " \n",
+ " 4020 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 8000000 | \n",
+ " Drama,Thriller | \n",
+ " NaN | \n",
+ " 318 | \n",
+ " tt0120753 | \n",
+ " en | \n",
+ " The Million Dollar Hotel | \n",
+ " The Million Dollar Hotel starts with a jump fr... | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 122.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " The Million Dollar Hotel | \n",
+ " False | \n",
+ " 5.9 | \n",
+ " 76.0 | \n",
+ " 4.469385 | \n",
+ "
\n",
+ " \n",
+ " 286 | \n",
+ " False | \n",
+ " {'id': 300546, 'name': 'Once were Warriors Col... | \n",
+ " 0 | \n",
+ " Drama | \n",
+ " NaN | \n",
+ " 527 | \n",
+ " tt0110729 | \n",
+ " en | \n",
+ " Once Were Warriors | \n",
+ " A drama about a Maori family lving in Auckland... | \n",
+ " ... | \n",
+ " 2201126.0 | \n",
+ " 99.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " A family in crisis, a life in chaos... Nothing... | \n",
+ " Once Were Warriors | \n",
+ " False | \n",
+ " 7.6 | \n",
+ " 106.0 | \n",
+ " 4.236960 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 70
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "user_rated.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 577
+ },
+ "id": "x6ohhdH0sF0H",
+ "outputId": "5f8a17f7-3b50-48ad-9d10-809357834a3b"
+ },
+ "execution_count": 71,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " userId movieId rating adult belongs_to_collection budget \\\n",
+ "0 220 2294 5.0 False NaN 22000000 \n",
+ "46 220 1247 5.0 False NaN 85000000 \n",
+ "25 220 2762 5.0 False NaN 0 \n",
+ "27 220 260 5.0 False NaN 0 \n",
+ "59 220 2324 5.0 False NaN 3250000 \n",
+ "\n",
+ " genres homepage imdb_id \\\n",
+ "0 Comedy NaN tt0261392 \n",
+ "46 Drama,Thriller,History http://www.thegoodshepherdmovie.com/ tt0343737 \n",
+ "25 Drama,Crime NaN tt0029811 \n",
+ "27 Action,Thriller,Mystery NaN tt0026029 \n",
+ "59 Drama http://www.localcolormovie.com/ tt0472126 \n",
+ "\n",
+ " original_language ... release_date revenue runtime \\\n",
+ "0 en ... 2001-08-22 33788161.0 104.0 \n",
+ "46 en ... 2006-12-11 59908565.0 167.0 \n",
+ "25 en ... 1937-11-01 0.0 83.0 \n",
+ "27 en ... 1935-06-01 0.0 86.0 \n",
+ "59 en ... 2006-09-19 32788.0 107.0 \n",
+ "\n",
+ " spoken_languages status \\\n",
+ "0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "46 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n",
+ "25 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "27 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "59 [{'iso_639_1': 'en', 'name': 'English'}] Released \n",
+ "\n",
+ " tagline \\\n",
+ "0 Hollywood had it coming \n",
+ "46 The untold story of the most powerful covert a... \n",
+ "25 A Brilliant Melodrama \n",
+ "27 Handcuffed to the girl who double-crossed him \n",
+ "59 NaN \n",
+ "\n",
+ " title video vote_average vote_count \n",
+ "0 Jay and Silent Bob Strike Back False 6.4 491.0 \n",
+ "46 The Good Shepherd False 6.3 342.0 \n",
+ "25 Young and Innocent False 6.8 42.0 \n",
+ "27 The 39 Steps False 7.4 217.0 \n",
+ "59 Local Color False 6.1 8.0 \n",
+ "\n",
+ "[5 rows x 26 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " rating | \n",
+ " adult | \n",
+ " belongs_to_collection | \n",
+ " budget | \n",
+ " genres | \n",
+ " homepage | \n",
+ " imdb_id | \n",
+ " original_language | \n",
+ " ... | \n",
+ " release_date | \n",
+ " revenue | \n",
+ " runtime | \n",
+ " spoken_languages | \n",
+ " status | \n",
+ " tagline | \n",
+ " title | \n",
+ " video | \n",
+ " vote_average | \n",
+ " vote_count | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 220 | \n",
+ " 2294 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 22000000 | \n",
+ " Comedy | \n",
+ " NaN | \n",
+ " tt0261392 | \n",
+ " en | \n",
+ " ... | \n",
+ " 2001-08-22 | \n",
+ " 33788161.0 | \n",
+ " 104.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Hollywood had it coming | \n",
+ " Jay and Silent Bob Strike Back | \n",
+ " False | \n",
+ " 6.4 | \n",
+ " 491.0 | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 220 | \n",
+ " 1247 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 85000000 | \n",
+ " Drama,Thriller,History | \n",
+ " http://www.thegoodshepherdmovie.com/ | \n",
+ " tt0343737 | \n",
+ " en | \n",
+ " ... | \n",
+ " 2006-12-11 | \n",
+ " 59908565.0 | \n",
+ " 167.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}, {'iso... | \n",
+ " Released | \n",
+ " The untold story of the most powerful covert a... | \n",
+ " The Good Shepherd | \n",
+ " False | \n",
+ " 6.3 | \n",
+ " 342.0 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " 220 | \n",
+ " 2762 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " Drama,Crime | \n",
+ " NaN | \n",
+ " tt0029811 | \n",
+ " en | \n",
+ " ... | \n",
+ " 1937-11-01 | \n",
+ " 0.0 | \n",
+ " 83.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " A Brilliant Melodrama | \n",
+ " Young and Innocent | \n",
+ " False | \n",
+ " 6.8 | \n",
+ " 42.0 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 220 | \n",
+ " 260 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " Action,Thriller,Mystery | \n",
+ " NaN | \n",
+ " tt0026029 | \n",
+ " en | \n",
+ " ... | \n",
+ " 1935-06-01 | \n",
+ " 0.0 | \n",
+ " 86.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " Handcuffed to the girl who double-crossed him | \n",
+ " The 39 Steps | \n",
+ " False | \n",
+ " 7.4 | \n",
+ " 217.0 | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " 220 | \n",
+ " 2324 | \n",
+ " 5.0 | \n",
+ " False | \n",
+ " NaN | \n",
+ " 3250000 | \n",
+ " Drama | \n",
+ " http://www.localcolormovie.com/ | \n",
+ " tt0472126 | \n",
+ " en | \n",
+ " ... | \n",
+ " 2006-09-19 | \n",
+ " 32788.0 | \n",
+ " 107.0 | \n",
+ " [{'iso_639_1': 'en', 'name': 'English'}] | \n",
+ " Released | \n",
+ " NaN | \n",
+ " Local Color | \n",
+ " False | \n",
+ " 6.1 | \n",
+ " 8.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 26 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 71
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "user_rated[[\"title\", \"genres\"]].head(10)"
+ ],
+ "metadata": {
+ "id": "18grZyJYmG5q",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 363
+ },
+ "outputId": "c064b2c7-9b04-4aae-b76f-58c1df0da2dc"
+ },
+ "execution_count": 72,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " title genres\n",
+ "0 Jay and Silent Bob Strike Back Comedy\n",
+ "46 The Good Shepherd Drama,Thriller,History\n",
+ "25 Young and Innocent Drama,Crime\n",
+ "27 The 39 Steps Action,Thriller,Mystery\n",
+ "59 Local Color Drama\n",
+ "31 The Big Sleep Crime,Drama,Mystery,Thriller\n",
+ "33 The Talented Mr. Ripley Thriller,Crime,Drama\n",
+ "42 The Big Parade Drama,Romance,War\n",
+ "73 Dancer in the Dark Drama,Crime,Music\n",
+ "110 Birdman of Alcatraz Drama"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Jay and Silent Bob Strike Back | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " The Good Shepherd | \n",
+ " Drama,Thriller,History | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " Young and Innocent | \n",
+ " Drama,Crime | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " The 39 Steps | \n",
+ " Action,Thriller,Mystery | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " Local Color | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 31 | \n",
+ " The Big Sleep | \n",
+ " Crime,Drama,Mystery,Thriller | \n",
+ "
\n",
+ " \n",
+ " 33 | \n",
+ " The Talented Mr. Ripley | \n",
+ " Thriller,Crime,Drama | \n",
+ "
\n",
+ " \n",
+ " 42 | \n",
+ " The Big Parade | \n",
+ " Drama,Romance,War | \n",
+ "
\n",
+ " \n",
+ " 73 | \n",
+ " Dancer in the Dark | \n",
+ " Drama,Crime,Music | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " Birdman of Alcatraz | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 72
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "user_preds[[\"title\", \"genres\"]].head(10)"
+ ],
+ "metadata": {
+ "id": "Mtq_XCEFmLee",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 363
+ },
+ "outputId": "0cd7dd8d-2fde-4d76-f8c0-2131239e5738"
+ },
+ "execution_count": 73,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " title \\\n",
+ "6388 Terminator 3: Rise of the Machines \n",
+ "3382 Solaris \n",
+ "5325 Men in Black II \n",
+ "4020 The Million Dollar Hotel \n",
+ "286 Once Were Warriors \n",
+ "2100 Young and Innocent \n",
+ "534 Sleepless in Seattle \n",
+ "2137 Say Anything... \n",
+ "11922 License to Wed \n",
+ "33911 The Tunnel \n",
+ "\n",
+ " genres \n",
+ "6388 Action,Thriller,Science Fiction \n",
+ "3382 Drama,Science Fiction,Adventure,Mystery \n",
+ "5325 Action,Adventure,Comedy,Science Fiction \n",
+ "4020 Drama,Thriller \n",
+ "286 Drama \n",
+ "2100 Drama,Crime \n",
+ "534 Comedy,Drama,Romance \n",
+ "2137 Comedy,Drama,Romance \n",
+ "11922 Comedy \n",
+ "33911 Science Fiction "
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " title | \n",
+ " genres | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6388 | \n",
+ " Terminator 3: Rise of the Machines | \n",
+ " Action,Thriller,Science Fiction | \n",
+ "
\n",
+ " \n",
+ " 3382 | \n",
+ " Solaris | \n",
+ " Drama,Science Fiction,Adventure,Mystery | \n",
+ "
\n",
+ " \n",
+ " 5325 | \n",
+ " Men in Black II | \n",
+ " Action,Adventure,Comedy,Science Fiction | \n",
+ "
\n",
+ " \n",
+ " 4020 | \n",
+ " The Million Dollar Hotel | \n",
+ " Drama,Thriller | \n",
+ "
\n",
+ " \n",
+ " 286 | \n",
+ " Once Were Warriors | \n",
+ " Drama | \n",
+ "
\n",
+ " \n",
+ " 2100 | \n",
+ " Young and Innocent | \n",
+ " Drama,Crime | \n",
+ "
\n",
+ " \n",
+ " 534 | \n",
+ " Sleepless in Seattle | \n",
+ " Comedy,Drama,Romance | \n",
+ "
\n",
+ " \n",
+ " 2137 | \n",
+ " Say Anything... | \n",
+ " Comedy,Drama,Romance | \n",
+ "
\n",
+ " \n",
+ " 11922 | \n",
+ " License to Wed | \n",
+ " Comedy | \n",
+ "
\n",
+ " \n",
+ " 33911 | \n",
+ " The Tunnel | \n",
+ " Science Fiction | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 73
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7h0B7szQuRbE"
+ },
+ "source": [
+ "# Ensemble Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {
+ "id": "BIG1-TQ7g1HJ"
+ },
+ "outputs": [],
+ "source": [
+ "def ensemble(userId, k=10):\n",
+ " user_rated, user_preds = recommend_top_k(weights_df, new_rating, movie, userId, k*k)\n",
+ " content_based_result = content_based_all_batches(rating[rating['userId'] == userId], k=k, movieIds=user_preds['movieId'])\n",
+ " return content_based_result[['id','title']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "ensemble_k=10\n",
+ "mf.log_param('ensemble k', ensemble_k)\n",
+ "ensemble(220, ensemble_k)"
+ ],
+ "metadata": {
+ "id": "DSGgvMKIPUIu",
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "outputId": "8fa1410d-315a-47ff-bba9-65b14c568708"
+ },
+ "execution_count": 75,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " id title\n",
+ "2663 912 The Thomas Crown Affair\n",
+ "1415 1968 Fools Rush In\n",
+ "2077 1580 Rope\n",
+ "2110 2762 Young and Innocent\n",
+ "533 858 Sleepless in Seattle\n",
+ "... ... ...\n",
+ "255 11 Star Wars\n",
+ "1315 377 A Nightmare on Elm Street\n",
+ "1344 364 Batman Returns\n",
+ "5524 1682 Mothra vs. Godzilla\n",
+ "6146 2671 Ringu\n",
+ "\n",
+ "[67 rows x 2 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " title | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2663 | \n",
+ " 912 | \n",
+ " The Thomas Crown Affair | \n",
+ "
\n",
+ " \n",
+ " 1415 | \n",
+ " 1968 | \n",
+ " Fools Rush In | \n",
+ "
\n",
+ " \n",
+ " 2077 | \n",
+ " 1580 | \n",
+ " Rope | \n",
+ "
\n",
+ " \n",
+ " 2110 | \n",
+ " 2762 | \n",
+ " Young and Innocent | \n",
+ "
\n",
+ " \n",
+ " 533 | \n",
+ " 858 | \n",
+ " Sleepless in Seattle | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 255 | \n",
+ " 11 | \n",
+ " Star Wars | \n",
+ "
\n",
+ " \n",
+ " 1315 | \n",
+ " 377 | \n",
+ " A Nightmare on Elm Street | \n",
+ "
\n",
+ " \n",
+ " 1344 | \n",
+ " 364 | \n",
+ " Batman Returns | \n",
+ "
\n",
+ " \n",
+ " 5524 | \n",
+ " 1682 | \n",
+ " Mothra vs. Godzilla | \n",
+ "
\n",
+ " \n",
+ " 6146 | \n",
+ " 2671 | \n",
+ " Ringu | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
67 rows × 2 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 75
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Evaluation"
+ ],
+ "metadata": {
+ "id": "oFDjsFmJgvwa"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_res = user_preds[[\"movieId\", \"prediction\"]]. \\\n",
+ " merge(user_rated[[\"movieId\", \"rating\"]], how = 'outer', on = 'movieId')"
+ ],
+ "metadata": {
+ "id": "lePskKh9rObl"
+ },
+ "execution_count": 76,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df_res.sort_values(by='prediction',ascending=False,inplace=True)\n",
+ "df_res"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "id": "gHZkoIr-r6dM",
+ "outputId": "946e5b4d-5c2f-4262-d601-ef915a3b0417"
+ },
+ "execution_count": 77,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " movieId prediction rating\n",
+ "0 296.0 4.792743 5.0\n",
+ "1 593.0 4.742942 4.0\n",
+ "2 608.0 4.647800 5.0\n",
+ "3 318.0 4.469385 NaN\n",
+ "4 527.0 4.236960 5.0\n",
+ ".. ... ... ...\n",
+ "174 2269.0 NaN 1.0\n",
+ "175 586.0 NaN 1.0\n",
+ "176 344.0 NaN 1.0\n",
+ "177 2054.0 NaN 1.0\n",
+ "178 2617.0 NaN 1.0\n",
+ "\n",
+ "[179 rows x 3 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " movieId | \n",
+ " prediction | \n",
+ " rating | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 296.0 | \n",
+ " 4.792743 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 593.0 | \n",
+ " 4.742942 | \n",
+ " 4.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 608.0 | \n",
+ " 4.647800 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 318.0 | \n",
+ " 4.469385 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 527.0 | \n",
+ " 4.236960 | \n",
+ " 5.0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " 2269.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 175 | \n",
+ " 586.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 176 | \n",
+ " 344.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 177 | \n",
+ " 2054.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ " 178 | \n",
+ " 2617.0 | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
179 rows × 3 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 77
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "threshold = 2\n",
+ "df_res['prediction'] = df_res['prediction'] >= threshold\n",
+ "df_res['rating'] = df_res['rating'] >= threshold\n",
+ "df_res"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 423
+ },
+ "id": "Wq-kIghejSIO",
+ "outputId": "4af55343-80d8-42e5-a61b-69615fa48659"
+ },
+ "execution_count": 78,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " movieId prediction rating\n",
+ "0 296.0 True True\n",
+ "1 593.0 True True\n",
+ "2 608.0 True True\n",
+ "3 318.0 True False\n",
+ "4 527.0 True True\n",
+ ".. ... ... ...\n",
+ "174 2269.0 False False\n",
+ "175 586.0 False False\n",
+ "176 344.0 False False\n",
+ "177 2054.0 False False\n",
+ "178 2617.0 False False\n",
+ "\n",
+ "[179 rows x 3 columns]"
+ ],
+ "text/html": [
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " movieId | \n",
+ " prediction | \n",
+ " rating | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 296.0 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 593.0 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 608.0 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 318.0 | \n",
+ " True | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 527.0 | \n",
+ " True | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 174 | \n",
+ " 2269.0 | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 175 | \n",
+ " 586.0 | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 176 | \n",
+ " 344.0 | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 177 | \n",
+ " 2054.0 | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 178 | \n",
+ " 2617.0 | \n",
+ " False | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
179 rows × 3 columns
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 78
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "def precision_at_k(df, k=10, y_test: str='rating', y_pred='prediction'): \n",
+ " dfK = df.head(k)\n",
+ " sum_df = dfK[y_pred].sum()\n",
+ " true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]\n",
+ " if sum_df > 0:\n",
+ " return true_pred/sum_df\n",
+ " else:\n",
+ " return None\n",
+ "\n",
+ "def recall_at_k(df, k=10, y_test='rating', y_pred='prediction'):\n",
+ " dfK = df.head(k)\n",
+ " sum_df = df[y_test].sum()\n",
+ " true_pred = dfK[dfK[y_pred] & dfK[y_test]].shape[0]\n",
+ " if sum_df > 0:\n",
+ " return true_pred/sum_df\n",
+ " else:\n",
+ " return None"
+ ],
+ "metadata": {
+ "id": "z3iLPKpZqQ0e"
+ },
+ "execution_count": 79,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "prec_at_k = precision_at_k(df_res, 100, y_test='rating', y_pred='prediction')\n",
+ "rec_at_k = recall_at_k(df_res, 100, y_test='rating', y_pred='prediction')"
+ ],
+ "metadata": {
+ "id": "IrMcSYwPsrV3"
+ },
+ "execution_count": 80,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(\"precision@k: \", prec_at_k)\n",
+ "print(\"recall@k: \", rec_at_k)\n",
+ "mf.log_metric('recall', rec_at_k)\n",
+ "mf.log_metric('precision', prec_at_k)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "r2mMjEt5s31U",
+ "outputId": "6d69f9d6-10eb-4ee5-90d5-ee8a4495c391"
+ },
+ "execution_count": 81,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "precision@k: 0.7941176470588235\n",
+ "recall@k: 0.21774193548387097\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "nj5lPtCfP17d"
+ },
+ "execution_count": 81,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "z1HlQhn9gyTJ"
+ },
+ "source": [
+ "# MLOps"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
+ "metadata": {
+ "id": "d7GJNfWMgzy3"
+ },
+ "outputs": [],
+ "source": [
+ "def updata_batch(new_batch):\n",
+ " number_of_batches = number_of_batches+1\n",
+ " batches = batches.append(new_batch)\n",
+ " mf.log_param('number of batches', number_of_batches)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [],
+ "metadata": {
+ "id": "FkGxEvXj21Dw"
+ },
+ "execution_count": 82,
+ "outputs": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file