{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": { "id": "c636ff09" }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import ast" ], "id": "c636ff09" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lTZzn2LhfKWx", "outputId": "385861c9-dac1-4814-9d04-58ef56a677c3" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')" ], "id": "lTZzn2LhfKWx" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "104070e2", "outputId": "d9ca22af-5ad6-42a7-c95e-138baf70793d" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":3: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n", " movies_data = pd.read_csv('/content/drive/MyDrive/movies_metadata.csv')\n" ] } ], "source": [ "credits = pd.read_csv('/content/drive/MyDrive/credits.csv')\n", "keywords = pd.read_csv('/content/drive/MyDrive/keywords.csv')\n", "movies_data = pd.read_csv('/content/drive/MyDrive/movies_metadata.csv')" ], "id": "104070e2" }, { "cell_type": "markdown", "metadata": { "id": "-gafAsafvJ3_" }, "source": [ "# New Section" ], "id": "-gafAsafvJ3_" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "3e393573", "outputId": "178a5220-4734-4131-b57e-aace00d6d0f5" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)',...[{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[{'cast_id': 1, 'character': 'Alan Parrish', '...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'c...[{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah...[{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[{'cast_id': 1, 'character': 'George Banks', '...[{'credit_id': '52fe44959251416c75039ed7', 'de...11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast \\\n", "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n", "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n", "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n", "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n", "4 [{'cast_id': 1, 'character': 'George Banks', '... \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "3e393573" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "r6l3nJHPDBX8", "outputId": "2b556319-7990-46bd-abcb-2aaabb7dfb65" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 45476 entries, 0 to 45475\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 cast 45476 non-null object\n", " 1 crew 45476 non-null object\n", " 2 id 45476 non-null int64 \n", "dtypes: int64(1), object(2)\n", "memory usage: 1.0+ MB\n" ] } ], "source": [ "credits.info()" ], "id": "r6l3nJHPDBX8" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0zYuGQpTDWCd", "outputId": "33690631-2a8c-4d34-8f24-a3a67df2946d" }, "outputs": [ { "data": { "text/plain": [ "(45476, 3)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.shape" ], "id": "0zYuGQpTDWCd" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "5ZoopuCYDbJc", "outputId": "c663f6ee-72f5-45e1-908a-2d07eba837c4" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
18844[{'id': 10090, 'name': 'board game'}, {'id': 1...
215602[{'id': 1495, 'name': 'fishing'}, {'id': 12392...
331357[{'id': 818, 'name': 'based on novel'}, {'id':...
411862[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id keywords\n", "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n", "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n", "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n", "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n", "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..." ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.head()" ], "id": "5ZoopuCYDbJc" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ec8af582", "outputId": "93d9cc90-17bb-4600-ebba-186df59d7759" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 46419 entries, 0 to 46418\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 46419 non-null int64 \n", " 1 keywords 46419 non-null object\n", "dtypes: int64(1), object(1)\n", "memory usage: 725.4+ KB\n" ] } ], "source": [ "keywords.info()" ], "id": "ec8af582" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "e210768e", "outputId": "c0cd641f-3835-4cd5-ec07-42636ff42501" }, "outputs": [ { "data": { "text/plain": [ "(46419, 2)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.shape" ], "id": "e210768e" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 706 }, "id": "1e37233a", "outputId": "7cf84fd4-a67e-49a9-ae06-0f512a1323dd" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
adultbelongs_to_collectionbudgetgenreshomepageidimdb_idoriginal_languageoriginal_titleoverview...release_daterevenueruntimespoken_languagesstatustaglinetitlevideovote_averagevote_count
0False{'id': 10194, 'name': 'Toy Story Collection', ...30000000[{'id': 16, 'name': 'Animation'}, {'id': 35, '...http://toystory.disney.com/toy-story862tt0114709enToy StoryLed by Woody, Andy's toys live happily in his ......1995-10-30373554033.081.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedNaNToy StoryFalse7.75415.0
1FalseNaN65000000[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...NaN8844tt0113497enJumanjiWhen siblings Judy and Peter discover an encha......1995-12-15262797249.0104.0[{'iso_639_1': 'en', 'name': 'English'}, {'iso...ReleasedRoll the dice and unleash the excitement!JumanjiFalse6.92413.0
2False{'id': 119050, 'name': 'Grumpy Old Men Collect...0[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...NaN15602tt0113228enGrumpier Old MenA family wedding reignites the ancient feud be......1995-12-220.0101.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedStill Yelling. Still Fighting. Still Ready for...Grumpier Old MenFalse6.592.0
3FalseNaN16000000[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...NaN31357tt0114885enWaiting to ExhaleCheated on, mistreated and stepped on, the wom......1995-12-2281452156.0127.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedFriends are the people who let you be yourself...Waiting to ExhaleFalse6.134.0
4False{'id': 96871, 'name': 'Father of the Bride Col...0[{'id': 35, 'name': 'Comedy'}]NaN11862tt0113041enFather of the Bride Part IIJust when George Banks has recovered from his ......1995-02-1076578911.0106.0[{'iso_639_1': 'en', 'name': 'English'}]ReleasedJust When His World Is Back To Normal... He's ...Father of the Bride Part IIFalse5.7173.0
\n", "

5 rows × 24 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " adult belongs_to_collection budget \\\n", "0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n", "1 False NaN 65000000 \n", "2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n", "3 False NaN 16000000 \n", "4 False {'id': 96871, 'name': 'Father of the Bride Col... 0 \n", "\n", " genres \\\n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "\n", " homepage id imdb_id original_language \\\n", "0 http://toystory.disney.com/toy-story 862 tt0114709 en \n", "1 NaN 8844 tt0113497 en \n", "2 NaN 15602 tt0113228 en \n", "3 NaN 31357 tt0114885 en \n", "4 NaN 11862 tt0113041 en \n", "\n", " original_title \\\n", "0 Toy Story \n", "1 Jumanji \n", "2 Grumpier Old Men \n", "3 Waiting to Exhale \n", "4 Father of the Bride Part II \n", "\n", " overview ... release_date \\\n", "0 Led by Woody, Andy's toys live happily in his ... ... 1995-10-30 \n", "1 When siblings Judy and Peter discover an encha... ... 1995-12-15 \n", "2 A family wedding reignites the ancient feud be... ... 1995-12-22 \n", "3 Cheated on, mistreated and stepped on, the wom... ... 1995-12-22 \n", "4 Just when George Banks has recovered from his ... ... 1995-02-10 \n", "\n", " revenue runtime spoken_languages \\\n", "0 373554033.0 81.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "1 262797249.0 104.0 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... \n", "2 0.0 101.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "3 81452156.0 127.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "4 76578911.0 106.0 [{'iso_639_1': 'en', 'name': 'English'}] \n", "\n", " status tagline \\\n", "0 Released NaN \n", "1 Released Roll the dice and unleash the excitement! \n", "2 Released Still Yelling. Still Fighting. Still Ready for... \n", "3 Released Friends are the people who let you be yourself... \n", "4 Released Just When His World Is Back To Normal... He's ... \n", "\n", " title video vote_average vote_count \n", "0 Toy Story False 7.7 5415.0 \n", "1 Jumanji False 6.9 2413.0 \n", "2 Grumpier Old Men False 6.5 92.0 \n", "3 Waiting to Exhale False 6.1 34.0 \n", "4 Father of the Bride Part II False 5.7 173.0 \n", "\n", "[5 rows x 24 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_data.head()" ], "id": "1e37233a" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "d54c1ebb", "outputId": "1acc7173-03a6-4a25-dd87-6314c5f2539d" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 45466 entries, 0 to 45465\n", "Data columns (total 24 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 adult 45466 non-null object \n", " 1 belongs_to_collection 4494 non-null object \n", " 2 budget 45466 non-null object \n", " 3 genres 45466 non-null object \n", " 4 homepage 7782 non-null object \n", " 5 id 45466 non-null object \n", " 6 imdb_id 45449 non-null object \n", " 7 original_language 45455 non-null object \n", " 8 original_title 45466 non-null object \n", " 9 overview 44512 non-null object \n", " 10 popularity 45461 non-null object \n", " 11 poster_path 45080 non-null object \n", " 12 production_companies 45463 non-null object \n", " 13 production_countries 45463 non-null object \n", " 14 release_date 45379 non-null object \n", " 15 revenue 45460 non-null float64\n", " 16 runtime 45203 non-null float64\n", " 17 spoken_languages 45460 non-null object \n", " 18 status 45379 non-null object \n", " 19 tagline 20412 non-null object \n", " 20 title 45460 non-null object \n", " 21 video 45460 non-null object \n", " 22 vote_average 45460 non-null float64\n", " 23 vote_count 45460 non-null float64\n", "dtypes: float64(4), object(20)\n", "memory usage: 8.3+ MB\n" ] } ], "source": [ "movies_data.info()" ], "id": "d54c1ebb" }, { "cell_type": "markdown", "metadata": { "id": "2MH6zTNSDyLw" }, "source": [ "- Belongs_to_collection(40972), homepage(37684), tagline(25054)\n", "- imdb_id(17), original_language(11), popularity(5), production_companies(3), production_contries(3), release_date(87),revenue(6), spoken_languages(6), status(87), title(6), video(6), vote_average(6), vote_count(6)\n", "- overview(954), poster_path(381), runtime(269),\n", "\n", "= Most of them contain null values but majority features have less than 100 points of null values. 3 have very high null values and 3 have null values between 100-1000\n", "\n", "= Now, How should I proceed. \"Should I first remove null values or irrelevent features\"" ], "id": "2MH6zTNSDyLw" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f5ad134d", "outputId": "cb9d8e25-2e77-4589-8e96-cd5c30b32c9f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "print(type(movies_data))\n", "print(type(keywords))\n", "print(type(credits))" ], "id": "f5ad134d" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8d83d48d" }, "outputs": [], "source": [ "#needed information -\n", "#id\n", "#title\n", "#overview\n", "#genres\n", "#keywords\n", "#cast\n", "#crew" ], "id": "8d83d48d" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aaXAWei3HSIL", "outputId": "f7e87d43-454d-4745-cb64-08997812b21a" }, "outputs": [ { "data": { "text/plain": [ "array([ 862, 8844, 15602, ..., 67758, 227506, 461257])" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits['id'].values" ], "id": "aaXAWei3HSIL" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1cccbfb2", "outputId": "4e2aaa8c-6749-4368-8ae9-86805c815f99" }, "outputs": [ { "data": { "text/plain": [ "array([ 862, 8844, 15602, 31357, 11862])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()['id'].values" ], "id": "1cccbfb2" }, { "cell_type": "markdown", "metadata": { "id": "AMf7jP92KJjt" }, "source": [ "Q = Are all three data similar? I mean id of credits data matching with title of movies_data" ], "id": "AMf7jP92KJjt" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7a5b1777" }, "outputs": [], "source": [ "movies = movies_data[['id','title','overview', 'genres']]" ], "id": "7a5b1777" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "9a3918de", "outputId": "1bb3f7ad-d4d3-4e58-8962-85ec13e2a294" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy StoryLed by Woody, Andy's toys live happily in his ...[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
18844JumanjiWhen siblings Judy and Peter discover an encha...[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
215602Grumpier Old MenA family wedding reignites the ancient feud be...[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
331357Waiting to ExhaleCheated on, mistreated and stepped on, the wom...[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
411862Father of the Bride Part IIJust when George Banks has recovered from his ...[{'id': 35, 'name': 'Comedy'}]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Cheated on, mistreated and stepped on, the wom... \n", "4 Just when George Banks has recovered from his ... \n", "\n", " genres \n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "9a3918de" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cc87d23c", "outputId": "b9569915-7408-4689-dd76-0cdbcd7e3e3f" }, "outputs": [ { "data": { "text/plain": [ "pandas.core.frame.DataFrame" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(movies)" ], "id": "cc87d23c" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "4f515e4b", "outputId": "6e9f0190-4b62-4705-9779-8565629239ea" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy StoryLed by Woody, Andy's toys live happily in his ...[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
18844JumanjiWhen siblings Judy and Peter discover an encha...[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
215602Grumpier Old MenA family wedding reignites the ancient feud be...[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
331357Waiting to ExhaleCheated on, mistreated and stepped on, the wom...[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
411862Father of the Bride Part IIJust when George Banks has recovered from his ...[{'id': 35, 'name': 'Comedy'}]
...............
45461439050SubdueRising and falling between a man and woman.[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...
45462111109Century of BirthingAn artist struggles to finish his work while a...[{'id': 18, 'name': 'Drama'}]
4546367758BetrayalWhen one of her hits goes wrong, a professiona...[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...
45464227506Satan TriumphantIn a small town live two brothers, one a minis...[]
45465461257Queerama50 years after decriminalisation of homosexual...[]
\n", "

45466 rows × 4 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "... ... ... \n", "45461 439050 Subdue \n", "45462 111109 Century of Birthing \n", "45463 67758 Betrayal \n", "45464 227506 Satan Triumphant \n", "45465 461257 Queerama \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Cheated on, mistreated and stepped on, the wom... \n", "4 Just when George Banks has recovered from his ... \n", "... ... \n", "45461 Rising and falling between a man and woman. \n", "45462 An artist struggles to finish his work while a... \n", "45463 When one of her hits goes wrong, a professiona... \n", "45464 In a small town live two brothers, one a minis... \n", "45465 50 years after decriminalisation of homosexual... \n", "\n", " genres \n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] \n", "... ... \n", "45461 [{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n... \n", "45462 [{'id': 18, 'name': 'Drama'}] \n", "45463 [{'id': 28, 'name': 'Action'}, {'id': 18, 'nam... \n", "45464 [] \n", "45465 [] \n", "\n", "[45466 rows x 4 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.reset_index(drop=True)" ], "id": "4f515e4b" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "4fee219d", "outputId": "34a73e9f-e0f4-4fea-fad4-3af815273ff3" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy StoryLed by Woody, Andy's toys live happily in his ...[{'id': 16, 'name': 'Animation'}, {'id': 35, '...
18844JumanjiWhen siblings Judy and Peter discover an encha...[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
215602Grumpier Old MenA family wedding reignites the ancient feud be...[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
331357Waiting to ExhaleCheated on, mistreated and stepped on, the wom...[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
411862Father of the Bride Part IIJust when George Banks has recovered from his ...[{'id': 35, 'name': 'Comedy'}]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Cheated on, mistreated and stepped on, the wom... \n", "4 Just when George Banks has recovered from his ... \n", "\n", " genres \n", "0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n", "1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n", "2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n", "3 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam... \n", "4 [{'id': 35, 'name': 'Comedy'}] " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "4fee219d" }, { "cell_type": "markdown", "metadata": { "id": "JklQN3yOvZA1" }, "source": [ "- You extracted name in the genre column but what about the id. Is it not the same as id column in the movie data or Are both id's different\n", "\n", "In a movie dataset, the presence of 'id' in two different features, such as the main 'id' feature and within the 'genres' feature as part of a dictionary, generally indicates that they serve different purposes. both instances use the term 'id,' they likely refer to different entities: one for uniquely identifying movies and the other for uniquely identifying genres associated with those movies." ], "id": "JklQN3yOvZA1" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "15N2F2xuvYlg" }, "outputs": [], "source": [], "id": "15N2F2xuvYlg" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "c91b75ca", "outputId": "9c9f6ae6-56d9-4b33-8b7a-9bc4117aa496" }, "outputs": [ { "data": { "text/plain": [ "id 0\n", "title 6\n", "overview 954\n", "genres 0\n", "dtype: int64" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.isnull().sum()" ], "id": "c91b75ca" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1c13eecf", "outputId": "de6d780d-3249-48d1-a707-f745f7214924" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies.dropna(inplace=True)\n" ] } ], "source": [ "movies.dropna(inplace=True)" ], "id": "1c13eecf" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ad028a7f", "outputId": "b913ccde-9b43-42c9-e551-9951402da04a" }, "outputs": [ { "data": { "text/plain": [ "30" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.duplicated().sum()" ], "id": "ad028a7f" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 36 }, "id": "1216bbee", "outputId": "b36da729-a157-400a-d8fa-87590903c165" }, "outputs": [ { "data": { "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" }, "text/plain": [ "\"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]\"" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.iloc[0].genres" ], "id": "1216bbee" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "29a15327" }, "outputs": [], "source": [ "def convert(obj):\n", " L = []\n", " for i in ast.literal_eval(obj):\n", " L.append(i['name'])\n", " return L" ], "id": "29a15327" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aaa3a067", "outputId": "0927b1c3-66ed-4d10-d0a4-bca3eb11cf71" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['genres'] = movies['genres'].apply(convert)\n" ] } ], "source": [ "movies['genres'] = movies['genres'].apply(convert)" ], "id": "aaa3a067" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "ed55d66b", "outputId": "87f6f480-8b16-4eb2-e0d8-fde3d0070000" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy StoryLed by Woody, Andy's toys live happily in his ...[Animation, Comedy, Family]
18844JumanjiWhen siblings Judy and Peter discover an encha...[Adventure, Fantasy, Family]
215602Grumpier Old MenA family wedding reignites the ancient feud be...[Romance, Comedy]
331357Waiting to ExhaleCheated on, mistreated and stepped on, the wom...[Comedy, Drama, Romance]
411862Father of the Bride Part IIJust when George Banks has recovered from his ...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Cheated on, mistreated and stepped on, the wom... \n", "4 Just when George Banks has recovered from his ... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "ed55d66b" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "hFOiNiljyMdp", "outputId": "5e50e02c-161b-469f-c1cc-0a366b36b5e5" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
overview
0Led by Woody, Andy's toys live happily in his ...
1When siblings Judy and Peter discover an encha...
2A family wedding reignites the ancient feud be...
3Cheated on, mistreated and stepped on, the wom...
4Just when George Banks has recovered from his ...
......
45461Rising and falling between a man and woman.
45462An artist struggles to finish his work while a...
45463When one of her hits goes wrong, a professiona...
45464In a small town live two brothers, one a minis...
4546550 years after decriminalisation of homosexual...
\n", "

44506 rows × 1 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " overview\n", "0 Led by Woody, Andy's toys live happily in his ...\n", "1 When siblings Judy and Peter discover an encha...\n", "2 A family wedding reignites the ancient feud be...\n", "3 Cheated on, mistreated and stepped on, the wom...\n", "4 Just when George Banks has recovered from his ...\n", "... ...\n", "45461 Rising and falling between a man and woman.\n", "45462 An artist struggles to finish his work while a...\n", "45463 When one of her hits goes wrong, a professiona...\n", "45464 In a small town live two brothers, one a minis...\n", "45465 50 years after decriminalisation of homosexual...\n", "\n", "[44506 rows x 1 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.iloc[:,2:3]" ], "id": "hFOiNiljyMdp" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "B0sYnGhFzPCd", "outputId": "b3766f3b-08b1-4a0b-cb35-13e8b2010ea2" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...
18844[{'id': 10090, 'name': 'board game'}, {'id': 1...
215602[{'id': 1495, 'name': 'fishing'}, {'id': 12392...
331357[{'id': 818, 'name': 'based on novel'}, {'id':...
411862[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id keywords\n", "0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n", "1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n", "2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392...\n", "3 31357 [{'id': 818, 'name': 'based on novel'}, {'id':...\n", "4 11862 [{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..." ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.head()" ], "id": "B0sYnGhFzPCd" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "bee63035" }, "outputs": [], "source": [ "keywords['keywords'] = keywords['keywords'].apply(convert)" ], "id": "bee63035" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "7d682a09", "outputId": "304a7973-7d42-484b-bb2b-c3e41074cebe" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[jealousy, toy, boy, friendship, friends, riva...
18844[board game, disappearance, based on children'...
215602[fishing, best friend, duringcreditsstinger, o...
331357[based on novel, interracial relationship, sin...
411862[baby, midlife crisis, confidence, aging, daug...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id keywords\n", "0 862 [jealousy, toy, boy, friendship, friends, riva...\n", "1 8844 [board game, disappearance, based on children'...\n", "2 15602 [fishing, best friend, duringcreditsstinger, o...\n", "3 31357 [based on novel, interracial relationship, sin...\n", "4 11862 [baby, midlife crisis, confidence, aging, daug..." ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.head()" ], "id": "7d682a09" }, { "cell_type": "markdown", "metadata": { "id": "TVM5TVEs0LqQ" }, "source": [ "Q = How did you get keywords data? It's looking like id for keywords and movies data is same. Are they extracted from the overview or we took a bunch of keywords by ourselves and later we will use them to find find similar keywords in overview or any other feature to train our model." ], "id": "TVM5TVEs0LqQ" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "d4eef40d" }, "outputs": [], "source": [ "def convert3(obj):\n", " L = []\n", " counter = 0\n", " for i in ast.literal_eval(obj):\n", " if counter != 3: #why??\n", " L.append(i['name'])\n", " counter += 1\n", " else:\n", " break\n", " return L" ], "id": "d4eef40d" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "EiS-2IEm2OQ0", "outputId": "51d48847-cf9a-4cab-8656-ab33d691b94a" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[{'cast_id': 14, 'character': 'Woody (voice)',...[{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[{'cast_id': 1, 'character': 'Alan Parrish', '...[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[{'cast_id': 2, 'character': 'Max Goldman', 'c...[{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[{'cast_id': 1, 'character': \"Savannah 'Vannah...[{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[{'cast_id': 1, 'character': 'George Banks', '...[{'credit_id': '52fe44959251416c75039ed7', 'de...11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast \\\n", "0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n", "1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n", "2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n", "3 [{'cast_id': 1, 'character': \"Savannah 'Vannah... \n", "4 [{'cast_id': 1, 'character': 'George Banks', '... \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "EiS-2IEm2OQ0" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "2c96335d" }, "outputs": [], "source": [ "credits['cast'] = credits['cast'].apply(convert3)" ], "id": "2c96335d" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "CQWMCQsb3lhM", "outputId": "9406acf9-ce09-4e1f-ebd9-6aaa0e9e4109" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[Tom Hanks, Tim Allen, Don Rickles][{'credit_id': '52fe4284c3a36847f8024f49', 'de...862
1[Robin Williams, Jonathan Hyde, Kirsten Dunst][{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...8844
2[Walter Matthau, Jack Lemmon, Ann-Margret][{'credit_id': '52fe466a9251416c75077a89', 'de...15602
3[Whitney Houston, Angela Bassett, Loretta Devine][{'credit_id': '52fe44779251416c91011acb', 'de...31357
4[Steve Martin, Diane Keaton, Martin Short][{'credit_id': '52fe44959251416c75039ed7', 'de...11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast \\\n", "0 [Tom Hanks, Tim Allen, Don Rickles] \n", "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] \n", "2 [Walter Matthau, Jack Lemmon, Ann-Margret] \n", "3 [Whitney Houston, Angela Bassett, Loretta Devine] \n", "4 [Steve Martin, Diane Keaton, Martin Short] \n", "\n", " crew id \n", "0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n", "1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n", "2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 \n", "3 [{'credit_id': '52fe44779251416c91011acb', 'de... 31357 \n", "4 [{'credit_id': '52fe44959251416c75039ed7', 'de... 11862 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "CQWMCQsb3lhM" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5264cd54" }, "outputs": [], "source": [ "def fetch_director(obj):\n", " L = []\n", " for i in ast.literal_eval(obj):\n", " if i['job'] == 'Director':\n", " L.append(i['name'])\n", " return L" ], "id": "5264cd54" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "839915b4" }, "outputs": [], "source": [ "credits['crew'] = credits['crew'].apply(fetch_director)" ], "id": "839915b4" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b2a378e2", "outputId": "4fb4db5e-0358-4361-8118-13d5f3434a7b" }, "outputs": [ { "data": { "text/plain": [ "0 [John Lasseter]\n", "1 [Joe Johnston]\n", "2 [Howard Deutch]\n", "3 [Forest Whitaker]\n", "4 [Charles Shyer]\n", "Name: crew, dtype: object" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits['crew'].head()" ], "id": "b2a378e2" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "438a1857", "outputId": "2dc7a1d7-74d2-47c5-e564-b205e4153170" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[Tom Hanks, Tim Allen, Don Rickles][John Lasseter]862
1[Robin Williams, Jonathan Hyde, Kirsten Dunst][Joe Johnston]8844
2[Walter Matthau, Jack Lemmon, Ann-Margret][Howard Deutch]15602
3[Whitney Houston, Angela Bassett, Loretta Devine][Forest Whitaker]31357
4[Steve Martin, Diane Keaton, Martin Short][Charles Shyer]11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast crew id\n", "0 [Tom Hanks, Tim Allen, Don Rickles] [John Lasseter] 862\n", "1 [Robin Williams, Jonathan Hyde, Kirsten Dunst] [Joe Johnston] 8844\n", "2 [Walter Matthau, Jack Lemmon, Ann-Margret] [Howard Deutch] 15602\n", "3 [Whitney Houston, Angela Bassett, Loretta Devine] [Forest Whitaker] 31357\n", "4 [Steve Martin, Diane Keaton, Martin Short] [Charles Shyer] 11862" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "438a1857" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "dfafc9d4", "outputId": "d88d0758-87cf-4a63-e89f-e0f7e46324b7" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[jealousy, toy, boy, friendship, friends, riva...
18844[board game, disappearance, based on children'...
215602[fishing, best friend, duringcreditsstinger, o...
331357[based on novel, interracial relationship, sin...
411862[baby, midlife crisis, confidence, aging, daug...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id keywords\n", "0 862 [jealousy, toy, boy, friendship, friends, riva...\n", "1 8844 [board game, disappearance, based on children'...\n", "2 15602 [fishing, best friend, duringcreditsstinger, o...\n", "3 31357 [based on novel, interracial relationship, sin...\n", "4 11862 [baby, midlife crisis, confidence, aging, daug..." ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.head()" ], "id": "dfafc9d4" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "SbjFhDQazfdF", "outputId": "135bc619-01f2-4402-d552-87c7a0421f99" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy StoryLed by Woody, Andy's toys live happily in his ...[Animation, Comedy, Family]
18844JumanjiWhen siblings Judy and Peter discover an encha...[Adventure, Fantasy, Family]
215602Grumpier Old MenA family wedding reignites the ancient feud be...[Romance, Comedy]
331357Waiting to ExhaleCheated on, mistreated and stepped on, the wom...[Comedy, Drama, Romance]
411862Father of the Bride Part IIJust when George Banks has recovered from his ...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 Led by Woody, Andy's toys live happily in his ... \n", "1 When siblings Judy and Peter discover an encha... \n", "2 A family wedding reignites the ancient feud be... \n", "3 Cheated on, mistreated and stepped on, the wom... \n", "4 Just when George Banks has recovered from his ... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "SbjFhDQazfdF" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aa7a1a88", "outputId": "ca311972-6f3b-448b-edfb-e7ef413e5d8b" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['overview'] = movies['overview'].apply(lambda x : x.split())\n" ] } ], "source": [ "movies['overview'] = movies['overview'].apply(lambda x : x.split())" ], "id": "aa7a1a88" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "b9438297", "outputId": "f012fa20-a702-449c-c2e6-61d94a2241fd" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family]
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family]
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy]
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance]
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "b9438297" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "c81aa01a", "outputId": "f48207de-204a-4993-cc0c-9781e90ce3cd" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['genres'] = movies['genres'].apply(lambda x: [i.replace(\" \",\"\") for i in x])\n" ] } ], "source": [ "movies['genres'] = movies['genres'].apply(lambda x: [i.replace(\" \",\"\") for i in x])\n", "keywords['keywords'] = keywords['keywords'].apply(lambda x: [i.replace(\" \",\"\") for i in x])\n", "credits['cast'] = credits['cast'].apply(lambda x: [i.replace(\" \",\"\") for i in x])\n", "credits['crew'] = credits['crew'].apply(lambda x: [i.replace(\" \",\"\") for i in x])" ], "id": "c81aa01a" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "65154ada", "outputId": "e2a364e7-3175-482b-93ee-729b1ebf794f" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family]
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family]
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy]
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance]
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "65154ada" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "4d92d210", "outputId": "95ef5790-0c31-41da-c8c4-aab9ee223499" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[TomHanks, TimAllen, DonRickles][JohnLasseter]862
1[RobinWilliams, JonathanHyde, KirstenDunst][JoeJohnston]8844
2[WalterMatthau, JackLemmon, Ann-Margret][HowardDeutch]15602
3[WhitneyHouston, AngelaBassett, LorettaDevine][ForestWhitaker]31357
4[SteveMartin, DianeKeaton, MartinShort][CharlesShyer]11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast crew id\n", "0 [TomHanks, TimAllen, DonRickles] [JohnLasseter] 862\n", "1 [RobinWilliams, JonathanHyde, KirstenDunst] [JoeJohnston] 8844\n", "2 [WalterMatthau, JackLemmon, Ann-Margret] [HowardDeutch] 15602\n", "3 [WhitneyHouston, AngelaBassett, LorettaDevine] [ForestWhitaker] 31357\n", "4 [SteveMartin, DianeKeaton, MartinShort] [CharlesShyer] 11862" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "4d92d210" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "61c5de71", "outputId": "b31c07a2-73fa-45cd-e7d0-0378980b14f3" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idkeywords
0862[jealousy, toy, boy, friendship, friends, riva...
18844[boardgame, disappearance, basedonchildren'sbo...
215602[fishing, bestfriend, duringcreditsstinger, ol...
331357[basedonnovel, interracialrelationship, single...
411862[baby, midlifecrisis, confidence, aging, daugh...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id keywords\n", "0 862 [jealousy, toy, boy, friendship, friends, riva...\n", "1 8844 [boardgame, disappearance, basedonchildren'sbo...\n", "2 15602 [fishing, bestfriend, duringcreditsstinger, ol...\n", "3 31357 [basedonnovel, interracialrelationship, single...\n", "4 11862 [baby, midlifecrisis, confidence, aging, daugh..." ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords.head()" ], "id": "61c5de71" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "061b1c99", "outputId": "50529ec3-db7b-4d11-b8d7-1465b6b06a32" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family]
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family]
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy]
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance]
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "061b1c99" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8920a39b", "outputId": "079aa645-b225-4d2f-c976-9b43b42e3f39" }, "outputs": [ { "data": { "text/plain": [ "44506" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_id = np.array(movies['id'])\n", "len(movies_id)" ], "id": "8920a39b" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "30149cf1", "outputId": "2953c0cf-2822-4b09-e39e-3543bcaf9ad6" }, "outputs": [ { "data": { "text/plain": [ "array(['862', '8844', '15602', ..., '67758', '227506', '461257'],\n", " dtype=object)" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_id" ], "id": "30149cf1" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "69cf0288", "outputId": "71846fcf-d98a-4c8b-d9bd-e7a94e52f3e1" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "45476\n" ] }, { "data": { "text/plain": [ "array([ 862, 8844, 15602, ..., 67758, 227506, 461257])" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits_id = np.array(credits['id'])\n", "print(len(credits_id))\n", "credits_id" ], "id": "69cf0288" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "b1214f7d", "outputId": "270e1fd9-c31f-407d-e5b3-dea1b81bec4a" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "46419\n" ] }, { "data": { "text/plain": [ "array([ 862, 8844, 15602, ..., 67758, 227506, 461257])" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords_id = np.array(keywords['id'])\n", "print(len(keywords_id))\n", "keywords_id" ], "id": "b1214f7d" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "4e7ece27" }, "outputs": [], "source": [ "movies_id = np.array([int(item) for item in movies_id])\n", "credits_id = np.array([int(item) for item in credits_id])\n", "keywords_id = np.array([int(item) for item in keywords_id]) #checking all the id's are integer or not" ], "id": "4e7ece27" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "17158d8a", "outputId": "767f1b81-7fb9-4a84-de2a-3ecfa8d6b5e8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "44475\n" ] }, { "data": { "text/plain": [ "array([ 2, 3, 5, ..., 467731, 468343, 469172])" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common1 = np.intersect1d(movies_id,credits_id)\n", "print(len(common1))\n", "common1" ], "id": "17158d8a" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "79ba7b2f", "outputId": "7f1dda93-cc26-431b-bc6a-e5dc6d32f276" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "44475\n" ] }, { "data": { "text/plain": [ "array([ 2, 3, 5, ..., 467731, 468343, 469172])" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "common_id = np.intersect1d(common1,keywords_id)\n", "print(len(common_id))\n", "common_id" ], "id": "79ba7b2f" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 423 }, "id": "1082cd38", "outputId": "9d632389-d9f2-4d85-8e7f-06d3f58d6b86" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id
02
13
25
36
411
......
44470464819
44471465044
44472467731
44473468343
44474469172
\n", "

44475 rows × 1 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id\n", "0 2\n", "1 3\n", "2 5\n", "3 6\n", "4 11\n", "... ...\n", "44470 464819\n", "44471 465044\n", "44472 467731\n", "44473 468343\n", "44474 469172\n", "\n", "[44475 rows x 1 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = pd.DataFrame(common_id, columns = ['id'])\n", "new_df" ], "id": "1082cd38" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-mVlKUH9_KS9", "outputId": "320a4436-e41b-48d6-bddc-234120ceb43f" }, "outputs": [ { "data": { "text/plain": [ "44506" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(movies.id)" ], "id": "-mVlKUH9_KS9" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "a_LOD348_GXo", "outputId": "d813bff2-9acc-4160-a632-be63ec8b5b31" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
castcrewid
0[TomHanks, TimAllen, DonRickles][JohnLasseter]862
1[RobinWilliams, JonathanHyde, KirstenDunst][JoeJohnston]8844
2[WalterMatthau, JackLemmon, Ann-Margret][HowardDeutch]15602
3[WhitneyHouston, AngelaBassett, LorettaDevine][ForestWhitaker]31357
4[SteveMartin, DianeKeaton, MartinShort][CharlesShyer]11862
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " cast crew id\n", "0 [TomHanks, TimAllen, DonRickles] [JohnLasseter] 862\n", "1 [RobinWilliams, JonathanHyde, KirstenDunst] [JoeJohnston] 8844\n", "2 [WalterMatthau, JackLemmon, Ann-Margret] [HowardDeutch] 15602\n", "3 [WhitneyHouston, AngelaBassett, LorettaDevine] [ForestWhitaker] 31357\n", "4 [SteveMartin, DianeKeaton, MartinShort] [CharlesShyer] 11862" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ], "id": "a_LOD348_GXo" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "vwJYkpwcSfYN", "outputId": "01bd07ed-14c0-4231-c4f0-9bcc22b6d82d" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family]
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family]
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy]
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance]
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "vwJYkpwcSfYN" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "fw3OCNiF_Lmx" }, "outputs": [], "source": [ "# movies = movies.reset_index(drop = True)\n", "# data_types = movies['id'].apply(type)\n", "# data_types.value_counts()" ], "id": "fw3OCNiF_Lmx" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "b6b61453" }, "outputs": [], "source": [ "#I tried to create a completely new dataframe called new_df with one column having the common id's of all 3 dataframes and\n", "#other column 'title' having 'title' of the movie corresponding to that id and 'tag' having sum of 'overview', 'cast', 'crew'\n", "# and 'keywords'\n", "\n", "#it is taking a LOT of time to process. I also tried different ways to remove NaN errors but no progress :(\n", "#tried using merge, pd.concat, pd.merge, converted int64 to object and tried etc. None worked. If you guys can do that then\n", "#preprocessing can be much easier without a need to define arrays for identities and stuff. Lets continue tomorrow! Good night." ], "id": "b6b61453" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qui8zmCUEYTV", "outputId": "f2b17316-e723-4946-91a1-5417d493f38e" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Numeric values: 44506\n", "Non-numeric values: 0\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# Assuming 'df' is your DataFrame and 'column_name' is the column you want to check\n", "column_name = 'id' # Replace with the actual column name\n", "\n", "# Convert the column to numeric, coercing errors to NaN for non-numeric values\n", "numeric_values = pd.to_numeric(movies['id'], errors='coerce')\n", "\n", "# Count the number of numeric and non-numeric values\n", "numeric_count = numeric_values.notna().sum()\n", "non_numeric_count = numeric_values.isna().sum()\n", "\n", "print(f\"Numeric values: {numeric_count}\")\n", "print(f\"Non-numeric values: {non_numeric_count}\")" ], "id": "qui8zmCUEYTV" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_F0aokROB91X", "outputId": "54ff0d7a-090c-4fbe-a33e-d93ab1ffdd71" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 46419 entries, 0 to 46418\n", "Data columns (total 2 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 46419 non-null int64 \n", " 1 keywords 46419 non-null object\n", "dtypes: int64(1), object(1)\n", "memory usage: 725.4+ KB\n" ] } ], "source": [ "keywords.info()" ], "id": "_F0aokROB91X" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KeE0Sz3CB9zd", "outputId": "8d141f30-fe0e-469e-ae24-b6d74968e069" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 45476 entries, 0 to 45475\n", "Data columns (total 3 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 cast 45476 non-null object\n", " 1 crew 45476 non-null object\n", " 2 id 45476 non-null int64 \n", "dtypes: int64(1), object(2)\n", "memory usage: 1.0+ MB\n" ] } ], "source": [ "credits.info()" ], "id": "KeE0Sz3CB9zd" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "051RAMToB9sb", "outputId": "eddb62c4-ce6c-4ce7-e2b3-fb62d7af84f0" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 44506 entries, 0 to 45465\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 44506 non-null object\n", " 1 title 44506 non-null object\n", " 2 overview 44506 non-null object\n", " 3 genres 44506 non-null object\n", "dtypes: object(4)\n", "memory usage: 1.7+ MB\n" ] } ], "source": [ "movies.info()" ], "id": "051RAMToB9sb" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uuWnVk5xBDCk", "outputId": "83331c25-8562-45b2-b3a2-635faa071d6d" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['id'] = movies['id'].astype(int)\n" ] } ], "source": [ "movies['id'] = movies['id'].astype(int)" ], "id": "uuWnVk5xBDCk" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jcCoT1UVS_XW", "outputId": "7a6d3b9a-7493-4bd1-e0f8-e17a7e05594f" }, "outputs": [ { "data": { "text/plain": [ "dtype('int64')" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies['id'].dtypes" ], "id": "jcCoT1UVS_XW" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "f2atUOqsTexB", "outputId": "ef7e69f0-0f2c-4bf0-dad5-9dc19047bd08" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 44506 entries, 0 to 45465\n", "Data columns (total 4 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 44506 non-null int64 \n", " 1 title 44506 non-null object\n", " 2 overview 44506 non-null object\n", " 3 genres 44506 non-null object\n", "dtypes: int64(1), object(3)\n", "memory usage: 1.7+ MB\n" ] } ], "source": [ "movies.info()" ], "id": "f2atUOqsTexB" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "GcMDzl_ITh53", "outputId": "eedacce3-947c-4d7d-99cb-eae7a69e00d5" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenres
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family]
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family]
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy]
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance]
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy]
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] " ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ], "id": "GcMDzl_ITh53" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 597 }, "id": "7LBd3unxTjsA", "outputId": "31e7d057-d554-43f4-f70d-0d2f61311d91" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenrescastcrewkeywords
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family][TomHanks, TimAllen, DonRickles][JohnLasseter][jealousy, toy, boy, friendship, friends, riva...
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family][RobinWilliams, JonathanHyde, KirstenDunst][JoeJohnston][boardgame, disappearance, basedonchildren'sbo...
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy][WalterMatthau, JackLemmon, Ann-Margret][HowardDeutch][fishing, bestfriend, duringcreditsstinger, ol...
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance][WhitneyHouston, AngelaBassett, LorettaDevine][ForestWhitaker][basedonnovel, interracialrelationship, single...
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy][SteveMartin, DianeKeaton, MartinShort][CharlesShyer][baby, midlifecrisis, confidence, aging, daugh...
........................
9956878Homeward Bound: The Incredible Journey[Remake, of, the, popular, Disney, classic,, t...[Adventure, Comedy, Drama, Family][MichaelJ.Fox, SallyField, DonAmeche][DuwayneDunham][basedonnovel, cat, friendship, remake, dog, j...
99615944The Shaggy Dog[Through, an, ancient, spell,, a, boy, changes...[Comedy, Family][FredMacMurray, JeanHagen, TommyKirk][CharlesBarton][magic, dog, sheepdog]
99718444Swiss Family Robinson[After, being, shipwrecked,, the, Robinson, fa...[Adventure, Family][JohnMills, DorothyMcGuire, JamesMacArthur][KenAnnakin][treehouse, island, shipwreck, pirategang, swi...
99820723That Darn Cat![A, young, woman, suspects, foul, play, when, ...[Drama, Family, Comedy][HayleyMills, DeanJones, DorothyProvine][RobertStevenson][cat, hostage, kidnapping, map, sister, suburb...
99917320,000 Leagues Under the Sea[A, ship, sent, to, investigate, a, wave, of, ...[Adventure, Drama, ScienceFiction][KirkDouglas, JamesMason, PaulLukas][RichardFleischer][diving, ocean, submarine, julesverne, captain...
\n", "

1000 rows × 7 columns

\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", ".. ... ... \n", "995 6878 Homeward Bound: The Incredible Journey \n", "996 15944 The Shaggy Dog \n", "997 18444 Swiss Family Robinson \n", "998 20723 That Darn Cat! \n", "999 173 20,000 Leagues Under the Sea \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", ".. ... \n", "995 [Remake, of, the, popular, Disney, classic,, t... \n", "996 [Through, an, ancient, spell,, a, boy, changes... \n", "997 [After, being, shipwrecked,, the, Robinson, fa... \n", "998 [A, young, woman, suspects, foul, play, when, ... \n", "999 [A, ship, sent, to, investigate, a, wave, of, ... \n", "\n", " genres \\\n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] \n", ".. ... \n", "995 [Adventure, Comedy, Drama, Family] \n", "996 [Comedy, Family] \n", "997 [Adventure, Family] \n", "998 [Drama, Family, Comedy] \n", "999 [Adventure, Drama, ScienceFiction] \n", "\n", " cast crew \\\n", "0 [TomHanks, TimAllen, DonRickles] [JohnLasseter] \n", "1 [RobinWilliams, JonathanHyde, KirstenDunst] [JoeJohnston] \n", "2 [WalterMatthau, JackLemmon, Ann-Margret] [HowardDeutch] \n", "3 [WhitneyHouston, AngelaBassett, LorettaDevine] [ForestWhitaker] \n", "4 [SteveMartin, DianeKeaton, MartinShort] [CharlesShyer] \n", ".. ... ... \n", "995 [MichaelJ.Fox, SallyField, DonAmeche] [DuwayneDunham] \n", "996 [FredMacMurray, JeanHagen, TommyKirk] [CharlesBarton] \n", "997 [JohnMills, DorothyMcGuire, JamesMacArthur] [KenAnnakin] \n", "998 [HayleyMills, DeanJones, DorothyProvine] [RobertStevenson] \n", "999 [KirkDouglas, JamesMason, PaulLukas] [RichardFleischer] \n", "\n", " keywords \n", "0 [jealousy, toy, boy, friendship, friends, riva... \n", "1 [boardgame, disappearance, basedonchildren'sbo... \n", "2 [fishing, bestfriend, duringcreditsstinger, ol... \n", "3 [basedonnovel, interracialrelationship, single... \n", "4 [baby, midlifecrisis, confidence, aging, daugh... \n", ".. ... \n", "995 [basedonnovel, cat, friendship, remake, dog, j... \n", "996 [magic, dog, sheepdog] \n", "997 [treehouse, island, shipwreck, pirategang, swi... \n", "998 [cat, hostage, kidnapping, map, sister, suburb... \n", "999 [diving, ocean, submarine, julesverne, captain... \n", "\n", "[1000 rows x 7 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df0 = pd.merge(movies, credits, on = 'id', how = 'inner')\n", "df = pd.merge(df0, keywords, on = 'id', how = 'inner')\n", "df.iloc[0:1000,:]" ], "id": "7LBd3unxTjsA" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Aj0kSqXST5KT", "outputId": "a1492b5e-39fe-47f9-f1b4-f8b8691b5c4c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Int64Index: 45629 entries, 0 to 45628\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 id 45629 non-null int64 \n", " 1 title 45629 non-null object\n", " 2 overview 45629 non-null object\n", " 3 genres 45629 non-null object\n", " 4 cast 45629 non-null object\n", " 5 crew 45629 non-null object\n", " 6 keywords 45629 non-null object\n", "dtypes: int64(1), object(6)\n", "memory usage: 2.8+ MB\n" ] } ], "source": [ "df.info()" ], "id": "Aj0kSqXST5KT" }, { "cell_type": "markdown", "metadata": { "id": "-1fyNw0OVeCq" }, "source": [ "- 45629 entries and in this dataset there is no null values How?? 45629\n", "- Initially credits(45476), keywords(46419), movie_data(45466)\n", "- common id are 44475 which mean that there are some uncommon id's which the new df contains but what about the values of others which doenn't contain that id values What is that value? if there is no value then what is filled there.\n", "- Should I proceed with this data?" ], "id": "-1fyNw0OVeCq" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zpuPTGGfVFvM" }, "outputs": [], "source": [ "#df['tags'] = df['overview'] + df['genres'] + df['cast'] + df['crew'] + df['keywords']" ], "id": "zpuPTGGfVFvM" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "eSO6rcWyiOY9" }, "outputs": [], "source": [ "#new_df = df[['id','title','tags']]" ], "id": "eSO6rcWyiOY9" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "tMqtic2ViP_6" }, "outputs": [], "source": [ "#new_df['tags'] = new_df['tags'].apply(lambda x: \" \".join(x))" ], "id": "tMqtic2ViP_6" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iyEAegRziRs7" }, "outputs": [], "source": [ "\n", "#new_df.head()" ], "id": "iyEAegRziRs7" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 293 }, "id": "7_8Ko6uponm3", "outputId": "a6ddf392-9b43-48c4-a701-2f5130fb73e6" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenrescastcrewkeywords
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family][TomHanks, TimAllen, DonRickles][JohnLasseter][jealousy, toy, boy, friendship, friends, riva...
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family][RobinWilliams, JonathanHyde, KirstenDunst][JoeJohnston][boardgame, disappearance, basedonchildren'sbo...
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy][WalterMatthau, JackLemmon, Ann-Margret][HowardDeutch][fishing, bestfriend, duringcreditsstinger, ol...
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance][WhitneyHouston, AngelaBassett, LorettaDevine][ForestWhitaker][basedonnovel, interracialrelationship, single...
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy][SteveMartin, DianeKeaton, MartinShort][CharlesShyer][baby, midlifecrisis, confidence, aging, daugh...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \\\n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] \n", "\n", " cast crew \\\n", "0 [TomHanks, TimAllen, DonRickles] [JohnLasseter] \n", "1 [RobinWilliams, JonathanHyde, KirstenDunst] [JoeJohnston] \n", "2 [WalterMatthau, JackLemmon, Ann-Margret] [HowardDeutch] \n", "3 [WhitneyHouston, AngelaBassett, LorettaDevine] [ForestWhitaker] \n", "4 [SteveMartin, DianeKeaton, MartinShort] [CharlesShyer] \n", "\n", " keywords \n", "0 [jealousy, toy, boy, friendship, friends, riva... \n", "1 [boardgame, disappearance, basedonchildren'sbo... \n", "2 [fishing, bestfriend, duringcreditsstinger, ol... \n", "3 [basedonnovel, interracialrelationship, single... \n", "4 [baby, midlifecrisis, confidence, aging, daugh... " ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ], "id": "7_8Ko6uponm3" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 293 }, "id": "weNa-_6kodC0", "outputId": "14e71682-79b1-4683-c696-3bd68c5951d3" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitleoverviewgenrescastcrewkeywordsdescription
0862Toy Story[Led, by, Woody,, Andy's, toys, live, happily,...[Animation, Comedy, Family][TomHanks, TimAllen, DonRickles][JohnLasseter][jealousy, toy, boy, friendship, friends, riva...[Animation, Comedy, Family, jealousy, toy, boy...
18844Jumanji[When, siblings, Judy, and, Peter, discover, a...[Adventure, Fantasy, Family][RobinWilliams, JonathanHyde, KirstenDunst][JoeJohnston][boardgame, disappearance, basedonchildren'sbo...[Adventure, Fantasy, Family, boardgame, disapp...
215602Grumpier Old Men[A, family, wedding, reignites, the, ancient, ...[Romance, Comedy][WalterMatthau, JackLemmon, Ann-Margret][HowardDeutch][fishing, bestfriend, duringcreditsstinger, ol...[Romance, Comedy, fishing, bestfriend, duringc...
331357Waiting to Exhale[Cheated, on,, mistreated, and, stepped, on,, ...[Comedy, Drama, Romance][WhitneyHouston, AngelaBassett, LorettaDevine][ForestWhitaker][basedonnovel, interracialrelationship, single...[Comedy, Drama, Romance, basedonnovel, interra...
411862Father of the Bride Part II[Just, when, George, Banks, has, recovered, fr...[Comedy][SteveMartin, DianeKeaton, MartinShort][CharlesShyer][baby, midlifecrisis, confidence, aging, daugh...[Comedy, baby, midlifecrisis, confidence, agin...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " overview \\\n", "0 [Led, by, Woody,, Andy's, toys, live, happily,... \n", "1 [When, siblings, Judy, and, Peter, discover, a... \n", "2 [A, family, wedding, reignites, the, ancient, ... \n", "3 [Cheated, on,, mistreated, and, stepped, on,, ... \n", "4 [Just, when, George, Banks, has, recovered, fr... \n", "\n", " genres \\\n", "0 [Animation, Comedy, Family] \n", "1 [Adventure, Fantasy, Family] \n", "2 [Romance, Comedy] \n", "3 [Comedy, Drama, Romance] \n", "4 [Comedy] \n", "\n", " cast crew \\\n", "0 [TomHanks, TimAllen, DonRickles] [JohnLasseter] \n", "1 [RobinWilliams, JonathanHyde, KirstenDunst] [JoeJohnston] \n", "2 [WalterMatthau, JackLemmon, Ann-Margret] [HowardDeutch] \n", "3 [WhitneyHouston, AngelaBassett, LorettaDevine] [ForestWhitaker] \n", "4 [SteveMartin, DianeKeaton, MartinShort] [CharlesShyer] \n", "\n", " keywords \\\n", "0 [jealousy, toy, boy, friendship, friends, riva... \n", "1 [boardgame, disappearance, basedonchildren'sbo... \n", "2 [fishing, bestfriend, duringcreditsstinger, ol... \n", "3 [basedonnovel, interracialrelationship, single... \n", "4 [baby, midlifecrisis, confidence, aging, daugh... \n", "\n", " description \n", "0 [Animation, Comedy, Family, jealousy, toy, boy... \n", "1 [Adventure, Fantasy, Family, boardgame, disapp... \n", "2 [Romance, Comedy, fishing, bestfriend, duringc... \n", "3 [Comedy, Drama, Romance, basedonnovel, interra... \n", "4 [Comedy, baby, midlifecrisis, confidence, agin... " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "# combining crew, genres, keywords into single feature\n", "#df['description'] = df.apply(lambda row: row['crew'] + row['genres'] + row['keywords'], axis=1)\n", "df['description'] = df.apply(lambda row: row['genres'] + row['keywords'], axis=1)\n", "\n", "df.head()" ], "id": "weNa-_6kodC0" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "pG8a_6YoajG4" }, "outputs": [], "source": [ "df['director'] = df['crew'].apply(lambda x: f\"DirectorIs{', '.join(x)}\").apply(lambda x: [x])" ], "id": "pG8a_6YoajG4" }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "_gGg6DSbapZB" }, "outputs": [], "source": [ "df['description'] = df.apply(lambda row: row['description'] + row['director'], axis=1)" ], "id": "_gGg6DSbapZB" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "47GLt8L1odBU", "outputId": "423143bb-7865-4616-b9ff-23373a216ea2" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtitledescription
0862Toy Story[Animation, Comedy, Family, jealousy, toy, boy...
18844Jumanji[Adventure, Fantasy, Family, boardgame, disapp...
215602Grumpier Old Men[Romance, Comedy, fishing, bestfriend, duringc...
331357Waiting to Exhale[Comedy, Drama, Romance, basedonnovel, interra...
411862Father of the Bride Part II[Comedy, baby, midlifecrisis, confidence, agin...
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "
\n", "
\n" ], "text/plain": [ " id title \\\n", "0 862 Toy Story \n", "1 8844 Jumanji \n", "2 15602 Grumpier Old Men \n", "3 31357 Waiting to Exhale \n", "4 11862 Father of the Bride Part II \n", "\n", " description \n", "0 [Animation, Comedy, Family, jealousy, toy, boy... \n", "1 [Adventure, Fantasy, Family, boardgame, disapp... \n", "2 [Romance, Comedy, fishing, bestfriend, duringc... \n", "3 [Comedy, Drama, Romance, basedonnovel, interra... \n", "4 [Comedy, baby, midlifecrisis, confidence, agin... " ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df = df[['id', 'title', 'description']]\n", "final_df.head()" ], "id": "47GLt8L1odBU" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "aIX8kCFc9K-x", "outputId": "e351fbda-edb3-45e0-f778-6870c6ae64ac" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ ":1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_df['embeddings']=np.zeros(len(final_df))\n" ] } ], "source": [ "final_df['embeddings']=np.zeros(len(final_df))" ], "id": "aIX8kCFc9K-x" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ESyvovG5MWlz", "outputId": "4d4b36c6-136c-4051-bb80-257de6dfe16c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m17.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m493.7/493.7 kB\u001b[0m \u001b[31m24.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.1/311.1 kB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m36.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m41.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m18.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h" ] } ], "source": [ "pip install -q transformers tqdm datasets huggingface_hub" ], "id": "ESyvovG5MWlz" }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 145, "referenced_widgets": [ "f6bfb10062f74681862b97fa846feebe", "9b16c5274c6542ababbd641f7a95d168", "715f4a87067646dcbc3d27c367d98d4a", "40399b454c954c4b971e011ce792f391", "63ef744c0e0848679f15240eecaa204c", "f3e550763c764e5baa6640853df6e5a9", "d2913d93f7ca40fab307e810d7c0f776", "1f044cf0e10d4191beadf659f42217cc", "d74217c01c4b4f39b55c01adba92953c", "0012dad044444fd2bce50c44e97851ae", "ca83cf2d89ea43a7b146734a16d47262", "c54f339dadab483f9f845ab6d25927d1", "a870e3a77aac4355a63ea9de9fc0605a", "2a3d1538c5f84060a3a89524093f0d5a" ] }, "id": "Fi_wDi1fMI4J", "outputId": "fc297040-f984-4ec1-f42b-a9ce71ea489d" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f6bfb10062f74681862b97fa846feebe", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
:30: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_df['embeddings'].iloc[i:i+batch_size] = final_df['description'].iloc[i:i+batch_size].progress_apply(get_embedding_batch)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "download(\"download_6ec2ab96-07d8-4d9a-9032-d2dd6a93716b\", \"hf_model_batch_16.csv\", 43084762)" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "Creating Embeddings: 100%|██████████| 2500/2500 [10:51<00:00, 3.84it/s]\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "download(\"download_b6127942-048e-4afe-86f5-710c936a0936\", \"hf_model_batch_17.csv\", 45776462)" ] }, "metadata": {} }, { "output_type": "stream", "name": "stderr", "text": [ "Creating Embeddings: 100%|██████████| 629/629 [02:31<00:00, 4.16it/s]\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ] }, "metadata": {} }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "application/javascript": [ "download(\"download_7817e877-c8f8-4891-93f1-52cdc2d9c9a4\", \"hf_model_batch_18.csv\", 46562227)" ] }, "metadata": {} } ], "source": [ "import torch\n", "from transformers import AutoTokenizer, AutoModel\n", "import pandas as pd\n", "import numpy as np\n", "from tqdm import tqdm\n", "\n", "# Assuming try_df is your DataFrame\n", "\n", "# Load the tokenizer and model\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "model = AutoModel.from_pretrained('bert-base-uncased')\n", "\n", "# Create a function to get embeddings\n", "def get_embedding_batch(descriptions):\n", " tokens = tokenizer(descriptions, return_tensors='pt', padding=True, truncation=True)\n", " with torch.no_grad():\n", " output = model(**tokens)\n", " return output.last_hidden_state.mean(dim=1).squeeze().detach().numpy()\n", "\n", "# Batch size\n", "batch_size = 2500\n", "\n", "# Total number of rows in the DataFrame\n", "start_row=40000\n", "end_row=len(final_df)\n", "\n", "# Process the DataFrame in batches\n", "for i in range(start_row, end_row, batch_size):\n", " # Get a batch of descriptions\n", " final_df['embeddings'].iloc[i:i+batch_size] = final_df['description'].iloc[i:i+batch_size].progress_apply(get_embedding_batch)\n", "\n", " # Save the embeddings to a Hugging Face model for the batch\n", " hf_model_batch = {\n", " 'titles': list(final_df['title'].iloc[i:i+batch_size]),\n", " 'embeddings': list(final_df['embeddings'].iloc[i:i+batch_size])\n", " }\n", "\n", " # Save the batch to a file\n", " csv_file_name = f'hf_model_batch_{i // batch_size}.csv'\n", " final_df.to_csv(csv_file_name, index=False)\n", "\n", " # Download the file\n", " from google.colab import files\n", " files.download(csv_file_name)" ], "id": "KVFvLV3j2xbO" }, { "cell_type": "markdown", "source": [ "Now we have made another .ipynb file named ME781_Gradio for better user interface. So to run the recommendation system you only have to run the gradio file and not this file." ], "metadata": { "id": "tHfPl5bNS71a" }, "id": "tHfPl5bNS71a" } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "0012dad044444fd2bce50c44e97851ae": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "1f044cf0e10d4191beadf659f42217cc": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "2a3d1538c5f84060a3a89524093f0d5a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "40399b454c954c4b971e011ce792f391": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LabelModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_ca83cf2d89ea43a7b146734a16d47262", "placeholder": "​", "style": "IPY_MODEL_c54f339dadab483f9f845ab6d25927d1", "value": "Your token has been saved to /root/.cache/huggingface/token" } }, "63ef744c0e0848679f15240eecaa204c": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LabelModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_a870e3a77aac4355a63ea9de9fc0605a", "placeholder": "​", "style": "IPY_MODEL_2a3d1538c5f84060a3a89524093f0d5a", "value": "Login successful" } }, "715f4a87067646dcbc3d27c367d98d4a": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LabelModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d74217c01c4b4f39b55c01adba92953c", "placeholder": "​", "style": "IPY_MODEL_0012dad044444fd2bce50c44e97851ae", "value": "Your token has been saved in your configured git credential helpers (store)." } }, "9b16c5274c6542ababbd641f7a95d168": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "LabelModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_d2913d93f7ca40fab307e810d7c0f776", "placeholder": "​", "style": "IPY_MODEL_1f044cf0e10d4191beadf659f42217cc", "value": "Token is valid (permission: write)." } }, "a870e3a77aac4355a63ea9de9fc0605a": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c54f339dadab483f9f845ab6d25927d1": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "DescriptionStyleModel", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "ca83cf2d89ea43a7b146734a16d47262": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d2913d93f7ca40fab307e810d7c0f776": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "d74217c01c4b4f39b55c01adba92953c": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "f3e550763c764e5baa6640853df6e5a9": { "model_module": "@jupyter-widgets/base", "model_module_version": "1.2.0", "model_name": "LayoutModel", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "f6bfb10062f74681862b97fa846feebe": { "model_module": "@jupyter-widgets/controls", "model_module_version": "1.5.0", "model_name": "VBoxModel", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [ "IPY_MODEL_9b16c5274c6542ababbd641f7a95d168", "IPY_MODEL_715f4a87067646dcbc3d27c367d98d4a", "IPY_MODEL_40399b454c954c4b971e011ce792f391", "IPY_MODEL_63ef744c0e0848679f15240eecaa204c" ], "layout": "IPY_MODEL_f3e550763c764e5baa6640853df6e5a9" } } } } }, "nbformat": 4, "nbformat_minor": 5 }