{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b468f6eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "from requests import get\n",
    "from bs4 import BeautifulSoup\n",
    "\n",
    "from time import sleep\n",
    "from random import randint"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "1ad7963f",
   "metadata": {},
   "outputs": [],
   "source": [
    "titles = []\n",
    "years = []\n",
    "time = []\n",
    "imdb_ratings = []\n",
    "metascores = []\n",
    "votes = []\n",
    "us_gross = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "5c2d321e",
   "metadata": {},
   "outputs": [],
   "source": [
    "headers = {'Accept-Language': 'en-US, en;q=0.5'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "f99c0c2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([  1,  51, 101, 151, 201])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pages = np.arange(1, 251, 50)\n",
    "pages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "4ff01a9a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Storing each of the urls of 50 movies \n",
    "for page in pages:\n",
    "    # Getting the contents from the each url\n",
    "    page = requests.get('https://www.imdb.com/search/title/?groups=top_1000&start=' + str(page) + '&ref_=adv_nxt', headers=headers)\n",
    "    soup = BeautifulSoup(page.text, 'html.parser')\n",
    "    \n",
    "    # Aiming the part of the html we want to get the information from\n",
    "    movie_div = soup.find_all('div', class_='lister-item mode-advanced')\n",
    "    \n",
    "    # Controling the loop’s rate by pausing the execution of the loop for a specified amount of time\n",
    "    # Waiting time between requests for a number between 2-10 seconds\n",
    "#     sleep(randint(2,5))\n",
    "    \n",
    "    for container in movie_div:\n",
    "        # Scraping the movie's name\n",
    "        name = container.h3.a.text\n",
    "        titles.append(name)\n",
    "        \n",
    "        # Scraping the movie's year\n",
    "        year = container.h3.find('span', class_='lister-item-year').text\n",
    "        years.append(year)\n",
    "        \n",
    "        # Scraping the movie's length\n",
    "        runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'\n",
    "        time.append(runtime)\n",
    "        \n",
    "        # Scraping the rating\n",
    "        imdb = float(container.strong.text)\n",
    "        imdb_ratings.append(imdb)\n",
    "        \n",
    "        # Scraping the metascore\n",
    "        m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'\n",
    "        metascores.append(m_score)\n",
    "        \n",
    "        # Scraping votes and gross earnings\n",
    "#         nv = container.find_all('span', attrs={'name':'nv'})\n",
    "#         vote = nv[0].text\n",
    "#         votes.append(vote)\n",
    "#         grosses = nv[1].text if len(nv) > 1 else '-'\n",
    "#         us_gross.append(grosses)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "7e788570",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie</th>\n",
       "      <th>year</th>\n",
       "      <th>time_minute</th>\n",
       "      <th>imdb_rating</th>\n",
       "      <th>metascore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Avatar: The Way of Water</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>192 min</td>\n",
       "      <td>7.8</td>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Puss in Boots: The Last Wish</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>102 min</td>\n",
       "      <td>7.9</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Banshees of Inisherin</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>114 min</td>\n",
       "      <td>7.8</td>\n",
       "      <td>87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Everything Everywhere All at Once</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>139 min</td>\n",
       "      <td>8.0</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The Fabelmans</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>151 min</td>\n",
       "      <td>7.7</td>\n",
       "      <td>84</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                               movie    year time_minute  imdb_rating  \\\n",
       "0           Avatar: The Way of Water  (2022)     192 min          7.8   \n",
       "1       Puss in Boots: The Last Wish  (2022)     102 min          7.9   \n",
       "2          The Banshees of Inisherin  (2022)     114 min          7.8   \n",
       "3  Everything Everywhere All at Once  (2022)     139 min          8.0   \n",
       "4                      The Fabelmans  (2022)     151 min          7.7   \n",
       "\n",
       "    metascore  \n",
       "0  67          \n",
       "1  75          \n",
       "2  87          \n",
       "3  81          \n",
       "4  84          "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies = pd.DataFrame({'movie':titles,\n",
    "                       'year':years,\n",
    "                       'time_minute':time,\n",
    "                       'imdb_rating':imdb_ratings,\n",
    "                       'metascore':metascores})\n",
    "\n",
    "movies.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3d3b75c",
   "metadata": {},
   "source": [
    "movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "8db3f51c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>movie</th>\n",
       "      <th>year</th>\n",
       "      <th>time_minute</th>\n",
       "      <th>imdb_rating</th>\n",
       "      <th>metascore</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Avatar: The Way of Water</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>192 min</td>\n",
       "      <td>7.8</td>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Puss in Boots: The Last Wish</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>102 min</td>\n",
       "      <td>7.9</td>\n",
       "      <td>75</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>The Banshees of Inisherin</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>114 min</td>\n",
       "      <td>7.8</td>\n",
       "      <td>87</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Everything Everywhere All at Once</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>139 min</td>\n",
       "      <td>8.0</td>\n",
       "      <td>81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>The Fabelmans</td>\n",
       "      <td>(2022)</td>\n",
       "      <td>151 min</td>\n",
       "      <td>7.7</td>\n",
       "      <td>84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1795</th>\n",
       "      <td>Pan's Labyrinth</td>\n",
       "      <td>(2006)</td>\n",
       "      <td>118 min</td>\n",
       "      <td>8.2</td>\n",
       "      <td>98</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1796</th>\n",
       "      <td>Guardians of the Galaxy Vol. 2</td>\n",
       "      <td>(2017)</td>\n",
       "      <td>136 min</td>\n",
       "      <td>7.6</td>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1797</th>\n",
       "      <td>The Wizard of Oz</td>\n",
       "      <td>(1939)</td>\n",
       "      <td>102 min</td>\n",
       "      <td>8.1</td>\n",
       "      <td>92</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1798</th>\n",
       "      <td>Psycho</td>\n",
       "      <td>(1960)</td>\n",
       "      <td>109 min</td>\n",
       "      <td>8.5</td>\n",
       "      <td>97</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1799</th>\n",
       "      <td>Before Sunrise</td>\n",
       "      <td>(1995)</td>\n",
       "      <td>101 min</td>\n",
       "      <td>8.1</td>\n",
       "      <td>77</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1800 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                  movie    year time_minute  imdb_rating  \\\n",
       "0              Avatar: The Way of Water  (2022)     192 min          7.8   \n",
       "1          Puss in Boots: The Last Wish  (2022)     102 min          7.9   \n",
       "2             The Banshees of Inisherin  (2022)     114 min          7.8   \n",
       "3     Everything Everywhere All at Once  (2022)     139 min          8.0   \n",
       "4                         The Fabelmans  (2022)     151 min          7.7   \n",
       "...                                 ...     ...         ...          ...   \n",
       "1795                    Pan's Labyrinth  (2006)     118 min          8.2   \n",
       "1796     Guardians of the Galaxy Vol. 2  (2017)     136 min          7.6   \n",
       "1797                   The Wizard of Oz  (1939)     102 min          8.1   \n",
       "1798                             Psycho  (1960)     109 min          8.5   \n",
       "1799                     Before Sunrise  (1995)     101 min          8.1   \n",
       "\n",
       "       metascore  \n",
       "0     67          \n",
       "1     75          \n",
       "2     87          \n",
       "3     81          \n",
       "4     84          \n",
       "...          ...  \n",
       "1795  98          \n",
       "1796  67          \n",
       "1797  92          \n",
       "1798  97          \n",
       "1799  77          \n",
       "\n",
       "[1800 rows x 5 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "766721af",
   "metadata": {},
   "outputs": [],
   "source": [
    "movies.to_csv('movies.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "eb119479",
   "metadata": {},
   "outputs": [],
   "source": [
    "# movies['metascore'] = movies['metascore'].str.extract('(\\d+)')\n",
    "# convert it to float and if there are dashes turn it into NaN\n",
    "movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')\n",
    "movies2=movies.dropna(inplace=True)\n",
    "movies2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "0ad0bafa",
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "'NoneType' object is not subscriptable",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[1;32mIn [29], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124md+)\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n\u001b[0;32m      2\u001b[0m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime_minute\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime_minute\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124md+)\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n",
      "\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable"
     ]
    }
   ],
   "source": [
    "movies2['year'] = movies2['year'].str.extract('(\\d+)').astype(int)\n",
    "movies2['time_minute'] = movies2['time_minute'].str.extract('(\\d+)').astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "ea6db0c2",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'NoneType' object has no attribute 'dtypes'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn [30], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m movies2\u001b[38;5;241m.\u001b[39mdtypes\n",
      "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'dtypes'"
     ]
    }
   ],
   "source": [
    "movies2.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "d8585ecb",
   "metadata": {},
   "outputs": [],
   "source": [
    "movies.to_csv('movies2.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a62df0a",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}