{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "b468f6eb", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import requests\n", "from requests import get\n", "from bs4 import BeautifulSoup\n", "\n", "from time import sleep\n", "from random import randint" ] }, { "cell_type": "code", "execution_count": 3, "id": "1ad7963f", "metadata": {}, "outputs": [], "source": [ "titles = []\n", "years = []\n", "time = []\n", "imdb_ratings = []\n", "metascores = []\n", "votes = []\n", "us_gross = []" ] }, { "cell_type": "code", "execution_count": 37, "id": "5c2d321e", "metadata": {}, "outputs": [], "source": [ "headers = {'Accept-Language': 'en-US, en;q=0.5'}" ] }, { "cell_type": "code", "execution_count": 38, "id": "f99c0c2e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 51, 101, 151, 201])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pages = np.arange(1, 251, 50)\n", "pages" ] }, { "cell_type": "code", "execution_count": 39, "id": "4ff01a9a", "metadata": {}, "outputs": [], "source": [ "# Storing each of the urls of 50 movies \n", "for page in pages:\n", " # Getting the contents from the each url\n", " page = requests.get('https://www.imdb.com/search/title/?groups=top_1000&start=' + str(page) + '&ref_=adv_nxt', headers=headers)\n", " soup = BeautifulSoup(page.text, 'html.parser')\n", " \n", " # Aiming the part of the html we want to get the information from\n", " movie_div = soup.find_all('div', class_='lister-item mode-advanced')\n", " \n", " # Controling the loop’s rate by pausing the execution of the loop for a specified amount of time\n", " # Waiting time between requests for a number between 2-10 seconds\n", "# sleep(randint(2,5))\n", " \n", " for container in movie_div:\n", " # Scraping the movie's name\n", " name = container.h3.a.text\n", " titles.append(name)\n", " \n", " # Scraping the movie's year\n", " year = container.h3.find('span', class_='lister-item-year').text\n", " years.append(year)\n", " \n", " # Scraping the movie's length\n", " runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'\n", " time.append(runtime)\n", " \n", " # Scraping the rating\n", " imdb = float(container.strong.text)\n", " imdb_ratings.append(imdb)\n", " \n", " # Scraping the metascore\n", " m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'\n", " metascores.append(m_score)\n", " \n", " # Scraping votes and gross earnings\n", "# nv = container.find_all('span', attrs={'name':'nv'})\n", "# vote = nv[0].text\n", "# votes.append(vote)\n", "# grosses = nv[1].text if len(nv) > 1 else '-'\n", "# us_gross.append(grosses)" ] }, { "cell_type": "code", "execution_count": 40, "id": "7e788570", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | movie | \n", "year | \n", "time_minute | \n", "imdb_rating | \n", "metascore | \n", "
---|---|---|---|---|---|
0 | \n", "Avatar: The Way of Water | \n", "(2022) | \n", "192 min | \n", "7.8 | \n", "67 | \n", "
1 | \n", "Puss in Boots: The Last Wish | \n", "(2022) | \n", "102 min | \n", "7.9 | \n", "75 | \n", "
2 | \n", "The Banshees of Inisherin | \n", "(2022) | \n", "114 min | \n", "7.8 | \n", "87 | \n", "
3 | \n", "Everything Everywhere All at Once | \n", "(2022) | \n", "139 min | \n", "8.0 | \n", "81 | \n", "
4 | \n", "The Fabelmans | \n", "(2022) | \n", "151 min | \n", "7.7 | \n", "84 | \n", "
\n", " | movie | \n", "year | \n", "time_minute | \n", "imdb_rating | \n", "metascore | \n", "
---|---|---|---|---|---|
0 | \n", "Avatar: The Way of Water | \n", "(2022) | \n", "192 min | \n", "7.8 | \n", "67 | \n", "
1 | \n", "Puss in Boots: The Last Wish | \n", "(2022) | \n", "102 min | \n", "7.9 | \n", "75 | \n", "
2 | \n", "The Banshees of Inisherin | \n", "(2022) | \n", "114 min | \n", "7.8 | \n", "87 | \n", "
3 | \n", "Everything Everywhere All at Once | \n", "(2022) | \n", "139 min | \n", "8.0 | \n", "81 | \n", "
4 | \n", "The Fabelmans | \n", "(2022) | \n", "151 min | \n", "7.7 | \n", "84 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1795 | \n", "Pan's Labyrinth | \n", "(2006) | \n", "118 min | \n", "8.2 | \n", "98 | \n", "
1796 | \n", "Guardians of the Galaxy Vol. 2 | \n", "(2017) | \n", "136 min | \n", "7.6 | \n", "67 | \n", "
1797 | \n", "The Wizard of Oz | \n", "(1939) | \n", "102 min | \n", "8.1 | \n", "92 | \n", "
1798 | \n", "Psycho | \n", "(1960) | \n", "109 min | \n", "8.5 | \n", "97 | \n", "
1799 | \n", "Before Sunrise | \n", "(1995) | \n", "101 min | \n", "8.1 | \n", "77 | \n", "
1800 rows × 5 columns
\n", "