{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "b468f6eb", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import requests\n", "from requests import get\n", "from bs4 import BeautifulSoup\n", "\n", "from time import sleep\n", "from random import randint" ] }, { "cell_type": "code", "execution_count": 3, "id": "1ad7963f", "metadata": {}, "outputs": [], "source": [ "titles = []\n", "years = []\n", "time = []\n", "imdb_ratings = []\n", "metascores = []\n", "votes = []\n", "us_gross = []" ] }, { "cell_type": "code", "execution_count": 37, "id": "5c2d321e", "metadata": {}, "outputs": [], "source": [ "headers = {'Accept-Language': 'en-US, en;q=0.5'}" ] }, { "cell_type": "code", "execution_count": 38, "id": "f99c0c2e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ 1, 51, 101, 151, 201])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pages = np.arange(1, 251, 50)\n", "pages" ] }, { "cell_type": "code", "execution_count": 39, "id": "4ff01a9a", "metadata": {}, "outputs": [], "source": [ "# Storing each of the urls of 50 movies \n", "for page in pages:\n", " # Getting the contents from the each url\n", " page = requests.get('https://www.imdb.com/search/title/?groups=top_1000&start=' + str(page) + '&ref_=adv_nxt', headers=headers)\n", " soup = BeautifulSoup(page.text, 'html.parser')\n", " \n", " # Aiming the part of the html we want to get the information from\n", " movie_div = soup.find_all('div', class_='lister-item mode-advanced')\n", " \n", " # Controling the loop’s rate by pausing the execution of the loop for a specified amount of time\n", " # Waiting time between requests for a number between 2-10 seconds\n", "# sleep(randint(2,5))\n", " \n", " for container in movie_div:\n", " # Scraping the movie's name\n", " name = container.h3.a.text\n", " titles.append(name)\n", " \n", " # Scraping the movie's year\n", " year = container.h3.find('span', class_='lister-item-year').text\n", " years.append(year)\n", " \n", " # Scraping the movie's length\n", " runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'\n", " time.append(runtime)\n", " \n", " # Scraping the rating\n", " imdb = float(container.strong.text)\n", " imdb_ratings.append(imdb)\n", " \n", " # Scraping the metascore\n", " m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'\n", " metascores.append(m_score)\n", " \n", " # Scraping votes and gross earnings\n", "# nv = container.find_all('span', attrs={'name':'nv'})\n", "# vote = nv[0].text\n", "# votes.append(vote)\n", "# grosses = nv[1].text if len(nv) > 1 else '-'\n", "# us_gross.append(grosses)" ] }, { "cell_type": "code", "execution_count": 40, "id": "7e788570", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieyeartime_minuteimdb_ratingmetascore
0Avatar: The Way of Water(2022)192 min7.867
1Puss in Boots: The Last Wish(2022)102 min7.975
2The Banshees of Inisherin(2022)114 min7.887
3Everything Everywhere All at Once(2022)139 min8.081
4The Fabelmans(2022)151 min7.784
\n", "
" ], "text/plain": [ " movie year time_minute imdb_rating \\\n", "0 Avatar: The Way of Water (2022) 192 min 7.8 \n", "1 Puss in Boots: The Last Wish (2022) 102 min 7.9 \n", "2 The Banshees of Inisherin (2022) 114 min 7.8 \n", "3 Everything Everywhere All at Once (2022) 139 min 8.0 \n", "4 The Fabelmans (2022) 151 min 7.7 \n", "\n", " metascore \n", "0 67 \n", "1 75 \n", "2 87 \n", "3 81 \n", "4 84 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies = pd.DataFrame({'movie':titles,\n", " 'year':years,\n", " 'time_minute':time,\n", " 'imdb_rating':imdb_ratings,\n", " 'metascore':metascores})\n", "\n", "movies.head()" ] }, { "cell_type": "markdown", "id": "b3d3b75c", "metadata": {}, "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 41, "id": "8db3f51c", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieyeartime_minuteimdb_ratingmetascore
0Avatar: The Way of Water(2022)192 min7.867
1Puss in Boots: The Last Wish(2022)102 min7.975
2The Banshees of Inisherin(2022)114 min7.887
3Everything Everywhere All at Once(2022)139 min8.081
4The Fabelmans(2022)151 min7.784
..................
1795Pan's Labyrinth(2006)118 min8.298
1796Guardians of the Galaxy Vol. 2(2017)136 min7.667
1797The Wizard of Oz(1939)102 min8.192
1798Psycho(1960)109 min8.597
1799Before Sunrise(1995)101 min8.177
\n", "

1800 rows × 5 columns

\n", "
" ], "text/plain": [ " movie year time_minute imdb_rating \\\n", "0 Avatar: The Way of Water (2022) 192 min 7.8 \n", "1 Puss in Boots: The Last Wish (2022) 102 min 7.9 \n", "2 The Banshees of Inisherin (2022) 114 min 7.8 \n", "3 Everything Everywhere All at Once (2022) 139 min 8.0 \n", "4 The Fabelmans (2022) 151 min 7.7 \n", "... ... ... ... ... \n", "1795 Pan's Labyrinth (2006) 118 min 8.2 \n", "1796 Guardians of the Galaxy Vol. 2 (2017) 136 min 7.6 \n", "1797 The Wizard of Oz (1939) 102 min 8.1 \n", "1798 Psycho (1960) 109 min 8.5 \n", "1799 Before Sunrise (1995) 101 min 8.1 \n", "\n", " metascore \n", "0 67 \n", "1 75 \n", "2 87 \n", "3 81 \n", "4 84 \n", "... ... \n", "1795 98 \n", "1796 67 \n", "1797 92 \n", "1798 97 \n", "1799 77 \n", "\n", "[1800 rows x 5 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 11, "id": "766721af", "metadata": {}, "outputs": [], "source": [ "movies.to_csv('movies.csv')" ] }, { "cell_type": "code", "execution_count": 28, "id": "eb119479", "metadata": {}, "outputs": [], "source": [ "# movies['metascore'] = movies['metascore'].str.extract('(\\d+)')\n", "# convert it to float and if there are dashes turn it into NaN\n", "movies['metascore'] = pd.to_numeric(movies['metascore'], errors='coerce')\n", "movies2=movies.dropna(inplace=True)\n", "movies2" ] }, { "cell_type": "code", "execution_count": 29, "id": "0ad0bafa", "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "'NoneType' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn [29], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124myear\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124md+)\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n\u001b[0;32m 2\u001b[0m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime_minute\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m movies2[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtime_minute\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mstr\u001b[38;5;241m.\u001b[39mextract(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m(\u001b[39m\u001b[38;5;124m\\\u001b[39m\u001b[38;5;124md+)\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mint\u001b[39m)\n", "\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" ] } ], "source": [ "movies2['year'] = movies2['year'].str.extract('(\\d+)').astype(int)\n", "movies2['time_minute'] = movies2['time_minute'].str.extract('(\\d+)').astype(int)" ] }, { "cell_type": "code", "execution_count": 30, "id": "ea6db0c2", "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'NoneType' object has no attribute 'dtypes'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn [30], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m movies2\u001b[38;5;241m.\u001b[39mdtypes\n", "\u001b[1;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'dtypes'" ] } ], "source": [ "movies2.dtypes" ] }, { "cell_type": "code", "execution_count": 32, "id": "d8585ecb", "metadata": {}, "outputs": [], "source": [ "movies.to_csv('movies2.csv')" ] }, { "cell_type": "code", "execution_count": null, "id": "3a62df0a", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" } }, "nbformat": 4, "nbformat_minor": 5 }