{ "cells": [ { "cell_type": "markdown", "id": "29cc0f22", "metadata": { "toc": true }, "source": [ "

Table of Contents

\n", "
" ] }, { "cell_type": "code", "execution_count": 3, "id": "fd5af781", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "import pandas as pd\n", "import re\n", "import fake_useragent\n", "import time" ] }, { "cell_type": "code", "execution_count": 4, "id": "b6333adb", "metadata": {}, "outputs": [], "source": [ "user = fake_useragent.UserAgent().random" ] }, { "cell_type": "code", "execution_count": 5, "id": "db46f2ff", "metadata": {}, "outputs": [], "source": [ "posters = []\n", "titles = []\n", "sources = []\n", "descriptions = []\n", "ganres = []\n", "filters = []\n", "\n", "with open('serials.txt') as file:\n", " lst = file.read().split('\\n')\n", "s = 10000\n", "for url in lst[10000:]:\n", " headers = {\n", " 'Accept': '*/*',\n", " 'User-Agent': user\n", " }\n", " req = requests.get(url, headers=headers)\n", " # req = requests.get(url)\n", " src = req.text\n", " s +=1\n", " print(s)\n", "\n", " with open('index.html', 'w', encoding=\"utf-8\") as file:\n", " file.write(src)\n", "\n", " with open('index.html', encoding=\"utf-8\") as file:\n", " src = file.read()\n", "\n", " soup = BeautifulSoup(src, 'lxml')\n", " # тянем название\n", " try:\n", " title = soup.find(class_='text text_bold_giant color_white').text\n", " title = re.sub(r'\\([^)]*\\)', ' ', title).strip()\n", " titles.append(title)\n", " except:\n", " titles.append(None)\n", " # тянем постер\n", " try:\n", " picture_url = soup.find('meta', itemprop='image')\n", " picture_url = picture_url['content']\n", " posters.append(picture_url)\n", " except:\n", " posters.append(None)\n", " # тянем жанры\n", " ganre = soup.find_all('span', class_='badge__text')\n", " helper = []\n", " for i in ganre:\n", " helper.append(i.text)\n", " ganres.append(helper)\n", " # тянем описание\n", " try:\n", " description = soup.find('div', class_='p-movie-info__description-text').text\n", " descriptions.append(description)\n", " except:\n", " descriptions.append(None)\n", " # возраст\n", " try:\n", " age_filter = soup.find('span', class_='label_restrict').text\n", " filters.append(age_filter)\n", " except:\n", " filters.append(None)\n", " # url\n", " sources.append(url)\n", " if len(sources) % 1 == 0:\n", " res = pd.DataFrame({'url':sources,\n", " 'poster':posters,\n", " 'title':titles,\n", " 'ganres':ganres,\n", " 'description':descriptions,\n", " 'age_limit':filters})\n", " print(f'{len(res)} saved')\n", " res.to_csv('DATA.csv')" ] }, { "cell_type": "code", "execution_count": 6, "id": "1cbe183b", "metadata": {}, "outputs": [ { "ename": "FileNotFoundError", "evalue": "[Errno 2] No such file or directory: 'DATA1.csv'", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df1 \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mDATA1.csv\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mUnnamed: 0\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m df2 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA2.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 3\u001b[0m df3 \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mread_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mDATA3.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index_col\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mUnnamed: 0\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:948\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 935\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 936\u001b[0m dialect,\n\u001b[1;32m 937\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 944\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 945\u001b[0m )\n\u001b[1;32m 946\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 948\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:611\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 608\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 610\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 611\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 613\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 614\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1448\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1445\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1447\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1448\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1705\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1703\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1704\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1705\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1706\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1707\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1708\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1709\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1710\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1711\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1712\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1713\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1714\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1715\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1716\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", "File \u001b[0;32m~/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/pandas/io/common.py:863\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 859\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 860\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 861\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 862\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 863\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 867\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 868\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 869\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 871\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 872\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'DATA1.csv'" ] } ], "source": [ "df1 = pd.read_csv('DATA1.csv', index_col='Unnamed: 0')\n", "df2 = pd.read_csv('DATA2.csv', index_col='Unnamed: 0')\n", "df3 = pd.read_csv('DATA3.csv', index_col='Unnamed: 0')\n", "df4 = pd.read_csv('DATA4.csv', index_col='Unnamed: 0')\n", "df5 = pd.read_csv('DATA5.csv', index_col='Unnamed: 0')\n", "df6 = pd.read_csv('DATA6.csv', index_col='Unnamed: 0')\n", "df7 = pd.read_csv('DATA7.csv', index_col='Unnamed: 0')\n", "df8 = pd.read_csv('DATA8.csv', index_col='Unnamed: 0')\n", "df9 = pd.read_csv('DATA9.csv', index_col='Unnamed: 0')\n", "df10 = pd.read_csv('DATA10.csv', index_col='Unnamed: 0')\n", "df11 = pd.read_csv('DATA11.csv', index_col='Unnamed: 0')\n", "df12 = pd.read_csv('DATA12.csv', index_col='Unnamed: 0')\n", "df13 = pd.read_csv('DATA13.csv', index_col='Unnamed: 0')\n", "df14 = pd.read_csv('DATA14.csv', index_col='Unnamed: 0')\n", "df15 = pd.read_csv('DATA15 - с 11880.csv', index_col='Unnamed: 0')\n", "df16 = pd.read_csv('DATA16.csv', index_col='Unnamed: 0')\n", "df17 = pd.read_csv('DATA17.csv', index_col='Unnamed: 0')\n", "df18 = pd.read_csv('DATA18.csv', index_col='Unnamed: 0')\n", "df19 = pd.read_csv('DATA19.csv', index_col='Unnamed: 0')\n", "df20 = pd.read_csv('DATA20.csv', index_col='Unnamed: 0')\n", "df21 = pd.read_csv('DATA21.csv', index_col='Unnamed: 0')\n", "df22 = pd.read_csv('DATA22.csv', index_col='Unnamed: 0')\n", "df23 = pd.read_csv('DATA23.csv', index_col='Unnamed: 0')\n", "df24 = pd.read_csv('DATA24.csv', index_col='Unnamed: 0')\n", "df25 = pd.read_csv('DATA25.csv', index_col='Unnamed: 0')\n", "df26 = pd.read_csv('DATA26.csv', index_col='Unnamed: 0')\n", "df27 = pd.read_csv('DATA27.csv', index_col='Unnamed: 0')\n", "df28 = pd.read_csv('DATA0-5000.csv', index_col='Unnamed: 0')\n", "df29 = pd.read_csv('DATA2-8.csv', index_col='Unnamed: 0')" ] }, { "cell_type": "code", "execution_count": null, "id": "8cda987f", "metadata": {}, "outputs": [], "source": [ "data = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15, df16, df17, df18, df19, df20, \\\n", " df21, df22, df23, df24, df25, df26, df27, df28, df29], axis=0).reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "222e1aef", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlpostertitleganresdescriptionage_limitUnnamed: 0.1
14969https://kino.mail.ru/series_893084_rusalka/https://resizer.mail.ru/p/11575246-90fe-53c0-b...Русалка['мелодрама']Наташа Алпатова (Елена Шилова) — простая девуш...12 +NaN
14970https://kino.mail.ru/series_838624_sezon_ohoti/https://resizer.mail.ru/p/ca54339b-94e8-5813-a...Сезон охоты['драма', 'мелодрама', 'комедия', 'для взрослых']В центре сюжета — история молодого успешного б...18 +NaN
14971https://kino.mail.ru/series_783649_smertelnii_...https://resizer.mail.ru/p/767707c0-af9c-588a-a...Смертельный танец['детектив']В родное Заречье возвращается танцовщица Настя...16 +NaN
14972https://kino.mail.ru/series_781099_fantom/https://resizer.mail.ru/p/07f60bae-b56a-58ea-b...Фантом['боевик']Сериал расскажет о деятельности спецслужб Росс...12 +NaN
14973https://kino.mail.ru/series_773279_handerbi/https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a...Хандерби['комедия']Сюжет сериала «Хандерби» начинается в 1831 год...NaNNaN
\n", "
" ], "text/plain": [ " url \\\n", "14969 https://kino.mail.ru/series_893084_rusalka/ \n", "14970 https://kino.mail.ru/series_838624_sezon_ohoti/ \n", "14971 https://kino.mail.ru/series_783649_smertelnii_... \n", "14972 https://kino.mail.ru/series_781099_fantom/ \n", "14973 https://kino.mail.ru/series_773279_handerbi/ \n", "\n", " poster title \\\n", "14969 https://resizer.mail.ru/p/11575246-90fe-53c0-b... Русалка \n", "14970 https://resizer.mail.ru/p/ca54339b-94e8-5813-a... Сезон охоты \n", "14971 https://resizer.mail.ru/p/767707c0-af9c-588a-a... Смертельный танец \n", "14972 https://resizer.mail.ru/p/07f60bae-b56a-58ea-b... Фантом \n", "14973 https://resizer.mail.ru/p/a5ddec74-e1f8-512d-a... Хандерби \n", "\n", " ganres \\\n", "14969 ['мелодрама'] \n", "14970 ['драма', 'мелодрама', 'комедия', 'для взрослых'] \n", "14971 ['детектив'] \n", "14972 ['боевик'] \n", "14973 ['комедия'] \n", "\n", " description age_limit \\\n", "14969 Наташа Алпатова (Елена Шилова) — простая девуш... 12 + \n", "14970 В центре сюжета — история молодого успешного б... 18 + \n", "14971 В родное Заречье возвращается танцовщица Настя... 16 + \n", "14972 Сериал расскажет о деятельности спецслужб Росс... 12 + \n", "14973 Сюжет сериала «Хандерби» начинается в 1831 год... NaN \n", "\n", " Unnamed: 0.1 \n", "14969 NaN \n", "14970 NaN \n", "14971 NaN \n", "14972 NaN \n", "14973 NaN " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.tail()" ] }, { "cell_type": "code", "execution_count": null, "id": "85c382bb", "metadata": {}, "outputs": [], "source": [ "data['title'] = data['title'].apply(lambda x: re.sub(r'\\([^)]*\\)', ' ', x).strip() if isinstance(x, str) else x)" ] }, { "cell_type": "code", "execution_count": null, "id": "fe021810", "metadata": {}, "outputs": [], "source": [ "data = data.drop(['Unnamed: 0.1'], axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "f317f7e3", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "Index: 14973 entries, 0 to 14973\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 url 14973 non-null object\n", " 1 poster 14785 non-null object\n", " 2 title 14785 non-null object\n", " 3 ganres 14973 non-null object\n", " 4 description 14730 non-null object\n", " 5 age_limit 13105 non-null object\n", "dtypes: object(6)\n", "memory usage: 818.8+ KB\n" ] } ], "source": [ "data.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "57f21838", "metadata": {}, "outputs": [], "source": [ "data = data.drop_duplicates()" ] }, { "cell_type": "code", "execution_count": null, "id": "da517ed0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": null, "id": "fcf403cf", "metadata": {}, "outputs": [], "source": [ "# data.to_csv('data.csv')" ] } ], "metadata": { "kernelspec": { "display_name": ".elbrus2", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": true, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": true, "toc_position": {}, "toc_section_display": true, "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }