File size: 8,727 Bytes
0024afc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Rfoz1Nim_nx_"
      },
      "outputs": [],
      "source": [
        "from bs4 import BeautifulSoup\n",
        "import lxml\n",
        "import xml.etree.ElementTree as ET\n",
        "import csv\n",
        "import pandas as pd\n",
        "import requests\n",
        "from bs4 import BeautifulSoup, element\n",
        "import pandas as pd\n",
        "import csv\n",
        "from bs4 import element\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def get_book_links(category_id, base_url=\"https://www.biblio-globus.ru/catalog/category?id=\"):\n",
        "    \"\"\"Извлекает ссылки на книги из категории с указанным id.\"\"\"\n",
        "    page_number = 1\n",
        "\n",
        "    while True:\n",
        "        url = f\"{base_url}{category_id}&page={page_number}\"\n",
        "        response = requests.get(url)\n",
        "        soup = BeautifulSoup(response.text, 'html.parser')\n",
        "\n",
        "        # Извлечение ссылок на книги\n",
        "        links = soup.find_all('a', class_='img_link')\n",
        "        if not links:\n",
        "            print(f\"Сканирование category_id {category_id} page_number {page_number} завершено.\")\n",
        "            break  # Выход из цикла, если страница не содержит ссылок\n",
        "\n",
        "        for link in links:\n",
        "            book_link = link.get('href')\n",
        "            if book_link and book_link.startswith('/product/'):\n",
        "                full_link = f\"https://www.biblio-globus.ru{book_link}\"\n",
        "                # Запись в CSV-файл\n",
        "                with open('book_links.csv', 'a', newline='', encoding='utf-8') as file:\n",
        "                    writer = csv.writer(file)\n",
        "                    writer.writerow([full_link])\n",
        "\n",
        "        page_number += 1\n",
        "\n",
        "categories = [226, 227, 241, 242, 248, 250, 251, 6168, 6169, 6170, 6171, 262, 263, 6173, 6174, 6176, 6177, 6178, 6179, 6180, 6181, 6182, 6183, 6184, 6186, 6187, 6188, 6189]  # Добавьте остальные категории по необходимости\n",
        "\n",
        "# Создание заголовка CSV-файла\n",
        "with open('book_links.csv', 'w', newline='', encoding='utf-8') as file:\n",
        "    writer = csv.writer(file)\n",
        "    writer.writerow(['book_link'])\n",
        "\n",
        "# Получение ссылок для каждой категории\n",
        "for category in categories:\n",
        "    get_book_links(category)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_sl6wNU5SzEI",
        "outputId": "ca837e22-8919-4066-b2da-b376697c9971"
      },
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Сканирование category_id 226 page_number 151 завершено.\n",
            "Сканирование category_id 227 page_number 413 завершено.\n",
            "Сканирование category_id 241 page_number 90 завершено.\n",
            "Сканирование category_id 242 page_number 99 завершено.\n",
            "Сканирование category_id 248 page_number 5 завершено.\n",
            "Сканирование category_id 250 page_number 89 завершено.\n",
            "Сканирование category_id 251 page_number 96 завершено.\n",
            "Сканирование category_id 6168 page_number 33 завершено.\n",
            "Сканирование category_id 6169 page_number 34 завершено.\n",
            "Сканирование category_id 6170 page_number 9 завершено.\n",
            "Сканирование category_id 6171 page_number 13 завершено.\n",
            "Сканирование category_id 262 page_number 12 завершено.\n",
            "Сканирование category_id 263 page_number 16 завершено.\n",
            "Сканирование category_id 6173 page_number 32 завершено.\n",
            "Сканирование category_id 6174 page_number 3 завершено.\n",
            "Сканирование category_id 6176 page_number 4 завершено.\n",
            "Сканирование category_id 6177 page_number 18 завершено.\n",
            "Сканирование category_id 6178 page_number 10 завершено.\n",
            "Сканирование category_id 6179 page_number 1 завершено.\n",
            "Сканирование category_id 6180 page_number 1 завершено.\n",
            "Сканирование category_id 6181 page_number 8 завершено.\n",
            "Сканирование category_id 6182 page_number 35 завершено.\n",
            "Сканирование category_id 6183 page_number 4 завершено.\n",
            "Сканирование category_id 6184 page_number 3 завершено.\n",
            "Сканирование category_id 6186 page_number 6 завершено.\n",
            "Сканирование category_id 6187 page_number 64 завершено.\n",
            "Сканирование category_id 6188 page_number 73 завершено.\n",
            "Сканирование category_id 6189 page_number 3 завершено.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def scrape_book_data(url):\n",
        "    collect = []\n",
        "    response = requests.get(url)\n",
        "    soup = BeautifulSoup(response.text, 'html.parser')\n",
        "    content = {\n",
        "        'image': soup.find('meta', property=\"og:image\")['content'] if soup.find('meta', property=\"og:image\") else '',\n",
        "        'author': soup.find('meta', property=\"og:book:author\")['content'] if soup.find('meta', property=\"og:book:author\") else '',\n",
        "        'title': soup.find('meta', property=\"og:title\")['content'] if soup.find('meta', property=\"og:title\") else '',\n",
        "        'annotation': soup.find('div', {\"class\": \"collapse\", \"id\": \"collapseExample\"}) if soup.find('div', {\"class\": \"collapse\", \"id\": \"collapseExample\"}) else ''\n",
        "    }\n",
        "\n",
        "    if content['annotation'] != '' and content['annotation'].children:\n",
        "        for el in content['annotation'].children:\n",
        "            if isinstance(el, element.Tag):\n",
        "                el.decompose()\n",
        "    collect.append(url)\n",
        "    collect.append(content['image'])\n",
        "    collect.append(content['title'])\n",
        "    collect.append(content['author'])\n",
        "    collect.append(content['annotation'].get_text(strip=True) if content['annotation'] != '' else '')\n",
        "    return collect\n",
        "\n",
        "# Загрузка списка URL из файла\n",
        "urls_df = pd.read_csv('book_links(1).csv')\n",
        "\n",
        "# Открытие файла для записи результатов\n",
        "with open('books.csv', 'w', newline='', encoding='utf-8') as csvfile:\n",
        "    writer = csv.writer(csvfile, escapechar='\\\\', quoting=csv.QUOTE_MINIMAL)\n",
        "    writer.writerow(['page_url', 'image_url', 'author', 'title', 'annotation'])\n",
        "\n",
        "    for index, row in urls_df.iterrows():\n",
        "        book_data = scrape_book_data(row['book_link'])\n",
        "        writer.writerow(book_data)\n",
        "        print(f\"Информация о книге: {row['book_link']} записана в файл books.csv\")\n",
        "\n"
      ],
      "metadata": {
        "id": "8U8VSC8KTONT"
      },
      "execution_count": 23,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "TAxdA0XLTVhg"
      },
      "execution_count": 22,
      "outputs": []
    }
  ]
}