booktoread / parsing .py
Vladislawoo's picture
Rename parsing (1).py to parsing .py
e3f4a13
import json
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
def text(links):
for elem in links:
result = elem.text.strip()
break
return result
url = 'https://www.biblio-globus.ru/catalog/categories'
catalog = requests.get(url)
catalog_soup = BeautifulSoup(catalog.text, 'lxml')
list_categories = catalog_soup.find_all('li', class_='list-group-item')
df = []
columns = ['product_url', 'image', 'author', 'title', 'annotation', 'genre']
n = 1
for link in tqdm(list_categories):
category_url = 'https://www.biblio-globus.ru' + link.find('a')['href']
category_page = requests.get(category_url)
category_soup = BeautifulSoup(category_page.text, 'lxml')
list_subcategories = category_soup.find_all('a', class_='product-preview-title')
for sub in tqdm(list_subcategories):
subcategory_id = sub['href'].split('/')[-1]
page = 1
while True:
subcategiry_url = f'https://www.biblio-globus.ru/catalog/category?id={subcategory_id}&page={page}&sort=0'
subcategiry_page = requests.get(subcategiry_url)
subcategiry_soup = BeautifulSoup(subcategiry_page.text, 'lxml')
subcategiry_links = subcategiry_soup.find_all('div', class_='text')
if not subcategiry_links:
break
for product in subcategiry_links:
product_url = 'https://www.biblio-globus.ru' + product.find('a')['href']
product_page = requests.get(product_url)
product_soup = BeautifulSoup(product_page.text, 'lxml')
product_annotation = product_soup.find('div', id='collapseExample')
if product_annotation:
annotation = ''.join([symbol for symbol in product_annotation.text if symbol not in ['\n', '\r', '\t', 'm', '\xa0']])
annotation = annotation.split('Характеристики', 1)[0]
annotation = annotation.strip()
else:
annotation = None
try:
product_json = product_soup.find('script', type='application/ld+json')
dict_json = json.loads(product_json.text)
except (AttributeError, json.JSONDecodeError):
continue
author = dict_json['author']['name']
title = dict_json['name']
image = dict_json['image']
genre = dict_json['genre']
df.append([product_url, image, author, title, annotation, genre])
page += 1
data = pd.DataFrame(df, columns=columns)
data.to_csv(f'data{n}.csv', index=False)
n += 1