Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import pandas as pd | |
| df = pd.DataFrame(columns=['page_url', 'image_url', 'author', 'title', 'annotation']) | |
| def extract_data_from_page(page_number): | |
| url = f'https://www.chitai-gorod.ru/catalog/books/hudozhestvennaya-literatura-110001?page={page_number}' | |
| response = requests.get(url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| books = soup.find_all('article', class_='product-card') | |
| data = [] | |
| for book in books: | |
| try: | |
| book_url = book.find('a', class_='product-card__picture')['href'] | |
| title = book.find('div', class_='product-title__head').get_text(strip=True) | |
| author = book.find('div', class_='product-title__author').get_text(strip=True) | |
| absolute_url = f'https://www.chitai-gorod.ru{book_url}' | |
| data.append({'page_url': absolute_url, 'title': title, 'author': author}) | |
| except Exception as e: | |
| print(f"Error processing book: {e}") | |
| return data | |
| for page in range(2, 201): | |
| print(f"Processing page {page}...") | |
| page_data = extract_data_from_page(page) | |
| df = pd.concat([df, pd.DataFrame(page_data)], ignore_index=True) | |
| if len(df) >= 5000: | |
| break | |
| df = df.head(5000) | |
| def extract_book_details(book_url): | |
| try: | |
| response = requests.get(book_url) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| image_tag = soup.find('meta', {'name': 'og:image'}) | |
| image_url = image_tag['content'] if image_tag else None | |
| annotation_tag = soup.find('div', {'itemprop': 'description'}) | |
| annotation = annotation_tag.get_text(strip=True) if annotation_tag else None | |
| return image_url, annotation | |
| except Exception as e: | |
| print(f"Error extracting details from {book_url}: {e}") | |
| return None, None | |
| for idx, row in df.head(5000).iterrows(): | |
| print(f"Fetching details for {row['page_url']}...") | |
| image_url, annotation = extract_book_details(row['page_url']) | |
| df.at[idx, 'image_url'] = image_url | |
| df.at[idx, 'annotation'] = annotation | |
| df.to_csv('books_data_with_details.csv', index=False) |