Spaces:

Leopat
/

thesis_chat_with_history_books

Sleeping

App Files Files

thesis_chat_with_history_books / finalthesis /scrape_epub.py

Leopat

upload src files

3b4f6eb verified 9 months ago

raw

history blame

2.68 kB

	"""
	This script downloads the top 30 most read books from Project Gutenberg in EPUB format.

	The script performs the following steps:
	1. Fetches the list of the most read books from Project Gutenberg.
	2. Parses the list to extract the top 30 books.
	3. Downloads the EPUB files of the top 30 books.
	4. Saves the EPUB files locally with titles formatted to replace whitespace with underscores.

	Modules required:
	- requests: For making HTTP requests to Project Gutenberg.
	- BeautifulSoup (bs4): For parsing HTML content.
	- tqdm: For displaying a progress bar.
	- re: For regular expression operations.

	Usage:
	Run the script directly to start the download process.
	"""

	import requests
	from bs4 import BeautifulSoup
	from tqdm import tqdm
	import re

	def get_top_books():
	"""
	Fetches the list of most read books from Project Gutenberg.

	Returns:
	list: A list of BeautifulSoup tag objects containing the top 30 most read books.
	"""
	# Get the list of most read books
	response = requests.get("http://www.gutenberg.org/browse/scores/top")
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the top 30 most read books
	top_books = soup.select('ol li a[href^="/ebooks/"]')[:30]
	return top_books

	def download_epub(book_id, title, directory):
	"""
	Downloads the EPUB file of a book from Project Gutenberg.

	Args:
	book_id (str): The ID of the book on Project Gutenberg.
	title (str): The title of the book.

	Returns:
	None
	"""
	epub_url = f"http://www.gutenberg.org/ebooks/{book_id}.epub.noimages"
	epub_response = requests.get(epub_url)

	# Check if the request was successful
	if epub_response.status_code == 200:
	# Replace whitespace in the title with underscores
	formatted_title = re.sub(r'\s+', '_', title)
	filename = f"{directory}{formatted_title}.epub"

	# Save the EPUB file
	with open(filename, 'wb') as file:
	file.write(epub_response.content)
	print(f"Downloaded: {filename}")
	else:
	print(f"EPUB not available for book ID: {book_id}")

	def main():
	"""
	Main function to download the top 30 most read books from Project Gutenberg in EPUB format.

	Returns:
	None
	"""
	top_books = get_top_books()

	# Loop through the top books and download EPUB files
	for book in tqdm(top_books, desc='Downloading books...'):
	book_url = book['href']
	book_id = book_url.split('/')[-1]
	title = book.get_text().strip().replace('/', '-')
	download_epub(book_id, title, directory="data/self_scraped/books_epub/")

	if __name__ == "__main__":
	main()