Spaces:
Sleeping
Sleeping
""" | |
This script downloads the top 30 most read books from Project Gutenberg in EPUB format. | |
The script performs the following steps: | |
1. Fetches the list of the most read books from Project Gutenberg. | |
2. Parses the list to extract the top 30 books. | |
3. Downloads the EPUB files of the top 30 books. | |
4. Saves the EPUB files locally with titles formatted to replace whitespace with underscores. | |
Modules required: | |
- requests: For making HTTP requests to Project Gutenberg. | |
- BeautifulSoup (bs4): For parsing HTML content. | |
- tqdm: For displaying a progress bar. | |
- re: For regular expression operations. | |
Usage: | |
Run the script directly to start the download process. | |
""" | |
import requests | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import re | |
def get_top_books(): | |
""" | |
Fetches the list of most read books from Project Gutenberg. | |
Returns: | |
list: A list of BeautifulSoup tag objects containing the top 30 most read books. | |
""" | |
# Get the list of most read books | |
response = requests.get("http://www.gutenberg.org/browse/scores/top") | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find the top 30 most read books | |
top_books = soup.select('ol li a[href^="/ebooks/"]')[:30] | |
return top_books | |
def download_epub(book_id, title, directory): | |
""" | |
Downloads the EPUB file of a book from Project Gutenberg. | |
Args: | |
book_id (str): The ID of the book on Project Gutenberg. | |
title (str): The title of the book. | |
Returns: | |
None | |
""" | |
epub_url = f"http://www.gutenberg.org/ebooks/{book_id}.epub.noimages" | |
epub_response = requests.get(epub_url) | |
# Check if the request was successful | |
if epub_response.status_code == 200: | |
# Replace whitespace in the title with underscores | |
formatted_title = re.sub(r'\s+', '_', title) | |
filename = f"{directory}{formatted_title}.epub" | |
# Save the EPUB file | |
with open(filename, 'wb') as file: | |
file.write(epub_response.content) | |
print(f"Downloaded: {filename}") | |
else: | |
print(f"EPUB not available for book ID: {book_id}") | |
def main(): | |
""" | |
Main function to download the top 30 most read books from Project Gutenberg in EPUB format. | |
Returns: | |
None | |
""" | |
top_books = get_top_books() | |
# Loop through the top books and download EPUB files | |
for book in tqdm(top_books, desc='Downloading books...'): | |
book_url = book['href'] | |
book_id = book_url.split('/')[-1] | |
title = book.get_text().strip().replace('/', '-') | |
download_epub(book_id, title, directory="data/self_scraped/books_epub/") | |
if __name__ == "__main__": | |
main() | |