""" This script downloads the top 30 most read books from Project Gutenberg in EPUB format. The script performs the following steps: 1. Fetches the list of the most read books from Project Gutenberg. 2. Parses the list to extract the top 30 books. 3. Downloads the EPUB files of the top 30 books. 4. Saves the EPUB files locally with titles formatted to replace whitespace with underscores. Modules required: - requests: For making HTTP requests to Project Gutenberg. - BeautifulSoup (bs4): For parsing HTML content. - tqdm: For displaying a progress bar. - re: For regular expression operations. Usage: Run the script directly to start the download process. """ import requests from bs4 import BeautifulSoup from tqdm import tqdm import re def get_top_books(): """ Fetches the list of most read books from Project Gutenberg. Returns: list: A list of BeautifulSoup tag objects containing the top 30 most read books. """ # Get the list of most read books response = requests.get("http://www.gutenberg.org/browse/scores/top") soup = BeautifulSoup(response.content, 'html.parser') # Find the top 30 most read books top_books = soup.select('ol li a[href^="/ebooks/"]')[:30] return top_books def download_epub(book_id, title, directory): """ Downloads the EPUB file of a book from Project Gutenberg. Args: book_id (str): The ID of the book on Project Gutenberg. title (str): The title of the book. Returns: None """ epub_url = f"http://www.gutenberg.org/ebooks/{book_id}.epub.noimages" epub_response = requests.get(epub_url) # Check if the request was successful if epub_response.status_code == 200: # Replace whitespace in the title with underscores formatted_title = re.sub(r'\s+', '_', title) filename = f"{directory}{formatted_title}.epub" # Save the EPUB file with open(filename, 'wb') as file: file.write(epub_response.content) print(f"Downloaded: {filename}") else: print(f"EPUB not available for book ID: {book_id}") def main(): """ Main function to download the top 30 most read books from Project Gutenberg in EPUB format. Returns: None """ top_books = get_top_books() # Loop through the top books and download EPUB files for book in tqdm(top_books, desc='Downloading books...'): book_url = book['href'] book_id = book_url.split('/')[-1] title = book.get_text().strip().replace('/', '-') download_epub(book_id, title, directory="data/self_scraped/books_epub/") if __name__ == "__main__": main()