Leopat's picture
upload src files
3b4f6eb verified
raw
history blame
2.68 kB
"""
This script downloads the top 30 most read books from Project Gutenberg in EPUB format.
The script performs the following steps:
1. Fetches the list of the most read books from Project Gutenberg.
2. Parses the list to extract the top 30 books.
3. Downloads the EPUB files of the top 30 books.
4. Saves the EPUB files locally with titles formatted to replace whitespace with underscores.
Modules required:
- requests: For making HTTP requests to Project Gutenberg.
- BeautifulSoup (bs4): For parsing HTML content.
- tqdm: For displaying a progress bar.
- re: For regular expression operations.
Usage:
Run the script directly to start the download process.
"""
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re
def get_top_books():
"""
Fetches the list of most read books from Project Gutenberg.
Returns:
list: A list of BeautifulSoup tag objects containing the top 30 most read books.
"""
# Get the list of most read books
response = requests.get("http://www.gutenberg.org/browse/scores/top")
soup = BeautifulSoup(response.content, 'html.parser')
# Find the top 30 most read books
top_books = soup.select('ol li a[href^="/ebooks/"]')[:30]
return top_books
def download_epub(book_id, title, directory):
"""
Downloads the EPUB file of a book from Project Gutenberg.
Args:
book_id (str): The ID of the book on Project Gutenberg.
title (str): The title of the book.
Returns:
None
"""
epub_url = f"http://www.gutenberg.org/ebooks/{book_id}.epub.noimages"
epub_response = requests.get(epub_url)
# Check if the request was successful
if epub_response.status_code == 200:
# Replace whitespace in the title with underscores
formatted_title = re.sub(r'\s+', '_', title)
filename = f"{directory}{formatted_title}.epub"
# Save the EPUB file
with open(filename, 'wb') as file:
file.write(epub_response.content)
print(f"Downloaded: {filename}")
else:
print(f"EPUB not available for book ID: {book_id}")
def main():
"""
Main function to download the top 30 most read books from Project Gutenberg in EPUB format.
Returns:
None
"""
top_books = get_top_books()
# Loop through the top books and download EPUB files
for book in tqdm(top_books, desc='Downloading books...'):
book_url = book['href']
book_id = book_url.split('/')[-1]
title = book.get_text().strip().replace('/', '-')
download_epub(book_id, title, directory="data/self_scraped/books_epub/")
if __name__ == "__main__":
main()