"""
This script downloads the top 30 most read books from Project Gutenberg in EPUB format.

The script performs the following steps:
1. Fetches the list of the most read books from Project Gutenberg.
2. Parses the list to extract the top 30 books.
3. Downloads the EPUB files of the top 30 books.
4. Saves the EPUB files locally with titles formatted to replace whitespace with underscores.

Modules required:
- requests: For making HTTP requests to Project Gutenberg.
- BeautifulSoup (bs4): For parsing HTML content.
- tqdm: For displaying a progress bar.
- re: For regular expression operations.

Usage:
Run the script directly to start the download process.
"""

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

def get_top_books():
    """
    Fetches the list of most read books from Project Gutenberg.

    Returns:
        list: A list of BeautifulSoup tag objects containing the top 30 most read books.
    """
    # Get the list of most read books
    response = requests.get("http://www.gutenberg.org/browse/scores/top")
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find the top 30 most read books
    top_books = soup.select('ol li a[href^="/ebooks/"]')[:30]
    return top_books

def download_epub(book_id, title, directory):
    """
    Downloads the EPUB file of a book from Project Gutenberg.

    Args:
        book_id (str): The ID of the book on Project Gutenberg.
        title (str): The title of the book.

    Returns:
        None
    """
    epub_url = f"http://www.gutenberg.org/ebooks/{book_id}.epub.noimages"
    epub_response = requests.get(epub_url)
    
    # Check if the request was successful
    if epub_response.status_code == 200:
        # Replace whitespace in the title with underscores
        formatted_title = re.sub(r'\s+', '_', title)
        filename = f"{directory}{formatted_title}.epub"
        
        # Save the EPUB file
        with open(filename, 'wb') as file:
            file.write(epub_response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"EPUB not available for book ID: {book_id}")

def main():
    """
    Main function to download the top 30 most read books from Project Gutenberg in EPUB format.

    Returns:
        None
    """
    top_books = get_top_books()
    
    # Loop through the top books and download EPUB files
    for book in tqdm(top_books, desc='Downloading books...'):
        book_url = book['href']
        book_id = book_url.split('/')[-1]
        title = book.get_text().strip().replace('/', '-')
        download_epub(book_id, title, directory="data/self_scraped/books_epub/")

if __name__ == "__main__":
    main()