Spaces:

lara1510
/

Research_Assistant

Sleeping

File size: 2,318 Bytes

import os
import logging
import weasyprint
from urllib.parse import urlparse

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def create_pdf(url, output_dir="data")-> str:
    try:
        # Convert the webpage content to a PDF using WeasyPrint
        pdf = weasyprint.HTML(url).write_pdf()

        # Check if the output directory exists; if not, create it
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # Extract the title and domain of the webpage
        title, domain = get_webpage_title(url)

        # Set a default title "Untitled" if the title extraction fails or returns an empty string
        if not title:
            title = "Untitled"

        # Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores
        file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"

        # Create the full file path by joining the output directory and the generated file name
        file_path = os.path.join(output_dir, file_name)

        # Write the generated PDF content to a file at the specified file path
        with open(file_path, 'wb') as f:
            f.write(pdf)

        # Log a success message indicating that the PDF was created successfully, along with the file path
        logger.info(f"PDF created successfully: {file_path}")

        # Return the file path of the generated PDF
        return file_path

    except Exception as e:
        # Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure
        logger.error(f"Error creating PDF: {e}")
        return None
    

def get_webpage_title(url) -> tuple:
    try:
        # Parse the URL to extract its components
        parsed_url = urlparse(url)

        # Extract the domain from the parsed URL
        domain = parsed_url.netloc

        # Extract the title from the path component of the parsed URL
        title = os.path.basename(parsed_url.path)

        # Return the extracted title and domain
        return title, domain

    except Exception as e:
        # Log an error message and return None for both title and domain if an exception occurs
        logger.error(f"Error getting webpage title and domain: {e}")
        return None, None