Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import weasyprint | |
from urllib.parse import urlparse | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
def create_pdf(url, output_dir="data")-> str: | |
try: | |
# Convert the webpage content to a PDF using WeasyPrint | |
pdf = weasyprint.HTML(url).write_pdf() | |
# Check if the output directory exists; if not, create it | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
# Extract the title and domain of the webpage | |
title, domain = get_webpage_title(url) | |
# Set a default title "Untitled" if the title extraction fails or returns an empty string | |
if not title: | |
title = "Untitled" | |
# Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores | |
file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf" | |
# Create the full file path by joining the output directory and the generated file name | |
file_path = os.path.join(output_dir, file_name) | |
# Write the generated PDF content to a file at the specified file path | |
with open(file_path, 'wb') as f: | |
f.write(pdf) | |
# Log a success message indicating that the PDF was created successfully, along with the file path | |
logger.info(f"PDF created successfully: {file_path}") | |
# Return the file path of the generated PDF | |
return file_path | |
except Exception as e: | |
# Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure | |
logger.error(f"Error creating PDF: {e}") | |
return None | |
def get_webpage_title(url) -> tuple: | |
try: | |
# Parse the URL to extract its components | |
parsed_url = urlparse(url) | |
# Extract the domain from the parsed URL | |
domain = parsed_url.netloc | |
# Extract the title from the path component of the parsed URL | |
title = os.path.basename(parsed_url.path) | |
# Return the extracted title and domain | |
return title, domain | |
except Exception as e: | |
# Log an error message and return None for both title and domain if an exception occurs | |
logger.error(f"Error getting webpage title and domain: {e}") | |
return None, None | |