Spaces:

lara1510
/

Research_Assistant

Sleeping

App Files Files Community

Research_Assistant / pdf_converter.py

lara1510

Update pdf_converter.py

a96888a verified 7 months ago

raw

history blame

2.32 kB

	import os
	import logging
	import weasyprint
	from urllib.parse import urlparse

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def create_pdf(url, output_dir="data")-> str:
	try:
	# Convert the webpage content to a PDF using WeasyPrint
	pdf = weasyprint.HTML(url).write_pdf()

	# Check if the output directory exists; if not, create it
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Extract the title and domain of the webpage
	title, domain = get_webpage_title(url)

	# Set a default title "Untitled" if the title extraction fails or returns an empty string
	if not title:
	title = "Untitled"

	# Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores
	file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"

	# Create the full file path by joining the output directory and the generated file name
	file_path = os.path.join(output_dir, file_name)

	# Write the generated PDF content to a file at the specified file path
	with open(file_path, 'wb') as f:
	f.write(pdf)

	# Log a success message indicating that the PDF was created successfully, along with the file path
	logger.info(f"PDF created successfully: {file_path}")

	# Return the file path of the generated PDF
	return file_path

	except Exception as e:
	# Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure
	logger.error(f"Error creating PDF: {e}")
	return None


	def get_webpage_title(url) -> tuple:
	try:
	# Parse the URL to extract its components
	parsed_url = urlparse(url)

	# Extract the domain from the parsed URL
	domain = parsed_url.netloc

	# Extract the title from the path component of the parsed URL
	title = os.path.basename(parsed_url.path)

	# Return the extracted title and domain
	return title, domain

	except Exception as e:
	# Log an error message and return None for both title and domain if an exception occurs
	logger.error(f"Error getting webpage title and domain: {e}")
	return None, None