Spaces:

lara1510
/

Research_Assistant

Sleeping

Research_Assistant / pdf_converter.py

Create pdf_converter.py

8a25a26 verified 2 months ago

No virus

1.75 kB

	import os
	import logging
	import weasyprint
	from urllib.parse import urlparse

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class DocumentHandler:
	def __init__(self, url):
	self.url = url

	def create_pdf(self, output_dir="data"):
	try:
	pdf = weasyprint.HTML(self.url).write_pdf()
	if not os.path.exists(output_dir):
	os.makedirs(output_dir)

	# Get the title and domain of the webpage
	title, domain = DocumentHandler.get_webpage_title()
	if not title:
	title = "Untitled"

	# Generate the PDF file name
	file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
	file_path = os.path.join(output_dir, file_name)

	with open(file_path, 'wb') as f:
	f.write(pdf)
	logger.info(f"PDF created successfully: {file_path}")
	return file_path
	except Exception as e:
	logger.error(f"Error creating PDF: {e}")
	return None

	def get_webpage_title(self):
	try:
	pdf = weasyprint.HTML(self.url)
	title = pdf.document.xpath('//title')[0].text
	if title:
	return title
	else:
	parsed_url = urlparse(self.url)
	path_components = parsed_url.path.split('/')
	# Extract the last component of the path as the title
	title = path_components[-2] if path_components[-1] == '' else path_components[-1]
	return title
	except Exception as e:
	logger.error(f"Error getting webpage title: {e}")
	return None