Research_Assistant / pdf_converter.py
lara1510's picture
Create pdf_converter.py
8a25a26 verified
raw history blame
No virus
1.75 kB
import os
import logging
import weasyprint
from urllib.parse import urlparse
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentHandler:
def __init__(self, url):
self.url = url
def create_pdf(self, output_dir="data"):
try:
pdf = weasyprint.HTML(self.url).write_pdf()
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Get the title and domain of the webpage
title, domain = DocumentHandler.get_webpage_title()
if not title:
title = "Untitled"
# Generate the PDF file name
file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
file_path = os.path.join(output_dir, file_name)
with open(file_path, 'wb') as f:
f.write(pdf)
logger.info(f"PDF created successfully: {file_path}")
return file_path
except Exception as e:
logger.error(f"Error creating PDF: {e}")
return None
def get_webpage_title(self):
try:
pdf = weasyprint.HTML(self.url)
title = pdf.document.xpath('//title')[0].text
if title:
return title
else:
parsed_url = urlparse(self.url)
path_components = parsed_url.path.split('/')
# Extract the last component of the path as the title
title = path_components[-2] if path_components[-1] == '' else path_components[-1]
return title
except Exception as e:
logger.error(f"Error getting webpage title: {e}")
return None