import os import logging import weasyprint from urllib.parse import urlparse logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DocumentHandler: def __init__(self, url): self.url = url def create_pdf(self, output_dir="data"): try: pdf = weasyprint.HTML(self.url).write_pdf() if not os.path.exists(output_dir): os.makedirs(output_dir) # Get the title and domain of the webpage title, domain = DocumentHandler.get_webpage_title() if not title: title = "Untitled" # Generate the PDF file name file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf" file_path = os.path.join(output_dir, file_name) with open(file_path, 'wb') as f: f.write(pdf) logger.info(f"PDF created successfully: {file_path}") return file_path except Exception as e: logger.error(f"Error creating PDF: {e}") return None def get_webpage_title(self): try: pdf = weasyprint.HTML(self.url) title = pdf.document.xpath('//title')[0].text if title: return title else: parsed_url = urlparse(self.url) path_components = parsed_url.path.split('/') # Extract the last component of the path as the title title = path_components[-2] if path_components[-1] == '' else path_components[-1] return title except Exception as e: logger.error(f"Error getting webpage title: {e}") return None