import os
import logging
import weasyprint
from urllib.parse import urlparse

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentHandler:
    def __init__(self, url):
        self.url = url

    def create_pdf(self, output_dir="data"):
        try:
            pdf = weasyprint.HTML(self.url).write_pdf()
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # Get the title and domain of the webpage
            title, domain = DocumentHandler.get_webpage_title()
            if not title:
                title = "Untitled"
            
            # Generate the PDF file name
            file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
            file_path = os.path.join(output_dir, file_name)
            
            with open(file_path, 'wb') as f:
                f.write(pdf)
            logger.info(f"PDF created successfully: {file_path}")
            return file_path
        except Exception as e:
            logger.error(f"Error creating PDF: {e}")
            return None

    def get_webpage_title(self):
        try:
            pdf = weasyprint.HTML(self.url)
            title = pdf.document.xpath('//title')[0].text
            if title:
                return title
            else:
                parsed_url = urlparse(self.url)
                path_components = parsed_url.path.split('/')
                # Extract the last component of the path as the title
                title = path_components[-2] if path_components[-1] == '' else path_components[-1]
                return title
        except Exception as e:
            logger.error(f"Error getting webpage title: {e}")
            return None