Spaces:
Sleeping
Sleeping
import os | |
import logging | |
import weasyprint | |
from urllib.parse import urlparse | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class DocumentHandler: | |
def __init__(self, url): | |
self.url = url | |
def create_pdf(self, output_dir="data"): | |
try: | |
pdf = weasyprint.HTML(self.url).write_pdf() | |
if not os.path.exists(output_dir): | |
os.makedirs(output_dir) | |
# Get the title and domain of the webpage | |
title, domain = DocumentHandler.get_webpage_title() | |
if not title: | |
title = "Untitled" | |
# Generate the PDF file name | |
file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf" | |
file_path = os.path.join(output_dir, file_name) | |
with open(file_path, 'wb') as f: | |
f.write(pdf) | |
logger.info(f"PDF created successfully: {file_path}") | |
return file_path | |
except Exception as e: | |
logger.error(f"Error creating PDF: {e}") | |
return None | |
def get_webpage_title(self): | |
try: | |
pdf = weasyprint.HTML(self.url) | |
title = pdf.document.xpath('//title')[0].text | |
if title: | |
return title | |
else: | |
parsed_url = urlparse(self.url) | |
path_components = parsed_url.path.split('/') | |
# Extract the last component of the path as the title | |
title = path_components[-2] if path_components[-1] == '' else path_components[-1] | |
return title | |
except Exception as e: | |
logger.error(f"Error getting webpage title: {e}") | |
return None | |