lara1510 commited on
Commit
8a25a26
1 Parent(s): 0bf16bf

Create pdf_converter.py

Browse files
Files changed (1) hide show
  1. pdf_converter.py +51 -0
pdf_converter.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import weasyprint
4
+ from urllib.parse import urlparse
5
+
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class DocumentHandler:
10
+ def __init__(self, url):
11
+ self.url = url
12
+
13
+ def create_pdf(self, output_dir="data"):
14
+ try:
15
+ pdf = weasyprint.HTML(self.url).write_pdf()
16
+ if not os.path.exists(output_dir):
17
+ os.makedirs(output_dir)
18
+
19
+ # Get the title and domain of the webpage
20
+ title, domain = DocumentHandler.get_webpage_title()
21
+ if not title:
22
+ title = "Untitled"
23
+
24
+ # Generate the PDF file name
25
+ file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
26
+ file_path = os.path.join(output_dir, file_name)
27
+
28
+ with open(file_path, 'wb') as f:
29
+ f.write(pdf)
30
+ logger.info(f"PDF created successfully: {file_path}")
31
+ return file_path
32
+ except Exception as e:
33
+ logger.error(f"Error creating PDF: {e}")
34
+ return None
35
+
36
+ def get_webpage_title(self):
37
+ try:
38
+ pdf = weasyprint.HTML(self.url)
39
+ title = pdf.document.xpath('//title')[0].text
40
+ if title:
41
+ return title
42
+ else:
43
+ parsed_url = urlparse(self.url)
44
+ path_components = parsed_url.path.split('/')
45
+ # Extract the last component of the path as the title
46
+ title = path_components[-2] if path_components[-1] == '' else path_components[-1]
47
+ return title
48
+ except Exception as e:
49
+ logger.error(f"Error getting webpage title: {e}")
50
+ return None
51
+