lara1510 commited on
Commit
a96888a
1 Parent(s): f4c194b

Update pdf_converter.py

Browse files
Files changed (1) hide show
  1. pdf_converter.py +59 -42
pdf_converter.py CHANGED
@@ -6,46 +6,63 @@ from urllib.parse import urlparse
6
  logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
8
 
9
- class DocumentHandler:
10
- def __init__(self, url):
11
- self.url = url
12
-
13
- def create_pdf(self, output_dir="data"):
14
- try:
15
- pdf = weasyprint.HTML(self.url).write_pdf()
16
- if not os.path.exists(output_dir):
17
- os.makedirs(output_dir)
18
-
19
- # Get the title and domain of the webpage
20
- title, domain = DocumentHandler.get_webpage_title()
21
- if not title:
22
- title = "Untitled"
23
-
24
- # Generate the PDF file name
25
- file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
26
- file_path = os.path.join(output_dir, file_name)
27
-
28
- with open(file_path, 'wb') as f:
29
- f.write(pdf)
30
- logger.info(f"PDF created successfully: {file_path}")
31
- return file_path
32
- except Exception as e:
33
- logger.error(f"Error creating PDF: {e}")
34
- return None
35
-
36
- def get_webpage_title(self):
37
- try:
38
- pdf = weasyprint.HTML(self.url)
39
- title = pdf.document.xpath('//title')[0].text
40
- if title:
41
- return title
42
- else:
43
- parsed_url = urlparse(self.url)
44
- path_components = parsed_url.path.split('/')
45
- # Extract the last component of the path as the title
46
- title = path_components[-2] if path_components[-1] == '' else path_components[-1]
47
- return title
48
- except Exception as e:
49
- logger.error(f"Error getting webpage title: {e}")
50
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
6
  logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
8
 
9
+ def create_pdf(url, output_dir="data")-> str:
10
+ try:
11
+ # Convert the webpage content to a PDF using WeasyPrint
12
+ pdf = weasyprint.HTML(url).write_pdf()
13
+
14
+ # Check if the output directory exists; if not, create it
15
+ if not os.path.exists(output_dir):
16
+ os.makedirs(output_dir)
17
+
18
+ # Extract the title and domain of the webpage
19
+ title, domain = get_webpage_title(url)
20
+
21
+ # Set a default title "Untitled" if the title extraction fails or returns an empty string
22
+ if not title:
23
+ title = "Untitled"
24
+
25
+ # Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores
26
+ file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
27
+
28
+ # Create the full file path by joining the output directory and the generated file name
29
+ file_path = os.path.join(output_dir, file_name)
30
+
31
+ # Write the generated PDF content to a file at the specified file path
32
+ with open(file_path, 'wb') as f:
33
+ f.write(pdf)
34
+
35
+ # Log a success message indicating that the PDF was created successfully, along with the file path
36
+ logger.info(f"PDF created successfully: {file_path}")
37
+
38
+ # Return the file path of the generated PDF
39
+ return file_path
40
+
41
+ except Exception as e:
42
+ # Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure
43
+ logger.error(f"Error creating PDF: {e}")
44
+ return None
45
+
46
+
47
+ def get_webpage_title(url) -> tuple:
48
+ try:
49
+ # Parse the URL to extract its components
50
+ parsed_url = urlparse(url)
51
+
52
+ # Extract the domain from the parsed URL
53
+ domain = parsed_url.netloc
54
+
55
+ # Extract the title from the path component of the parsed URL
56
+ title = os.path.basename(parsed_url.path)
57
+
58
+ # Return the extracted title and domain
59
+ return title, domain
60
+
61
+ except Exception as e:
62
+ # Log an error message and return None for both title and domain if an exception occurs
63
+ logger.error(f"Error getting webpage title and domain: {e}")
64
+ return None, None
65
+
66
+
67
+
68