Spaces:
Sleeping
Sleeping
Update pdf_converter.py
Browse files- pdf_converter.py +59 -42
pdf_converter.py
CHANGED
@@ -6,46 +6,63 @@ from urllib.parse import urlparse
|
|
6 |
logging.basicConfig(level=logging.INFO)
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
|
|
6 |
logging.basicConfig(level=logging.INFO)
|
7 |
logger = logging.getLogger(__name__)
|
8 |
|
9 |
+
def create_pdf(url, output_dir="data")-> str:
|
10 |
+
try:
|
11 |
+
# Convert the webpage content to a PDF using WeasyPrint
|
12 |
+
pdf = weasyprint.HTML(url).write_pdf()
|
13 |
+
|
14 |
+
# Check if the output directory exists; if not, create it
|
15 |
+
if not os.path.exists(output_dir):
|
16 |
+
os.makedirs(output_dir)
|
17 |
+
|
18 |
+
# Extract the title and domain of the webpage
|
19 |
+
title, domain = get_webpage_title(url)
|
20 |
+
|
21 |
+
# Set a default title "Untitled" if the title extraction fails or returns an empty string
|
22 |
+
if not title:
|
23 |
+
title = "Untitled"
|
24 |
+
|
25 |
+
# Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores
|
26 |
+
file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
|
27 |
+
|
28 |
+
# Create the full file path by joining the output directory and the generated file name
|
29 |
+
file_path = os.path.join(output_dir, file_name)
|
30 |
+
|
31 |
+
# Write the generated PDF content to a file at the specified file path
|
32 |
+
with open(file_path, 'wb') as f:
|
33 |
+
f.write(pdf)
|
34 |
+
|
35 |
+
# Log a success message indicating that the PDF was created successfully, along with the file path
|
36 |
+
logger.info(f"PDF created successfully: {file_path}")
|
37 |
+
|
38 |
+
# Return the file path of the generated PDF
|
39 |
+
return file_path
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
# Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure
|
43 |
+
logger.error(f"Error creating PDF: {e}")
|
44 |
+
return None
|
45 |
+
|
46 |
+
|
47 |
+
def get_webpage_title(url) -> tuple:
|
48 |
+
try:
|
49 |
+
# Parse the URL to extract its components
|
50 |
+
parsed_url = urlparse(url)
|
51 |
+
|
52 |
+
# Extract the domain from the parsed URL
|
53 |
+
domain = parsed_url.netloc
|
54 |
+
|
55 |
+
# Extract the title from the path component of the parsed URL
|
56 |
+
title = os.path.basename(parsed_url.path)
|
57 |
+
|
58 |
+
# Return the extracted title and domain
|
59 |
+
return title, domain
|
60 |
+
|
61 |
+
except Exception as e:
|
62 |
+
# Log an error message and return None for both title and domain if an exception occurs
|
63 |
+
logger.error(f"Error getting webpage title and domain: {e}")
|
64 |
+
return None, None
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
|