Spaces:

lara1510
/

Research_Assistant

Sleeping

App Files Files Community

lara1510 commited on Apr 18

Commit

8a25a26

•

1 Parent(s): 0bf16bf

Create pdf_converter.py

Browse files

Files changed (1) hide show

pdf_converter.py +51 -0

pdf_converter.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import os
+import logging
+import weasyprint
+from urllib.parse import urlparse
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class DocumentHandler:
+    def __init__(self, url):
+        self.url = url
+    def create_pdf(self, output_dir="data"):
+        try:
+            pdf = weasyprint.HTML(self.url).write_pdf()
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+            # Get the title and domain of the webpage
+            title, domain = DocumentHandler.get_webpage_title()
+            if not title:
+                title = "Untitled"
+            # Generate the PDF file name
+            file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
+            file_path = os.path.join(output_dir, file_name)
+            with open(file_path, 'wb') as f:
+                f.write(pdf)
+            logger.info(f"PDF created successfully: {file_path}")
+            return file_path
+        except Exception as e:
+            logger.error(f"Error creating PDF: {e}")
+            return None
+    def get_webpage_title(self):
+        try:
+            pdf = weasyprint.HTML(self.url)
+            title = pdf.document.xpath('//title')[0].text
+            if title:
+                return title
+            else:
+                parsed_url = urlparse(self.url)
+                path_components = parsed_url.path.split('/')
+                # Extract the last component of the path as the title
+                title = path_components[-2] if path_components[-1] == '' else path_components[-1]
+                return title
+        except Exception as e:
+            logger.error(f"Error getting webpage title: {e}")
+            return None