Spaces:

lara1510
/

Research_Assistant

Sleeping

App Files Files Community

lara1510 commited on Apr 19

Commit

a96888a

•

1 Parent(s): f4c194b

Update pdf_converter.py

Browse files

Files changed (1) hide show

pdf_converter.py +59 -42

pdf_converter.py CHANGED Viewed

@@ -6,46 +6,63 @@ from urllib.parse import urlparse
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-class DocumentHandler:
-    def __init__(self, url):
-        self.url = url
-    def create_pdf(self, output_dir="data"):
-        try:
-            pdf = weasyprint.HTML(self.url).write_pdf()
-            if not os.path.exists(output_dir):
-                os.makedirs(output_dir)
-            # Get the title and domain of the webpage
-            title, domain = DocumentHandler.get_webpage_title()
-            if not title:
-                title = "Untitled"
-            # Generate the PDF file name
-            file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
-            file_path = os.path.join(output_dir, file_name)
-            with open(file_path, 'wb') as f:
-                f.write(pdf)
-            logger.info(f"PDF created successfully: {file_path}")
-            return file_path
-        except Exception as e:
-            logger.error(f"Error creating PDF: {e}")
-            return None
-    def get_webpage_title(self):
-        try:
-            pdf = weasyprint.HTML(self.url)
-            title = pdf.document.xpath('//title')[0].text
-            if title:
-                return title
-            else:
-                parsed_url = urlparse(self.url)
-                path_components = parsed_url.path.split('/')
-                # Extract the last component of the path as the title
-                title = path_components[-2] if path_components[-1] == '' else path_components[-1]
-                return title
-        except Exception as e:
-            logger.error(f"Error getting webpage title: {e}")
-            return None

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def create_pdf(url, output_dir="data")-> str:
+    try:
+        # Convert the webpage content to a PDF using WeasyPrint
+        pdf = weasyprint.HTML(url).write_pdf()
+        # Check if the output directory exists; if not, create it
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        # Extract the title and domain of the webpage
+        title, domain = get_webpage_title(url)
+        # Set a default title "Untitled" if the title extraction fails or returns an empty string
+        if not title:
+            title = "Untitled"
+        # Generate the PDF file name based on the extracted domain and title, replacing spaces with underscores
+        file_name = f"{domain}_{title.strip().replace(' ', '_')}.pdf"
+        # Create the full file path by joining the output directory and the generated file name
+        file_path = os.path.join(output_dir, file_name)
+        # Write the generated PDF content to a file at the specified file path
+        with open(file_path, 'wb') as f:
+            f.write(pdf)
+        # Log a success message indicating that the PDF was created successfully, along with the file path
+        logger.info(f"PDF created successfully: {file_path}")
+        # Return the file path of the generated PDF
+        return file_path
+    except Exception as e:
+        # Catch any exceptions that occur during PDF creation, log an error message, and return None to indicate failure
+        logger.error(f"Error creating PDF: {e}")
+        return None
+def get_webpage_title(url) -> tuple:
+    try:
+        # Parse the URL to extract its components
+        parsed_url = urlparse(url)
+        # Extract the domain from the parsed URL
+        domain = parsed_url.netloc
+        # Extract the title from the path component of the parsed URL
+        title = os.path.basename(parsed_url.path)
+        # Return the extracted title and domain
+        return title, domain
+    except Exception as e:
+        # Log an error message and return None for both title and domain if an exception occurs
+        logger.error(f"Error getting webpage title and domain: {e}")
+        return None, None