Spaces:

oceansweep
/

tldw

Running

App Files Files Community

oceansweep commited on Sep 14

Commit

0b8741e

•

1 Parent(s): 9a2dd5b

Update App_Function_Libraries/PDF_Ingestion_Lib.py

Browse files

Files changed (1) hide show

App_Function_Libraries/PDF_Ingestion_Lib.py +317 -317

App_Function_Libraries/PDF_Ingestion_Lib.py CHANGED Viewed

@@ -1,318 +1,318 @@
-# PDF_Ingestion_Lib.py
-#########################################
-# Library to hold functions for ingesting PDF files.#
-#
-####################
-# Function List
-#
-# 1. convert_pdf_to_markdown(pdf_path)
-# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
-# 3.
-#
-#
-####################
-import re
-# Import necessary libraries
-# Import Local
-#######################################################################################################################
-# Function Definitions
-#
-# Ingest a text file into the database with Title/Author/Keywords
-# Constants
-MAX_FILE_SIZE_MB = 50
-CONVERSION_TIMEOUT_SECONDS = 300
-# Marker PDF solution
-# def convert_pdf_to_markdown(pdf_path):
-#     """
-#     Convert a PDF file to Markdown by calling a script in another virtual environment.
-#     """
-#
-#     logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
-#     # Check if the file size exceeds the maximum allowed size
-#     file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
-#     if file_size_mb > MAX_FILE_SIZE_MB:
-#         raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
-#
-#     logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
-#     # Path to the Python interpreter in the other virtual environment
-#     other_venv_python = "Helper_Scripts/marker_venv/bin/python"
-#
-#     # Path to the conversion script
-#     converter_script = "Helper_Scripts/PDF_Converter.py"
-#
-#     logging.debug("Marker: Attempting to convert PDF file to Markdown...")
-#     try:
-#         result = subprocess.run(
-#             [other_venv_python, converter_script, pdf_path],
-#             capture_output=True,
-#             text=True,
-#             timeout=CONVERSION_TIMEOUT_SECONDS
-#         )
-#         if result.returncode != 0:
-#             raise Exception(f"Conversion failed: {result.stderr}")
-#         return result.stdout
-#     except subprocess.TimeoutExpired:
-#         raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
-#
-#
-# def process_and_ingest_pdf(file, title, author, keywords):
-#     if file is None:
-#         return "Please select a PDF file to upload."
-#
-#     try:
-#         # Create a temporary directory
-#         with tempfile.TemporaryDirectory() as temp_dir:
-#             # Create a path for the temporary PDF file
-#             temp_path = os.path.join(temp_dir, "temp.pdf")
-#
-#             # Copy the contents of the uploaded file to the temporary file
-#             shutil.copy(file.name, temp_path)
-#
-#             # Call the ingest_pdf_file function with the temporary file path
-#             result = ingest_pdf_file(temp_path, title, author, keywords)
-#
-#         return result
-#     except Exception as e:
-#         return f"Error processing PDF: {str(e)}"
-#
-#
-# def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
-#     try:
-#         # Convert PDF to Markdown
-#         markdown_content = convert_pdf_to_markdown(file_path)
-#
-#         # If title is not provided, use the filename without extension
-#         if not title:
-#             title = os.path.splitext(os.path.basename(file_path))[0]
-#
-#         # If author is not provided, set it to 'Unknown'
-#         if not author:
-#             author = 'Unknown'
-#
-#         # If keywords are not provided, use a default keyword
-#         if not keywords:
-#             keywords = 'pdf_file,markdown_converted'
-#         else:
-#             keywords = f'pdf_file,markdown_converted,{keywords}'
-#
-#         # Add the markdown content to the database
-#         add_media_with_keywords(
-#             url=file_path,
-#             title=title,
-#             media_type='document',
-#             content=markdown_content,
-#             keywords=keywords,
-#             prompt='No prompt for PDF files',
-#             summary='No summary for PDF files',
-#             transcription_model='None',
-#             author=author,
-#             ingestion_date=datetime.now().strftime('%Y-%m-%d')
-#         )
-#
-#         return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
-#     except ValueError as e:
-#         logging.error(f"File size error: {str(e)}")
-#         return f"Error: {str(e)}", file_path
-#     except Exception as e:
-#         logging.error(f"Error ingesting PDF file: {str(e)}")
-#         return f"Error ingesting PDF file: {str(e)}", file_path
-#
-#
-# def process_and_cleanup_pdf(file, title, author, keywords):
-#     # FIXME - Update to validate file upload/filetype is pdf....
-#     if file is None:
-#         return "No file uploaded. Please upload a PDF file."
-#
-#     temp_dir = tempfile.mkdtemp()
-#     temp_file_path = os.path.join(temp_dir, "temp.pdf")
-#
-#     try:
-#         # Copy the uploaded file to a temporary location
-#         shutil.copy2(file.name, temp_file_path)
-#
-#         # Process the file
-#         result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
-#
-#         return result
-#     except Exception as e:
-#         logging.error(f"Error in processing and cleanup: {str(e)}")
-#         return f"Error: {str(e)}"
-#     finally:
-#         # Clean up the temporary directory and its contents
-#         try:
-#             shutil.rmtree(temp_dir)
-#             logging.info(f"Removed temporary directory: {temp_dir}")
-#         except Exception as cleanup_error:
-#             logging.error(f"Error during cleanup: {str(cleanup_error)}")
-#             result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
-import logging
-#
-#
-#######################################################################################################################
-#
-# Non-Marker implementation
-import os
-import shutil
-import tempfile
-from datetime import datetime
-import pymupdf
-from App_Function_Libraries.DB_Manager import add_media_with_keywords
-def extract_text_and_format_from_pdf(pdf_path):
-    """
-    Extract text from a PDF file and convert it to Markdown, preserving formatting.
-    """
-    try:
-        markdown_text = ""
-        with pymupdf.open(pdf_path) as doc:
-            for page_num, page in enumerate(doc, 1):
-                markdown_text += f"## Page {page_num}\n\n"
-                blocks = page.get_text("dict")["blocks"]
-                current_paragraph = ""
-                for block in blocks:
-                    if block["type"] == 0:  # Text block
-                        for line in block["lines"]:
-                            line_text = ""
-                            for span in line["spans"]:
-                                text = span["text"]
-                                font_size = span["size"]
-                                font_flags = span["flags"]
-                                # Apply formatting based on font size and flags
-                                if font_size > 20:
-                                    text = f"# {text}"
-                                elif font_size > 16:
-                                    text = f"## {text}"
-                                elif font_size > 14:
-                                    text = f"### {text}"
-                                if font_flags & 2 ** 0:  # Bold
-                                    text = f"**{text}**"
-                                if font_flags & 2 ** 1:  # Italic
-                                    text = f"*{text}*"
-                                line_text += text + " "
-                            # Remove hyphens at the end of lines
-                            line_text = line_text.rstrip()
-                            if line_text.endswith('-'):
-                                line_text = line_text[:-1]
-                            else:
-                                line_text += " "
-                            current_paragraph += line_text
-                        # End of block, add paragraph
-                        if current_paragraph:
-                            # Remove extra spaces
-                            current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
-                            markdown_text += current_paragraph + "\n\n"
-                            current_paragraph = ""
-                    elif block["type"] == 1:  # Image block
-                        markdown_text += "[Image]\n\n"
-                markdown_text += "\n---\n\n"  # Page separator
-        # Clean up hyphenated words
-        markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
-        return markdown_text
-    except Exception as e:
-        logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
-        raise
-def extract_metadata_from_pdf(pdf_path):
-    """
-    Extract metadata from a PDF file using PyMuPDF.
-    """
-    try:
-        with pymupdf.open(pdf_path) as doc:
-            metadata = doc.metadata
-        return metadata
-    except Exception as e:
-        logging.error(f"Error extracting metadata from PDF: {str(e)}")
-        return {}
-def process_and_ingest_pdf(file, title, author, keywords):
-    if file is None:
-        return "Please select a PDF file to upload."
-    try:
-        # Create a temporary directory
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # Create a path for the temporary PDF file
-            temp_path = os.path.join(temp_dir, "temp.pdf")
-            # Copy the contents of the uploaded file to the temporary file
-            shutil.copy(file.name, temp_path)
-            # Extract text and convert to Markdown
-            markdown_text = extract_text_and_format_from_pdf(temp_path)
-            # Extract metadata from PDF
-            metadata = extract_metadata_from_pdf(temp_path)
-            # Use metadata for title and author if not provided
-            if not title:
-                title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
-            if not author:
-                author = metadata.get('author', 'Unknown')
-            # If keywords are not provided, use a default keyword
-            if not keywords:
-                keywords = 'pdf_file,markdown_converted'
-            else:
-                keywords = f'pdf_file,markdown_converted,{keywords}'
-            # Add metadata-based keywords
-            if 'subject' in metadata:
-                keywords += f",{metadata['subject']}"
-            # Add the PDF content to the database
-            add_media_with_keywords(
-                url=file.name,
-                title=title,
-                media_type='document',
-                content=markdown_text,
-                keywords=keywords,
-                prompt='No prompt for PDF files',
-                summary='No summary for PDF files',
-                transcription_model='None',
-                author=author,
-                ingestion_date=datetime.now().strftime('%Y-%m-%d')
-            )
-        return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
-    except Exception as e:
-        logging.error(f"Error ingesting PDF file: {str(e)}")
-        return f"Error ingesting PDF file: {str(e)}"
-def process_and_cleanup_pdf(file, title, author, keywords):
-    if file is None:
-        return "No file uploaded. Please upload a PDF file."
-    try:
-        result = process_and_ingest_pdf(file, title, author, keywords)
-        return result
-    except Exception as e:
-        logging.error(f"Error in processing and cleanup: {str(e)}")
-        return f"Error: {str(e)}"
-#
-# End of PDF_Ingestion_Lib.py
 #######################################################################################################################

+# PDF_Ingestion_Lib.py
+#########################################
+# Library to hold functions for ingesting PDF files.#
+#
+####################
+# Function List
+#
+# 1. convert_pdf_to_markdown(pdf_path)
+# 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
+# 3.
+#
+#
+####################
+import re
+# Import necessary libraries
+# Import Local
+#######################################################################################################################
+# Function Definitions
+#
+# Ingest a text file into the database with Title/Author/Keywords
+# Constants
+MAX_FILE_SIZE_MB = 50
+CONVERSION_TIMEOUT_SECONDS = 300
+# Marker PDF solution
+# def convert_pdf_to_markdown(pdf_path):
+#     """
+#     Convert a PDF file to Markdown by calling a script in another virtual environment.
+#     """
+#
+#     logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
+#     # Check if the file size exceeds the maximum allowed size
+#     file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
+#     if file_size_mb > MAX_FILE_SIZE_MB:
+#         raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
+#
+#     logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
+#     # Path to the Python interpreter in the other virtual environment
+#     other_venv_python = "Helper_Scripts/marker_venv/bin/python"
+#
+#     # Path to the conversion script
+#     converter_script = "Helper_Scripts/PDF_Converter.py"
+#
+#     logging.debug("Marker: Attempting to convert PDF file to Markdown...")
+#     try:
+#         result = subprocess.run(
+#             [other_venv_python, converter_script, pdf_path],
+#             capture_output=True,
+#             text=True,
+#             timeout=CONVERSION_TIMEOUT_SECONDS
+#         )
+#         if result.returncode != 0:
+#             raise Exception(f"Conversion failed: {result.stderr}")
+#         return result.stdout
+#     except subprocess.TimeoutExpired:
+#         raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
+#
+#
+# def process_and_ingest_pdf(file, title, author, keywords):
+#     if file is None:
+#         return "Please select a PDF file to upload."
+#
+#     try:
+#         # Create a temporary directory
+#         with tempfile.TemporaryDirectory() as temp_dir:
+#             # Create a path for the temporary PDF file
+#             temp_path = os.path.join(temp_dir, "temp.pdf")
+#
+#             # Copy the contents of the uploaded file to the temporary file
+#             shutil.copy(file.name, temp_path)
+#
+#             # Call the ingest_pdf_file function with the temporary file path
+#             result = ingest_pdf_file(temp_path, title, author, keywords)
+#
+#         return result
+#     except Exception as e:
+#         return f"Error processing PDF: {str(e)}"
+#
+#
+# def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
+#     try:
+#         # Convert PDF to Markdown
+#         markdown_content = convert_pdf_to_markdown(file_path)
+#
+#         # If title is not provided, use the filename without extension
+#         if not title:
+#             title = os.path.splitext(os.path.basename(file_path))[0]
+#
+#         # If author is not provided, set it to 'Unknown'
+#         if not author:
+#             author = 'Unknown'
+#
+#         # If keywords are not provided, use a default keyword
+#         if not keywords:
+#             keywords = 'pdf_file,markdown_converted'
+#         else:
+#             keywords = f'pdf_file,markdown_converted,{keywords}'
+#
+#         # Add the markdown content to the database
+#         add_media_with_keywords(
+#             url=file_path,
+#             title=title,
+#             media_type='document',
+#             content=markdown_content,
+#             keywords=keywords,
+#             prompt='No prompt for PDF files',
+#             summary='No summary for PDF files',
+#             transcription_model='None',
+#             author=author,
+#             ingestion_date=datetime.now().strftime('%Y-%m-%d')
+#         )
+#
+#         return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
+#     except ValueError as e:
+#         logging.error(f"File size error: {str(e)}")
+#         return f"Error: {str(e)}", file_path
+#     except Exception as e:
+#         logging.error(f"Error ingesting PDF file: {str(e)}")
+#         return f"Error ingesting PDF file: {str(e)}", file_path
+#
+#
+# def process_and_cleanup_pdf(file, title, author, keywords):
+#     # FIXME - Update to validate file upload/filetype is pdf....
+#     if file is None:
+#         return "No file uploaded. Please upload a PDF file."
+#
+#     temp_dir = tempfile.mkdtemp()
+#     temp_file_path = os.path.join(temp_dir, "temp.pdf")
+#
+#     try:
+#         # Copy the uploaded file to a temporary location
+#         shutil.copy2(file.name, temp_file_path)
+#
+#         # Process the file
+#         result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
+#
+#         return result
+#     except Exception as e:
+#         logging.error(f"Error in processing and cleanup: {str(e)}")
+#         return f"Error: {str(e)}"
+#     finally:
+#         # Clean up the temporary directory and its contents
+#         try:
+#             shutil.rmtree(temp_dir)
+#             logging.info(f"Removed temporary directory: {temp_dir}")
+#         except Exception as cleanup_error:
+#             logging.error(f"Error during cleanup: {str(cleanup_error)}")
+#             result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
+import logging
+#
+#
+#######################################################################################################################
+#
+# Non-Marker implementation
+import os
+import shutil
+import tempfile
+from datetime import datetime
+import pymupdf
+from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
+def extract_text_and_format_from_pdf(pdf_path):
+    """
+    Extract text from a PDF file and convert it to Markdown, preserving formatting.
+    """
+    try:
+        markdown_text = ""
+        with pymupdf.open(pdf_path) as doc:
+            for page_num, page in enumerate(doc, 1):
+                markdown_text += f"## Page {page_num}\n\n"
+                blocks = page.get_text("dict")["blocks"]
+                current_paragraph = ""
+                for block in blocks:
+                    if block["type"] == 0:  # Text block
+                        for line in block["lines"]:
+                            line_text = ""
+                            for span in line["spans"]:
+                                text = span["text"]
+                                font_size = span["size"]
+                                font_flags = span["flags"]
+                                # Apply formatting based on font size and flags
+                                if font_size > 20:
+                                    text = f"# {text}"
+                                elif font_size > 16:
+                                    text = f"## {text}"
+                                elif font_size > 14:
+                                    text = f"### {text}"
+                                if font_flags & 2 ** 0:  # Bold
+                                    text = f"**{text}**"
+                                if font_flags & 2 ** 1:  # Italic
+                                    text = f"*{text}*"
+                                line_text += text + " "
+                            # Remove hyphens at the end of lines
+                            line_text = line_text.rstrip()
+                            if line_text.endswith('-'):
+                                line_text = line_text[:-1]
+                            else:
+                                line_text += " "
+                            current_paragraph += line_text
+                        # End of block, add paragraph
+                        if current_paragraph:
+                            # Remove extra spaces
+                            current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
+                            markdown_text += current_paragraph + "\n\n"
+                            current_paragraph = ""
+                    elif block["type"] == 1:  # Image block
+                        markdown_text += "[Image]\n\n"
+                markdown_text += "\n---\n\n"  # Page separator
+        # Clean up hyphenated words
+        markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
+        return markdown_text
+    except Exception as e:
+        logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
+        raise
+def extract_metadata_from_pdf(pdf_path):
+    """
+    Extract metadata from a PDF file using PyMuPDF.
+    """
+    try:
+        with pymupdf.open(pdf_path) as doc:
+            metadata = doc.metadata
+        return metadata
+    except Exception as e:
+        logging.error(f"Error extracting metadata from PDF: {str(e)}")
+        return {}
+def process_and_ingest_pdf(file, title, author, keywords):
+    if file is None:
+        return "Please select a PDF file to upload."
+    try:
+        # Create a temporary directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Create a path for the temporary PDF file
+            temp_path = os.path.join(temp_dir, "temp.pdf")
+            # Copy the contents of the uploaded file to the temporary file
+            shutil.copy(file.name, temp_path)
+            # Extract text and convert to Markdown
+            markdown_text = extract_text_and_format_from_pdf(temp_path)
+            # Extract metadata from PDF
+            metadata = extract_metadata_from_pdf(temp_path)
+            # Use metadata for title and author if not provided
+            if not title:
+                title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
+            if not author:
+                author = metadata.get('author', 'Unknown')
+            # If keywords are not provided, use a default keyword
+            if not keywords:
+                keywords = 'pdf_file,markdown_converted'
+            else:
+                keywords = f'pdf_file,markdown_converted,{keywords}'
+            # Add metadata-based keywords
+            if 'subject' in metadata:
+                keywords += f",{metadata['subject']}"
+            # Add the PDF content to the database
+            add_media_with_keywords(
+                url=file.name,
+                title=title,
+                media_type='document',
+                content=markdown_text,
+                keywords=keywords,
+                prompt='No prompt for PDF files',
+                summary='No summary for PDF files',
+                transcription_model='None',
+                author=author,
+                ingestion_date=datetime.now().strftime('%Y-%m-%d')
+            )
+        return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
+    except Exception as e:
+        logging.error(f"Error ingesting PDF file: {str(e)}")
+        return f"Error ingesting PDF file: {str(e)}"
+def process_and_cleanup_pdf(file, title, author, keywords):
+    if file is None:
+        return "No file uploaded. Please upload a PDF file."
+    try:
+        result = process_and_ingest_pdf(file, title, author, keywords)
+        return result
+    except Exception as e:
+        logging.error(f"Error in processing and cleanup: {str(e)}")
+        return f"Error: {str(e)}"
+#
+# End of PDF_Ingestion_Lib.py
 #######################################################################################################################